From bd08d0ecbe355b9e0de7d07e8b9ff6ccdb150e73 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 31 Jan 2020 07:07:54 +0900 Subject: [PATCH 01/26] ARROW-7712: [CI] [Crossbow] Delete fuzzit jobs We will focus on OSS-Fuzz, now that we have been accepted there. Also add an OSS-Fuzz badge in the README. Closes #6328 from pitrou/ARROW-7712-delete-fuzzit-jobs and squashes the following commits: 6c6d2c462 Remove remaining fuzzit-related configuration a6abaa9ba ARROW-7712: Delete fuzzit jobs Authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- README.md | 2 +- ci/docker/linux-apt-fuzzit.dockerfile | 47 --------------------------- ci/scripts/fuzzit_build.sh | 46 -------------------------- dev/tasks/tasks.yml | 29 ----------------- docker-compose.yml | 24 -------------- 5 files changed, 1 insertion(+), 147 deletions(-) delete mode 100644 ci/docker/linux-apt-fuzzit.dockerfile delete mode 100755 ci/scripts/fuzzit_build.sh diff --git a/README.md b/README.md index f7dbb5852c1..e6b5718a8c6 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ [![Build Status](https://ci.appveyor.com/api/projects/status/github/apache/arrow/branch/master?svg=true)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/branch/master) [![Coverage Status](https://codecov.io/gh/apache/arrow/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master) -[![Fuzzit Status](https://app.fuzzit.dev/badge?org_id=yMxZh42xl9qy6bvg3EiJ&branch=master)](https://app.fuzzit.dev/admin/yMxZh42xl9qy6bvg3EiJ/dashboard) +[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow) [![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt) [![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow) diff --git a/ci/docker/linux-apt-fuzzit.dockerfile b/ci/docker/linux-apt-fuzzit.dockerfile deleted file mode 100644 index 5cb14020ae4..00000000000 --- a/ci/docker/linux-apt-fuzzit.dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base -FROM ${base} - -RUN apt-get update && \ - apt-get install -y -q \ - clang-7 \ - libclang-7-dev \ - clang-format-7 \ - clang-tidy-7 \ - clang-tools-7 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN wget -O /usr/local/bin/fuzzit https://github.com/fuzzitdev/fuzzit/releases/latest/download/fuzzit_Linux_x86_64 && \ - chmod a+x /usr/local/bin/fuzzit - -ENV ARROW_FUZZING="ON" \ - ARROW_USE_ASAN="ON" \ - CC="clang-7" \ - CXX="clang++-7" \ - ARROW_BUILD_TYPE="RelWithDebInfo" \ - ARROW_FLIGHT="OFF" \ - ARROW_GANDIVA="OFF" \ - ARROW_ORC="OFF" \ - ARROW_PARQUET="OFF" \ - ARROW_PLASMA="OFF" \ - ARROW_WITH_BZ2="OFF" \ - ARROW_WITH_ZSTD="OFF" \ - ARROW_BUILD_BENCHMARKS="OFF" \ - ARROW_BUILD_UTILITIES="OFF" diff --git a/ci/scripts/fuzzit_build.sh b/ci/scripts/fuzzit_build.sh deleted file mode 100755 index b73c49cddb4..00000000000 --- a/ci/scripts/fuzzit_build.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -euxo pipefail - -export TARGET_ID=apache-arrow/arrow-ipc-fuzzing - -source_dir=${1}/cpp -build_dir=${2}/cpp - -pushd ${build_dir}/relwithdebinfo - -mkdir -p out - -cp arrow-ipc-fuzzing-test out/fuzzer -ldd arrow-ipc-fuzzing-test | grep "=> /" | awk '{print $3}' | xargs -I '{}' cp -v '{}' out/ - -pushd out -tar -czvf fuzzer.tar.gz * -stat fuzzer.tar.gz -popd - -fuzzit create job \ - --type $FUZZIT_JOB_TYPE \ - --host $FUZZIT_HOST \ - --revision $CI_ARROW_SHA \ - --branch $CI_ARROW_BRANCH \ - $TARGET_ID \ - out/fuzzer.tar.gz - -popd diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 7142491fea6..38023f7f6a4 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -122,8 +122,6 @@ groups: - test-ubuntu-18.04-r-sanitizer - test-debian-10-go-1.12 - test-ubuntu-18.04-docs - - test-ubuntu-fuzzit-fuzzing - - test-ubuntu-fuzzit-regression - test-conda-python-2.7-pandas-latest - test-conda-python-2.7-pandas-master - test-conda-python-3.7-pandas-latest @@ -300,9 +298,6 @@ groups: - test-ubuntu-18.04-r-sanitizer - test-debian-10-go-1.12 - test-ubuntu-18.04-docs - # https://issues.apache.org/jira/browse/ARROW-7712 - # - test-ubuntu-fuzzit-fuzzing - # - test-ubuntu-fuzzit-regression - test-conda-python-2.7-pandas-latest - test-conda-python-3.7-pandas-latest - test-conda-python-3.7-pandas-master @@ -2058,30 +2053,6 @@ tasks: - docker-compose build ubuntu-docs - docker-compose run ubuntu-docs - test-ubuntu-fuzzit-fuzzing: - ci: circle - platform: linux - template: docker-tests/circle.linux.yml - params: - commands: - - docker-compose pull --ignore-pull-failures ubuntu-cpp - - docker-compose pull --ignore-pull-failures ubuntu-fuzzit - - docker-compose build ubuntu-cpp - - docker-compose build ubuntu-fuzzit - - docker-compose run -e FUZZIT_JOB_TYPE=fuzzing ubuntu-fuzzit - - test-ubuntu-fuzzit-regression: - ci: circle - platform: linux - template: docker-tests/circle.linux.yml - params: - commands: - - docker-compose pull --ignore-pull-failures ubuntu-cpp - - docker-compose pull --ignore-pull-failures ubuntu-fuzzit - - docker-compose build ubuntu-cpp - - docker-compose build ubuntu-fuzzit - - docker-compose run -e FUZZIT_JOB_TYPE=regression ubuntu-fuzzit - ############################## Integration tests ############################ test-conda-python-2.7-pandas-latest: diff --git a/docker-compose.yml b/docker-compose.yml index 4eb92bfa14c..108a039adca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -951,30 +951,6 @@ services: command: > /bin/bash -c "pip install -e /arrow/dev/archery && archery lint" - ubuntu-fuzzit: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-fuzzit - # docker-compose run ubuntu-fuzzit - build: - context: . - dockerfile: ci/docker/linux-apt-fuzzit.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-fuzzit - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - environment: - <<: *ccache - FUZZIT_API_KEY: c0b760d37db6012fcaafd8ca5f412ba7bcd297ac969650502994b51aa11798153824442f999a067e1ef67821989ed664 - FUZZIT_HOST: bionic-llvm7 - CI_ARROW_SHA: ${CI_ARROW_SHA:-UNSET} - CI_ARROW_BRANCH: ${CI_ARROW_BRANCH:-UNSET} - volumes: *debian-volumes - command: > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/fuzzit_build.sh /arrow /build" - ######################### Integration Tests ################################# postgres: From 942a4d0324a2d8d77a0f06eb54e6e5f5f45e3ce0 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Sun, 2 Feb 2020 19:39:48 -0800 Subject: [PATCH 02/26] ARROW-6738: [Java] Fix problems with current union comparison logic There are some problems with the current union comparison logic. For example: 1. For type check, we should not require fields to be equal. It is possible that two vectors' value ranges are equal but their fields are different. 2. We should not compare the number of sub vectors, as it is possible that two union vectors have different numbers of sub vectors, but have equal values in the range. Closes #5544 from liyafan82/fly_0930_share and squashes the following commits: d6ef3d220 Refine test case c008289ec Resolve test failure after rebasing c5153931e Rule out the change for union type comparison bab74028f Compare fields for all vectors except union vectors 5b2225e3c Fix the bug with decimal vector 4d8b570df Fix problems with current union comparison logic Authored-by: liyafan82 Signed-off-by: Micah Kornfield --- .../main/codegen/templates/UnionVector.java | 2 +- .../vector/compare/ApproxEqualsVisitor.java | 6 ++- .../vector/compare/RangeEqualsVisitor.java | 46 +++++++++-------- .../compare/TestRangeEqualsVisitor.java | 51 +++++++++++++++++++ 4 files changed, 82 insertions(+), 23 deletions(-) diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index db3c8a89f5e..2c28c66beb3 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -550,7 +550,7 @@ public Iterator iterator() { return vectors.iterator(); } - private ValueVector getVector(int index) { + public ValueVector getVector(int index) { int type = typeBuffer.getByte(index * TYPE_WIDTH); switch (MinorType.values()[type]) { case NULL: diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java index ab5ce94e7e3..bcf8c64e0ce 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java @@ -109,8 +109,10 @@ public Boolean visit(BaseFixedWidthVector left, Range range) { } @Override - protected ApproxEqualsVisitor createInnerVisitor(ValueVector left, ValueVector right) { - return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone()); + protected ApproxEqualsVisitor createInnerVisitor( + ValueVector left, ValueVector right, + BiFunction typeComparator) { + return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone(), typeComparator); } private boolean float4ApproxEquals(Range range) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index cbba0b82039..49fa69f5ac7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -24,7 +24,6 @@ import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; -import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; @@ -193,12 +192,9 @@ public Boolean visit(NullVector left, Range range) { return true; } - /** - * Creates a visitor to visit child vectors. - * It is used for complex vector types. - * @return the visitor for child vecors. - */ - protected RangeEqualsVisitor createInnerVisitor(ValueVector leftInner, ValueVector rightInner) { + protected RangeEqualsVisitor createInnerVisitor( + ValueVector leftInner, ValueVector rightInner, + BiFunction typeComparator) { return new RangeEqualsVisitor(leftInner, rightInner, typeComparator); } @@ -206,16 +202,23 @@ protected boolean compareUnionVectors(Range range) { UnionVector leftVector = (UnionVector) left; UnionVector rightVector = (UnionVector) right; - List leftChildren = leftVector.getChildrenFromFields(); - List rightChildren = rightVector.getChildrenFromFields(); - - if (leftChildren.size() != rightChildren.size()) { - return false; - } - - for (int k = 0; k < leftChildren.size(); k++) { - RangeEqualsVisitor visitor = createInnerVisitor(leftChildren.get(k), rightChildren.get(k)); - if (!visitor.rangeEquals(range)) { + Range subRange = new Range(0, 0, 1); + for (int i = 0; i < range.getLength(); i++) { + subRange.setLeftStart(range.getLeftStart() + i).setRightStart(range.getRightStart() + i); + ValueVector leftSubVector = leftVector.getVector(range.getLeftStart() + i); + ValueVector rightSubVector = rightVector.getVector(range.getRightStart() + i); + + if (leftSubVector == null || rightSubVector == null) { + if (leftSubVector == rightSubVector) { + continue; + } else { + return false; + } + } + TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightSubVector); + RangeEqualsVisitor visitor = + createInnerVisitor(leftSubVector, rightSubVector, (left, right) -> typeVisitor.equals(left)); + if (!visitor.rangeEquals(subRange)) { return false; } } @@ -232,7 +235,8 @@ protected boolean compareStructVectors(Range range) { } for (String name : leftChildNames) { - RangeEqualsVisitor visitor = createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name)); + RangeEqualsVisitor visitor = + createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name), /*type comparator*/ null); if (!visitor.rangeEquals(range)) { return false; } @@ -311,7 +315,8 @@ protected boolean compareListVectors(Range range) { ListVector leftVector = (ListVector) left; ListVector rightVector = (ListVector) right; - RangeEqualsVisitor innerVisitor = createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector()); + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); Range innerRange = new Range(); for (int i = 0; i < range.getLength(); i++) { @@ -357,7 +362,8 @@ protected boolean compareFixedSizeListVectors(Range range) { } int listSize = leftVector.getListSize(); - RangeEqualsVisitor innerVisitor = createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector()); + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); Range innerRange = new Range(0, 0, listSize); for (int i = 0; i < range.getLength(); i++) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index 183c8abac18..c35fd4107a9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -282,6 +282,57 @@ public void testUnionVectorRangeEquals() { } } + /** + * Test comparing two union vectors. + * The two vectors are different in total, but have a range with equal values. + */ + @Test + public void testUnionVectorSubRangeEquals() { + try (final UnionVector vector1 = new UnionVector("union", allocator, null, null); + final UnionVector vector2 = new UnionVector("union", allocator, null, null);) { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 10; + uInt4Holder.isSet = 1; + + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.value = 20; + intHolder.isSet = 1; + + vector1.setType(0, Types.MinorType.UINT4); + vector1.setSafe(0, uInt4Holder); + + vector1.setType(1, Types.MinorType.INT); + vector1.setSafe(1, intHolder); + + vector1.setType(2, Types.MinorType.INT); + vector1.setSafe(2, intHolder); + + vector1.setType(3, Types.MinorType.INT); + vector1.setSafe(3, intHolder); + + vector1.setValueCount(4); + + vector2.setType(0, Types.MinorType.UINT4); + vector2.setSafe(0, uInt4Holder); + + vector2.setType(1, Types.MinorType.INT); + vector2.setSafe(1, intHolder); + + vector2.setType(2, Types.MinorType.INT); + vector2.setSafe(2, intHolder); + + vector2.setType(3, Types.MinorType.UINT4); + vector2.setSafe(3, uInt4Holder); + + vector2.setValueCount(4); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertFalse(visitor.rangeEquals(new Range(0, 0, 4))); + assertTrue(visitor.rangeEquals(new Range(1, 1, 2))); + } + } + @Ignore @Test public void testEqualsWithOutTypeCheck() { From 49aada21d6f4a0e0e0a8cc651884fe402eaeb5fd Mon Sep 17 00:00:00 2001 From: David Li Date: Sun, 2 Feb 2020 19:42:55 -0800 Subject: [PATCH 03/26] ARROW-7734: [C++] check status details for nullptr in equality When checking statuses for equality, check to make sure that both have status detail objects before proceeding to compare those objects. Closes #6332 from lidavidm/arrow-7734 and squashes the following commits: 9084bcec7 ARROW-7734: check status details for nullptr in equality Authored-by: David Li Signed-off-by: Micah Kornfield --- cpp/src/arrow/status.h | 8 ++++++-- cpp/src/arrow/status_test.cc | 13 +++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index c44e311b7b2..df3ea6b9af6 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -292,6 +292,7 @@ class ARROW_EXPORT Status : public util::EqualityComparable, } bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } + bool IsAlreadyExists() const { return code() == StatusCode::AlreadyExists; } /// \brief Return a string representation of this status suitable for printing. /// @@ -385,8 +386,11 @@ bool Status::Equals(const Status& s) const { return false; } - if (detail() != s.detail() && !(*detail() == *s.detail())) { - return false; + if (detail() != s.detail()) { + if ((detail() && !s.detail()) || (!detail() && s.detail())) { + return false; + } + return *detail() == *s.detail(); } return code() == s.code() && message() == s.message(); diff --git a/cpp/src/arrow/status_test.cc b/cpp/src/arrow/status_test.cc index 593ad2194bf..fc5a7ec45cf 100644 --- a/cpp/src/arrow/status_test.cc +++ b/cpp/src/arrow/status_test.cc @@ -114,4 +114,17 @@ TEST(StatusTest, TestEquality) { ASSERT_NE(Status::Invalid("error"), Status::Invalid("other error")); } +TEST(StatusTest, TestDetailEquality) { + const auto status_with_detail = + arrow::Status(StatusCode::IOError, "", std::make_shared()); + const auto status_with_detail2 = + arrow::Status(StatusCode::IOError, "", std::make_shared()); + const auto status_without_detail = arrow::Status::IOError(""); + + ASSERT_EQ(*status_with_detail.detail(), *status_with_detail2.detail()); + ASSERT_EQ(status_with_detail, status_with_detail2); + ASSERT_NE(status_with_detail, status_without_detail); + ASSERT_NE(status_without_detail, status_with_detail); +} + } // namespace arrow From af24bb7ec0a73b725d5b0ac54f14a27b938eb377 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 2 Feb 2020 19:47:01 -0800 Subject: [PATCH 04/26] ARROW-6724: [C++] Allow simpler BufferOutputStream creation Make the initial capacity argument optional. Closes #6327 from pitrou/ARROW-6724-bufferoutstream-simpler-ctor and squashes the following commits: b563f4601 ARROW-6724: Allow simpler BufferOutputStream creation Authored-by: Antoine Pitrou Signed-off-by: Micah Kornfield --- cpp/src/arrow/dataset/file_ipc_test.cc | 7 ++----- cpp/src/arrow/extension_type_test.cc | 4 ++-- cpp/src/arrow/io/compressed_test.cc | 2 +- cpp/src/arrow/io/memory.h | 2 +- cpp/src/arrow/ipc/feather_test.cc | 4 ++-- cpp/src/parquet/reader_test.cc | 2 +- 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc index 29cfc384a42..49fb6879008 100644 --- a/cpp/src/arrow/dataset/file_ipc_test.cc +++ b/cpp/src/arrow/dataset/file_ipc_test.cc @@ -34,7 +34,6 @@ namespace arrow { namespace dataset { -constexpr int64_t kDefaultOutputStreamSize = 1024; constexpr int64_t kBatchSize = 1UL << 12; constexpr int64_t kBatchRepetitions = 1 << 5; constexpr int64_t kNumRows = kBatchSize * kBatchRepetitions; @@ -42,8 +41,7 @@ constexpr int64_t kNumRows = kBatchSize * kBatchRepetitions; class ArrowIpcWriterMixin : public ::testing::Test { public: std::shared_ptr Write(std::vector readers) { - EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create( - kDefaultOutputStreamSize, default_memory_pool())); + EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create()); auto writer_schema = readers[0]->schema(); EXPECT_OK_AND_ASSIGN(auto writer, @@ -69,8 +67,7 @@ class ArrowIpcWriterMixin : public ::testing::Test { } std::shared_ptr Write(const Table& table) { - EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create( - kDefaultOutputStreamSize, default_memory_pool())); + EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create()); EXPECT_OK_AND_ASSIGN(auto writer, ipc::RecordBatchFileWriter::Open(sink.get(), table.schema())); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index aead8ab25e4..a26f57e667f 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -217,7 +217,7 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { auto RoundtripBatch = [](const std::shared_ptr& batch, std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(), out_stream.get())); @@ -255,7 +255,7 @@ TEST_F(TestExtensionType, UnrecognizedExtension) { // Write full IPC stream including schema, then unregister type, then read // and ensure that a plain instance of the storage type is created - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(), out_stream.get())); diff --git a/cpp/src/arrow/io/compressed_test.cc b/cpp/src/arrow/io/compressed_test.cc index 71b2012b67d..df969feb3b6 100644 --- a/cpp/src/arrow/io/compressed_test.cc +++ b/cpp/src/arrow/io/compressed_test.cc @@ -127,7 +127,7 @@ void CheckCompressedInputStream(Codec* codec, const std::vector& data) void CheckCompressedOutputStream(Codec* codec, const std::vector& data, bool do_flush) { // Create compressed output stream - ASSERT_OK_AND_ASSIGN(auto buffer_writer, BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto buffer_writer, BufferOutputStream::Create()); ASSERT_OK_AND_ASSIGN(auto stream, CompressedOutputStream::Make(codec, buffer_writer)); ASSERT_OK_AND_EQ(0, stream->Tell()); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 185a972a6b6..0b710aff325 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -46,7 +46,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { /// \param[in,out] pool a MemoryPool to use for allocations /// \return the created stream static Result> Create( - int64_t initial_capacity, MemoryPool* pool = default_memory_pool()); + int64_t initial_capacity = 4096, MemoryPool* pool = default_memory_pool()); ARROW_DEPRECATED("Use Result-returning overload") static Status Create(int64_t initial_capacity, MemoryPool* pool, diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 74b94a10169..b2c2994aece 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -281,7 +281,7 @@ void CheckBatches(const RecordBatch& expected, const RecordBatch& result) { class TestTableReader : public ::testing::Test { public: void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); ASSERT_OK(TableWriter::Open(stream_, &writer_)); } @@ -356,7 +356,7 @@ TEST_F(TestTableReader, ReadNames) { class TestTableWriter : public ::testing::Test { public: void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); ASSERT_OK(TableWriter::Open(stream_, &writer_)); } diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 3ec4246eac4..f28b22639d3 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -412,7 +412,7 @@ TEST(TestFileReader, BufferedReads) { std::shared_ptr writer_props = WriterProperties::Builder().write_batch_size(64)->data_pagesize(128)->build(); - ASSERT_OK_AND_ASSIGN(auto out_file, arrow::io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto out_file, arrow::io::BufferOutputStream::Create()); std::shared_ptr file_writer = ParquetFileWriter::Open(out_file, schema, writer_props); From 85996e6c2e9af4700a3410b68d5a917605e24cf1 Mon Sep 17 00:00:00 2001 From: tianchen Date: Sun, 2 Feb 2020 19:51:30 -0800 Subject: [PATCH 05/26] ARROW-6871: [Java] Enhance TransferPair related parameters check and tests Related to [ARROW-6871](https://issues.apache.org/jira/browse/ARROW-6871). TransferPair related param checks in different classes have potential problems: i. splitAndTansfer has no indices check in classes like VarcharVector ii. splitAndTranser indices check in classes like UnionVector is not correct (Preconditions.checkArgument(startIndex + length <= valueCount)), should check params separately. iii. should add more UT to cover corner cases. Closes #5645 from tianchen92/ARROW-6871 and squashes the following commits: f3b897ddd fix style 0d3c7eab3 add benchmark 01f9a48f2 revert changes in copyFrom a22d58abb ARROW-6871: Enhance TransferPair related parameters check and tests Authored-by: tianchen Signed-off-by: Micah Kornfield --- .../sort/FixedWidthInPlaceVectorSorter.java | 1 + .../vector/util/TransferPairBenchmarks.java | 123 +++++++++++++ .../main/codegen/templates/UnionVector.java | 5 +- .../arrow/vector/BaseFixedWidthVector.java | 5 +- .../arrow/vector/BaseVariableWidthVector.java | 5 +- .../org/apache/arrow/vector/BitVector.java | 5 +- .../vector/complex/FixedSizeListVector.java | 5 +- .../arrow/vector/complex/ListVector.java | 5 +- .../arrow/vector/complex/StructVector.java | 5 +- .../arrow/vector/TestSplitAndTransfer.java | 166 ++++++++++++++++-- 10 files changed, 306 insertions(+), 19 deletions(-) create mode 100644 java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java index bac25cf8165..c5ecd8acb9b 100644 --- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java +++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java @@ -46,6 +46,7 @@ public void sortInPlace(V vec, VectorValueComparator comparator) { this.comparator = comparator; this.pivotBuffer = (V) vec.getField().createVector(vec.getAllocator()); this.pivotBuffer.allocateNew(1); + this.pivotBuffer.setValueCount(1); comparator.attachVectors(vec, pivotBuffer); quickSort(); diff --git a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java new file mode 100644 index 00000000000..235eca53c84 --- /dev/null +++ b/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +/** + * Benchmarks for {@link TransferPair}. + */ +@State(Scope.Benchmark) +public class TransferPairBenchmarks { + + private static final int VECTOR_LENGTH = 1024; + + private static final int ALLOCATOR_CAPACITY = 1024 * 1024; + + private BufferAllocator allocator; + + private IntVector intVector; + + private VarCharVector varCharVector; + + /** + * Setup benchmarks. + */ + @Setup + public void prepare() { + allocator = new RootAllocator(ALLOCATOR_CAPACITY); + intVector = new IntVector("intVector", allocator); + varCharVector = new VarCharVector("varcharVector", allocator); + + intVector.allocateNew(VECTOR_LENGTH); + varCharVector.allocateNew(VECTOR_LENGTH); + + for (int i = 0; i < VECTOR_LENGTH; i++) { + if (i % 3 == 0) { + intVector.setNull(i); + varCharVector.setNull(i); + } else { + intVector.setSafe(i, i * i); + varCharVector.setSafe(i, ("teststring" + i).getBytes(StandardCharsets.UTF_8)); + } + } + intVector.setValueCount(VECTOR_LENGTH); + varCharVector.setValueCount(VECTOR_LENGTH); + } + + /** + * Tear down benchmarks. + */ + @TearDown + public void tearDown() { + intVector.close(); + varCharVector.close();; + allocator.close(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MICROSECONDS) + public int splitAndTransferIntVector() { + IntVector toVector = new IntVector("intVector", allocator); + toVector.setValueCount(VECTOR_LENGTH); + TransferPair transferPair = intVector.makeTransferPair(toVector); + transferPair.splitAndTransfer(0, VECTOR_LENGTH); + toVector.close(); + return 0; + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MICROSECONDS) + public int splitAndTransferVarcharVector() { + VarCharVector toVector = new VarCharVector("varcharVector", allocator); + toVector.setValueCount(VECTOR_LENGTH); + TransferPair transferPair = varCharVector.makeTransferPair(toVector); + transferPair.splitAndTransfer(0, VECTOR_LENGTH); + toVector.close(); + return 0; + } + + public static void main(String [] args) throws RunnerException { + Options opt = new OptionsBuilder() + .include(TransferPairBenchmarks.class.getSimpleName()) + .forks(1) + .build(); + + new Runner(opt).run(); + } +} diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 2c28c66beb3..d760fe53fc6 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -469,7 +469,10 @@ public void transfer() { @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); to.clear(); internalStructVectorTransferPair.splitAndTransfer(startIndex, length); final int startPoint = startIndex * TYPE_WIDTH; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index ed91d6e659b..a4e94bcac09 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -584,7 +584,10 @@ public void transferTo(BaseFixedWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseFixedWidthVector target) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); compareTypes(target, "splitAndTransferTo"); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 7e276445643..89395ef7e22 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -714,6 +714,10 @@ public void transferTo(BaseVariableWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseVariableWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); compareTypes(target, "splitAndTransferTo"); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); @@ -750,7 +754,6 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariab */ private void splitAndTransferValidityBuffer(int startIndex, int length, BaseVariableWidthVector target) { - Preconditions.checkArgument(startIndex + length <= valueCount); int firstByteSource = BitVectorHelper.byteIndex(startIndex); int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); int byteSizeTarget = getValidityBufferSizeFromCount(length); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 2fa5fdef4ef..b41c62677fc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -158,6 +158,10 @@ public int getBufferSize() { * @param target destination vector */ public void splitAndTransferTo(int startIndex, int length, BaseFixedWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); compareTypes(target, "splitAndTransferTo"); target.clear(); target.validityBuffer = splitAndTransferBuffer(startIndex, length, target, @@ -174,7 +178,6 @@ private ArrowBuf splitAndTransferBuffer( BaseFixedWidthVector target, ArrowBuf sourceBuffer, ArrowBuf destBuffer) { - assert startIndex + length <= valueCount; int firstByteSource = BitVectorHelper.byteIndex(startIndex); int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); int byteSizeTarget = getValidityBufferSizeFromCount(length); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index 653719feafe..8fa43fb06b7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -590,7 +590,10 @@ public void transfer() { @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); final int startPoint = listSize * startIndex; final int sliceLength = listSize * length; to.clear(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 59481d39399..312a3556082 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -493,7 +493,10 @@ public void transfer() { */ @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java index ec922a032a9..7b22835963a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -195,7 +195,10 @@ public void copyValueSafe(int fromIndex, int toIndex) { @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); super.splitAndTransfer(startIndex, length); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index 6405e256b22..d5e4a00d599 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -18,16 +18,17 @@ package org.apache.arrow.vector; import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; import org.junit.Before; import org.junit.Test; +import org.junit.jupiter.api.Assertions; public class TestSplitAndTransfer { private BufferAllocator allocator; @@ -41,6 +42,17 @@ public void init() { public void terminate() throws Exception { allocator.close(); } + + private void populateVarcharVector(final VarCharVector vector, int valueCount, String[] compareArray) { + for (int i = 0; i < valueCount; i += 3) { + final String s = String.format("%010d", i); + vector.set(i, s.getBytes()); + if (compareArray != null) { + compareArray[i] = s; + } + } + vector.setValueCount(valueCount); + } @Test /* VarCharVector */ public void test() throws Exception { @@ -50,12 +62,7 @@ public void test() throws Exception { final int valueCount = 500; final String[] compareArray = new String[valueCount]; - for (int i = 0; i < valueCount; i += 3) { - final String s = String.format("%010d", i); - varCharVector.set(i, s.getBytes()); - compareArray[i] = s; - } - varCharVector.setValueCount(valueCount); + populateVarcharVector(varCharVector, valueCount, compareArray); final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); @@ -89,11 +96,7 @@ public void testMemoryConstrainedTransfer() { final int valueCount = 1000; - for (int i = 0; i < valueCount; i += 3) { - final String s = String.format("%010d", i); - varCharVector.set(i, s.getBytes()); - } - varCharVector.setValueCount(valueCount); + populateVarcharVector(varCharVector, valueCount, null); final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); @@ -107,4 +110,143 @@ public void testMemoryConstrainedTransfer() { } } } + + @Test + public void testTransfer() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + final String[] compareArray = new String[valueCount]; + populateVarcharVector(varCharVector, valueCount, compareArray); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + tp.transfer(); + + assertEquals(0, varCharVector.valueCount); + assertEquals(valueCount, newVarCharVector.valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + final byte[] expectedValue = compareArray[i].getBytes(); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + + newVarCharVector.clear(); + } + } + + @Test + public void testCopyValueSafe() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + // new vector memory is not pre-allocated, we expect copyValueSafe work fine. + for (int i = 0; i < valueCount; i++) { + tp.copyValueSafe(i, i); + } + newVarCharVector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + assertFalse(varCharVector.isNull(i)); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(varCharVector.get(i), newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + } + } + + @Test + public void testSplitAndTransferNon() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + + tp.splitAndTransfer(0, 0); + assertEquals(0, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testSplitAndTransferAll() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + + tp.splitAndTransfer(0, valueCount); + assertEquals(valueCount, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testInvalidStartIndex() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(valueCount, 10)); + + assertEquals("Invalid startIndex: 500", e.getMessage()); + + newVarCharVector.clear(); + } + } + + @Test + public void testInvalidLength() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(0, valueCount * 2)); + + assertEquals("Invalid length: 1000", e.getMessage()); + + newVarCharVector.clear(); + } + } } From 4c7bfc7e1ca9efef00a0cb0d5539b13fd67acf7b Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Sun, 2 Feb 2020 20:28:40 -0800 Subject: [PATCH 06/26] ARROW-7301: [Java] Sql type DATE should correspond to DateDayVector According to SQL convertion, sql type DATE should correspond to a format of YYYY-MM-DD, without the components for hour/minute/second/millis Therefore, JDBC type DATE should correspond to DateDayVector, with a type width of 4, instead of 8. Closes #5944 from liyafan82/fly_1203_date and squashes the following commits: a6de37756 Remove division in time conversion be731925d Resolve comments eea8b7911 Sql type DATE should correspond to DateDayVector Authored-by: liyafan82 Signed-off-by: Micah Kornfield --- .../arrow/adapter/jdbc/JdbcToArrowUtils.java | 8 ++-- .../adapter/jdbc/consumer/DateConsumer.java | 46 ++++++++++++++----- .../adapter/jdbc/JdbcToArrowTestHelper.java | 12 ++--- .../jdbc/h2/JdbcToArrowDataTypesTest.java | 6 +-- .../adapter/jdbc/h2/JdbcToArrowNullTest.java | 10 ++-- .../adapter/jdbc/h2/JdbcToArrowTest.java | 6 +-- .../jdbc/h2/JdbcToArrowTimeZoneTest.java | 6 +-- .../h2/JdbcToArrowVectorIteratorTest.java | 14 +++--- .../resources/h2/test1_all_datatypes_h2.yml | 2 +- ...t1_all_datatypes_selected_null_rows_h2.yml | 2 +- .../src/test/resources/h2/test1_date_h2.yml | 20 ++++---- .../test/resources/h2/test1_est_date_h2.yml | 20 ++++---- .../test/resources/h2/test1_gmt_date_h2.yml | 20 ++++---- .../test/resources/h2/test1_pst_date_h2.yml | 20 ++++---- .../apache/arrow/vector/DateDayVector.java | 1 + 15 files changed, 110 insertions(+), 83 deletions(-) diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java index 21278e4cabd..f7bf3248eb4 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java @@ -59,7 +59,7 @@ import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.Float4Vector; @@ -210,7 +210,7 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig *
  • BINARY --> ArrowType.Binary
  • *
  • VARBINARY --> ArrowType.Binary
  • *
  • LONGVARBINARY --> ArrowType.Binary
  • - *
  • DATE --> ArrowType.Date(DateUnit.MILLISECOND)
  • + *
  • DATE --> ArrowType.Date(DateUnit.DAY)
  • *
  • TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)
  • *
  • TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)
  • *
  • CLOB --> ArrowType.Utf8
  • @@ -265,7 +265,7 @@ public static ArrowType getArrowTypeForJdbcField(JdbcFieldInfo fieldInfo, Calend case Types.CLOB: return new ArrowType.Utf8(); case Types.DATE: - return new ArrowType.Date(DateUnit.MILLISECOND); + return new ArrowType.Date(DateUnit.DAY); case Types.TIME: return new ArrowType.Time(TimeUnit.MILLISECOND, 32); case Types.TIMESTAMP: @@ -402,7 +402,7 @@ static JdbcConsumer getConsumer(ResultSet resultSet, int columnIndex, int jdbcCo case Types.LONGNVARCHAR: return VarCharConsumer.createConsumer((VarCharVector) vector, columnIndex, nullable); case Types.DATE: - return DateConsumer.createConsumer((DateMilliVector) vector, columnIndex, nullable, calendar); + return DateConsumer.createConsumer((DateDayVector) vector, columnIndex, nullable, calendar); case Types.TIME: return TimeConsumer.createConsumer((TimeMilliVector) vector, columnIndex, nullable, calendar); case Types.TIMESTAMP: diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java index 2fbffa08cf7..bb9ab1cf0b9 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java @@ -17,11 +17,15 @@ package org.apache.arrow.adapter.jdbc.consumer; +import java.sql.Date; import java.sql.ResultSet; import java.sql.SQLException; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Calendar; -import java.util.Date; +import java.util.concurrent.TimeUnit; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; /** @@ -30,11 +34,23 @@ */ public class DateConsumer { + public static final int MAX_DAY; + + static { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + try { + java.util.Date date = dateFormat.parse("9999-12-31"); + MAX_DAY = (int) TimeUnit.MILLISECONDS.toDays(date.getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("Failed to parse max day", e); + } + } + /** * Creates a consumer for {@link DateMilliVector}. */ - public static JdbcConsumer createConsumer( - DateMilliVector vector, int index, boolean nullable, Calendar calendar) { + public static JdbcConsumer createConsumer( + DateDayVector vector, int index, boolean nullable, Calendar calendar) { if (nullable) { return new NullableDateConsumer(vector, index, calendar); } else { @@ -45,21 +61,21 @@ public static JdbcConsumer createConsumer( /** * Nullable consumer for date. */ - static class NullableDateConsumer extends BaseConsumer { + static class NullableDateConsumer extends BaseConsumer { protected final Calendar calendar; /** * Instantiate a DateConsumer. */ - public NullableDateConsumer(DateMilliVector vector, int index) { + public NullableDateConsumer(DateDayVector vector, int index) { this(vector, index, /* calendar */null); } /** * Instantiate a DateConsumer. */ - public NullableDateConsumer(DateMilliVector vector, int index, Calendar calendar) { + public NullableDateConsumer(DateDayVector vector, int index, Calendar calendar) { super(vector, index); this.calendar = calendar; } @@ -69,7 +85,11 @@ public void consume(ResultSet resultSet) throws SQLException { Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : resultSet.getDate(columnIndexInResultSet, calendar); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex, date.getTime()); + int day = (int) TimeUnit.MILLISECONDS.toDays(date.getTime()); + if (day < 0 || day > MAX_DAY) { + throw new IllegalArgumentException("Day overflow: " + day); + } + vector.setSafe(currentIndex, day); } currentIndex++; } @@ -78,21 +98,21 @@ public void consume(ResultSet resultSet) throws SQLException { /** * Non-nullable consumer for date. */ - static class NonNullableDateConsumer extends BaseConsumer { + static class NonNullableDateConsumer extends BaseConsumer { protected final Calendar calendar; /** * Instantiate a DateConsumer. */ - public NonNullableDateConsumer(DateMilliVector vector, int index) { + public NonNullableDateConsumer(DateDayVector vector, int index) { this(vector, index, /* calendar */null); } /** * Instantiate a DateConsumer. */ - public NonNullableDateConsumer(DateMilliVector vector, int index, Calendar calendar) { + public NonNullableDateConsumer(DateDayVector vector, int index, Calendar calendar) { super(vector, index); this.calendar = calendar; } @@ -101,7 +121,11 @@ public NonNullableDateConsumer(DateMilliVector vector, int index, Calendar calen public void consume(ResultSet resultSet) throws SQLException { Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : resultSet.getDate(columnIndexInResultSet, calendar); - vector.setSafe(currentIndex, date.getTime()); + int day = (int) TimeUnit.MILLISECONDS.toDays(date.getTime()); + if (day < 0 || day > MAX_DAY) { + throw new IllegalArgumentException("Day overflow: " + day); + } + vector.setSafe(currentIndex, day); currentIndex++; } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java index 3e21144c0ca..c194dfbc355 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java @@ -32,7 +32,7 @@ import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -173,14 +173,14 @@ public static void assertTimeVectorValues(TimeMilliVector timeMilliVector, int r } } - public static void assertDateVectorValues(DateMilliVector dateMilliVector, int rowCount, Long[] values) { - assertEquals(rowCount, dateMilliVector.getValueCount()); + public static void assertDateVectorValues(DateDayVector dateDayVector, int rowCount, Integer[] values) { + assertEquals(rowCount, dateDayVector.getValueCount()); - for (int j = 0; j < dateMilliVector.getValueCount(); j++) { + for (int j = 0; j < dateDayVector.getValueCount(); j++) { if (values[j] == null) { - assertTrue(dateMilliVector.isNull(j)); + assertTrue(dateDayVector.isNull(j)); } else { - assertEquals(values[j].longValue(), dateMilliVector.get(j)); + assertEquals(values[j].longValue(), dateDayVector.get(j)); } } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java index 650c766e02a..cafb7a050d7 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java @@ -50,7 +50,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -202,8 +202,8 @@ public void testDataSets(VectorSchemaRoot root) { table.getCharValues()); break; case DATE: - assertDateVectorValues((DateMilliVector) root.getVector(table.getVector()), table.getValues().length, - table.getLongValues()); + assertDateVectorValues((DateDayVector) root.getVector(table.getVector()), table.getValues().length, + table.getIntValues()); break; case TIME: assertTimeVectorValues((TimeMilliVector) root.getVector(table.getVector()), table.getValues().length, diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java index 199b821ba19..57fcf566d7d 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java @@ -58,7 +58,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -206,8 +206,8 @@ private void testAllVectorValues(VectorSchemaRoot root) { assertBooleanVectorValues((BitVector) root.getVector(BOOL), table.getRowCount(), getBooleanValues(table.getValues(), BOOL)); - assertDateVectorValues((DateMilliVector) root.getVector(DATE), table.getRowCount(), - getLongValues(table.getValues(), DATE)); + assertDateVectorValues((DateDayVector) root.getVector(DATE), table.getRowCount(), + getIntValues(table.getValues(), DATE)); assertTimeVectorValues((TimeMilliVector) root.getVector(TIME), table.getRowCount(), getLongValues(table.getValues(), TIME)); @@ -242,7 +242,7 @@ public void sqlToArrowTestNullValues(String[] vectors, VectorSchemaRoot root, in assertNullValues((Float8Vector) root.getVector(vectors[6]), rowCount); assertNullValues((Float4Vector) root.getVector(vectors[7]), rowCount); assertNullValues((TimeMilliVector) root.getVector(vectors[8]), rowCount); - assertNullValues((DateMilliVector) root.getVector(vectors[9]), rowCount); + assertNullValues((DateDayVector) root.getVector(vectors[9]), rowCount); assertNullValues((TimeStampVector) root.getVector(vectors[10]), rowCount); assertNullValues((VarBinaryVector) root.getVector(vectors[11]), rowCount); assertNullValues((VarCharVector) root.getVector(vectors[12]), rowCount); @@ -265,7 +265,7 @@ public void sqlToArrowTestSelectedNullColumnsValues(String[] vectors, VectorSche assertNullValues((Float8Vector) root.getVector(vectors[2]), rowCount); assertNullValues((Float4Vector) root.getVector(vectors[3]), rowCount); assertNullValues((TimeMilliVector) root.getVector(vectors[4]), rowCount); - assertNullValues((DateMilliVector) root.getVector(vectors[5]), rowCount); + assertNullValues((DateDayVector) root.getVector(vectors[5]), rowCount); assertNullValues((TimeStampVector) root.getVector(vectors[6]), rowCount); assertNullValues((VarBinaryVector) root.getVector(vectors[7]), rowCount); assertNullValues((VarCharVector) root.getVector(vectors[8]), rowCount); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java index dfcdb90be7c..b7d517f26e1 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java @@ -79,7 +79,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -202,8 +202,8 @@ public void testDataSets(VectorSchemaRoot root) { assertBooleanVectorValues((BitVector) root.getVector(BOOL), table.getRowCount(), getBooleanValues(table.getValues(), BOOL)); - assertDateVectorValues((DateMilliVector) root.getVector(DATE), table.getRowCount(), - getLongValues(table.getValues(), DATE)); + assertDateVectorValues((DateDayVector) root.getVector(DATE), table.getRowCount(), + getIntValues(table.getValues(), DATE)); assertTimeVectorValues((TimeMilliVector) root.getVector(TIME), table.getRowCount(), getLongValues(table.getValues(), TIME)); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java index 002a1239101..ef2b406d120 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java @@ -37,7 +37,7 @@ import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; import org.apache.arrow.adapter.jdbc.Table; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.TimeMilliVector; import org.apache.arrow.vector.TimeStampVector; import org.apache.arrow.vector.VectorSchemaRoot; @@ -145,8 +145,8 @@ public void testDataSets(VectorSchemaRoot root) { case EST_DATE: case GMT_DATE: case PST_DATE: - assertDateVectorValues((DateMilliVector) root.getVector(table.getVector()), table.getValues().length, - table.getLongValues()); + assertDateVectorValues((DateDayVector) root.getVector(table.getVector()), table.getValues().length, + table.getIntValues()); break; case EST_TIME: case GMT_TIME: diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java index d041f082b0b..fa75c1cdbaf 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java @@ -38,7 +38,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -51,6 +51,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -66,6 +67,7 @@ public JdbcToArrowVectorIteratorTest(Table table) { super(table); } + @Test @Override public void testJdbcToArrowValues() throws SQLException, IOException { @@ -91,7 +93,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep List vectorsForChar = new ArrayList<>(); List vectorsForBit = new ArrayList<>(); List vectorsForBool = new ArrayList<>(); - List dateMilliVectors = new ArrayList<>(); + List dateDayVectors = new ArrayList<>(); List timeMilliVectors = new ArrayList<>(); List timeStampVectors = new ArrayList<>(); List decimalVectors = new ArrayList<>(); @@ -115,7 +117,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep vectorsForChar.add((VarCharVector) root.getVector(CHAR)); vectorsForBit.add((BitVector) root.getVector(BIT)); vectorsForBool.add((BitVector) root.getVector(BOOL)); - dateMilliVectors.add((DateMilliVector) root.getVector(DATE)); + dateDayVectors.add((DateDayVector) root.getVector(DATE)); timeMilliVectors.add((TimeMilliVector) root.getVector(TIME)); timeStampVectors.add((TimeStampVector) root.getVector(TIMESTAMP)); decimalVectors.add((DecimalVector) root.getVector(DECIMAL)); @@ -134,7 +136,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep assertVarCharVectorValues(vectorsForChar, table.getRowCount(), getCharArray(table.getValues(), CHAR)); assertBitVectorValues(vectorsForBit, table.getRowCount(), getIntValues(table.getValues(), BIT)); assertBooleanVectorValues(vectorsForBool, table.getRowCount(), getBooleanValues(table.getValues(), BOOL)); - assertDateMilliVectorValues(dateMilliVectors, table.getRowCount(), getLongValues(table.getValues(), DATE)); + assertDateDayVectorValues(dateDayVectors, table.getRowCount(), getLongValues(table.getValues(), DATE)); assertTimeMilliVectorValues(timeMilliVectors, table.getRowCount(), getLongValues(table.getValues(), TIME)); assertTimeStampVectorValues(timeStampVectors, table.getRowCount(), getLongValues(table.getValues(), TIMESTAMP)); assertDecimalVectorValues(decimalVectors, table.getRowCount(), getDecimalValues(table.getValues(), DECIMAL)); @@ -205,12 +207,12 @@ private void assertTimeMilliVectorValues(List vectors, int rowC } } - private void assertDateMilliVectorValues(List vectors, int rowCount, Long[] values) { + private void assertDateDayVectorValues(List vectors, int rowCount, Long[] values) { int valueCount = vectors.stream().mapToInt(ValueVector::getValueCount).sum(); assertEquals(rowCount, valueCount); int index = 0; - for (DateMilliVector vector : vectors) { + for (DateDayVector vector : vectors) { for (int i = 0; i < vector.getValueCount(); i++) { assertEquals(values[index++].longValue(), vector.get(i)); } diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml index fc9784df7ff..9baae643aef 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml @@ -85,7 +85,7 @@ values: - 'DECIMAL_FIELD6=17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23' - 'DOUBLE_FIELD7=56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345' - 'TIME_FIELD9=45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000' - - 'DATE_FIELD10=1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000' + - 'DATE_FIELD10=17574,17574,17574,17574,17574,17574,17574,17574,17574,17574' - 'TIMESTAMP_FIELD11=1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000' - 'CHAR_FIELD16=some char text,some char text,some char text,some char text,some char text, some char text,some char text,some char text,some char text,some char text' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml index bb9bfb81548..4be8ab86ecc 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml @@ -71,7 +71,7 @@ values: - 'DECIMAL_FIELD6=null,17345667789.23,null,17345667789.23,null' - 'DOUBLE_FIELD7=null,56478356785.345,null,56478356785.345,null' - 'TIME_FIELD9=null,45935000,null,45935000,null' - - 'DATE_FIELD10=null,1518393600000,null,1518393600000,null' + - 'DATE_FIELD10=null,17574,null,17574,null' - 'TIMESTAMP_FIELD11=null,1518439535000,null,1518439535000,null' - 'CHAR_FIELD16=null,some char text,null,some char text,null' - 'VARCHAR_FIELD13=null,some text that needs to be converted to varchar,null, diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml index 45aa56c7417..da33d9d47cd 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml @@ -34,13 +34,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml index 290c32ebafa..1868db3adeb 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml @@ -36,13 +36,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml index 03929c936e4..65824861a8a 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml @@ -36,13 +36,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' \ No newline at end of file + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' \ No newline at end of file diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml index 81a668f37a4..798cfc7d67d 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml @@ -36,13 +36,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' \ No newline at end of file + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java index 7f244bfa658..9448a1a1453 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java @@ -37,6 +37,7 @@ * maintained to track which elements in the vector are null. */ public final class DateDayVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; private final FieldReader reader; From ab3d86e70adb4ac95b5fa7e3f6e46048dd7648f4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 3 Feb 2020 11:56:47 +0100 Subject: [PATCH 07/26] ARROW-7736: [Release] Retry binary download on transient error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes binary verification more robust. Closes #6335 from kou/release-verification-binaries-retry and squashes the following commits: 38944a9c7 Retry binary download on transient error Authored-by: Sutou Kouhei Signed-off-by: Krisztián Szűcs --- dev/release/download_rc_binaries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 dev/release/download_rc_binaries.py diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py old mode 100644 new mode 100755 index b2dfc867bc0..da66e483d37 --- a/dev/release/download_rc_binaries.py +++ b/dev/release/download_rc_binaries.py @@ -112,7 +112,7 @@ def _download_file(self, dest, info): bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath) cmd = [ - 'curl', '--fail', '--location', + 'curl', '--fail', '--location', '--retry', '5', '--output', dest_path, bintray_abspath ] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, From 19b0d4b1164a7f73cf46b298426f1d242f29dd62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 3 Feb 2020 08:35:27 -0500 Subject: [PATCH 08/26] ARROW-7466: [CI][Java] Fix gandiva-jar-osx nightly build failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #6331 from kszucs/ARROW-7466 and squashes the following commits: d50b8c8975 don't install already installed python3 d956297c74 re-enable steps 421142dba8 start all arguments at the same column... 2131836b6f travis multi line string b50865e70c use travis_build_dir 6f1beb6851 debug paths 7b054ecabf queue path 6929f3d2c7 fix crossbow path 2a2d7c378b deploy using crossbow Authored-by: Krisztián Szűcs Signed-off-by: François Saint-Jacques --- dev/tasks/gandiva-jars/travis.osx.yml | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dev/tasks/gandiva-jars/travis.osx.yml b/dev/tasks/gandiva-jars/travis.osx.yml index 5626b1b795d..8a43e3a2429 100644 --- a/dev/tasks/gandiva-jars/travis.osx.yml +++ b/dev/tasks/gandiva-jars/travis.osx.yml @@ -33,10 +33,10 @@ env: - ARROW_TRAVIS_GANDIVA=1 before_script: + - pwd - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - if [ $CROSSBOW_USE_COMMIT_ID = true ]; then git -C arrow checkout {{ arrow.head }}; else git -C arrow checkout FETCH_HEAD; fi - - export TRAVIS_BUILD_DIR=$TRAVIS_BUILD_DIR/arrow - brew update - brew install bison flex llvm@7 @@ -52,15 +52,17 @@ script: - mkdir -p dist - dev/tasks/gandiva-jars/build-cpp-osx.sh || travis_terminate 1 - dev/tasks/gandiva-jars/build-java.sh || travis_terminate 1 - -deploy: - provider: releases - api_key: $CROSSBOW_GITHUB_TOKEN - file_glob: true - file: dist/*.jar - skip_cleanup: true - on: - tags: true + # deploy using crossbow + - brew install libgit2 + - pip3 install click github3.py jinja2 jira pygit2 ruamel.yaml setuptools_scm toolz + - > + python3 dev/tasks/crossbow.py + --queue-path $TRAVIS_BUILD_DIR + --queue-remote {{ queue.remote_url }} + upload-artifacts + --sha {{ task.branch }} + --tag {{ task.tag }} + --pattern "dist/*.jar" notifications: email: From d091894a6d5c24543e57722e59224a76ca04e7b0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 3 Feb 2020 08:18:12 -0700 Subject: [PATCH 09/26] ARROW-7684: [Rust] Example Flight client and server for DataFusion This PR adds DataFusion examples for a Flight client and server where the client can send a SQL query to the server and then receive the results. I have manually tested with a Java client as well to confirm that it works. Closes #6308 from andygrove/datafusion-flight-example and squashes the following commits: 788feef8c code cleanup 9c473389a Complete flight client's record batch reader 1337b9890 parse recordbatch 459bef386 client parses schema from ipc batches 31c894bbc update release test script efe05aef5 update release test script 5ecea83b1 formatting 8b419dab3 update release test script 03d2c8437 client streams results 0a39a513d client can stream batches e72c60545 add starting point for flight-client example ab28da82e get schema from query plan instead of from first batch 0901a3f88 Merge branch 'datafusion-flight-example' of https://github.com/andygrove/arrow into datafusion-flight-example ad2e3b066 send schema before batches 996f2a4a2 Use PARQUET_TEST_DATA env var 260f9cac1 fix license violation 516b66dc8 add helpers to convert record batch to flight data proto message 6beb4eafb WIP example Flight server for DataFusion Lead-authored-by: Andy Grove Co-authored-by: Neville Dipale Signed-off-by: Andy Grove --- dev/release/00-prepare-test.rb | 8 +- rust/arrow/Cargo.toml | 4 +- rust/arrow/src/flight/mod.rs | 83 +++++++++++ rust/arrow/src/ipc/convert.rs | 6 + rust/arrow/src/ipc/reader.rs | 2 +- rust/arrow/src/ipc/writer.rs | 37 +++-- rust/arrow/src/lib.rs | 2 + rust/datafusion/Cargo.toml | 6 + rust/datafusion/examples/README.md | 28 ++++ rust/datafusion/examples/flight-client.rs | 62 ++++++++ rust/datafusion/examples/flight-server.rs | 174 ++++++++++++++++++++++ 11 files changed, 394 insertions(+), 18 deletions(-) create mode 100644 rust/arrow/src/flight/mod.rs create mode 100644 rust/datafusion/examples/README.md create mode 100644 rust/datafusion/examples/flight-client.rs create mode 100644 rust/datafusion/examples/flight-server.rs diff --git a/dev/release/00-prepare-test.rb b/dev/release/00-prepare-test.rb index e9894d0bc8a..afabe88fa7e 100644 --- a/dev/release/00-prepare-test.rb +++ b/dev/release/00-prepare-test.rb @@ -276,7 +276,9 @@ def test_version_pre_tag ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", "-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }", "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"] + "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"], + ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@snapshot_version}\" }", + "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }"] ], }, { @@ -458,7 +460,9 @@ def test_version_post_tag ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", "-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"] + "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"], + ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }", + "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@next_snapshot_version}\" }"] ], }, { diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 29829610889..57fc261985c 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -50,10 +50,12 @@ packed_simd = { version = "0.3.1", optional = true } chrono = "0.4" flatbuffers = "0.6.0" hex = "0.4" +arrow-flight = { path = "../arrow-flight", optional = true } [features] simd = ["packed_simd"] -default = ["simd"] +flight = ["arrow-flight"] +default = ["simd", "flight"] [dev-dependencies] criterion = "0.2" diff --git a/rust/arrow/src/flight/mod.rs b/rust/arrow/src/flight/mod.rs new file mode 100644 index 00000000000..6d09ca48c2a --- /dev/null +++ b/rust/arrow/src/flight/mod.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities to assist with reading and writing Arrow data as Flight messages + +use std::convert::TryFrom; +use std::sync::Arc; + +use flight::FlightData; + +use crate::datatypes::Schema; +use crate::error::{ArrowError, Result}; +use crate::ipc::{convert, reader, writer}; +use crate::record_batch::RecordBatch; + +/// Convert a `RecordBatch` to `FlightData` by getting the header and body as bytes +impl From<&RecordBatch> for FlightData { + fn from(batch: &RecordBatch) -> Self { + let (header, body) = writer::record_batch_to_bytes(batch); + Self { + flight_descriptor: None, + app_metadata: vec![], + data_header: header, + data_body: body, + } + } +} + +/// Convert a `Schema` to `FlightData` by converting to an IPC message +impl From<&Schema> for FlightData { + fn from(schema: &Schema) -> Self { + let schema = writer::schema_to_bytes(schema); + Self { + flight_descriptor: None, + app_metadata: vec![], + data_header: schema, + data_body: vec![], + } + } +} + +/// Try convert `FlightData` into an Arrow Schema +/// +/// Returns an error if the `FlightData` header is not a valid IPC schema +impl TryFrom<&FlightData> for Schema { + type Error = ArrowError; + fn try_from(data: &FlightData) -> Result { + convert::schema_from_bytes(&data.data_header[..]).ok_or(ArrowError::ParseError( + "Unable to convert flight data to Arrow schema".to_string(), + )) + } +} + +/// Convert a FlightData message to a RecordBatch +pub fn flight_data_to_batch( + data: &FlightData, + schema: Arc, +) -> Result> { + // check that the data_header is a record batch message + let message = crate::ipc::get_root_as_message(&data.data_header[..]); + let batch_header = message + .header_as_record_batch() + .ok_or(ArrowError::ParseError( + "Unable to convert flight data header to a record batch".to_string(), + ))?; + reader::read_record_batch(&data.data_body, batch_header, schema) +} + +// TODO: add more explicit conversion that expoess flight descriptor and metadata options diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index a38975fbf1c..f53914e31d2 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -152,6 +152,12 @@ pub(crate) fn fb_to_schema(fb: ipc::Schema) -> Schema { Schema::new_with_metadata(fields, metadata) } +/// Deserialize an IPC message into a schema +pub(crate) fn schema_from_bytes(bytes: &[u8]) -> Option { + let ipc = ipc::get_root_as_message(bytes); + ipc.header_as_schema().map(|schema| fb_to_schema(schema)) +} + /// Get the Arrow data type from the flatbuffer Field table pub(crate) fn get_data_type(field: ipc::Field) -> DataType { match field.type_type() { diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index 99173641d32..25ba0cd3df1 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -348,7 +348,7 @@ fn create_list_array( } /// Creates a record batch from binary data using the `ipc::RecordBatch` indexes and the `Schema` -fn read_record_batch( +pub(crate) fn read_record_batch( buf: &Vec, batch: ipc::RecordBatch, schema: Arc, diff --git a/rust/arrow/src/ipc/writer.rs b/rust/arrow/src/ipc/writer.rs index 36c89c7a666..c872c8286d7 100644 --- a/rust/arrow/src/ipc/writer.rs +++ b/rust/arrow/src/ipc/writer.rs @@ -209,8 +209,7 @@ impl Drop for StreamWriter { } } -/// Convert the schema to its IPC representation, and write it to the `writer` -fn write_schema(writer: &mut BufWriter, schema: &Schema) -> Result { +pub(crate) fn schema_to_bytes(schema: &Schema) -> Vec { let mut fbb = FlatBufferBuilder::new(); let schema = { let fb = ipc::convert::schema_to_fb_offset(&mut fbb, schema); @@ -227,9 +226,13 @@ fn write_schema(writer: &mut BufWriter, schema: &Schema) -> Result< fbb.finish(data, None); let data = fbb.finished_data(); - let written = write_padded_data(writer, data, WriteDataType::Header); + data.to_vec() +} - written +/// Convert the schema to its IPC representation, and write it to the `writer` +fn write_schema(writer: &mut BufWriter, schema: &Schema) -> Result { + let data = schema_to_bytes(schema); + write_padded_data(writer, &data[..], WriteDataType::Header) } /// The message type being written. This determines whether to write the data length or not. @@ -266,13 +269,8 @@ fn write_padded_data( Ok(total_len as usize) } -/// Write a record batch to the writer, writing the message size before the message -/// if the record batch is being written to a stream -fn write_record_batch( - writer: &mut BufWriter, - batch: &RecordBatch, - is_stream: bool, -) -> Result<(usize, usize)> { +/// Write a `RecordBatch` into a tuple of bytes, one for the header (ipc::Message) and the other for the batch's data +pub(crate) fn record_batch_to_bytes(batch: &RecordBatch) -> (Vec, Vec) { let mut fbb = FlatBufferBuilder::new(); let mut nodes: Vec = vec![]; @@ -313,13 +311,24 @@ fn write_record_batch( let root = message.finish(); fbb.finish(root, None); let finished_data = fbb.finished_data(); + + (finished_data.to_vec(), arrow_data) +} + +/// Write a record batch to the writer, writing the message size before the message +/// if the record batch is being written to a stream +fn write_record_batch( + writer: &mut BufWriter, + batch: &RecordBatch, + is_stream: bool, +) -> Result<(usize, usize)> { + let (meta_data, arrow_data) = record_batch_to_bytes(batch); // write the length of data if writing to stream if is_stream { - let total_len: u32 = finished_data.len() as u32; + let total_len: u32 = meta_data.len() as u32; writer.write(&total_len.to_le_bytes()[..])?; } - let meta_written = - write_padded_data(writer, fbb.finished_data(), WriteDataType::Body)?; + let meta_written = write_padded_data(writer, &meta_data[..], WriteDataType::Body)?; let arrow_data_written = write_padded_data(writer, &arrow_data[..], WriteDataType::Body)?; Ok((meta_written, arrow_data_written)) diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs index 899bb62f08a..4383922be09 100644 --- a/rust/arrow/src/lib.rs +++ b/rust/arrow/src/lib.rs @@ -33,6 +33,8 @@ pub mod compute; pub mod csv; pub mod datatypes; pub mod error; +#[cfg(feature = "flight")] +pub mod flight; pub mod ipc; pub mod json; pub mod memory; diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml index 14fc4e54548..f489d2b0d6d 100644 --- a/rust/datafusion/Cargo.toml +++ b/rust/datafusion/Cargo.toml @@ -56,6 +56,12 @@ crossbeam = "0.7.1" [dev-dependencies] criterion = "0.2.0" tempdir = "0.3.7" +futures = "0.3" +prost = "0.6" +tokio = { version = "0.2", features = ["macros"] } +tonic = "0.1" +flatbuffers = "0.6.0" +arrow-flight = { path = "../arrow-flight", version = "0.16.0-SNAPSHOT" } [[bench]] name = "aggregate_query_sql" diff --git a/rust/datafusion/examples/README.md b/rust/datafusion/examples/README.md new file mode 100644 index 00000000000..163ef3d952b --- /dev/null +++ b/rust/datafusion/examples/README.md @@ -0,0 +1,28 @@ + + +# DataFusion Examples + +## Single Process + +The examples `csv_sql.rs` and `parquet_sql.rs` demonstrate building a query plan from a SQL statement and then executing the query plan against local CSV and Parquet files, respectively. + +## Distributed + +The `flight-client.rs` and `flight-server.rs` examples demonstrate how to run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol. \ No newline at end of file diff --git a/rust/datafusion/examples/flight-client.rs b/rust/datafusion/examples/flight-client.rs new file mode 100644 index 00000000000..97db7934620 --- /dev/null +++ b/rust/datafusion/examples/flight-client.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::convert::TryFrom; +use std::sync::Arc; + +use arrow::array::Int32Array; +use arrow::datatypes::Schema; +use arrow::flight::flight_data_to_batch; +use flight::flight_service_client::FlightServiceClient; +use flight::Ticket; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let mut client = FlightServiceClient::connect("http://localhost:50051").await?; + + let request = tonic::Request::new(Ticket { + ticket: "SELECT id FROM alltypes_plain".into(), + }); + + let mut stream = client.do_get(request).await?.into_inner(); + + // the schema should be the first message returned, else client should error + let flight_data = stream.message().await?.unwrap(); + // convert FlightData to a stream + let schema = Arc::new(Schema::try_from(&flight_data)?); + println!("Schema: {:?}", schema); + + // all the remaining stream messages should be dictionary and record batches + while let Some(flight_data) = stream.message().await? { + // the unwrap is infallible and thus safe + let record_batch = flight_data_to_batch(&flight_data, schema.clone())?.unwrap(); + + println!( + "record_batch has {} columns and {} rows", + record_batch.num_columns(), + record_batch.num_rows() + ); + let column = record_batch.column(0); + let column = column + .as_any() + .downcast_ref::() + .expect("Unable to get column"); + println!("Column 1: {:?}", column); + } + + Ok(()) +} diff --git a/rust/datafusion/examples/flight-server.rs b/rust/datafusion/examples/flight-server.rs new file mode 100644 index 00000000000..e6cc22ac503 --- /dev/null +++ b/rust/datafusion/examples/flight-server.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::pin::Pin; + +use futures::Stream; +use tonic::transport::Server; +use tonic::{Request, Response, Status, Streaming}; + +use datafusion::execution::context::ExecutionContext; + +use flight::{ + flight_service_server::FlightService, flight_service_server::FlightServiceServer, + Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, +}; + +#[derive(Clone)] +pub struct FlightServiceImpl {} + +#[tonic::async_trait] +impl FlightService for FlightServiceImpl { + type HandshakeStream = Pin< + Box> + Send + Sync + 'static>, + >; + type ListFlightsStream = + Pin> + Send + Sync + 'static>>; + type DoGetStream = + Pin> + Send + Sync + 'static>>; + type DoPutStream = + Pin> + Send + Sync + 'static>>; + type DoActionStream = Pin< + Box> + Send + Sync + 'static>, + >; + type ListActionsStream = + Pin> + Send + Sync + 'static>>; + + async fn do_get( + &self, + request: Request, + ) -> Result, Status> { + let ticket = request.into_inner(); + match String::from_utf8(ticket.ticket.to_vec()) { + Ok(sql) => { + println!("do_get: {}", sql); + + // create local execution context + let mut ctx = ExecutionContext::new(); + + let testdata = std::env::var("PARQUET_TEST_DATA") + .expect("PARQUET_TEST_DATA not defined"); + + // register parquet file with the execution context + ctx.register_parquet( + "alltypes_plain", + &format!("{}/alltypes_plain.parquet", testdata), + ) + .map_err(|e| to_tonic_err(&e))?; + + // create the query plan + let plan = ctx + .create_logical_plan(&sql) + .and_then(|plan| ctx.optimize(&plan)) + .and_then(|plan| ctx.create_physical_plan(&plan, 1024 * 1024)) + .map_err(|e| to_tonic_err(&e))?; + + // execute the query + let results = ctx.collect(plan.as_ref()).map_err(|e| to_tonic_err(&e))?; + if results.is_empty() { + return Err(Status::internal("There were no results from ticket")); + } + + // add an initial FlightData message that sends schema + let schema = plan.schema(); + let mut flights: Vec> = + vec![Ok(FlightData::from(schema.as_ref()))]; + + let mut batches: Vec> = results + .iter() + .map(|batch| Ok(FlightData::from(batch))) + .collect(); + + // append batch vector to schema vector, so that the first message sent is the schema + flights.append(&mut batches); + + let output = futures::stream::iter(flights); + + Ok(Response::new(Box::pin(output) as Self::DoGetStream)) + } + Err(e) => Err(Status::invalid_argument(format!("Invalid ticket: {:?}", e))), + } + } + + async fn handshake( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn list_flights( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn get_flight_info( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn get_schema( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn do_put( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn do_action( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn list_actions( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } +} + +fn to_tonic_err(e: &datafusion::error::ExecutionError) -> Status { + Status::internal(format!("{:?}", e)) +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let addr = "0.0.0.0:50051".parse()?; + let service = FlightServiceImpl {}; + + let svc = FlightServiceServer::new(service); + + println!("Listening on {:?}", addr); + + Server::builder().add_service(svc).serve(addr).await?; + + Ok(()) +} From 68c2f3c3edc04f4478853b9045962ff138e7ad24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 3 Feb 2020 17:13:40 +0100 Subject: [PATCH 10/26] ARROW-7729: [Python][CI] Pin pandas version to 0.25 in the dask integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Additionally test agains dask's latest release not just the master revision. Closes #6326 from kszucs/dask-pandas-pin and squashes the following commits: b5cb40eec pin pandas depending on dask's version 083221ce9 pin pandas in the dask integration test Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- ci/docker/conda-python-dask.dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile index e567fecfd13..cf19c05237f 100644 --- a/ci/docker/conda-python-dask.dockerfile +++ b/ci/docker/conda-python-dask.dockerfile @@ -23,3 +23,6 @@ FROM ${repo}:${arch}-conda-python-${python} ARG dask=latest COPY ci/scripts/install_dask.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_dask.sh ${dask} + +# The Spark tests currently break with pandas >= 1.0 +RUN if [ ${dask} == "latest" ]; then conda install pandas=0.25.3; fi \ No newline at end of file From cb81f7d84c26cd75f4568ae1b29744cb4218b578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 3 Feb 2020 17:30:19 +0100 Subject: [PATCH 11/26] ARROW-7735: [Release][Python] Use pip to install dependencies for wheel verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wheel verification script fails for python 3.5. At the same time the wheel properly works for python 3.5 docker images without conda environment. Conda forge doesn't maintain packages for python 3.5 anymore and something must have mixed with the numpy versions. This change fixed the wheel verification locally for me. Closes #6339 from kszucs/wheel-verification and squashes the following commits: 3e9694958 remove pytest verbose flags 026e5fb34 use pip to install dependencies for wheel verification Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- dev/release/verify-release-candidate.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 000af83a172..9d68b7ce18b 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -609,8 +609,10 @@ test_linux_wheels() { pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[mu.]/}-cp${py_arch//./}-manylinux${ml_spec}_x86_64.whl check_python_imports py_arch - # execute the python unit tests - conda install -y --file ${ARROW_DIR}/ci/conda_env_python.yml pandas + # install test requirements + pip install -r ${ARROW_DIR}/python/requirements-test.txt + + # execute the tests pytest --pyargs pyarrow done @@ -640,8 +642,10 @@ test_macos_wheels() { pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[m.]/}-cp${py_arch//./}-${macos_suffix}.whl check_python_imports py_arch - # execute the python unit tests - conda install -y --file ${ARROW_DIR}/ci/conda_env_python.yml pandas + # install test requirements + pip install -r ${ARROW_DIR}/python/requirements-test.txt + + # execute the tests pytest --pyargs pyarrow conda deactivate From 4b549977fa0ddc9bf0a6031486b7b1987e66baec Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 4 Feb 2020 06:44:19 +0900 Subject: [PATCH 12/26] ARROW-7726: [CI] [C++] Use boost binaries on Windows GHA build The binaries are installed using Chocolatey, which takes a bit of time (it's a 2+GB install...), but less so than recompiling Boost from scratch during the CMake build. [skip appveyor] Closes #6325 from pitrou/ARROW-7726-download-boost-gha and squashes the following commits: e877622b9 Revert "Try a more flexible way of finding Boost" eb5db8fc1 Try a more flexible way of finding Boost d57064960 ARROW-7726: Use boost binaries on Windows GHA build Authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index abc3b8f4d91..cf746c327db 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -175,10 +175,15 @@ jobs: ARROW_BUILD_STATIC: OFF ARROW_BOOST_USE_SHARED: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF - BOOST_SOURCE: BUNDLED + BOOST_ROOT: C:\local\boost_1_67_0 + BOOST_LIBRARYDIR: C:\local\boost_1_67_0\lib64-msvc-14.1\ steps: - name: Disable Crash Dialogs run: reg add "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" /v DontShowUI /t REG_DWORD /d 1 /f + - name: Installed Packages + run: choco list -l + - name: Install Boost + run: choco install -y boost-msvc-14.1 - name: Checkout Arrow uses: actions/checkout@v1 with: From bc261d1e43aea04f5277ff0c3252a21582e7ab61 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Mon, 3 Feb 2020 21:08:10 -0800 Subject: [PATCH 13/26] ARROW-7073: [Java] Support concating vectors values in batch We need a way to copy vector values in batch. Currently, we have copyFrom and copyFromSafe APIs. However, they are not enough, as copying values individually is not performant. Closes #5916 from liyafan82/fly_1125_veccat and squashes the following commits: 94b407c85 Support dense union vector ee49dc678 Add tests with null values ad33e234c Rewrite tests with vector populator for result verification c89211abc Rewrite tests with vector populator and provide static utility 7c13ede14 Support concating vectors values in batch Authored-by: liyafan82 Signed-off-by: Micah Kornfield --- .../main/codegen/templates/UnionVector.java | 12 +- .../arrow/vector/util/VectorAppender.java | 326 +++++++++++++++++ .../vector/util/VectorBatchAppender.java | 39 ++ .../testing/ValueVectorDataPopulator.java | 73 +++- .../arrow/vector/util/TestVectorAppender.java | 346 ++++++++++++++++++ .../vector/util/TestVectorBatchAppender.java | 72 ++++ 6 files changed, 863 insertions(+), 5 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index d760fe53fc6..8bf5d8a9b20 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -553,9 +553,13 @@ public Iterator iterator() { return vectors.iterator(); } - public ValueVector getVector(int index) { - int type = typeBuffer.getByte(index * TYPE_WIDTH); - switch (MinorType.values()[type]) { + public ValueVector getVector(int index) { + int type = typeBuffer.getByte(index * TYPE_WIDTH); + return getVectorByType(type); + } + + public ValueVector getVectorByType(int typeId) { + switch (MinorType.values()[typeId]) { case NULL: return null; <#list vv.types as type> @@ -574,7 +578,7 @@ public ValueVector getVector(int index) { case LIST: return getList(); default: - throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]); + throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[typeId]); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java new file mode 100644 index 00000000000..8dac7676858 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.HashSet; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +import io.netty.util.internal.PlatformDependent; + +/** + * Utility to append two vectors together. + */ +class VectorAppender implements VectorVisitor { + + /** + * The targetVector to be appended. + */ + private final ValueVector targetVector; + + private final TypeEqualsVisitor typeVisitor; + + /** + * Constructs a new targetVector appender, with the given targetVector. + * @param targetVector the targetVector to be appended. + */ + VectorAppender(ValueVector targetVector) { + this.targetVector = targetVector; + typeVisitor = new TypeEqualsVisitor(targetVector, false, true); + } + + @Override + public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(), + deltaVector.getTypeWidth() * deltaVector.getValueCount()); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetDataSize = targetVector.getOffsetBuffer().getInt( + targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int deltaDataSize = deltaVector.getOffsetBuffer().getInt( + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int newValueCapacity = targetDataSize + deltaDataSize; + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + while (targetVector.getDataBuffer().capacity() < newValueCapacity) { + ((BaseVariableWidthVector) targetVector).reallocDataBuffer(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); + + // copy offset buffer + PlatformDependent.copyMemory( + deltaVector.getOffsetBuffer().memoryAddress() + BaseVariableWidthVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + BaseVariableWidthVector.OFFSET_WIDTH, + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + int oldOffset = targetVector.getOffsetBuffer().getInt((targetVector.getValueCount() + 1 + i) * + BaseVariableWidthVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setInt( + (targetVector.getValueCount() + 1 + i) * + BaseVariableWidthVector.OFFSET_WIDTH, oldOffset + targetDataSize); + } + ((BaseVariableWidthVector) targetVector).setLastSet(newValueCount - 1); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(ListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The targetVector to append must have the same type as the targetVector being appended"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetListSize = targetVector.getOffsetBuffer().getInt( + targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int deltaListSize = deltaVector.getOffsetBuffer().getInt( + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + ListVector targetListVector = (ListVector) targetVector; + + // make sure the underlying vector has value count set + targetListVector.getDataVector().setValueCount(targetListSize); + deltaVector.getDataVector().setValueCount(deltaListSize); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append offset buffer + PlatformDependent.copyMemory(deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + ListVector.OFFSET_WIDTH, + deltaVector.getValueCount() * ListVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + int oldOffset = + targetVector.getOffsetBuffer().getInt((targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setInt((targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH, + oldOffset + targetListSize); + } + targetListVector.setLastSet(newValueCount - 1); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(FixedSizeListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The vector to append must have the same type as the targetVector being appended"); + + FixedSizeListVector targetListVector = (FixedSizeListVector) targetVector; + + Preconditions.checkArgument(targetListVector.getListSize() == deltaVector.getListSize(), + "FixedSizeListVector must have the same list size to append"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetListSize = targetListVector.getValueCount() * targetListVector.getListSize(); + int deltaListSize = deltaVector.getValueCount() * deltaVector.getListSize(); + + // make sure the underlying vector has value count set + targetListVector.getDataVector().setValueCount(targetListSize); + deltaVector.getDataVector().setValueCount(deltaListSize); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(NonNullableStructVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The vector to append must have the same type as the targetVector being appended"); + + NonNullableStructVector targetStructVector = (NonNullableStructVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append child vectors + for (int i = 0; i < targetStructVector.getChildrenFromFields().size(); i++) { + ValueVector targetChild = targetStructVector.getVectorById(i); + ValueVector deltaChild = deltaVector.getVectorById(i); + + targetChild.setValueCount(targetStructVector.getValueCount()); + deltaChild.setValueCount(deltaVector.getValueCount()); + + VectorAppender innerAppender = new VectorAppender(targetChild); + deltaChild.accept(innerAppender, null); + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(UnionVector deltaVector, Void value) { + // we only make sure that both vectors are union vectors. + Preconditions.checkArgument(targetVector.getMinorType() == deltaVector.getMinorType(), + "The vector to append must have the same type as the targetVector being appended"); + + UnionVector targetUnionVector = (UnionVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetUnionVector.getValueCapacity() < newValueCount) { + targetUnionVector.reAlloc(); + } + + // append type buffers + PlatformDependent.copyMemory(deltaVector.getValidityBufferAddress(), + targetUnionVector.getValidityBufferAddress() + targetVector.getValueCount(), + deltaVector.getValueCount()); + + // build the hash set for all types + HashSet targetTypes = new HashSet<>(); + for (int i = 0; i < targetUnionVector.getValueCount(); i++) { + targetTypes.add((int) targetUnionVector.getValidityBuffer().getByte(i)); + } + HashSet deltaTypes = new HashSet<>(); + for (int i = 0; i < deltaVector.getValueCount(); i++) { + deltaTypes.add((int) deltaVector.getValidityBuffer().getByte(i)); + } + + // append child vectors + for (int i = 0; i < Byte.MAX_VALUE; i++) { + if (targetTypes.contains(i) || deltaTypes.contains(i)) { + ValueVector targetChild = targetUnionVector.getVectorByType(i); + if (!targetTypes.contains(i)) { + // if the vector type does not exist in the target, it must be newly created + // and we must make sure it has enough capacity. + while (targetChild.getValueCapacity() < newValueCount) { + targetChild.reAlloc(); + } + } + + if (deltaTypes.contains(i)) { + // append child vectors + ValueVector deltaChild = deltaVector.getVectorByType(i); + + targetChild.setValueCount(targetUnionVector.getValueCount()); + deltaChild.setValueCount(deltaVector.getValueCount()); + + VectorAppender innerAppender = new VectorAppender(targetChild); + deltaChild.accept(innerAppender, null); + } + targetChild.setValueCount(newValueCount); + } + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(DenseUnionVector left, Void value) { + throw new UnsupportedOperationException(); + } + + @Override + public ValueVector visit(NullVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + return targetVector; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java new file mode 100644 index 00000000000..570783d1070 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.apache.arrow.vector.ValueVector; + +/** + * Utility to add vector values in batch. + */ +public class VectorBatchAppender { + + /** + * Add value vectors in batch. + * @param targetVector the target vector. + * @param vectorsToAppend the vectors to append. + * @param the vector type. + */ + public static void batchAppend(V targetVector, V... vectorsToAppend) { + VectorAppender appender = new VectorAppender(targetVector); + for (V delta : vectorsToAppend) { + delta.accept(appender, null); + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index ab3f81a6446..b50a3441322 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -17,10 +17,14 @@ package org.apache.arrow.vector.testing; +import static org.junit.Assert.assertEquals; + import java.nio.charset.StandardCharsets; +import java.util.List; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DecimalVector; @@ -51,7 +55,12 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.holders.IntervalDayHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; /** * Utility for populating {@link org.apache.arrow.vector.ValueVector}. @@ -533,10 +542,72 @@ public static void setVector(VarCharVector vector, String... values) { vector.allocateNewSafe(); for (int i = 0; i < length; i++) { if (values[i] != null) { - vector.set(i, values[i].getBytes(StandardCharsets.UTF_8)); + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); } } vector.setValueCount(length); } + /** + * Populate values for {@link ListVector}. + */ + public static void setVector(ListVector vector, List... values) { + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + vector.getOffsetBuffer().setInt(0, curPos); + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.set(curPos, value); + curPos += 1; + } + } + vector.getOffsetBuffer().setInt((i + 1) * BaseRepeatedValueVector.OFFSET_WIDTH, curPos); + } + dataVector.setValueCount(curPos); + vector.setLastSet(values.length - 1); + vector.setValueCount(values.length); + } + + /** + * Populate values for {@link FixedSizeListVector}. + */ + public static void setVector(FixedSizeListVector vector, List... values) { + for (int i = 0; i < values.length; i++) { + if (values[i] != null) { + assertEquals(vector.getListSize(), values[i].size()); + } + } + + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.set(curPos, value); + curPos += 1; + } + } + } + dataVector.setValueCount(curPos); + vector.setValueCount(values.length); + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java new file mode 100644 index 00000000000..83fc24fe90a --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorAppender}. + */ +public class TestVectorAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testAppendFixedWidthVector() { + final int length1 = 10; + final int length2 = 5; + try (IntVector target = new IntVector("", allocator); + IntVector delta = new IntVector("", allocator)) { + + target.allocateNew(length1); + delta.allocateNew(length2); + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, null, 8, 9); + ValueVectorDataPopulator.setVector(delta, null, 11, 12, 13, 14); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 + length2, target.getValueCount()); + + try (IntVector expected = new IntVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, 0, 1, 2, 3, 4, 5, 6, null, 8, 9, null, 11, 12, 13, 14); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendVariableWidthVector() { + final int length1 = 10; + final int length2 = 5; + try (VarCharVector target = new VarCharVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(5, length1); + delta.allocateNew(5, length2); + + ValueVectorDataPopulator.setVector(target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + ValueVectorDataPopulator.setVector(delta, "a10", "a11", "a12", "a13", null); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (VarCharVector expected = new VarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, + "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9", "a10", "a11", "a12", "a13", null); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendListVector() { + final int length1 = 5; + final int length2 = 2; + try (ListVector target = ListVector.empty("target", allocator); + ListVector delta = ListVector.empty("delta", allocator)) { + + target.allocateNew(); + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1), + Arrays.asList(2, 3), + null, + Arrays.asList(6, 7), + Arrays.asList(8, 9)); + assertEquals(length1, target.getValueCount()); + + delta.allocateNew(); + ValueVectorDataPopulator.setVector(delta, + Arrays.asList(10, 11, 12, 13, 14), + Arrays.asList(15, 16, 17, 18, 19)); + assertEquals(length2, delta.getValueCount()); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(7, target.getValueCount()); + + List expected = Arrays.asList(0, 1); + assertEquals(expected, target.getObject(0)); + + expected = Arrays.asList(2, 3); + assertEquals(expected, target.getObject(1)); + + assertTrue(target.isNull(2)); + + expected = Arrays.asList(6, 7); + assertEquals(expected, target.getObject(3)); + + expected = Arrays.asList(8, 9); + assertEquals(expected, target.getObject(4)); + + expected = Arrays.asList(10, 11, 12, 13, 14); + assertEquals(expected, target.getObject(5)); + + expected = Arrays.asList(15, 16, 17, 18, 19); + assertEquals(expected, target.getObject(6)); + } + } + + @Test + public void testAppendFixedSizeListVector() { + try (FixedSizeListVector target = FixedSizeListVector.empty("target", 5, allocator); + FixedSizeListVector delta = FixedSizeListVector.empty("delta", 5, allocator)) { + + target.allocateNew(); + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1, 2, 3, 4), + null); + assertEquals(2, target.getValueCount()); + + delta.allocateNew(); + ValueVectorDataPopulator.setVector(delta, + Arrays.asList(10, 11, 12, 13, 14), + Arrays.asList(15, 16, 17, 18, 19)); + assertEquals(2, delta.getValueCount()); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(4, target.getValueCount()); + + assertEquals(Arrays.asList(0, 1, 2, 3, 4), target.getObject(0)); + assertTrue(target.isNull(1)); + assertEquals(Arrays.asList(10, 11, 12, 13, 14), target.getObject(2)); + assertEquals(Arrays.asList(15, 16, 17, 18, 19), target.getObject(3)); + } + } + + @Test + public void testAppendStructVector() { + final int length1 = 10; + final int length2 = 5; + try (final StructVector target = StructVector.empty("target", allocator); + final StructVector delta = StructVector.empty("delta", allocator)) { + + IntVector targetChild1 = target.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector targetChild2 = target.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + targetChild1.allocateNew(); + targetChild2.allocateNew(); + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(targetChild2, "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9"); + target.setValueCount(length1); + + IntVector deltaChild1 = delta.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector deltaChild2 = delta.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + deltaChild1.allocateNew(); + deltaChild2.allocateNew(); + ValueVectorDataPopulator.setVector(deltaChild1, 10, 11, 12, null, 14); + ValueVectorDataPopulator.setVector(deltaChild2, "a10", "a11", "a12", "a13", "a14"); + delta.setValueCount(length2); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 + length2, target.getValueCount()); + IntVector child1 = (IntVector) target.getVectorById(0); + VarCharVector child2 = (VarCharVector) target.getVectorById(1); + + try (IntVector expected1 = new IntVector("expected1", allocator); + VarCharVector expected2 = new VarCharVector("expected2", allocator)) { + expected1.allocateNew(); + expected2.allocateNew(); + + ValueVectorDataPopulator.setVector(expected1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9, 10, 11, 12, null, 14); + ValueVectorDataPopulator.setVector(expected2, + "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9", "a10", "a11", "a12", "a13", "a14"); + + assertVectorsEqual(expected1, target.getChild("f0")); + assertVectorsEqual(expected2, target.getChild("f1")); + } + } + } + + @Test + public void testAppendUnionVector() { + final int length1 = 10; + final int length2 = 5; + + try (final UnionVector target = UnionVector.empty("target", allocator); + final UnionVector delta = UnionVector.empty("delta", allocator)) { + + // alternating ints and big ints + target.setType(0, Types.MinorType.INT); + target.setType(1, Types.MinorType.BIGINT); + target.setType(2, Types.MinorType.INT); + target.setType(3, Types.MinorType.BIGINT); + target.setType(4, Types.MinorType.INT); + target.setType(5, Types.MinorType.BIGINT); + target.setType(6, Types.MinorType.INT); + target.setType(7, Types.MinorType.BIGINT); + target.setType(8, Types.MinorType.INT); + target.setType(9, Types.MinorType.BIGINT); + target.setType(10, Types.MinorType.INT); + target.setType(11, Types.MinorType.BIGINT); + target.setType(12, Types.MinorType.INT); + target.setType(13, Types.MinorType.BIGINT); + target.setType(14, Types.MinorType.INT); + target.setType(15, Types.MinorType.BIGINT); + target.setType(16, Types.MinorType.INT); + target.setType(17, Types.MinorType.BIGINT); + target.setType(18, Types.MinorType.INT); + target.setType(19, Types.MinorType.BIGINT); + + IntVector targetIntVec = target.getIntVector(); + targetIntVec.allocateNew(); + ValueVectorDataPopulator.setVector( + targetIntVec, + 0, null, 1, null, 2, null, 3, null, 4, null, 5, null, 6, null, 7, null, 8, null, 9, null); + assertEquals(length1 * 2, targetIntVec.getValueCount()); + + BigIntVector targetBigIntVec = target.getBigIntVector(); + targetBigIntVec.allocateNew(); + ValueVectorDataPopulator.setVector( + targetBigIntVec, + null, 0L, null, 1L, null, 2L, null, 3L, null, 4L, null, 5L, null, 6L, null, 7L, null, 8L, null, 9L); + assertEquals(length1 * 2, targetBigIntVec.getValueCount()); + + target.setValueCount(length1 * 2); + + // populate the delta vector + delta.setType(0, Types.MinorType.FLOAT4); + delta.setType(1, Types.MinorType.FLOAT4); + delta.setType(2, Types.MinorType.FLOAT4); + delta.setType(3, Types.MinorType.FLOAT4); + delta.setType(4, Types.MinorType.FLOAT4); + + Float4Vector deltaFloatVector = delta.getFloat4Vector(); + deltaFloatVector.allocateNew(); + ValueVectorDataPopulator.setVector(deltaFloatVector, 10f, 11f, 12f, 13f, 14f); + assertEquals(length2, deltaFloatVector.getValueCount()); + delta.setValueCount(length2); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 * 2 + length2, target.getValueCount()); + + for (int i = 0; i < length1; i++) { + Object intObj = target.getObject(i * 2); + assertTrue(intObj instanceof Integer); + assertEquals(i, ((Integer) intObj).intValue()); + + Object longObj = target.getObject(i * 2 + 1); + assertTrue(longObj instanceof Long); + assertEquals(i, ((Long) longObj).longValue()); + } + + for (int i = 0; i < length2; i++) { + Object floatObj = target.getObject(length1 * 2 + i); + assertTrue(floatObj instanceof Float); + assertEquals(i + length1, ((Float) floatObj).intValue()); + } + } + } + + @Test + public void testAppendVectorNegative() { + final int vectorLength = 10; + try (IntVector target = new IntVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(vectorLength); + delta.allocateNew(vectorLength); + + VectorAppender appender = new VectorAppender(target); + + assertThrows(IllegalArgumentException.class, + () -> delta.accept(appender, null)); + } + } + + public static void assertVectorsEqual(ValueVector vector1, ValueVector vector2) { + assertEquals(vector1.getValueCount(), vector2.getValueCount()); + + TypeEqualsVisitor typeEqualsVisitor = new TypeEqualsVisitor(vector1, false, false); + RangeEqualsVisitor equalsVisitor = + new RangeEqualsVisitor(vector1, vector2, (v1, v2) -> typeEqualsVisitor.equals(vector2)); + assertTrue(equalsVisitor.rangeEquals(new Range(0, 0, vector1.getValueCount()))); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java new file mode 100644 index 00000000000..799c25c0ad7 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorBatchAppender}. + */ +public class TestVectorBatchAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testBatchAppendIntVector() { + final int length1 = 10; + final int length2 = 5; + final int length3 = 7; + try (IntVector target = new IntVector("", allocator); + IntVector delta1 = new IntVector("", allocator); + IntVector delta2 = new IntVector("", allocator)) { + + target.allocateNew(length1); + delta1.allocateNew(length2); + delta2.allocateNew(length3); + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(delta1, 10, 11, 12, 13, 14); + ValueVectorDataPopulator.setVector(delta2, 15, 16, 17, 18, 19, 20, 21); + + VectorBatchAppender.batchAppend(target, delta1, delta2); + + assertEquals(length1 + length2 + length3, target.getValueCount()); + for (int i = 0; i < target.getValueCount(); i++) { + assertEquals(i, target.get(i)); + } + } + } +} From a6054174f0fe8d8058e9b0c04355bda32eb7a80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 4 Feb 2020 12:50:50 +0100 Subject: [PATCH 14/26] ARROW-7750: [Release] Make the source release verification script restartable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Executing the verification script can take quite some time, so creating a new environment in case if anything fails is time consuming. Let the script reuse the same build directory for source release verification. Need to export `TMPDIR` environment variable. @kou shall we use an argument instead? Closes #6344 from kszucs/restartable-verification and squashes the following commits: 6d4723da3 Support for restarting the release verification script Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- dev/release/verify-release-candidate.sh | 35 ++++++++++++++++++------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 9d68b7ce18b..24c60ea632b 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -28,6 +28,9 @@ # # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to # LD_LIBRARY_PATH. +# +# To reuse build artifacts between runs set TMPDIR environment variable to +# a directory where the temporary files should be placed to. case $# in 3) ARTIFACT="$1" @@ -205,7 +208,12 @@ setup_tempdir() { fi } trap cleanup EXIT - TMPDIR=$(mktemp -d -t "$1.XXXXX") + + if [ -z "${TMPDIR}" ]; then + TMPDIR=$(mktemp -d -t "$1.XXXXX") + else + mkdir -p "${TMPDIR}" + fi } @@ -219,9 +227,12 @@ setup_miniconda() { MINICONDA=$PWD/test-miniconda - wget -O miniconda.sh $MINICONDA_URL - bash miniconda.sh -b -p $MINICONDA - rm -f miniconda.sh + if [ ! -d "${MINICONDA}" ]; then + # Setup miniconda only if the directory doesn't exist yet + wget -O miniconda.sh $MINICONDA_URL + bash miniconda.sh -b -p $MINICONDA + rm -f miniconda.sh + fi . $MINICONDA/etc/profile.d/conda.sh } @@ -249,7 +260,7 @@ test_and_install_cpp() { cython conda activate arrow-test - mkdir cpp/build + mkdir -p cpp/build pushd cpp/build ARROW_CMAKE_OPTIONS=" @@ -523,10 +534,14 @@ test_source_distribution() { NPROC=$(nproc) fi - git clone https://github.com/apache/arrow-testing.git + # Clone testing repositories if not cloned already + if [ ! -d "arrow-testing" ]; then + git clone https://github.com/apache/arrow-testing.git + fi + if [ ! -d "parquet-testing" ]; then + git clone https://github.com/apache/parquet-testing.git + fi export ARROW_TEST_DATA=$PWD/arrow-testing/data - - git clone https://github.com/apache/parquet-testing.git export PARQUET_TEST_DATA=$PWD/parquet-testing/data if [ ${TEST_JAVA} -gt 0 ]; then @@ -612,7 +627,7 @@ test_linux_wheels() { # install test requirements pip install -r ${ARROW_DIR}/python/requirements-test.txt - # execute the tests + # execute the python unit tests pytest --pyargs pyarrow done @@ -645,7 +660,7 @@ test_macos_wheels() { # install test requirements pip install -r ${ARROW_DIR}/python/requirements-test.txt - # execute the tests + # execute the python unit tests pytest --pyargs pyarrow conda deactivate From 0326ea34b63ae399582a99d60f0d23cc03aaa628 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Feb 2020 13:42:03 +0100 Subject: [PATCH 15/26] ARROW-7751: [Release] macOS wheel verification also needs arrow-testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This addition follows the pattern of `test_source_distribution` (as it is in #6344). There are also two error message patches to make them consistent with everywhere else that references this env var (though FWIW `testing/data` is still not correct in the verification script, it's `arrow-testing/data` 🤷‍♂). Closes #6345 from nealrichardson/flight-testing-data and squashes the following commits: a29e88a7e factor out testing repository cloning df9ef255d Move addition and fix lint e165d54f1 Make sure macOS wheel verification has test data Lead-authored-by: Neal Richardson Co-authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- cpp/src/arrow/flight/test_util.cc | 3 ++- dev/release/verify-release-candidate.sh | 24 +++++++++++++++--------- python/pyarrow/tests/test_flight.py | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc index 5ad7a899c66..5d83353d281 100644 --- a/cpp/src/arrow/flight/test_util.cc +++ b/cpp/src/arrow/flight/test_util.cc @@ -454,7 +454,8 @@ Status TestClientBasicAuthHandler::GetToken(std::string* token) { Status GetTestResourceRoot(std::string* out) { const char* c_root = std::getenv("ARROW_TEST_DATA"); if (!c_root) { - return Status::IOError("Test resources not found, set ARROW_TEST_DATA"); + return Status::IOError( + "Test resources not found, set ARROW_TEST_DATA to /testing/data"); } *out = std::string(c_root); return Status::OK(); diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 24c60ea632b..bae762d3274 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -522,6 +522,18 @@ test_integration() { $INTEGRATION_TEST_ARGS } +clone_testing_repositories() { + # Clone testing repositories if not cloned already + if [ ! -d "arrow-testing" ]; then + git clone https://github.com/apache/arrow-testing.git + fi + if [ ! -d "parquet-testing" ]; then + git clone https://github.com/apache/parquet-testing.git + fi + export ARROW_TEST_DATA=$PWD/arrow-testing/data + export PARQUET_TEST_DATA=$PWD/parquet-testing/data +} + test_source_distribution() { export ARROW_HOME=$TMPDIR/install export PARQUET_HOME=$TMPDIR/install @@ -534,15 +546,7 @@ test_source_distribution() { NPROC=$(nproc) fi - # Clone testing repositories if not cloned already - if [ ! -d "arrow-testing" ]; then - git clone https://github.com/apache/arrow-testing.git - fi - if [ ! -d "parquet-testing" ]; then - git clone https://github.com/apache/parquet-testing.git - fi - export ARROW_TEST_DATA=$PWD/arrow-testing/data - export PARQUET_TEST_DATA=$PWD/parquet-testing/data + clone_testing_repositories if [ ${TEST_JAVA} -gt 0 ]; then test_package_java @@ -668,6 +672,8 @@ test_macos_wheels() { } test_wheels() { + clone_testing_repositories + local download_dir=binaries mkdir -p ${download_dir} diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index ff31b64105c..fe0470f7e5d 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -60,7 +60,7 @@ def resource_root(): """Get the path to the test resources directory.""" if not os.environ.get("ARROW_TEST_DATA"): raise RuntimeError("Test resources not found; set " - "ARROW_TEST_DATA to /testing") + "ARROW_TEST_DATA to /testing/data") return pathlib.Path(os.environ["ARROW_TEST_DATA"]) / "flight" From d6b6f878d951833bbe594131de9c093f37fbfdf1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 4 Feb 2020 15:41:19 +0100 Subject: [PATCH 16/26] ARROW-7691: [C++] Check non-scalar Flatbuffers fields are not null We're discussing whether to make those fields required in the schema definitions (which would make validation automatic by the flatbuffers generated verifier), but in the meantime we can check those fields manually. This should fix a bunch of issues detected by OSS-Fuzz. Closes #6293 from pitrou/ARROW-7691-check-fb-fields-not-null and squashes the following commits: 02478a620 Use a function rather than a macro e6d3d8852 ARROW-7691: Check non-scalar Flatbuffers fields are not null Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/ipc/metadata_internal.cc | 77 ++++++-------------------- cpp/src/arrow/ipc/metadata_internal.h | 25 +++++++-- cpp/src/arrow/ipc/reader.cc | 21 +++---- 3 files changed, 46 insertions(+), 77 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 05f508df31c..94bb6cbd839 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -286,11 +286,7 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::Timestamp: { auto ts_type = static_cast(type_data); TimeUnit::type unit = FromFlatbufferUnit(ts_type->unit()); - if (ts_type->timezone() != 0 && ts_type->timezone()->Length() > 0) { - *out = timestamp(unit, ts_type->timezone()->str()); - } else { - *out = timestamp(unit); - } + *out = timestamp(unit, StringFromFlatbuffers(ts_type->timezone())); return Status::OK(); } case flatbuf::Type::Duration: { @@ -369,10 +365,7 @@ static Status TypeFromFlatbuffer(const flatbuf::Field* field, const KeyValueMetadata* field_metadata, std::shared_ptr* out) { auto type_data = field->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded Field is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type"); RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out)); // Look for extension metadata in custom_metadata field @@ -474,14 +467,8 @@ Status KeyValueMetadataFromFlatbuffer(const KVVector* fb_metadata, metadata->reserve(fb_metadata->size()); for (const auto& pair : *fb_metadata) { - if (pair->key() == nullptr) { - return Status::IOError( - "Key-pointer in custom metadata of flatbuffer-encoded Schema is null."); - } - if (pair->value() == nullptr) { - return Status::IOError( - "Value-pointer in custom metadata of flatbuffer-encoded Schema is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key"); + CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value"); metadata->Append(pair->key()->str(), pair->value()->str()); } @@ -776,9 +763,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona // Reconstruct the data type auto children = field->children(); - if (children == nullptr) { - return Status::IOError("Children-pointer of flatbuffer-encoded Field is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(children, "Field.children"); std::vector> child_fields(children->size()); for (int i = 0; i < static_cast(children->size()); ++i) { RETURN_NOT_OK( @@ -786,6 +771,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona } RETURN_NOT_OK(TypeFromFlatbuffer(field, child_fields, metadata.get(), &type)); + auto field_name = StringFromFlatbuffers(field->name()); + const flatbuf::DictionaryEncoding* encoding = field->dictionary(); if (encoding != nullptr) { @@ -794,22 +781,14 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona // dictionary_memo std::shared_ptr index_type; auto int_data = encoding->indexType(); - if (int_data == nullptr) { - return Status::IOError( - "indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding " - "is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(int_data, "DictionaryEncoding.indexType"); RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type)); ARROW_ASSIGN_OR_RAISE(type, DictionaryType::Make(index_type, type, encoding->isOrdered())); - *out = ::arrow::field(field->name()->str(), type, field->nullable(), metadata); + *out = ::arrow::field(field_name, type, field->nullable(), metadata); RETURN_NOT_OK(dictionary_memo->AddField(encoding->id(), *out)); } else { - auto name = field->name(); - if (name == nullptr) { - return Status::IOError("Name-pointer of flatbuffer-encoded Field is null."); - } - *out = ::arrow::field(name->str(), type, field->nullable(), metadata); + *out = ::arrow::field(field_name, type, field->nullable(), metadata); } return Status::OK(); } @@ -1183,17 +1162,15 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr* out) { auto schema = static_cast(opaque_schema); - if (schema->fields() == nullptr) { - return Status::IOError("Fields-pointer of flatbuffer-encoded Schema is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(schema, "schema"); + CHECK_FLATBUFFERS_NOT_NULL(schema->fields(), "Schema.fields"); int num_fields = static_cast(schema->fields()->size()); std::vector> fields(num_fields); for (int i = 0; i < num_fields; ++i) { const flatbuf::Field* field = schema->fields()->Get(i); - if (field == nullptr) { - return Status::IOError("Field-pointer of flatbuffer-encoded Schema is null."); - } + // XXX I don't think this check is necessary (AP) + CHECK_FLATBUFFERS_NOT_NULL(field, "DictionaryEncoding.indexType"); RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i])); } @@ -1225,12 +1202,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type auto dim = tensor->shape()->Get(i); shape->push_back(dim->size()); - auto fb_name = dim->name(); - if (fb_name == 0) { - dim_names->push_back(""); - } else { - dim_names->push_back(fb_name->str()); - } + dim_names->push_back(StringFromFlatbuffers(dim->name())); } if (tensor->strides() && tensor->strides()->size() > 0) { @@ -1239,11 +1211,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type } } - auto type_data = tensor->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded Tensor is null."); - } + auto type_data = tensor->type(); // Required return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type); } @@ -1283,12 +1251,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } if (dim_names) { - auto fb_name = dim->name(); - if (fb_name == 0) { - dim_names->push_back(""); - } else { - dim_names->push_back(fb_name->str()); - } + dim_names->push_back(StringFromFlatbuffers(dim->name())); } } } @@ -1324,11 +1287,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } } - auto type_data = sparse_tensor->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null."); - } + auto type_data = sparse_tensor->type(); // Required if (type) { return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type); } else { diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index b2ac81f86e7..f5ffee3fa75 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -31,20 +31,16 @@ #include "arrow/buffer.h" #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" -#include "arrow/memory_pool.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" #include "generated/Message_generated.h" #include "generated/Schema_generated.h" namespace arrow { -class DataType; -class Schema; -class Tensor; -class SparseTensor; - namespace flatbuf = org::apache::arrow::flatbuf; namespace io { @@ -92,6 +88,23 @@ struct FileBlock { int64_t body_length; }; +// Low-level utilities to help with reading Flatbuffers data. + +#define CHECK_FLATBUFFERS_NOT_NULL(fb_value, name) \ + if ((fb_value) == NULLPTR) { \ + return Status::IOError("Unexpected null field ", name, \ + " in flatbuffer-encoded metadata"); \ + } + +template +inline uint32_t FlatBuffersVectorSize(const flatbuffers::Vector* vec) { + return (vec == NULLPTR) ? 0 : vec->size(); +} + +inline std::string StringFromFlatbuffers(const flatbuffers::String* s) { + return (s == NULLPTR) ? "" : s->str(); +} + // Read interface classes. We do not fully deserialize the flatbuffers so that // individual fields metadata can be retrieved from very large schema without // diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8712d46d1c9..f99037a0d34 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -102,10 +102,7 @@ class IpcComponentSource { Status GetBuffer(int buffer_index, std::shared_ptr* out) { auto buffers = metadata_->buffers(); - if (buffers == nullptr) { - return Status::IOError( - "Buffers-pointer of flatbuffer-encoded RecordBatch is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (buffer_index >= static_cast(buffers->size())) { return Status::IOError("buffer_index out of range."); } @@ -127,9 +124,7 @@ class IpcComponentSource { Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); - if (nodes == nullptr) { - return Status::IOError("Nodes-pointer of flatbuffer-encoded Table is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); // pop off a field if (field_index >= static_cast(nodes->size())) { return Status::Invalid("Ran out of field metadata, likely malformed"); @@ -441,6 +436,7 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, // The dictionary is embedded in a record batch with a single column std::shared_ptr batch; auto batch_meta = dictionary_batch->data(); + CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data"); RETURN_NOT_OK(ReadRecordBatch(batch_meta, ::arrow::schema({value_field}), dictionary_memo, options, file, &batch)); if (batch->num_columns() != 1) { @@ -475,9 +471,6 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl { } CHECK_MESSAGE_TYPE(Message::SCHEMA, message->type()); CHECK_HAS_NO_BODY(*message); - if (message->header() == nullptr) { - return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); - } return internal::GetSchema(message->header(), &dictionary_memo_, &schema_); } @@ -663,9 +656,13 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { return Status::OK(); } - int num_dictionaries() const { return footer_->dictionaries()->size(); } + int num_dictionaries() const { + return static_cast(internal::FlatBuffersVectorSize(footer_->dictionaries())); + } - int num_record_batches() const { return footer_->recordBatches()->size(); } + int num_record_batches() const { + return static_cast(internal::FlatBuffersVectorSize(footer_->recordBatches())); + } MetadataVersion version() const { return internal::GetMetadataVersion(footer_->version()); From 09059d51e42d4c930e72016a09e551e04c6f6b60 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 4 Feb 2020 12:02:29 -0600 Subject: [PATCH 17/26] ARROW-7760: [Release] Fix verify-release-candidate.sh since pip3 seems to no longer be in miniconda, install miniconda unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change was necessary for me to get the script to finish to completion. Closes #6348 from wesm/pip3-to-pip and squashes the following commits: fcf3ae643 conda environment must be activated for other steps to work if C++ is disabled. Fix selective disabling of integration test components in archery 26da75982 always set up miniconda 490ceacfe pip3 no longer in miniconda Lead-authored-by: Wes McKinney Co-authored-by: Krisztián Szűcs Signed-off-by: Wes McKinney --- dev/archery/archery/integration/runner.py | 12 +++---- dev/release/verify-release-candidate.sh | 44 +++++++++++++---------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 9fa3f6194c7..60ee8f3b856 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -227,23 +227,23 @@ def get_static_json_files(): ] -def run_all_tests(enable_cpp=True, enable_java=True, enable_js=True, - enable_go=True, run_flight=False, +def run_all_tests(with_cpp=True, with_java=True, with_js=True, + with_go=True, run_flight=False, tempdir=None, **kwargs): tempdir = tempdir or tempfile.mkdtemp() testers = [] - if enable_cpp: + if with_cpp: testers.append(CPPTester(kwargs)) - if enable_java: + if with_java: testers.append(JavaTester(kwargs)) - if enable_js: + if with_js: testers.append(JSTester(kwargs)) - if enable_go: + if with_go: testers.append(GoTester(kwargs)) static_json_files = get_static_json_files() diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index bae762d3274..88974529ecf 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -30,7 +30,8 @@ # LD_LIBRARY_PATH. # # To reuse build artifacts between runs set TMPDIR environment variable to -# a directory where the temporary files should be placed to. +# a directory where the temporary files should be placed to, note that this +# directory is not cleaned up automatically. case $# in 3) ARTIFACT="$1" @@ -126,7 +127,7 @@ test_binary() { local download_dir=binaries mkdir -p ${download_dir} - python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir} + python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir} verify_dir_artifact_signatures ${download_dir} } @@ -207,11 +208,13 @@ setup_tempdir() { echo "Failed to verify release candidate. See ${TMPDIR} for details." fi } - trap cleanup EXIT if [ -z "${TMPDIR}" ]; then + # clean up automatically if TMPDIR is not defined TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT else + # don't clean up automatically mkdir -p "${TMPDIR}" fi } @@ -235,6 +238,15 @@ setup_miniconda() { fi . $MINICONDA/etc/profile.d/conda.sh + + conda create -n arrow-test -y -q -c conda-forge \ + python=3.6 \ + nomkl \ + numpy \ + pandas \ + six \ + cython + conda activate arrow-test } # Build and test Java (Requires newer Maven -- I used 3.3.9) @@ -251,15 +263,6 @@ test_package_java() { # Build and test C++ test_and_install_cpp() { - conda create -n arrow-test -y -q -c conda-forge \ - python=3.6 \ - nomkl \ - numpy \ - pandas \ - six \ - cython - conda activate arrow-test - mkdir -p cpp/build pushd cpp/build @@ -503,7 +506,7 @@ test_integration() { export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar export ARROW_CPP_EXE_PATH=$CPP_BUILD_DIR/release - pip3 install -e dev/archery + pip install -e dev/archery INTEGRATION_TEST_ARGS="" @@ -552,7 +555,6 @@ test_source_distribution() { test_package_java fi if [ ${TEST_CPP} -gt 0 ]; then - setup_miniconda test_and_install_cpp fi if [ ${TEST_CSHARP} -gt 0 ]; then @@ -686,9 +688,9 @@ test_wheels() { conda create -yq -n py3-base python=3.7 conda activate py3-base - python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ - --regex=${filter_regex} \ - --dest=${download_dir} + python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ + --regex=${filter_regex} \ + --dest=${download_dir} verify_dir_artifact_signatures ${download_dir} @@ -751,6 +753,9 @@ setup_tempdir "arrow-${VERSION}" echo "Working in sandbox ${TMPDIR}" cd ${TMPDIR} +setup_miniconda +echo "Using miniconda environment ${MINICONDA}" + if [ "${ARTIFACT}" == "source" ]; then dist_name="apache-arrow-${VERSION}" if [ ${TEST_SOURCE} -gt 0 ]; then @@ -759,6 +764,10 @@ if [ "${ARTIFACT}" == "source" ]; then tar xf ${dist_name}.tar.gz else mkdir -p ${dist_name} + if [ ! -f ${TEST_ARCHIVE} ]; then + echo "${TEST_ARCHIVE} not found, did you mean to pass TEST_SOURCE=1?" + exit 1 + fi tar xf ${TEST_ARCHIVE} -C ${dist_name} --strip-components=1 fi pushd ${dist_name} @@ -766,7 +775,6 @@ if [ "${ARTIFACT}" == "source" ]; then popd elif [ "${ARTIFACT}" == "wheels" ]; then import_gpg_keys - setup_miniconda test_wheels else import_gpg_keys From b7dbbccae93839e3d5a65286d366a56d92548061 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 4 Feb 2020 12:35:14 -0600 Subject: [PATCH 18/26] ARROW-6757: [Release] Use same CMake generator for C++ and Python when verifying RC, remove Python 3.5 from wheel verification This resolves the issues I was having as described in ARROW-6757. This does not fix the Python 3.8 wheel, though Closes #6350 from wesm/windows-rc-verify-fixes and squashes the following commits: a9d4c6656 Fixes for Windows release verification scripts Authored-by: Wes McKinney Signed-off-by: Wes McKinney --- dev/release/verify-release-candidate-wheels.bat | 7 ++----- dev/release/verify-release-candidate.bat | 12 +++++++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat index 0f8e047752e..19056cd77b6 100644 --- a/dev/release/verify-release-candidate-wheels.bat +++ b/dev/release/verify-release-candidate-wheels.bat @@ -25,7 +25,7 @@ @echo on set _CURRENT_DIR=%CD% -set _VERIFICATION_DIR=C:\tmp\arrow-verify-release +set _VERIFICATION_DIR=C:\tmp\arrow-verify-release-wheels if not exist "C:\tmp\" mkdir C:\tmp if exist %_VERIFICATION_DIR% rd %_VERIFICATION_DIR% /s /q @@ -33,9 +33,6 @@ if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR% cd %_VERIFICATION_DIR% -CALL :verify_wheel 3.5 %1 %2 -if errorlevel 1 GOTO error - CALL :verify_wheel 3.6 %1 %2 if errorlevel 1 GOTO error @@ -62,7 +59,7 @@ set ARROW_VERSION=%2 set RC_NUMBER=%3 set PY_VERSION_NO_PERIOD=%PY_VERSION:.=% -set CONDA_ENV_PATH=C:\tmp\arrow-verify-release\_verify-wheel-%PY_VERSION% +set CONDA_ENV_PATH=%_VERIFICATION_DIR%\_verify-wheel-%PY_VERSION% call conda create -p %CONDA_ENV_PATH% ^ --no-shortcuts -f -q -y python=%PY_VERSION% ^ || EXIT /B 1 diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index 70d47347823..8596b96f870 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -71,7 +71,10 @@ pushd %ARROW_SOURCE%\cpp\build @rem This is the path for Visual Studio Community 2017 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 -cmake -G "Ninja" ^ +@rem NOTE(wesm): not using Ninja for now to be able to more easily control the +@rem generator used + +cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BOOST_USE_SHARED=ON ^ @@ -90,11 +93,13 @@ cmake -G "Ninja" ^ -DARROW_PARQUET=ON ^ .. || exit /B +cmake --build . --target INSTALL --config Release + @rem NOTE(wesm): Building googletest is flaky for me with ninja. Building it @rem first fixes the problem -ninja googletest_ep || exit /B -ninja install || exit /B +@rem ninja googletest_ep || exit /B +@rem ninja install || exit /B @rem Get testing datasets for Parquet unit tests git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing @@ -112,6 +117,7 @@ popd @rem Build and import pyarrow pushd %ARROW_SOURCE%\python +set PYARROW_CMAKE_GENERATOR=%GENERATOR% set PYARROW_WITH_FLIGHT=1 set PYARROW_WITH_PARQUET=1 python setup.py build_ext --inplace --bundle-arrow-cpp bdist_wheel || exit /B From 992d9fcff72c8ee3bd22292ce44a450acffe044f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 5 Feb 2020 06:42:37 +0900 Subject: [PATCH 19/26] ARROW-7752: [Release] Enable and test dataset in the verification script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're not testing the dataset feature in the verifications scripts yet. Closes #6346 from kszucs/dataset-verification and squashes the following commits: b8530ea39 Test dataset during the verification Authored-by: Krisztián Szűcs Signed-off-by: Sutou Kouhei --- dev/release/verify-release-candidate-wheels.bat | 1 + dev/release/verify-release-candidate.bat | 2 ++ dev/release/verify-release-candidate.sh | 5 ++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat index 19056cd77b6..6bf1c111218 100644 --- a/dev/release/verify-release-candidate-wheels.bat +++ b/dev/release/verify-release-candidate-wheels.bat @@ -75,6 +75,7 @@ python -c "import pyarrow" || EXIT /B 1 python -c "import pyarrow.flight" || EXIT /B 1 python -c "import pyarrow.gandiva" || EXIT /B 1 python -c "import pyarrow.parquet" || EXIT /B 1 +python -c "import pyarrow.dataset" || EXIT /B 1 call deactivate diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index 8596b96f870..bcef9f51f6e 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -90,6 +90,7 @@ cmake -G "%GENERATOR%" ^ -DARROW_WITH_BROTLI=ON ^ -DARROW_FLIGHT=ON ^ -DARROW_PYTHON=ON ^ + -DARROW_DATASET=ON ^ -DARROW_PARQUET=ON ^ .. || exit /B @@ -120,6 +121,7 @@ pushd %ARROW_SOURCE%\python set PYARROW_CMAKE_GENERATOR=%GENERATOR% set PYARROW_WITH_FLIGHT=1 set PYARROW_WITH_PARQUET=1 +set PYARROW_WITH_DATASET=1 python setup.py build_ext --inplace --bundle-arrow-cpp bdist_wheel || exit /B py.test pyarrow -v -s --parquet || exit /B diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 88974529ecf..60a9d544bd9 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -276,6 +276,7 @@ ${ARROW_CMAKE_OPTIONS:-} -DARROW_PYTHON=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON +-DARROW_DATASET=ON -DPARQUET_REQUIRE_ENCRYPTION=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_ZLIB=ON @@ -365,6 +366,7 @@ test_python() { pip install -r requirements.txt -r requirements-test.txt + export PYARROW_WITH_DATASET=1 export PYARROW_WITH_GANDIVA=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 @@ -605,7 +607,8 @@ check_python_imports() { python -c "import pyarrow.fs" if [[ "$py_arch" =~ ^3 ]]; then - # Flight and Gandiva are only available for py3 + # Flight, Gandiva and Dataset are only available for py3 + python -c "import pyarrow.dataset" python -c "import pyarrow.flight" python -c "import pyarrow.gandiva" fi From 67e34c53b3be4c88348369f8109626b4a8a997aa Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 4 Feb 2020 18:57:12 -0600 Subject: [PATCH 20/26] ARROW-7766: [Python][Packaging] Windows py38 wheels are built with wrong ABI tag This is fixed in the latest release of `wheel` but we were pinning to an old version Closes #6353 from nealrichardson/fix-win-38-wheels and squashes the following commits: e3a865bc0 Remove wheel pin in requirements-wheel.txt Authored-by: Neal Richardson Signed-off-by: Wes McKinney --- python/requirements-wheel.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/requirements-wheel.txt b/python/requirements-wheel.txt index 99a2666dd42..18e79636e39 100644 --- a/python/requirements-wheel.txt +++ b/python/requirements-wheel.txt @@ -6,7 +6,6 @@ pandas setuptools_scm==3.2.0 six>=1.0.0 # TODO: TensorFlow doesn't support Python 3.8 yet. +# See https://github.com/tensorflow/tensorflow/issues/33374 tensorflow; python_version >= "3.0" and python_version < "3.8" -# pin wheel, because auditwheel is not compatible with wheel=0.32 -# TODO(kszucs): remove after auditwheel properly supports wheel>0.31 -wheel==0.31.1 +wheel From 184f8288d2ed18175c3a030dcc268aed0946eb8c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 Feb 2020 18:58:37 -0600 Subject: [PATCH 21/26] ARROW-7762: [Python] Do not ignore exception for invalid version in ParquetWriter Closes #6352 from jorisvandenbossche/ARROW-7762 and squashes the following commits: 935365831 ARROW-7762: Do not ignore exception for invalid version in ParquetWriter Authored-by: Joris Van den Bossche Signed-off-by: Wes McKinney --- python/pyarrow/_parquet.pyx | 5 +++-- python/pyarrow/tests/test_parquet.py | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 256c3cd2179..cc2b25da72b 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1304,14 +1304,15 @@ cdef class ParquetWriter: else: props.disallow_truncated_timestamps() - cdef void _set_version(self, WriterProperties.Builder* props): + cdef int _set_version(self, WriterProperties.Builder* props) except -1: if self.version is not None: if self.version == "1.0": props.version(ParquetVersion_V1) elif self.version == "2.0": props.version(ParquetVersion_V2) else: - raise ArrowException("Unsupported Parquet format version") + raise ValueError("Unsupported Parquet format version: {0}" + .format(self.version)) cdef void _set_compression_props(self, WriterProperties.Builder* props): if isinstance(self.compression, basestring): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f106da3267c..e18075e4aab 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -177,6 +177,12 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size): tm.assert_frame_equal(df, df_read) +def test_parquet_invalid_version(tempdir): + table = pa.table({'a': [1, 2, 3]}) + with pytest.raises(ValueError, match="Unsupported Parquet format version"): + _write_table(table, tempdir / 'test_version.parquet', version="2.2") + + def test_set_data_page_size(): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) From 25fd97b81a65df7aca5d5f3ce482a1126bb01b83 Mon Sep 17 00:00:00 2001 From: Martin Radev Date: Tue, 4 Feb 2020 19:10:53 -0600 Subject: [PATCH 22/26] PARQUET-1716: [C++] Add BYTE_STREAM_SPLIT encoder and decoder The patch implements an encoder and decoder for Parquet's BYTE_STREAM_SPLIT encoding. The patch also adds tests for the new encoding. Closes #6005 from martinradev/byte_stream_split_submit and squashes the following commits: 5a78f8b6b ARROW-5913: Add BYTE_STREAM_SPLIT encoder and decoder Authored-by: Martin Radev Signed-off-by: Wes McKinney --- cpp/src/parquet/column_reader.cc | 6 + cpp/src/parquet/encoding.cc | 188 +++++++++++++++++++++++++++++++ cpp/src/parquet/encoding_test.cc | 187 ++++++++++++++++++++++++++++++ cpp/src/parquet/types.cc | 2 + cpp/src/parquet/types.h | 1 + 5 files changed, 384 insertions(+) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index dded399240e..3cfb8938175 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -577,6 +577,12 @@ class ColumnReaderImplBase { decoders_[static_cast(encoding)] = std::move(decoder); break; } + case Encoding::BYTE_STREAM_SPLIT: { + auto decoder = MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT, descr_); + current_decoder_ = decoder.get(); + decoders_[static_cast(encoding)] = std::move(decoder); + break; + } case Encoding::RLE_DICTIONARY: throw ParquetException("Dictionary page must be before data page."); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a61bcecc974..dc06436ff9c 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -815,6 +815,104 @@ void DictEncoderImpl::PutDictionary(const arrow::Array& values) { } } +// ---------------------------------------------------------------------- +// ByteStreamSplitEncoder implementations + +template +class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + using T = typename DType::c_type; + using TypedEncoder::Put; + + explicit ByteStreamSplitEncoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + int64_t EstimatedDataEncodedSize() override; + std::shared_ptr FlushValues() override; + + void Put(const T* buffer, int num_values) override; + void Put(const arrow::Array& values) override; + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override; + + protected: + arrow::TypedBufferBuilder values_; + + private: + void PutArrowArray(const arrow::Array& values); +}; + +template +ByteStreamSplitEncoder::ByteStreamSplitEncoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) + : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool), values_{pool} {} + +template +int64_t ByteStreamSplitEncoder::EstimatedDataEncodedSize() { + return values_.length() * sizeof(T); +} + +template +std::shared_ptr ByteStreamSplitEncoder::FlushValues() { + constexpr size_t num_streams = sizeof(T); + std::shared_ptr output_buffer = + AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize()); + uint8_t* output_buffer_raw = output_buffer->mutable_data(); + const size_t num_values = values_.length(); + const uint8_t* raw_values = reinterpret_cast(values_.data()); + for (size_t i = 0; i < num_values; ++i) { + for (size_t j = 0U; j < num_streams; ++j) { + const uint8_t byte_in_value = raw_values[i * num_streams + j]; + output_buffer_raw[j * num_values + i] = byte_in_value; + } + } + values_.Reset(); + return std::move(output_buffer); +} + +template +void ByteStreamSplitEncoder::Put(const T* buffer, int num_values) { + PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values)); +} + +template +void ByteStreamSplitEncoder::Put(const ::arrow::Array& values) { + PutArrowArray(values); +} + +template <> +void ByteStreamSplitEncoder::PutArrowArray(const ::arrow::Array& values) { + DirectPutImpl(values, + reinterpret_cast(&values_)); +} + +template <> +void ByteStreamSplitEncoder::PutArrowArray(const ::arrow::Array& values) { + DirectPutImpl(values, + reinterpret_cast(&values_)); +} + +template +void ByteStreamSplitEncoder::PutSpaced(const T* src, int num_values, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(), + num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); +} + // ---------------------------------------------------------------------- // Encoder and decoder factory functions @@ -863,6 +961,18 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin DCHECK(false) << "Encoder not implemented"; break; } + } else if (encoding == Encoding::BYTE_STREAM_SPLIT) { + switch (type_num) { + case Type::FLOAT: + return std::unique_ptr( + new ByteStreamSplitEncoder(descr, pool)); + case Type::DOUBLE: + return std::unique_ptr( + new ByteStreamSplitEncoder(descr, pool)); + default: + throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE"); + break; + } } else { ParquetException::NYI("Selected encoding is not supported"); } @@ -2236,6 +2346,74 @@ class DeltaByteArrayDecoder : public DecoderImpl, ByteArray last_value_; }; +// ---------------------------------------------------------------------- +// BYTE_STREAM_SPLIT + +template +class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder { + public: + using T = typename DType::c_type; + explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr); + + int Decode(T* buffer, int max_values) override; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) override; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override; + + void SetData(int num_values, const uint8_t* data, int len) override; + + private: + int num_values_in_buffer{0U}; +}; + +template +ByteStreamSplitDecoder::ByteStreamSplitDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {} + +template +void ByteStreamSplitDecoder::SetData(int num_values, const uint8_t* data, + int len) { + DecoderImpl::SetData(num_values, data, len); + num_values_in_buffer = num_values; +} + +template +int ByteStreamSplitDecoder::Decode(T* buffer, int max_values) { + constexpr size_t num_streams = sizeof(T); + const int values_to_decode = std::min(num_values_, max_values); + const int num_decoded_previously = num_values_in_buffer - num_values_; + for (int i = 0; i < values_to_decode; ++i) { + uint8_t gathered_byte_data[num_streams]; + for (size_t b = 0; b < num_streams; ++b) { + const size_t byte_index = b * num_values_in_buffer + num_decoded_previously + i; + gathered_byte_data[b] = data_[byte_index]; + } + buffer[i] = arrow::util::SafeLoadAs(&gathered_byte_data[0]); + } + num_values_ -= values_to_decode; + len_ -= sizeof(T) * values_to_decode; + return values_to_decode; +} + +template +int ByteStreamSplitDecoder::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) { + ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder"); +} + +template +int ByteStreamSplitDecoder::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) { + ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder"); +} + // ---------------------------------------------------------------------- std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, @@ -2261,6 +2439,16 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin default: break; } + } else if (encoding == Encoding::BYTE_STREAM_SPLIT) { + switch (type_num) { + case Type::FLOAT: + return std::unique_ptr(new ByteStreamSplitDecoder(descr)); + case Type::DOUBLE: + return std::unique_ptr(new ByteStreamSplitDecoder(descr)); + default: + throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE"); + break; + } } else { ParquetException::NYI("Selected encoding is not supported"); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 473b985b35a..4a40d0162f8 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -29,6 +29,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "parquet/encoding.h" #include "parquet/platform.h" @@ -38,6 +39,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::internal::checked_cast; // TODO(hatemhelal): investigate whether this can be replaced with GTEST_SKIP in a future // gtest release that contains https://github.com/google/googletest/pull/1544 @@ -862,5 +864,190 @@ TEST_F(DictEncoding, CheckDecodeIndicesNoNulls) { CheckDict(actual_num_values, *builder); } +// ---------------------------------------------------------------------- +// BYTE_STREAM_SPLIT encode/decode tests. + +template +void TestByteStreamSplitDecodePath(const uint8_t* encoded_data, + const int64_t encoded_data_size, + const typename DType::c_type* expected_decoded_data, + const int num_elements, + const bool request_more_values) { + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT); + decoder->SetData(num_elements, encoded_data, static_cast(encoded_data_size)); + std::vector decoded_data(num_elements); + int num_elements_to_decode = num_elements; + if (request_more_values) { + num_elements_to_decode += 100; + } + int num_decoded_elements = decoder->Decode(decoded_data.data(), num_elements_to_decode); + ASSERT_EQ(num_elements, num_decoded_elements); + for (size_t i = 0U; i < decoded_data.size(); ++i) { + ASSERT_EQ(expected_decoded_data[i], decoded_data[i]); + } + ASSERT_EQ(0, decoder->values_left()); +} + +template +void TestByteStreamSplitRoundTrip(const typename DType::c_type* input_data, const int n) { + auto encoder = MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT); + encoder->Put(input_data, n); + const int64_t estimated_num_bytes = encoder->EstimatedDataEncodedSize(); + ASSERT_EQ(static_cast(n) * sizeof(typename DType::c_type), + estimated_num_bytes); + std::shared_ptr buffer = encoder->FlushValues(); + TestByteStreamSplitDecodePath(buffer->data(), buffer->size(), input_data, n, + false); +} + +template +void TestEncodeDecodeWithBigInput() { + const int nvalues = 10000; + using T = typename DType::c_type; + std::vector data(nvalues); + GenerateData(nvalues, data.data(), nullptr); + TestByteStreamSplitRoundTrip(data.data(), nvalues); +} + +// Check that the encoder can handle input with one element. +TEST(ByteStreamSplitEncodeDecode, EncodeOneLenInput) { + const float value = 1.0f; + TestByteStreamSplitRoundTrip(&value, 1); +} + +// Check that the decoder can handle empty input. +TEST(ByteStreamSplitEncodeDecode, DecodeZeroLenInput) { + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT); + decoder->SetData(0, NULL, 0); + ASSERT_EQ(0U, decoder->Decode(NULL, 0)); +} + +TEST(ByteStreamSplitEncodeDecode, DecodeOneLenInput) { + const uint8_t data[] = {0x47U, 0x24U, 0xa7U, 0x44U}; + TestByteStreamSplitDecodePath( + data, 4, reinterpret_cast(&data[0]), 1, false); +} + +// Check that requesting to decode more elements than is available in the storage +// of the decoder works correctly. +TEST(ByteStreamSplitEncodeDecode, DecodeLargerPortion) { + const uint8_t data[] = {0xDEU, 0xC0U, 0x37U, 0x13U, 0x11U, 0x22U, 0x33U, 0x44U, + 0xAAU, 0xBBU, 0xCCU, 0xDDU, 0x55U, 0x66U, 0x77U, 0x88U}; + const uint64_t expected_output[2] = {0x7755CCAA331137DEULL, 0x8866DDBB442213C0ULL}; + TestByteStreamSplitDecodePath( + data, sizeof(data), reinterpret_cast(&expected_output[0]), 2, true); +} + +// Check that the decoder can decode the input in smaller steps. +TEST(ByteStreamSplitEncodeDecode, DecodeMultipleTimes) { + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT); + const int num_values = 100; + std::vector data(num_values * 4); + for (size_t i = 0; i < data.size(); ++i) { + data[i] = static_cast(i & 0xFFU); + } + decoder->SetData(num_values, data.data(), num_values * 4); + + const int step = 25; + std::vector decoded_data(step); + for (int i = 0; i < num_values; i += step) { + int num_decoded = decoder->Decode(decoded_data.data(), step); + ASSERT_EQ(step, num_decoded); + for (int j = 0; j < step; ++j) { + const uint32_t assembled_value = + static_cast(data[i + j]) | + (static_cast(data[(i + j) + num_values]) << 8U) | + (static_cast(data[(i + j) + num_values * 2]) << 16U) | + (static_cast(data[(i + j) + num_values * 3]) << 24U); + const float assembled_value_as_float = + *reinterpret_cast(&assembled_value); + ASSERT_EQ(assembled_value_as_float, decoded_data[j]); + } + } +} + +// Check that an encode-decode pipeline produces the original small input. +// This small-input test is added to ease debugging in case of changes to +// the encoder/decoder implementation. +TEST(ByteStreamSplitEncodeDecode, SmallInput) { + const float data[] = {-166.166f, -0.2566f, .0f, 322.0f, 178888.189f}; + const int num_values = sizeof(data) / sizeof(data[0U]); + TestByteStreamSplitRoundTrip(data, num_values); +} + +TEST(ByteStreamSplitEncodeDecode, PutSpaced) { + const float data[] = {-1.0f, .0f, .0f, 3.0f, .0f, 22.1234f, + .0f, 198891.0f, .0f, -223345.4455f, 24443.124f}; + const float valid_data[] = {-1.0f, 3.0f, 22.1234f, + 198891.0f, -223345.4455f, 24443.124f}; + // The valid ones are the ones which are non-zero. + // The enable bits are: 10010101 011. + const uint8_t valid_bits[2] = {0xA9U, 0x6U}; + const int num_values = sizeof(data) / sizeof(data[0U]); + const int num_valid_values = sizeof(valid_data) / sizeof(valid_data[0U]); + std::unique_ptr> encoder = + MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT); + encoder->PutSpaced(data, num_values, valid_bits, 0); + std::shared_ptr buffer = encoder->FlushValues(); + + TestByteStreamSplitDecodePath(buffer->data(), buffer->size(), valid_data, + num_valid_values, false); +} + +TEST(ByteStreamSplitEncodeDecode, PutArrow) { + arrow::random::RandomArrayGenerator rag{1337}; + const int num_values = 123; + auto arr = rag.Float32(num_values, -2048.0f, 2048.0f, 0); + std::unique_ptr> encoder = + MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT); + encoder->Put(*arr); + std::shared_ptr buffer = encoder->FlushValues(); + + auto raw_values = checked_cast(*arr).raw_values(); + TestByteStreamSplitDecodePath(buffer->data(), buffer->size(), raw_values, + num_values, false); +} + +// Test that the encode-decode pipeline can handle big 32-bit FP input. +TEST(ByteStreamSplitEncodeDecode, BigInputFloat) { + TestEncodeDecodeWithBigInput(); +} + +// Test that the encode-decode pipeline can handle big 64-bit FP input. +TEST(ByteStreamSplitEncodeDecode, BigInputDouble) { + TestEncodeDecodeWithBigInput(); +} + +TEST(ByteStreamSplitEncodeDecode, InvalidDataTypes) { + // First check encoders. + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), ParquetException); + + // Then check decoders. + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), ParquetException); +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index e7efe6a263c..9dc13e42191 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -157,6 +157,8 @@ std::string EncodingToString(Encoding::type t) { return "DELTA_BYTE_ARRAY"; case Encoding::RLE_DICTIONARY: return "RLE_DICTIONARY"; + case Encoding::BYTE_STREAM_SPLIT: + return "BYTE_STREAM_SPLIT"; default: return "UNKNOWN"; } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index ebb8c2446e3..420494a8951 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -452,6 +452,7 @@ struct Encoding { DELTA_LENGTH_BYTE_ARRAY = 6, DELTA_BYTE_ARRAY = 7, RLE_DICTIONARY = 8, + BYTE_STREAM_SPLIT = 9, UNKNOWN = 999 }; }; From c02d376ed31d51dce0f37e13b9a2f4cb0c39a792 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 5 Feb 2020 15:45:32 +0100 Subject: [PATCH 23/26] ARROW-4226: [C++] Add sparse CSF tensor support This is to resolve [ARROW-4226](https://issues.apache.org/jira/browse/ARROW-4226). Closes #5716 from rok/ARROW-4226 and squashes the following commits: 9ca93ab60 Implementing review feedback. 1b922f6ae Implementing review feedback. 11b81bb04 Factoring out index incrementing for dense to COO and CSF indices. 6f4f4a8f9 Implementing feedback review. 28d38cb5e Removing backslashes from comments. 3291abc90 Marking indptrBuffers, indicesBuffers and axisOrder required. d9ff47e67 Further work and implementing review feedback. 24a831f3e Style. 4f2bf00dd Work on CSF index tests. 6ceb406b6 Implementing review feedback. bd0d8c2f8 Dense to sparse CSF conversion now in order of dimension size. eb519471d Switching SparseCSFIndex to '2D' data structure. a322ff5b2 Adding tests for multiple index value types for SparseCSFIndex. f44d92cfd Adding SparseCSFIndex::Make. 7d17995a4 Adding Tensor to SparseCSFTensor conversion. 05a47a546 Using axis_order in CSF. 6b938f7da Documentation. 2d1010405 WIP Authored-by: Rok Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compare.cc | 12 + cpp/src/arrow/python/serialize.cc | 5 + cpp/src/arrow/sparse_tensor.cc | 345 ++++++++++++++++++++++++++-- cpp/src/arrow/sparse_tensor.h | 67 ++++++ cpp/src/arrow/sparse_tensor_test.cc | 222 ++++++++++++++++++ format/SparseTensor.fbs | 2 +- 6 files changed, 628 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f7431f80f5f..d2322009ea8 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1196,6 +1196,13 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csf); + } + default: return false; } @@ -1232,6 +1239,11 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { return SparseTensorEqualsImplDispatch(left_csc, right); } + case SparseTensorFormat::CSF: { + const auto& left_csf = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csf, right); + } + default: return false; } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 09a322b1060..06c85648591 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -654,6 +654,7 @@ Status CountSparseTensors( OwnedRef num_sparse_tensors(PyDict_New()); size_t num_coo = 0; size_t num_csr = 0; + size_t num_csf = 0; for (const auto& sparse_tensor : sparse_tensors) { switch (sparse_tensor->format_id()) { @@ -663,6 +664,9 @@ Status CountSparseTensors( case SparseTensorFormat::CSR: ++num_csr; break; + case SparseTensorFormat::CSF: + ++num_csf; + break; case SparseTensorFormat::CSC: // TODO(mrkn): support csc break; @@ -671,6 +675,7 @@ Status CountSparseTensors( PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo)); PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf)); RETURN_IF_PYERROR(); *out = num_sparse_tensors.detach(); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d42bdf4ca61..549223c0798 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -23,9 +23,11 @@ #include #include +#include "arrow/buffer_builder.h" #include "arrow/compare.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" #include "arrow/visitor_inline.h" namespace arrow { @@ -55,6 +57,38 @@ class SparseTensorConverter { Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; +// ---------------------------------------------------------------------- +// IncrementIndex for SparseCOOIndex and SparseCSFIndex + +inline void IncrementIndex(std::vector& coord, + const std::vector& shape) { + const int64_t ndim = shape.size(); + ++coord[ndim - 1]; + if (coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } +} + +inline void IncrementIndex(std::vector& coord, const std::vector& shape, + const std::vector& axis_order) { + const int64_t ndim = shape.size(); + const int64_t last_axis = axis_order[ndim - 1]; + ++coord[last_axis]; + if (coord[last_axis] == shape[last_axis]) { + int64_t d = ndim - 1; + while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { + coord[axis_order[d]] = 0; + ++coord[axis_order[d - 1]]; + --d; + } + } +} + // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCOOIndex @@ -128,17 +162,7 @@ class SparseTensorConverter *indices++ = static_cast(coord[i]); } } - - // increment index - ++coord[ndim - 1]; - if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { - int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { - coord[d] = 0; - ++coord[d - 1]; - --d; - } - } + IncrementIndex(coord, shape); } } @@ -419,6 +443,144 @@ class SparseTensorConverter inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSFIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using typename BaseClass::NumericTensorType; + using typename BaseClass::value_type; + + SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type, + MemoryPool* pool) + : BaseClass(tensor, index_value_type, pool) {} + + template + Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + + const int64_t ndim = tensor_.ndim(); + // Axis order as ascending order of dimension size is a good heuristic but is not + // necessarily optimal. + std::vector axis_order = internal::ArgSort(tensor_.shape()); + int64_t nonzero_count = -1; + RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); + + std::shared_ptr values_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + std::vector counts(ndim, 0); + std::vector coord(ndim, 0); + std::vector previous_coord(ndim, -1); + std::vector> indptr_buffer_builders(ndim - 1); + std::vector> indices_buffer_builders(ndim); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + const std::vector& shape = tensor_.shape(); + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); + + if (x != 0) { + bool tree_split = false; + *values++ = x; + + for (int64_t i = 0; i < ndim; ++i) { + int64_t dimension = axis_order[i]; + + tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); + if (tree_split) { + if (i < ndim - 1) { + RETURN_NOT_OK(indptr_buffer_builders[i].Append( + static_cast(counts[i + 1]))); + } + RETURN_NOT_OK(indices_buffer_builders[i].Append( + static_cast(coord[dimension]))); + ++counts[i]; + } + } + previous_coord = coord; + } + IncrementIndex(coord, shape, axis_order); + } + } + + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Append( + static_cast(counts[column + 1]))); + } + + // make results + data = values_buffer; + + std::vector> indptr_buffers(ndim - 1); + std::vector> indices_buffers(ndim); + std::vector indptr_shapes(counts.begin(), counts.end() - 1); + std::vector indices_shapes = counts; + + for (int64_t column = 0; column < ndim; ++column) { + RETURN_NOT_OK( + indices_buffer_builders[column].Finish(&indices_buffers[column], true)); + } + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true)); + } + + ARROW_ASSIGN_OR_RAISE( + sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, + indptr_buffers, indices_buffers)); + return Status::OK(); + } + +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::TypeError("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::index_value_type_; + using BaseClass::pool_; + using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + auto max_dimension = + *std::max_element(tensor_.shape().begin(), tensor_.shape().end()); + if (static_cast(type_max) < max_dimension) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } +}; + // ---------------------------------------------------------------------- // Instantiate templates @@ -438,6 +600,7 @@ class SparseTensorConverter INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSCIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSFIndex); } // namespace @@ -500,6 +663,10 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, case SparseTensorFormat::CSC: return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); + case SparseTensorFormat::CSF: + return MakeSparseTensorFromTensor(tensor, index_value_type, pool, + out_sparse_index, out_data); + // LCOV_EXCL_START: ignore program failure default: return Status::Invalid("Invalid sparse tensor format"); @@ -507,6 +674,35 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, } } +namespace { + +template +void ExpandSparseCSFTensorValues(int64_t dimension, int64_t dense_offset, + int64_t first_ptr, int64_t last_ptr, + const SparseCSFIndex& sparse_index, const TYPE* raw_data, + const std::vector& strides, + const std::vector& axis_order, TYPE* out) { + int64_t ndim = axis_order.size(); + + for (int64_t i = first_ptr; i < last_ptr; ++i) { + int64_t tmp_dense_offset = + dense_offset + sparse_index.indices()[dimension]->Value({i}) * + strides[axis_order[dimension]]; + + if (dimension < ndim - 1) { + ExpandSparseCSFTensorValues( + dimension + 1, tmp_dense_offset, + sparse_index.indptr()[dimension]->Value({i}), + sparse_index.indptr()[dimension]->Value({i + 1}), sparse_index, + raw_data, strides, axis_order, out); + } else { + out[tmp_dense_offset] = raw_data[i]; + } + } +} + +} // namespace + template Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, std::shared_ptr* out) { @@ -521,18 +717,20 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t std::fill_n(values, sparse_tensor->size(), static_cast(0)); + std::vector strides(sparse_tensor->ndim(), 1); + for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { + strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; + } + std::vector empty_strides; + + const auto raw_data = reinterpret_cast(sparse_tensor->raw_data()); + switch (sparse_tensor->format_id()) { case SparseTensorFormat::COO: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr coords = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); - std::vector strides(sparse_tensor->ndim(), 1); - for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { - strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; - } for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) { std::vector coord(sparse_tensor->ndim()); int64_t offset = 0; @@ -543,7 +741,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t values[offset] = raw_data[i]; } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -552,8 +751,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t i = 0; i < indptr->size() - 1; ++i) { @@ -565,7 +762,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -574,8 +772,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t j = 0; j < indptr->size() - 1; ++j) { @@ -587,7 +783,21 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); + return Status::OK(); + } + + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + + ExpandSparseCSFTensorValues( + 0, 0, 0, sparse_index.indptr()[0]->size() - 1, sparse_index, raw_data, strides, + sparse_index.axis_order(), values); + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } } @@ -625,6 +835,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t const std::shared_ptr indices = sparse_index.indices(); type = indices->type(); break; + } + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::vector> indices = sparse_index.indices(); + type = indices[0]->type(); + break; } // LCOV_EXCL_START: ignore program failure default: @@ -754,6 +971,86 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, } // namespace internal +// ---------------------------------------------------------------------- +// SparseCSFIndex + +namespace { + +inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const int64_t num_indptrs, + const int64_t num_indices, + const std::vector& indptr_shape, + const std::vector& indices_shape, + const int64_t axis_order_size) { + if (!is_integer(indptr_type->id())) { + return Status::TypeError("Type of SparseCSFIndex indptr must be integer"); + } + if (!is_integer(indices_type->id())) { + return Status::TypeError("Type of SparseCSFIndex indices must be integer"); + } + if (num_indptrs + 1 != num_indices) { + return Status::Invalid( + "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex."); + } + if (axis_order_size != num_indices) { + return Status::Invalid( + "Length of indices must be equal to number of dimensions for SparseCSFIndex."); + } + return Status::OK(); +} + +} // namespace + +Result> SparseCSFIndex::Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data) { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) + indptr[i] = std::make_shared(indptr_type, indptr_data[i], + std::vector({indices_shapes[i] + 1})); + for (int64_t i = 0; i < ndim; ++i) + indices[i] = std::make_shared(indices_type, indices_data[i], + std::vector({indices_shapes[i]})); + + RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), + indices.size(), indptr.back()->shape(), + indices.back()->shape(), axis_order.size())); + + return std::make_shared(indptr, indices, axis_order); +} + +// Constructor with two index vectors +SparseCSFIndex::SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, + const std::vector& axis_order) + : SparseIndexBase(indices.back()->size()), + indptr_(indptr), + indices_(indices), + axis_order_(axis_order) { + ARROW_CHECK_OK(CheckSparseCSFIndexValidity( + indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(), + indptr_.back()->shape(), indices_.back()->shape(), axis_order_.size())); +} + +std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } + +bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const { + for (int64_t i = 0; i < static_cast(indices().size()); ++i) { + if (!indices()[i]->Equals(*other.indices()[i])) return false; + } + for (int64_t i = 0; i < static_cast(indptr().size()); ++i) { + if (!indptr()[i]->Equals(*other.indptr()[i])) return false; + } + return axis_order() == other.axis_order(); +} + // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index f736f7b7576..33a53761e14 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -40,6 +40,8 @@ struct SparseTensorFormat { CSR, /// Compressed sparse column (CSC) format. CSC, + /// Compressed sparse fiber (CSF) format. + CSF }; }; @@ -329,6 +331,68 @@ class ARROW_EXPORT SparseCSCIndex using SparseCSXIndex::SparseCSXIndex; }; +// ---------------------------------------------------------------------- +// SparseCSFIndex class + +/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor +/// +/// A CSF sparse index manages the location of its non-zero values by set of +/// prefix trees. Each path from a root to leaf forms one tensor non-zero index. +/// CSF is implemented with three vectors. +/// +/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the +/// number of dimensions. Axis_order is a vector of integers of legth N. Indptr and +/// indices describe the set of prefix trees. Trees traverse dimensions in order given by +/// axis_order. +class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { + public: + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; + static constexpr char const* kTypeName = "SparseCSFIndex"; + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data); + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data) { + return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data, + indices_data); + } + + /// \brief Construct SparseCSFIndex from two index vectors + explicit SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, + const std::vector& axis_order); + + /// \brief Return a 1D vector of indptr tensors + const std::vector>& indptr() const { return indptr_; } + + /// \brief Return a 1D vector of indices tensors + const std::vector>& indices() const { return indices_; } + + /// \brief Return a 1D vector specifying the order of axes + const std::vector& axis_order() const { return axis_order_; } + + /// \brief Return a string representation of the sparse index + std::string ToString() const override; + + /// \brief Return whether the CSF indices are equal + bool Equals(const SparseCSFIndex& other) const; + + protected: + std::vector> indptr_; + std::vector> indices_; + std::vector axis_order_; +}; + // ---------------------------------------------------------------------- // SparseTensor class @@ -527,6 +591,9 @@ using SparseCSRMatrix = SparseTensorImpl; /// \brief EXPERIMENTAL: Type alias for CSC sparse matrix using SparseCSCMatrix = SparseTensorImpl; +/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix +using SparseCSFTensor = SparseTensorImpl; + } // namespace arrow #endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 198aa8f5f8d..45cb8dcc8f3 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -30,6 +30,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/sort.h" namespace arrow { @@ -910,4 +911,225 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +template +class TestSparseCSFTensorBase : public ::testing::Test { + public: + void SetUp() { + dim_names_ = {"a", "b", "c", "d"}; + shape_ = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor_(int16(), dense_buffer, shape_, {}, dim_names_); + ASSERT_OK_AND_ASSIGN( + sparse_tensor_from_dense_, + SparseCSFTensor::Make(dense_tensor_, + TypeTraits::type_singleton())); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; + +template +class TestSparseCSFTensorForIndexValueType + : public TestSparseCSFTensorBase { + protected: + std::shared_ptr MakeSparseCSFIndex( + const std::vector& axis_order, + const std::vector>& indptr_values, + const std::vector>& indices_values) + const { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) { + indptr[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indptr_values[i]), + std::vector({static_cast(indptr_values[i].size())})); + } + for (int64_t i = 0; i < ndim; ++i) { + indices[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indices_values[i]), + std::vector({static_cast(indices_values[i].size())})); + } + return std::make_shared(indptr, indices, axis_order); + } + + template + std::shared_ptr MakeSparseTensor( + const std::shared_ptr& si, std::vector& sparse_values, + const std::vector& shape, + const std::vector& dim_names) const { + auto data_buffer = Buffer::Wrap(sparse_values); + return std::make_shared( + si, CTypeTraits::type_singleton(), data_buffer, shape, dim_names); + } +}; + +TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {2, 3, 4, 5}; + std::vector dim_names = {"a", "b", "c", "d"}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector> indptr_values = { + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 1, 1}, {0, 0, 1, 1}, {1, 2, 0, 2, 0, 0, 1, 2}}; + + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, dim_names); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) { + std::vector dim_names = {"a", "b", "c", "d"}; + ASSERT_EQ(8, this->sparse_tensor_from_dense_->non_zero_length()); + ASSERT_TRUE(this->sparse_tensor_from_dense_->is_mutable()); + ASSERT_EQ(dim_names, this->sparse_tensor_from_dense_->dim_names()); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { + std::vector shape = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + ASSERT_TRUE(dense_tensor.Equals(*dt)); + ASSERT_EQ(dense_tensor.dim_names(), dt->dim_names()); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestRoundTrip) { + using IndexValueType = TypeParam; + + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + std::shared_ptr st; + ASSERT_OK_AND_ASSIGN( + st, SparseCSFTensor::Make(*dt, TypeTraits::type_singleton())); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestAlternativeAxisOrder) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector dense_values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; + std::vector shape = {4, 6}; + std::vector dim_names = {"a", "b"}; + std::shared_ptr dense_buffer = Buffer::Wrap(dense_values); + Tensor tensor(int16(), dense_buffer, shape, {}, dim_names); + + // Axis order 1 + std::vector axis_order_1 = {0, 1}; + std::vector sparse_values_1 = {1, 3, 2, 4, 5}; + std::vector> indptr_values_1 = {{0, 2, 3, 5}}; + std::vector> indices_values_1 = {{0, 1, 3}, + {0, 3, 1, 3, 5}}; + auto si_1 = this->MakeSparseCSFIndex(axis_order_1, indptr_values_1, indices_values_1); + auto st_1 = this->MakeSparseTensor(si_1, sparse_values_1, shape, dim_names); + + // Axis order 2 + std::vector axis_order_2 = {1, 0}; + std::vector sparse_values_2 = {1, 2, 3, 4, 5}; + std::vector> indptr_values_2 = {{0, 1, 2, 4, 5}}; + std::vector> indices_values_2 = {{0, 1, 3, 5}, + {0, 1, 0, 3, 3}}; + auto si_2 = this->MakeSparseCSFIndex(axis_order_2, indptr_values_2, indices_values_2); + auto st_2 = this->MakeSparseTensor(si_2, sparse_values_2, shape, dim_names); + + std::shared_ptr dt_1, dt_2; + ASSERT_OK(st_1->ToTensor(&dt_1)); + ASSERT_OK(st_2->ToTensor(&dt_2)); + + ASSERT_FALSE(st_1->Equals(*st_2)); + ASSERT_TRUE(dt_1->Equals(*dt_2)); + ASSERT_TRUE(dt_1->Equals(tensor)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestNonAscendingShape) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {5, 2, 3, 4}; + int16_t dense_values[5][2][3][4] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr sparse_tensor; + ASSERT_OK_AND_ASSIGN( + sparse_tensor, + SparseCSFTensor::Make(dense_tensor, TypeTraits::type_singleton())); + + std::vector> indptr_values = { + {0, 1, 3}, {0, 2, 4, 7}, {0, 1, 2, 3, 4, 6, 7, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 0, 1}, {1, 2, 0, 2, 0, 1, 2}, {0, 0, 0, 0, 0, 1, 1, 1}}; + std::vector axis_order = {1, 2, 3, 0}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(dt->Equals(dense_tensor)); + ASSERT_TRUE(st->Equals(*sparse_tensor)); +} + +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor, + TestTensorToSparseTensor, TestSparseTensorToTensor, + TestAlternativeAxisOrder, TestNonAscendingShape, + TestRoundTrip); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCSFTensorForIndexValueType, + UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType, + UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSFTensorForIndexValueType, + UInt64Type); } // namespace arrow diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 1de67eed19a..9c8ddae0b7c 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -116,7 +116,7 @@ table SparseMatrixIndexCSX { union SparseTensorIndex { SparseTensorIndexCOO, - SparseMatrixIndexCSX, + SparseMatrixIndexCSX } table SparseTensor { From a1ba1cbe1713d252e473a56a3595df58dcbd8a28 Mon Sep 17 00:00:00 2001 From: Takashi Hashida Date: Wed, 5 Feb 2020 12:10:31 -0600 Subject: [PATCH 24/26] ARROW-7514: [C#] Make GetValueOffset Obsolete * Add an [Obsolete] attribute to `BinaryArray.GetValueOffset` * `ListArray.GetValueOffset` already has the [Obsolete] attribute, so it is not changed * Avoid using `GetValueOffset` in the product source code As a precaution, I added tests for `ValueOffsets` and left tests for `GetValueOffset`. Closes #6333 from HashidaTKS/ARROW-7514_make_getvalueoffset_obsolete and squashes the following commits: 1dbaf39f0 ARROW-7514_make_getvalueoffset_obsolete 92b14c055 ARROW-7514_make_getvalueoffset_obsolete 07d106c48 ARROW-7514_make_getvalueoffset_obsolete Authored-by: Takashi Hashida Signed-off-by: Eric Erhardt --- csharp/src/Apache.Arrow/Arrays/BinaryArray.cs | 9 ++++++--- csharp/src/Apache.Arrow/Arrays/ListArray.cs | 2 +- csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs | 8 ++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index 49c483fc41f..e2cf0bc772a 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -171,6 +171,7 @@ public BinaryArray(IArrowType dataType, int length, public ReadOnlySpan Values => ValueBuffer.Span.CastTo(); [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Obsolete("This method has been deprecated. Please use ValueOffsets[index] instead.")] public int GetValueOffset(int index) { if (index < 0 || index > Length) @@ -193,10 +194,12 @@ public int GetValueLength(int index) public ReadOnlySpan GetBytes(int index) { - var offset = GetValueOffset(index); - var length = GetValueLength(index); + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } - return ValueBuffer.Span.Slice(offset, length); + return ValueBuffer.Span.Slice(ValueOffsets[index], GetValueLength(index)); } } diff --git a/csharp/src/Apache.Arrow/Arrays/ListArray.cs b/csharp/src/Apache.Arrow/Arrays/ListArray.cs index ff7095d5da6..bc171e6ffbd 100644 --- a/csharp/src/Apache.Arrow/Arrays/ListArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/ListArray.cs @@ -123,7 +123,7 @@ private ListArray(ArrayData data, IArrowArray values) : base(data) public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); - [Obsolete] + [Obsolete("This method has been deprecated. Please use ValueOffsets[index] instead.")] public int GetValueOffset(int index) { if (index < 0 || index > Length) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index f2a8e58c89f..a1a7b231314 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -40,11 +40,19 @@ public void ThrowsWhenGetValueAndOffsetIndexOutOfBounds() Assert.Equal(1, array.GetValueLength(1)); Assert.Throws(() => array.GetValueLength(2)); +#pragma warning disable 618 Assert.Throws(() => array.GetValueOffset(-1)); Assert.Equal(0, array.GetValueOffset(0)); Assert.Equal(1, array.GetValueOffset(1)); Assert.Equal(2, array.GetValueOffset(2)); Assert.Throws(() => array.GetValueOffset(3)); +#pragma warning restore 618 + + Assert.Throws(() => array.ValueOffsets[-1]); + Assert.Equal(0, array.ValueOffsets[0]); + Assert.Equal(1, array.ValueOffsets[1]); + Assert.Equal(2, array.ValueOffsets[2]); + Assert.Throws(() => array.ValueOffsets[3]); } From 12d58e3239302e84febddadc90971cb5a6d521c6 Mon Sep 17 00:00:00 2001 From: Richard Artoul Date: Wed, 5 Feb 2020 20:39:17 -0500 Subject: [PATCH 25/26] Add Reset method to buffer and cleanup comments --- go/arrow/memory/buffer.go | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/go/arrow/memory/buffer.go b/go/arrow/memory/buffer.go index 234f5d4337f..006e45577b1 100644 --- a/go/arrow/memory/buffer.go +++ b/go/arrow/memory/buffer.go @@ -22,6 +22,7 @@ import ( "github.com/apache/arrow/go/arrow/internal/debug" ) +// Buffer is a wrapper type for a buffer of bytes. type Buffer struct { refCount int64 buf []byte @@ -35,7 +36,7 @@ func NewBufferBytes(data []byte) *Buffer { return &Buffer{refCount: 0, buf: data, length: len(data)} } -// NewBuffer creates a mutable, resizable buffer with an Allocator for managing memory. +// NewResizableBuffer creates a mutable, resizable buffer with an Allocator for managing memory. func NewResizableBuffer(mem Allocator) *Buffer { return &Buffer{refCount: 1, mutable: true, mem: mem} } @@ -60,15 +61,29 @@ func (b *Buffer) Release() { } } +// Reset resets the buffer for reuse. +func (b *Buffer) Reset(buf []byte) { + b.refCount = 0 + b.buf = buf + b.length = len(buf) +} + // Buf returns the slice of memory allocated by the Buffer, which is adjusted by calling Reserve. func (b *Buffer) Buf() []byte { return b.buf } // Bytes returns a slice of size Len, which is adjusted by calling Resize. func (b *Buffer) Bytes() []byte { return b.buf[:b.length] } + +// Mutable returns a bool indicating whether the buffer is mutable or not. func (b *Buffer) Mutable() bool { return b.mutable } -func (b *Buffer) Len() int { return b.length } -func (b *Buffer) Cap() int { return len(b.buf) } +// Len returns the length of the buffer. +func (b *Buffer) Len() int { return b.length } + +// Cap returns the capacity of the buffer. +func (b *Buffer) Cap() int { return len(b.buf) } + +// Reserve reserves the provided amount of capacity for the buffer. func (b *Buffer) Reserve(capacity int) { if capacity > len(b.buf) { newCap := roundUpToMultipleOf64(capacity) @@ -80,10 +95,13 @@ func (b *Buffer) Reserve(capacity int) { } } +// Resize resizes the buffer to the target size. func (b *Buffer) Resize(newSize int) { b.resize(newSize, true) } +// ResizeNoShrink resizes the buffer to the target size, but will not +// shrink it. func (b *Buffer) ResizeNoShrink(newSize int) { b.resize(newSize, false) } From fb575241b4130c068c1153613cacd8c9c508077a Mon Sep 17 00:00:00 2001 From: Richard Artoul Date: Wed, 5 Feb 2020 20:43:35 -0500 Subject: [PATCH 26/26] Dont reset refcount --- go/arrow/memory/buffer.go | 1 - 1 file changed, 1 deletion(-) diff --git a/go/arrow/memory/buffer.go b/go/arrow/memory/buffer.go index 006e45577b1..57c0db487d1 100644 --- a/go/arrow/memory/buffer.go +++ b/go/arrow/memory/buffer.go @@ -63,7 +63,6 @@ func (b *Buffer) Release() { // Reset resets the buffer for reuse. func (b *Buffer) Reset(buf []byte) { - b.refCount = 0 b.buf = buf b.length = len(buf) }