diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index abc3b8f4d91..cf746c327db 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -175,10 +175,15 @@ jobs: ARROW_BUILD_STATIC: OFF ARROW_BOOST_USE_SHARED: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF - BOOST_SOURCE: BUNDLED + BOOST_ROOT: C:\local\boost_1_67_0 + BOOST_LIBRARYDIR: C:\local\boost_1_67_0\lib64-msvc-14.1\ steps: - name: Disable Crash Dialogs run: reg add "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" /v DontShowUI /t REG_DWORD /d 1 /f + - name: Installed Packages + run: choco list -l + - name: Install Boost + run: choco install -y boost-msvc-14.1 - name: Checkout Arrow uses: actions/checkout@v1 with: diff --git a/README.md b/README.md index f7dbb5852c1..e6b5718a8c6 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ [![Build Status](https://ci.appveyor.com/api/projects/status/github/apache/arrow/branch/master?svg=true)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/branch/master) [![Coverage Status](https://codecov.io/gh/apache/arrow/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master) -[![Fuzzit Status](https://app.fuzzit.dev/badge?org_id=yMxZh42xl9qy6bvg3EiJ&branch=master)](https://app.fuzzit.dev/admin/yMxZh42xl9qy6bvg3EiJ/dashboard) +[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow) [![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt) [![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow) diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile index e567fecfd13..cf19c05237f 100644 --- a/ci/docker/conda-python-dask.dockerfile +++ b/ci/docker/conda-python-dask.dockerfile @@ -23,3 +23,6 @@ FROM ${repo}:${arch}-conda-python-${python} ARG dask=latest COPY ci/scripts/install_dask.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_dask.sh ${dask} + +# The Spark tests currently break with pandas >= 1.0 +RUN if [ ${dask} == "latest" ]; then conda install pandas=0.25.3; fi \ No newline at end of file diff --git a/ci/docker/linux-apt-fuzzit.dockerfile b/ci/docker/linux-apt-fuzzit.dockerfile deleted file mode 100644 index 5cb14020ae4..00000000000 --- a/ci/docker/linux-apt-fuzzit.dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base -FROM ${base} - -RUN apt-get update && \ - apt-get install -y -q \ - clang-7 \ - libclang-7-dev \ - clang-format-7 \ - clang-tidy-7 \ - clang-tools-7 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN wget -O /usr/local/bin/fuzzit https://github.com/fuzzitdev/fuzzit/releases/latest/download/fuzzit_Linux_x86_64 && \ - chmod a+x /usr/local/bin/fuzzit - -ENV ARROW_FUZZING="ON" \ - ARROW_USE_ASAN="ON" \ - CC="clang-7" \ - CXX="clang++-7" \ - ARROW_BUILD_TYPE="RelWithDebInfo" \ - ARROW_FLIGHT="OFF" \ - ARROW_GANDIVA="OFF" \ - ARROW_ORC="OFF" \ - ARROW_PARQUET="OFF" \ - ARROW_PLASMA="OFF" \ - ARROW_WITH_BZ2="OFF" \ - ARROW_WITH_ZSTD="OFF" \ - ARROW_BUILD_BENCHMARKS="OFF" \ - ARROW_BUILD_UTILITIES="OFF" diff --git a/ci/scripts/fuzzit_build.sh b/ci/scripts/fuzzit_build.sh deleted file mode 100755 index b73c49cddb4..00000000000 --- a/ci/scripts/fuzzit_build.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -euxo pipefail - -export TARGET_ID=apache-arrow/arrow-ipc-fuzzing - -source_dir=${1}/cpp -build_dir=${2}/cpp - -pushd ${build_dir}/relwithdebinfo - -mkdir -p out - -cp arrow-ipc-fuzzing-test out/fuzzer -ldd arrow-ipc-fuzzing-test | grep "=> /" | awk '{print $3}' | xargs -I '{}' cp -v '{}' out/ - -pushd out -tar -czvf fuzzer.tar.gz * -stat fuzzer.tar.gz -popd - -fuzzit create job \ - --type $FUZZIT_JOB_TYPE \ - --host $FUZZIT_HOST \ - --revision $CI_ARROW_SHA \ - --branch $CI_ARROW_BRANCH \ - $TARGET_ID \ - out/fuzzer.tar.gz - -popd diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f7431f80f5f..d2322009ea8 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1196,6 +1196,13 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csf); + } + default: return false; } @@ -1232,6 +1239,11 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { return SparseTensorEqualsImplDispatch(left_csc, right); } + case SparseTensorFormat::CSF: { + const auto& left_csf = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csf, right); + } + default: return false; } diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc index 29cfc384a42..49fb6879008 100644 --- a/cpp/src/arrow/dataset/file_ipc_test.cc +++ b/cpp/src/arrow/dataset/file_ipc_test.cc @@ -34,7 +34,6 @@ namespace arrow { namespace dataset { -constexpr int64_t kDefaultOutputStreamSize = 1024; constexpr int64_t kBatchSize = 1UL << 12; constexpr int64_t kBatchRepetitions = 1 << 5; constexpr int64_t kNumRows = kBatchSize * kBatchRepetitions; @@ -42,8 +41,7 @@ constexpr int64_t kNumRows = kBatchSize * kBatchRepetitions; class ArrowIpcWriterMixin : public ::testing::Test { public: std::shared_ptr Write(std::vector readers) { - EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create( - kDefaultOutputStreamSize, default_memory_pool())); + EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create()); auto writer_schema = readers[0]->schema(); EXPECT_OK_AND_ASSIGN(auto writer, @@ -69,8 +67,7 @@ class ArrowIpcWriterMixin : public ::testing::Test { } std::shared_ptr Write(const Table& table) { - EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create( - kDefaultOutputStreamSize, default_memory_pool())); + EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create()); EXPECT_OK_AND_ASSIGN(auto writer, ipc::RecordBatchFileWriter::Open(sink.get(), table.schema())); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index aead8ab25e4..a26f57e667f 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -217,7 +217,7 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { auto RoundtripBatch = [](const std::shared_ptr& batch, std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(), out_stream.get())); @@ -255,7 +255,7 @@ TEST_F(TestExtensionType, UnrecognizedExtension) { // Write full IPC stream including schema, then unregister type, then read // and ensure that a plain instance of the storage type is created - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(), out_stream.get())); diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc index 5ad7a899c66..5d83353d281 100644 --- a/cpp/src/arrow/flight/test_util.cc +++ b/cpp/src/arrow/flight/test_util.cc @@ -454,7 +454,8 @@ Status TestClientBasicAuthHandler::GetToken(std::string* token) { Status GetTestResourceRoot(std::string* out) { const char* c_root = std::getenv("ARROW_TEST_DATA"); if (!c_root) { - return Status::IOError("Test resources not found, set ARROW_TEST_DATA"); + return Status::IOError( + "Test resources not found, set ARROW_TEST_DATA to /testing/data"); } *out = std::string(c_root); return Status::OK(); diff --git a/cpp/src/arrow/io/compressed_test.cc b/cpp/src/arrow/io/compressed_test.cc index 71b2012b67d..df969feb3b6 100644 --- a/cpp/src/arrow/io/compressed_test.cc +++ b/cpp/src/arrow/io/compressed_test.cc @@ -127,7 +127,7 @@ void CheckCompressedInputStream(Codec* codec, const std::vector& data) void CheckCompressedOutputStream(Codec* codec, const std::vector& data, bool do_flush) { // Create compressed output stream - ASSERT_OK_AND_ASSIGN(auto buffer_writer, BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto buffer_writer, BufferOutputStream::Create()); ASSERT_OK_AND_ASSIGN(auto stream, CompressedOutputStream::Make(codec, buffer_writer)); ASSERT_OK_AND_EQ(0, stream->Tell()); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 185a972a6b6..0b710aff325 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -46,7 +46,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { /// \param[in,out] pool a MemoryPool to use for allocations /// \return the created stream static Result> Create( - int64_t initial_capacity, MemoryPool* pool = default_memory_pool()); + int64_t initial_capacity = 4096, MemoryPool* pool = default_memory_pool()); ARROW_DEPRECATED("Use Result-returning overload") static Status Create(int64_t initial_capacity, MemoryPool* pool, diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 74b94a10169..b2c2994aece 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -281,7 +281,7 @@ void CheckBatches(const RecordBatch& expected, const RecordBatch& result) { class TestTableReader : public ::testing::Test { public: void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); ASSERT_OK(TableWriter::Open(stream_, &writer_)); } @@ -356,7 +356,7 @@ TEST_F(TestTableReader, ReadNames) { class TestTableWriter : public ::testing::Test { public: void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); ASSERT_OK(TableWriter::Open(stream_, &writer_)); } diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 05f508df31c..94bb6cbd839 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -286,11 +286,7 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::Timestamp: { auto ts_type = static_cast(type_data); TimeUnit::type unit = FromFlatbufferUnit(ts_type->unit()); - if (ts_type->timezone() != 0 && ts_type->timezone()->Length() > 0) { - *out = timestamp(unit, ts_type->timezone()->str()); - } else { - *out = timestamp(unit); - } + *out = timestamp(unit, StringFromFlatbuffers(ts_type->timezone())); return Status::OK(); } case flatbuf::Type::Duration: { @@ -369,10 +365,7 @@ static Status TypeFromFlatbuffer(const flatbuf::Field* field, const KeyValueMetadata* field_metadata, std::shared_ptr* out) { auto type_data = field->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded Field is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type"); RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out)); // Look for extension metadata in custom_metadata field @@ -474,14 +467,8 @@ Status KeyValueMetadataFromFlatbuffer(const KVVector* fb_metadata, metadata->reserve(fb_metadata->size()); for (const auto& pair : *fb_metadata) { - if (pair->key() == nullptr) { - return Status::IOError( - "Key-pointer in custom metadata of flatbuffer-encoded Schema is null."); - } - if (pair->value() == nullptr) { - return Status::IOError( - "Value-pointer in custom metadata of flatbuffer-encoded Schema is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key"); + CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value"); metadata->Append(pair->key()->str(), pair->value()->str()); } @@ -776,9 +763,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona // Reconstruct the data type auto children = field->children(); - if (children == nullptr) { - return Status::IOError("Children-pointer of flatbuffer-encoded Field is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(children, "Field.children"); std::vector> child_fields(children->size()); for (int i = 0; i < static_cast(children->size()); ++i) { RETURN_NOT_OK( @@ -786,6 +771,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona } RETURN_NOT_OK(TypeFromFlatbuffer(field, child_fields, metadata.get(), &type)); + auto field_name = StringFromFlatbuffers(field->name()); + const flatbuf::DictionaryEncoding* encoding = field->dictionary(); if (encoding != nullptr) { @@ -794,22 +781,14 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona // dictionary_memo std::shared_ptr index_type; auto int_data = encoding->indexType(); - if (int_data == nullptr) { - return Status::IOError( - "indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding " - "is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(int_data, "DictionaryEncoding.indexType"); RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type)); ARROW_ASSIGN_OR_RAISE(type, DictionaryType::Make(index_type, type, encoding->isOrdered())); - *out = ::arrow::field(field->name()->str(), type, field->nullable(), metadata); + *out = ::arrow::field(field_name, type, field->nullable(), metadata); RETURN_NOT_OK(dictionary_memo->AddField(encoding->id(), *out)); } else { - auto name = field->name(); - if (name == nullptr) { - return Status::IOError("Name-pointer of flatbuffer-encoded Field is null."); - } - *out = ::arrow::field(name->str(), type, field->nullable(), metadata); + *out = ::arrow::field(field_name, type, field->nullable(), metadata); } return Status::OK(); } @@ -1183,17 +1162,15 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr* out) { auto schema = static_cast(opaque_schema); - if (schema->fields() == nullptr) { - return Status::IOError("Fields-pointer of flatbuffer-encoded Schema is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(schema, "schema"); + CHECK_FLATBUFFERS_NOT_NULL(schema->fields(), "Schema.fields"); int num_fields = static_cast(schema->fields()->size()); std::vector> fields(num_fields); for (int i = 0; i < num_fields; ++i) { const flatbuf::Field* field = schema->fields()->Get(i); - if (field == nullptr) { - return Status::IOError("Field-pointer of flatbuffer-encoded Schema is null."); - } + // XXX I don't think this check is necessary (AP) + CHECK_FLATBUFFERS_NOT_NULL(field, "DictionaryEncoding.indexType"); RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i])); } @@ -1225,12 +1202,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type auto dim = tensor->shape()->Get(i); shape->push_back(dim->size()); - auto fb_name = dim->name(); - if (fb_name == 0) { - dim_names->push_back(""); - } else { - dim_names->push_back(fb_name->str()); - } + dim_names->push_back(StringFromFlatbuffers(dim->name())); } if (tensor->strides() && tensor->strides()->size() > 0) { @@ -1239,11 +1211,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type } } - auto type_data = tensor->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded Tensor is null."); - } + auto type_data = tensor->type(); // Required return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type); } @@ -1283,12 +1251,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } if (dim_names) { - auto fb_name = dim->name(); - if (fb_name == 0) { - dim_names->push_back(""); - } else { - dim_names->push_back(fb_name->str()); - } + dim_names->push_back(StringFromFlatbuffers(dim->name())); } } } @@ -1324,11 +1287,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } } - auto type_data = sparse_tensor->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null."); - } + auto type_data = sparse_tensor->type(); // Required if (type) { return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type); } else { diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index b2ac81f86e7..f5ffee3fa75 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -31,20 +31,16 @@ #include "arrow/buffer.h" #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" -#include "arrow/memory_pool.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" #include "generated/Message_generated.h" #include "generated/Schema_generated.h" namespace arrow { -class DataType; -class Schema; -class Tensor; -class SparseTensor; - namespace flatbuf = org::apache::arrow::flatbuf; namespace io { @@ -92,6 +88,23 @@ struct FileBlock { int64_t body_length; }; +// Low-level utilities to help with reading Flatbuffers data. + +#define CHECK_FLATBUFFERS_NOT_NULL(fb_value, name) \ + if ((fb_value) == NULLPTR) { \ + return Status::IOError("Unexpected null field ", name, \ + " in flatbuffer-encoded metadata"); \ + } + +template +inline uint32_t FlatBuffersVectorSize(const flatbuffers::Vector* vec) { + return (vec == NULLPTR) ? 0 : vec->size(); +} + +inline std::string StringFromFlatbuffers(const flatbuffers::String* s) { + return (s == NULLPTR) ? "" : s->str(); +} + // Read interface classes. We do not fully deserialize the flatbuffers so that // individual fields metadata can be retrieved from very large schema without // diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8712d46d1c9..f99037a0d34 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -102,10 +102,7 @@ class IpcComponentSource { Status GetBuffer(int buffer_index, std::shared_ptr* out) { auto buffers = metadata_->buffers(); - if (buffers == nullptr) { - return Status::IOError( - "Buffers-pointer of flatbuffer-encoded RecordBatch is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (buffer_index >= static_cast(buffers->size())) { return Status::IOError("buffer_index out of range."); } @@ -127,9 +124,7 @@ class IpcComponentSource { Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); - if (nodes == nullptr) { - return Status::IOError("Nodes-pointer of flatbuffer-encoded Table is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); // pop off a field if (field_index >= static_cast(nodes->size())) { return Status::Invalid("Ran out of field metadata, likely malformed"); @@ -441,6 +436,7 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, // The dictionary is embedded in a record batch with a single column std::shared_ptr batch; auto batch_meta = dictionary_batch->data(); + CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data"); RETURN_NOT_OK(ReadRecordBatch(batch_meta, ::arrow::schema({value_field}), dictionary_memo, options, file, &batch)); if (batch->num_columns() != 1) { @@ -475,9 +471,6 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl { } CHECK_MESSAGE_TYPE(Message::SCHEMA, message->type()); CHECK_HAS_NO_BODY(*message); - if (message->header() == nullptr) { - return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); - } return internal::GetSchema(message->header(), &dictionary_memo_, &schema_); } @@ -663,9 +656,13 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { return Status::OK(); } - int num_dictionaries() const { return footer_->dictionaries()->size(); } + int num_dictionaries() const { + return static_cast(internal::FlatBuffersVectorSize(footer_->dictionaries())); + } - int num_record_batches() const { return footer_->recordBatches()->size(); } + int num_record_batches() const { + return static_cast(internal::FlatBuffersVectorSize(footer_->recordBatches())); + } MetadataVersion version() const { return internal::GetMetadataVersion(footer_->version()); diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 09a322b1060..06c85648591 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -654,6 +654,7 @@ Status CountSparseTensors( OwnedRef num_sparse_tensors(PyDict_New()); size_t num_coo = 0; size_t num_csr = 0; + size_t num_csf = 0; for (const auto& sparse_tensor : sparse_tensors) { switch (sparse_tensor->format_id()) { @@ -663,6 +664,9 @@ Status CountSparseTensors( case SparseTensorFormat::CSR: ++num_csr; break; + case SparseTensorFormat::CSF: + ++num_csf; + break; case SparseTensorFormat::CSC: // TODO(mrkn): support csc break; @@ -671,6 +675,7 @@ Status CountSparseTensors( PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo)); PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf)); RETURN_IF_PYERROR(); *out = num_sparse_tensors.detach(); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d42bdf4ca61..549223c0798 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -23,9 +23,11 @@ #include #include +#include "arrow/buffer_builder.h" #include "arrow/compare.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" #include "arrow/visitor_inline.h" namespace arrow { @@ -55,6 +57,38 @@ class SparseTensorConverter { Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; +// ---------------------------------------------------------------------- +// IncrementIndex for SparseCOOIndex and SparseCSFIndex + +inline void IncrementIndex(std::vector& coord, + const std::vector& shape) { + const int64_t ndim = shape.size(); + ++coord[ndim - 1]; + if (coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } +} + +inline void IncrementIndex(std::vector& coord, const std::vector& shape, + const std::vector& axis_order) { + const int64_t ndim = shape.size(); + const int64_t last_axis = axis_order[ndim - 1]; + ++coord[last_axis]; + if (coord[last_axis] == shape[last_axis]) { + int64_t d = ndim - 1; + while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { + coord[axis_order[d]] = 0; + ++coord[axis_order[d - 1]]; + --d; + } + } +} + // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCOOIndex @@ -128,17 +162,7 @@ class SparseTensorConverter *indices++ = static_cast(coord[i]); } } - - // increment index - ++coord[ndim - 1]; - if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { - int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { - coord[d] = 0; - ++coord[d - 1]; - --d; - } - } + IncrementIndex(coord, shape); } } @@ -419,6 +443,144 @@ class SparseTensorConverter inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSFIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using typename BaseClass::NumericTensorType; + using typename BaseClass::value_type; + + SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type, + MemoryPool* pool) + : BaseClass(tensor, index_value_type, pool) {} + + template + Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + + const int64_t ndim = tensor_.ndim(); + // Axis order as ascending order of dimension size is a good heuristic but is not + // necessarily optimal. + std::vector axis_order = internal::ArgSort(tensor_.shape()); + int64_t nonzero_count = -1; + RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); + + std::shared_ptr values_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + std::vector counts(ndim, 0); + std::vector coord(ndim, 0); + std::vector previous_coord(ndim, -1); + std::vector> indptr_buffer_builders(ndim - 1); + std::vector> indices_buffer_builders(ndim); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + const std::vector& shape = tensor_.shape(); + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); + + if (x != 0) { + bool tree_split = false; + *values++ = x; + + for (int64_t i = 0; i < ndim; ++i) { + int64_t dimension = axis_order[i]; + + tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); + if (tree_split) { + if (i < ndim - 1) { + RETURN_NOT_OK(indptr_buffer_builders[i].Append( + static_cast(counts[i + 1]))); + } + RETURN_NOT_OK(indices_buffer_builders[i].Append( + static_cast(coord[dimension]))); + ++counts[i]; + } + } + previous_coord = coord; + } + IncrementIndex(coord, shape, axis_order); + } + } + + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Append( + static_cast(counts[column + 1]))); + } + + // make results + data = values_buffer; + + std::vector> indptr_buffers(ndim - 1); + std::vector> indices_buffers(ndim); + std::vector indptr_shapes(counts.begin(), counts.end() - 1); + std::vector indices_shapes = counts; + + for (int64_t column = 0; column < ndim; ++column) { + RETURN_NOT_OK( + indices_buffer_builders[column].Finish(&indices_buffers[column], true)); + } + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true)); + } + + ARROW_ASSIGN_OR_RAISE( + sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, + indptr_buffers, indices_buffers)); + return Status::OK(); + } + +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::TypeError("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::index_value_type_; + using BaseClass::pool_; + using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + auto max_dimension = + *std::max_element(tensor_.shape().begin(), tensor_.shape().end()); + if (static_cast(type_max) < max_dimension) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } +}; + // ---------------------------------------------------------------------- // Instantiate templates @@ -438,6 +600,7 @@ class SparseTensorConverter INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSCIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSFIndex); } // namespace @@ -500,6 +663,10 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, case SparseTensorFormat::CSC: return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); + case SparseTensorFormat::CSF: + return MakeSparseTensorFromTensor(tensor, index_value_type, pool, + out_sparse_index, out_data); + // LCOV_EXCL_START: ignore program failure default: return Status::Invalid("Invalid sparse tensor format"); @@ -507,6 +674,35 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, } } +namespace { + +template +void ExpandSparseCSFTensorValues(int64_t dimension, int64_t dense_offset, + int64_t first_ptr, int64_t last_ptr, + const SparseCSFIndex& sparse_index, const TYPE* raw_data, + const std::vector& strides, + const std::vector& axis_order, TYPE* out) { + int64_t ndim = axis_order.size(); + + for (int64_t i = first_ptr; i < last_ptr; ++i) { + int64_t tmp_dense_offset = + dense_offset + sparse_index.indices()[dimension]->Value({i}) * + strides[axis_order[dimension]]; + + if (dimension < ndim - 1) { + ExpandSparseCSFTensorValues( + dimension + 1, tmp_dense_offset, + sparse_index.indptr()[dimension]->Value({i}), + sparse_index.indptr()[dimension]->Value({i + 1}), sparse_index, + raw_data, strides, axis_order, out); + } else { + out[tmp_dense_offset] = raw_data[i]; + } + } +} + +} // namespace + template Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, std::shared_ptr* out) { @@ -521,18 +717,20 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t std::fill_n(values, sparse_tensor->size(), static_cast(0)); + std::vector strides(sparse_tensor->ndim(), 1); + for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { + strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; + } + std::vector empty_strides; + + const auto raw_data = reinterpret_cast(sparse_tensor->raw_data()); + switch (sparse_tensor->format_id()) { case SparseTensorFormat::COO: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr coords = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); - std::vector strides(sparse_tensor->ndim(), 1); - for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { - strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; - } for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) { std::vector coord(sparse_tensor->ndim()); int64_t offset = 0; @@ -543,7 +741,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t values[offset] = raw_data[i]; } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -552,8 +751,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t i = 0; i < indptr->size() - 1; ++i) { @@ -565,7 +762,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -574,8 +772,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t j = 0; j < indptr->size() - 1; ++j) { @@ -587,7 +783,21 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); + return Status::OK(); + } + + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + + ExpandSparseCSFTensorValues( + 0, 0, 0, sparse_index.indptr()[0]->size() - 1, sparse_index, raw_data, strides, + sparse_index.axis_order(), values); + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } } @@ -625,6 +835,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t const std::shared_ptr indices = sparse_index.indices(); type = indices->type(); break; + } + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::vector> indices = sparse_index.indices(); + type = indices[0]->type(); + break; } // LCOV_EXCL_START: ignore program failure default: @@ -754,6 +971,86 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, } // namespace internal +// ---------------------------------------------------------------------- +// SparseCSFIndex + +namespace { + +inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const int64_t num_indptrs, + const int64_t num_indices, + const std::vector& indptr_shape, + const std::vector& indices_shape, + const int64_t axis_order_size) { + if (!is_integer(indptr_type->id())) { + return Status::TypeError("Type of SparseCSFIndex indptr must be integer"); + } + if (!is_integer(indices_type->id())) { + return Status::TypeError("Type of SparseCSFIndex indices must be integer"); + } + if (num_indptrs + 1 != num_indices) { + return Status::Invalid( + "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex."); + } + if (axis_order_size != num_indices) { + return Status::Invalid( + "Length of indices must be equal to number of dimensions for SparseCSFIndex."); + } + return Status::OK(); +} + +} // namespace + +Result> SparseCSFIndex::Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data) { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) + indptr[i] = std::make_shared(indptr_type, indptr_data[i], + std::vector({indices_shapes[i] + 1})); + for (int64_t i = 0; i < ndim; ++i) + indices[i] = std::make_shared(indices_type, indices_data[i], + std::vector({indices_shapes[i]})); + + RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), + indices.size(), indptr.back()->shape(), + indices.back()->shape(), axis_order.size())); + + return std::make_shared(indptr, indices, axis_order); +} + +// Constructor with two index vectors +SparseCSFIndex::SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, + const std::vector& axis_order) + : SparseIndexBase(indices.back()->size()), + indptr_(indptr), + indices_(indices), + axis_order_(axis_order) { + ARROW_CHECK_OK(CheckSparseCSFIndexValidity( + indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(), + indptr_.back()->shape(), indices_.back()->shape(), axis_order_.size())); +} + +std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } + +bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const { + for (int64_t i = 0; i < static_cast(indices().size()); ++i) { + if (!indices()[i]->Equals(*other.indices()[i])) return false; + } + for (int64_t i = 0; i < static_cast(indptr().size()); ++i) { + if (!indptr()[i]->Equals(*other.indptr()[i])) return false; + } + return axis_order() == other.axis_order(); +} + // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index f736f7b7576..33a53761e14 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -40,6 +40,8 @@ struct SparseTensorFormat { CSR, /// Compressed sparse column (CSC) format. CSC, + /// Compressed sparse fiber (CSF) format. + CSF }; }; @@ -329,6 +331,68 @@ class ARROW_EXPORT SparseCSCIndex using SparseCSXIndex::SparseCSXIndex; }; +// ---------------------------------------------------------------------- +// SparseCSFIndex class + +/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor +/// +/// A CSF sparse index manages the location of its non-zero values by set of +/// prefix trees. Each path from a root to leaf forms one tensor non-zero index. +/// CSF is implemented with three vectors. +/// +/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the +/// number of dimensions. Axis_order is a vector of integers of legth N. Indptr and +/// indices describe the set of prefix trees. Trees traverse dimensions in order given by +/// axis_order. +class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { + public: + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; + static constexpr char const* kTypeName = "SparseCSFIndex"; + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data); + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data) { + return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data, + indices_data); + } + + /// \brief Construct SparseCSFIndex from two index vectors + explicit SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, + const std::vector& axis_order); + + /// \brief Return a 1D vector of indptr tensors + const std::vector>& indptr() const { return indptr_; } + + /// \brief Return a 1D vector of indices tensors + const std::vector>& indices() const { return indices_; } + + /// \brief Return a 1D vector specifying the order of axes + const std::vector& axis_order() const { return axis_order_; } + + /// \brief Return a string representation of the sparse index + std::string ToString() const override; + + /// \brief Return whether the CSF indices are equal + bool Equals(const SparseCSFIndex& other) const; + + protected: + std::vector> indptr_; + std::vector> indices_; + std::vector axis_order_; +}; + // ---------------------------------------------------------------------- // SparseTensor class @@ -527,6 +591,9 @@ using SparseCSRMatrix = SparseTensorImpl; /// \brief EXPERIMENTAL: Type alias for CSC sparse matrix using SparseCSCMatrix = SparseTensorImpl; +/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix +using SparseCSFTensor = SparseTensorImpl; + } // namespace arrow #endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 198aa8f5f8d..45cb8dcc8f3 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -30,6 +30,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/sort.h" namespace arrow { @@ -910,4 +911,225 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +template +class TestSparseCSFTensorBase : public ::testing::Test { + public: + void SetUp() { + dim_names_ = {"a", "b", "c", "d"}; + shape_ = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor_(int16(), dense_buffer, shape_, {}, dim_names_); + ASSERT_OK_AND_ASSIGN( + sparse_tensor_from_dense_, + SparseCSFTensor::Make(dense_tensor_, + TypeTraits::type_singleton())); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; + +template +class TestSparseCSFTensorForIndexValueType + : public TestSparseCSFTensorBase { + protected: + std::shared_ptr MakeSparseCSFIndex( + const std::vector& axis_order, + const std::vector>& indptr_values, + const std::vector>& indices_values) + const { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) { + indptr[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indptr_values[i]), + std::vector({static_cast(indptr_values[i].size())})); + } + for (int64_t i = 0; i < ndim; ++i) { + indices[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indices_values[i]), + std::vector({static_cast(indices_values[i].size())})); + } + return std::make_shared(indptr, indices, axis_order); + } + + template + std::shared_ptr MakeSparseTensor( + const std::shared_ptr& si, std::vector& sparse_values, + const std::vector& shape, + const std::vector& dim_names) const { + auto data_buffer = Buffer::Wrap(sparse_values); + return std::make_shared( + si, CTypeTraits::type_singleton(), data_buffer, shape, dim_names); + } +}; + +TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {2, 3, 4, 5}; + std::vector dim_names = {"a", "b", "c", "d"}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector> indptr_values = { + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 1, 1}, {0, 0, 1, 1}, {1, 2, 0, 2, 0, 0, 1, 2}}; + + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, dim_names); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) { + std::vector dim_names = {"a", "b", "c", "d"}; + ASSERT_EQ(8, this->sparse_tensor_from_dense_->non_zero_length()); + ASSERT_TRUE(this->sparse_tensor_from_dense_->is_mutable()); + ASSERT_EQ(dim_names, this->sparse_tensor_from_dense_->dim_names()); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { + std::vector shape = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + ASSERT_TRUE(dense_tensor.Equals(*dt)); + ASSERT_EQ(dense_tensor.dim_names(), dt->dim_names()); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestRoundTrip) { + using IndexValueType = TypeParam; + + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + std::shared_ptr st; + ASSERT_OK_AND_ASSIGN( + st, SparseCSFTensor::Make(*dt, TypeTraits::type_singleton())); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestAlternativeAxisOrder) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector dense_values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; + std::vector shape = {4, 6}; + std::vector dim_names = {"a", "b"}; + std::shared_ptr dense_buffer = Buffer::Wrap(dense_values); + Tensor tensor(int16(), dense_buffer, shape, {}, dim_names); + + // Axis order 1 + std::vector axis_order_1 = {0, 1}; + std::vector sparse_values_1 = {1, 3, 2, 4, 5}; + std::vector> indptr_values_1 = {{0, 2, 3, 5}}; + std::vector> indices_values_1 = {{0, 1, 3}, + {0, 3, 1, 3, 5}}; + auto si_1 = this->MakeSparseCSFIndex(axis_order_1, indptr_values_1, indices_values_1); + auto st_1 = this->MakeSparseTensor(si_1, sparse_values_1, shape, dim_names); + + // Axis order 2 + std::vector axis_order_2 = {1, 0}; + std::vector sparse_values_2 = {1, 2, 3, 4, 5}; + std::vector> indptr_values_2 = {{0, 1, 2, 4, 5}}; + std::vector> indices_values_2 = {{0, 1, 3, 5}, + {0, 1, 0, 3, 3}}; + auto si_2 = this->MakeSparseCSFIndex(axis_order_2, indptr_values_2, indices_values_2); + auto st_2 = this->MakeSparseTensor(si_2, sparse_values_2, shape, dim_names); + + std::shared_ptr dt_1, dt_2; + ASSERT_OK(st_1->ToTensor(&dt_1)); + ASSERT_OK(st_2->ToTensor(&dt_2)); + + ASSERT_FALSE(st_1->Equals(*st_2)); + ASSERT_TRUE(dt_1->Equals(*dt_2)); + ASSERT_TRUE(dt_1->Equals(tensor)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestNonAscendingShape) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {5, 2, 3, 4}; + int16_t dense_values[5][2][3][4] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr sparse_tensor; + ASSERT_OK_AND_ASSIGN( + sparse_tensor, + SparseCSFTensor::Make(dense_tensor, TypeTraits::type_singleton())); + + std::vector> indptr_values = { + {0, 1, 3}, {0, 2, 4, 7}, {0, 1, 2, 3, 4, 6, 7, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 0, 1}, {1, 2, 0, 2, 0, 1, 2}, {0, 0, 0, 0, 0, 1, 1, 1}}; + std::vector axis_order = {1, 2, 3, 0}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(dt->Equals(dense_tensor)); + ASSERT_TRUE(st->Equals(*sparse_tensor)); +} + +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor, + TestTensorToSparseTensor, TestSparseTensorToTensor, + TestAlternativeAxisOrder, TestNonAscendingShape, + TestRoundTrip); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCSFTensorForIndexValueType, + UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType, + UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSFTensorForIndexValueType, + UInt64Type); } // namespace arrow diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index c44e311b7b2..df3ea6b9af6 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -292,6 +292,7 @@ class ARROW_EXPORT Status : public util::EqualityComparable, } bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } + bool IsAlreadyExists() const { return code() == StatusCode::AlreadyExists; } /// \brief Return a string representation of this status suitable for printing. /// @@ -385,8 +386,11 @@ bool Status::Equals(const Status& s) const { return false; } - if (detail() != s.detail() && !(*detail() == *s.detail())) { - return false; + if (detail() != s.detail()) { + if ((detail() && !s.detail()) || (!detail() && s.detail())) { + return false; + } + return *detail() == *s.detail(); } return code() == s.code() && message() == s.message(); diff --git a/cpp/src/arrow/status_test.cc b/cpp/src/arrow/status_test.cc index 593ad2194bf..fc5a7ec45cf 100644 --- a/cpp/src/arrow/status_test.cc +++ b/cpp/src/arrow/status_test.cc @@ -114,4 +114,17 @@ TEST(StatusTest, TestEquality) { ASSERT_NE(Status::Invalid("error"), Status::Invalid("other error")); } +TEST(StatusTest, TestDetailEquality) { + const auto status_with_detail = + arrow::Status(StatusCode::IOError, "", std::make_shared()); + const auto status_with_detail2 = + arrow::Status(StatusCode::IOError, "", std::make_shared()); + const auto status_without_detail = arrow::Status::IOError(""); + + ASSERT_EQ(*status_with_detail.detail(), *status_with_detail2.detail()); + ASSERT_EQ(status_with_detail, status_with_detail2); + ASSERT_NE(status_with_detail, status_without_detail); + ASSERT_NE(status_without_detail, status_with_detail); +} + } // namespace arrow diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index dded399240e..3cfb8938175 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -577,6 +577,12 @@ class ColumnReaderImplBase { decoders_[static_cast(encoding)] = std::move(decoder); break; } + case Encoding::BYTE_STREAM_SPLIT: { + auto decoder = MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT, descr_); + current_decoder_ = decoder.get(); + decoders_[static_cast(encoding)] = std::move(decoder); + break; + } case Encoding::RLE_DICTIONARY: throw ParquetException("Dictionary page must be before data page."); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a61bcecc974..dc06436ff9c 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -815,6 +815,104 @@ void DictEncoderImpl::PutDictionary(const arrow::Array& values) { } } +// ---------------------------------------------------------------------- +// ByteStreamSplitEncoder implementations + +template +class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + using T = typename DType::c_type; + using TypedEncoder::Put; + + explicit ByteStreamSplitEncoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + int64_t EstimatedDataEncodedSize() override; + std::shared_ptr FlushValues() override; + + void Put(const T* buffer, int num_values) override; + void Put(const arrow::Array& values) override; + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override; + + protected: + arrow::TypedBufferBuilder values_; + + private: + void PutArrowArray(const arrow::Array& values); +}; + +template +ByteStreamSplitEncoder::ByteStreamSplitEncoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) + : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool), values_{pool} {} + +template +int64_t ByteStreamSplitEncoder::EstimatedDataEncodedSize() { + return values_.length() * sizeof(T); +} + +template +std::shared_ptr ByteStreamSplitEncoder::FlushValues() { + constexpr size_t num_streams = sizeof(T); + std::shared_ptr output_buffer = + AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize()); + uint8_t* output_buffer_raw = output_buffer->mutable_data(); + const size_t num_values = values_.length(); + const uint8_t* raw_values = reinterpret_cast(values_.data()); + for (size_t i = 0; i < num_values; ++i) { + for (size_t j = 0U; j < num_streams; ++j) { + const uint8_t byte_in_value = raw_values[i * num_streams + j]; + output_buffer_raw[j * num_values + i] = byte_in_value; + } + } + values_.Reset(); + return std::move(output_buffer); +} + +template +void ByteStreamSplitEncoder::Put(const T* buffer, int num_values) { + PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values)); +} + +template +void ByteStreamSplitEncoder::Put(const ::arrow::Array& values) { + PutArrowArray(values); +} + +template <> +void ByteStreamSplitEncoder::PutArrowArray(const ::arrow::Array& values) { + DirectPutImpl(values, + reinterpret_cast(&values_)); +} + +template <> +void ByteStreamSplitEncoder::PutArrowArray(const ::arrow::Array& values) { + DirectPutImpl(values, + reinterpret_cast(&values_)); +} + +template +void ByteStreamSplitEncoder::PutSpaced(const T* src, int num_values, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(), + num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); +} + // ---------------------------------------------------------------------- // Encoder and decoder factory functions @@ -863,6 +961,18 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin DCHECK(false) << "Encoder not implemented"; break; } + } else if (encoding == Encoding::BYTE_STREAM_SPLIT) { + switch (type_num) { + case Type::FLOAT: + return std::unique_ptr( + new ByteStreamSplitEncoder(descr, pool)); + case Type::DOUBLE: + return std::unique_ptr( + new ByteStreamSplitEncoder(descr, pool)); + default: + throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE"); + break; + } } else { ParquetException::NYI("Selected encoding is not supported"); } @@ -2236,6 +2346,74 @@ class DeltaByteArrayDecoder : public DecoderImpl, ByteArray last_value_; }; +// ---------------------------------------------------------------------- +// BYTE_STREAM_SPLIT + +template +class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder { + public: + using T = typename DType::c_type; + explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr); + + int Decode(T* buffer, int max_values) override; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) override; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override; + + void SetData(int num_values, const uint8_t* data, int len) override; + + private: + int num_values_in_buffer{0U}; +}; + +template +ByteStreamSplitDecoder::ByteStreamSplitDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {} + +template +void ByteStreamSplitDecoder::SetData(int num_values, const uint8_t* data, + int len) { + DecoderImpl::SetData(num_values, data, len); + num_values_in_buffer = num_values; +} + +template +int ByteStreamSplitDecoder::Decode(T* buffer, int max_values) { + constexpr size_t num_streams = sizeof(T); + const int values_to_decode = std::min(num_values_, max_values); + const int num_decoded_previously = num_values_in_buffer - num_values_; + for (int i = 0; i < values_to_decode; ++i) { + uint8_t gathered_byte_data[num_streams]; + for (size_t b = 0; b < num_streams; ++b) { + const size_t byte_index = b * num_values_in_buffer + num_decoded_previously + i; + gathered_byte_data[b] = data_[byte_index]; + } + buffer[i] = arrow::util::SafeLoadAs(&gathered_byte_data[0]); + } + num_values_ -= values_to_decode; + len_ -= sizeof(T) * values_to_decode; + return values_to_decode; +} + +template +int ByteStreamSplitDecoder::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) { + ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder"); +} + +template +int ByteStreamSplitDecoder::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) { + ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder"); +} + // ---------------------------------------------------------------------- std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, @@ -2261,6 +2439,16 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin default: break; } + } else if (encoding == Encoding::BYTE_STREAM_SPLIT) { + switch (type_num) { + case Type::FLOAT: + return std::unique_ptr(new ByteStreamSplitDecoder(descr)); + case Type::DOUBLE: + return std::unique_ptr(new ByteStreamSplitDecoder(descr)); + default: + throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE"); + break; + } } else { ParquetException::NYI("Selected encoding is not supported"); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 473b985b35a..4a40d0162f8 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -29,6 +29,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "parquet/encoding.h" #include "parquet/platform.h" @@ -38,6 +39,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::internal::checked_cast; // TODO(hatemhelal): investigate whether this can be replaced with GTEST_SKIP in a future // gtest release that contains https://github.com/google/googletest/pull/1544 @@ -862,5 +864,190 @@ TEST_F(DictEncoding, CheckDecodeIndicesNoNulls) { CheckDict(actual_num_values, *builder); } +// ---------------------------------------------------------------------- +// BYTE_STREAM_SPLIT encode/decode tests. + +template +void TestByteStreamSplitDecodePath(const uint8_t* encoded_data, + const int64_t encoded_data_size, + const typename DType::c_type* expected_decoded_data, + const int num_elements, + const bool request_more_values) { + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT); + decoder->SetData(num_elements, encoded_data, static_cast(encoded_data_size)); + std::vector decoded_data(num_elements); + int num_elements_to_decode = num_elements; + if (request_more_values) { + num_elements_to_decode += 100; + } + int num_decoded_elements = decoder->Decode(decoded_data.data(), num_elements_to_decode); + ASSERT_EQ(num_elements, num_decoded_elements); + for (size_t i = 0U; i < decoded_data.size(); ++i) { + ASSERT_EQ(expected_decoded_data[i], decoded_data[i]); + } + ASSERT_EQ(0, decoder->values_left()); +} + +template +void TestByteStreamSplitRoundTrip(const typename DType::c_type* input_data, const int n) { + auto encoder = MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT); + encoder->Put(input_data, n); + const int64_t estimated_num_bytes = encoder->EstimatedDataEncodedSize(); + ASSERT_EQ(static_cast(n) * sizeof(typename DType::c_type), + estimated_num_bytes); + std::shared_ptr buffer = encoder->FlushValues(); + TestByteStreamSplitDecodePath(buffer->data(), buffer->size(), input_data, n, + false); +} + +template +void TestEncodeDecodeWithBigInput() { + const int nvalues = 10000; + using T = typename DType::c_type; + std::vector data(nvalues); + GenerateData(nvalues, data.data(), nullptr); + TestByteStreamSplitRoundTrip(data.data(), nvalues); +} + +// Check that the encoder can handle input with one element. +TEST(ByteStreamSplitEncodeDecode, EncodeOneLenInput) { + const float value = 1.0f; + TestByteStreamSplitRoundTrip(&value, 1); +} + +// Check that the decoder can handle empty input. +TEST(ByteStreamSplitEncodeDecode, DecodeZeroLenInput) { + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT); + decoder->SetData(0, NULL, 0); + ASSERT_EQ(0U, decoder->Decode(NULL, 0)); +} + +TEST(ByteStreamSplitEncodeDecode, DecodeOneLenInput) { + const uint8_t data[] = {0x47U, 0x24U, 0xa7U, 0x44U}; + TestByteStreamSplitDecodePath( + data, 4, reinterpret_cast(&data[0]), 1, false); +} + +// Check that requesting to decode more elements than is available in the storage +// of the decoder works correctly. +TEST(ByteStreamSplitEncodeDecode, DecodeLargerPortion) { + const uint8_t data[] = {0xDEU, 0xC0U, 0x37U, 0x13U, 0x11U, 0x22U, 0x33U, 0x44U, + 0xAAU, 0xBBU, 0xCCU, 0xDDU, 0x55U, 0x66U, 0x77U, 0x88U}; + const uint64_t expected_output[2] = {0x7755CCAA331137DEULL, 0x8866DDBB442213C0ULL}; + TestByteStreamSplitDecodePath( + data, sizeof(data), reinterpret_cast(&expected_output[0]), 2, true); +} + +// Check that the decoder can decode the input in smaller steps. +TEST(ByteStreamSplitEncodeDecode, DecodeMultipleTimes) { + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT); + const int num_values = 100; + std::vector data(num_values * 4); + for (size_t i = 0; i < data.size(); ++i) { + data[i] = static_cast(i & 0xFFU); + } + decoder->SetData(num_values, data.data(), num_values * 4); + + const int step = 25; + std::vector decoded_data(step); + for (int i = 0; i < num_values; i += step) { + int num_decoded = decoder->Decode(decoded_data.data(), step); + ASSERT_EQ(step, num_decoded); + for (int j = 0; j < step; ++j) { + const uint32_t assembled_value = + static_cast(data[i + j]) | + (static_cast(data[(i + j) + num_values]) << 8U) | + (static_cast(data[(i + j) + num_values * 2]) << 16U) | + (static_cast(data[(i + j) + num_values * 3]) << 24U); + const float assembled_value_as_float = + *reinterpret_cast(&assembled_value); + ASSERT_EQ(assembled_value_as_float, decoded_data[j]); + } + } +} + +// Check that an encode-decode pipeline produces the original small input. +// This small-input test is added to ease debugging in case of changes to +// the encoder/decoder implementation. +TEST(ByteStreamSplitEncodeDecode, SmallInput) { + const float data[] = {-166.166f, -0.2566f, .0f, 322.0f, 178888.189f}; + const int num_values = sizeof(data) / sizeof(data[0U]); + TestByteStreamSplitRoundTrip(data, num_values); +} + +TEST(ByteStreamSplitEncodeDecode, PutSpaced) { + const float data[] = {-1.0f, .0f, .0f, 3.0f, .0f, 22.1234f, + .0f, 198891.0f, .0f, -223345.4455f, 24443.124f}; + const float valid_data[] = {-1.0f, 3.0f, 22.1234f, + 198891.0f, -223345.4455f, 24443.124f}; + // The valid ones are the ones which are non-zero. + // The enable bits are: 10010101 011. + const uint8_t valid_bits[2] = {0xA9U, 0x6U}; + const int num_values = sizeof(data) / sizeof(data[0U]); + const int num_valid_values = sizeof(valid_data) / sizeof(valid_data[0U]); + std::unique_ptr> encoder = + MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT); + encoder->PutSpaced(data, num_values, valid_bits, 0); + std::shared_ptr buffer = encoder->FlushValues(); + + TestByteStreamSplitDecodePath(buffer->data(), buffer->size(), valid_data, + num_valid_values, false); +} + +TEST(ByteStreamSplitEncodeDecode, PutArrow) { + arrow::random::RandomArrayGenerator rag{1337}; + const int num_values = 123; + auto arr = rag.Float32(num_values, -2048.0f, 2048.0f, 0); + std::unique_ptr> encoder = + MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT); + encoder->Put(*arr); + std::shared_ptr buffer = encoder->FlushValues(); + + auto raw_values = checked_cast(*arr).raw_values(); + TestByteStreamSplitDecodePath(buffer->data(), buffer->size(), raw_values, + num_values, false); +} + +// Test that the encode-decode pipeline can handle big 32-bit FP input. +TEST(ByteStreamSplitEncodeDecode, BigInputFloat) { + TestEncodeDecodeWithBigInput(); +} + +// Test that the encode-decode pipeline can handle big 64-bit FP input. +TEST(ByteStreamSplitEncodeDecode, BigInputDouble) { + TestEncodeDecodeWithBigInput(); +} + +TEST(ByteStreamSplitEncodeDecode, InvalidDataTypes) { + // First check encoders. + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder(Encoding::BYTE_STREAM_SPLIT), ParquetException); + + // Then check decoders. + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder(Encoding::BYTE_STREAM_SPLIT), ParquetException); +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 3ec4246eac4..f28b22639d3 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -412,7 +412,7 @@ TEST(TestFileReader, BufferedReads) { std::shared_ptr writer_props = WriterProperties::Builder().write_batch_size(64)->data_pagesize(128)->build(); - ASSERT_OK_AND_ASSIGN(auto out_file, arrow::io::BufferOutputStream::Create(1024)); + ASSERT_OK_AND_ASSIGN(auto out_file, arrow::io::BufferOutputStream::Create()); std::shared_ptr file_writer = ParquetFileWriter::Open(out_file, schema, writer_props); diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index e7efe6a263c..9dc13e42191 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -157,6 +157,8 @@ std::string EncodingToString(Encoding::type t) { return "DELTA_BYTE_ARRAY"; case Encoding::RLE_DICTIONARY: return "RLE_DICTIONARY"; + case Encoding::BYTE_STREAM_SPLIT: + return "BYTE_STREAM_SPLIT"; default: return "UNKNOWN"; } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index ebb8c2446e3..420494a8951 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -452,6 +452,7 @@ struct Encoding { DELTA_LENGTH_BYTE_ARRAY = 6, DELTA_BYTE_ARRAY = 7, RLE_DICTIONARY = 8, + BYTE_STREAM_SPLIT = 9, UNKNOWN = 999 }; }; diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index 49c483fc41f..e2cf0bc772a 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -171,6 +171,7 @@ public BinaryArray(IArrowType dataType, int length, public ReadOnlySpan Values => ValueBuffer.Span.CastTo(); [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Obsolete("This method has been deprecated. Please use ValueOffsets[index] instead.")] public int GetValueOffset(int index) { if (index < 0 || index > Length) @@ -193,10 +194,12 @@ public int GetValueLength(int index) public ReadOnlySpan GetBytes(int index) { - var offset = GetValueOffset(index); - var length = GetValueLength(index); + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } - return ValueBuffer.Span.Slice(offset, length); + return ValueBuffer.Span.Slice(ValueOffsets[index], GetValueLength(index)); } } diff --git a/csharp/src/Apache.Arrow/Arrays/ListArray.cs b/csharp/src/Apache.Arrow/Arrays/ListArray.cs index ff7095d5da6..bc171e6ffbd 100644 --- a/csharp/src/Apache.Arrow/Arrays/ListArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/ListArray.cs @@ -123,7 +123,7 @@ private ListArray(ArrayData data, IArrowArray values) : base(data) public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); - [Obsolete] + [Obsolete("This method has been deprecated. Please use ValueOffsets[index] instead.")] public int GetValueOffset(int index) { if (index < 0 || index > Length) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index f2a8e58c89f..a1a7b231314 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -40,11 +40,19 @@ public void ThrowsWhenGetValueAndOffsetIndexOutOfBounds() Assert.Equal(1, array.GetValueLength(1)); Assert.Throws(() => array.GetValueLength(2)); +#pragma warning disable 618 Assert.Throws(() => array.GetValueOffset(-1)); Assert.Equal(0, array.GetValueOffset(0)); Assert.Equal(1, array.GetValueOffset(1)); Assert.Equal(2, array.GetValueOffset(2)); Assert.Throws(() => array.GetValueOffset(3)); +#pragma warning restore 618 + + Assert.Throws(() => array.ValueOffsets[-1]); + Assert.Equal(0, array.ValueOffsets[0]); + Assert.Equal(1, array.ValueOffsets[1]); + Assert.Equal(2, array.ValueOffsets[2]); + Assert.Throws(() => array.ValueOffsets[3]); } diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 9fa3f6194c7..60ee8f3b856 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -227,23 +227,23 @@ def get_static_json_files(): ] -def run_all_tests(enable_cpp=True, enable_java=True, enable_js=True, - enable_go=True, run_flight=False, +def run_all_tests(with_cpp=True, with_java=True, with_js=True, + with_go=True, run_flight=False, tempdir=None, **kwargs): tempdir = tempdir or tempfile.mkdtemp() testers = [] - if enable_cpp: + if with_cpp: testers.append(CPPTester(kwargs)) - if enable_java: + if with_java: testers.append(JavaTester(kwargs)) - if enable_js: + if with_js: testers.append(JSTester(kwargs)) - if enable_go: + if with_go: testers.append(GoTester(kwargs)) static_json_files = get_static_json_files() diff --git a/dev/release/00-prepare-test.rb b/dev/release/00-prepare-test.rb index e9894d0bc8a..afabe88fa7e 100644 --- a/dev/release/00-prepare-test.rb +++ b/dev/release/00-prepare-test.rb @@ -276,7 +276,9 @@ def test_version_pre_tag ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", "-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }", "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"] + "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"], + ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@snapshot_version}\" }", + "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }"] ], }, { @@ -458,7 +460,9 @@ def test_version_post_tag ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", "-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"] + "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"], + ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }", + "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@next_snapshot_version}\" }"] ], }, { diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py old mode 100644 new mode 100755 index b2dfc867bc0..da66e483d37 --- a/dev/release/download_rc_binaries.py +++ b/dev/release/download_rc_binaries.py @@ -112,7 +112,7 @@ def _download_file(self, dest, info): bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath) cmd = [ - 'curl', '--fail', '--location', + 'curl', '--fail', '--location', '--retry', '5', '--output', dest_path, bintray_abspath ] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat index 0f8e047752e..6bf1c111218 100644 --- a/dev/release/verify-release-candidate-wheels.bat +++ b/dev/release/verify-release-candidate-wheels.bat @@ -25,7 +25,7 @@ @echo on set _CURRENT_DIR=%CD% -set _VERIFICATION_DIR=C:\tmp\arrow-verify-release +set _VERIFICATION_DIR=C:\tmp\arrow-verify-release-wheels if not exist "C:\tmp\" mkdir C:\tmp if exist %_VERIFICATION_DIR% rd %_VERIFICATION_DIR% /s /q @@ -33,9 +33,6 @@ if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR% cd %_VERIFICATION_DIR% -CALL :verify_wheel 3.5 %1 %2 -if errorlevel 1 GOTO error - CALL :verify_wheel 3.6 %1 %2 if errorlevel 1 GOTO error @@ -62,7 +59,7 @@ set ARROW_VERSION=%2 set RC_NUMBER=%3 set PY_VERSION_NO_PERIOD=%PY_VERSION:.=% -set CONDA_ENV_PATH=C:\tmp\arrow-verify-release\_verify-wheel-%PY_VERSION% +set CONDA_ENV_PATH=%_VERIFICATION_DIR%\_verify-wheel-%PY_VERSION% call conda create -p %CONDA_ENV_PATH% ^ --no-shortcuts -f -q -y python=%PY_VERSION% ^ || EXIT /B 1 @@ -78,6 +75,7 @@ python -c "import pyarrow" || EXIT /B 1 python -c "import pyarrow.flight" || EXIT /B 1 python -c "import pyarrow.gandiva" || EXIT /B 1 python -c "import pyarrow.parquet" || EXIT /B 1 +python -c "import pyarrow.dataset" || EXIT /B 1 call deactivate diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index 70d47347823..bcef9f51f6e 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -71,7 +71,10 @@ pushd %ARROW_SOURCE%\cpp\build @rem This is the path for Visual Studio Community 2017 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 -cmake -G "Ninja" ^ +@rem NOTE(wesm): not using Ninja for now to be able to more easily control the +@rem generator used + +cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BOOST_USE_SHARED=ON ^ @@ -87,14 +90,17 @@ cmake -G "Ninja" ^ -DARROW_WITH_BROTLI=ON ^ -DARROW_FLIGHT=ON ^ -DARROW_PYTHON=ON ^ + -DARROW_DATASET=ON ^ -DARROW_PARQUET=ON ^ .. || exit /B +cmake --build . --target INSTALL --config Release + @rem NOTE(wesm): Building googletest is flaky for me with ninja. Building it @rem first fixes the problem -ninja googletest_ep || exit /B -ninja install || exit /B +@rem ninja googletest_ep || exit /B +@rem ninja install || exit /B @rem Get testing datasets for Parquet unit tests git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing @@ -112,8 +118,10 @@ popd @rem Build and import pyarrow pushd %ARROW_SOURCE%\python +set PYARROW_CMAKE_GENERATOR=%GENERATOR% set PYARROW_WITH_FLIGHT=1 set PYARROW_WITH_PARQUET=1 +set PYARROW_WITH_DATASET=1 python setup.py build_ext --inplace --bundle-arrow-cpp bdist_wheel || exit /B py.test pyarrow -v -s --parquet || exit /B diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 000af83a172..60a9d544bd9 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -28,6 +28,10 @@ # # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to # LD_LIBRARY_PATH. +# +# To reuse build artifacts between runs set TMPDIR environment variable to +# a directory where the temporary files should be placed to, note that this +# directory is not cleaned up automatically. case $# in 3) ARTIFACT="$1" @@ -123,7 +127,7 @@ test_binary() { local download_dir=binaries mkdir -p ${download_dir} - python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir} + python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir} verify_dir_artifact_signatures ${download_dir} } @@ -204,8 +208,15 @@ setup_tempdir() { echo "Failed to verify release candidate. See ${TMPDIR} for details." fi } - trap cleanup EXIT - TMPDIR=$(mktemp -d -t "$1.XXXXX") + + if [ -z "${TMPDIR}" ]; then + # clean up automatically if TMPDIR is not defined + TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT + else + # don't clean up automatically + mkdir -p "${TMPDIR}" + fi } @@ -219,11 +230,23 @@ setup_miniconda() { MINICONDA=$PWD/test-miniconda - wget -O miniconda.sh $MINICONDA_URL - bash miniconda.sh -b -p $MINICONDA - rm -f miniconda.sh + if [ ! -d "${MINICONDA}" ]; then + # Setup miniconda only if the directory doesn't exist yet + wget -O miniconda.sh $MINICONDA_URL + bash miniconda.sh -b -p $MINICONDA + rm -f miniconda.sh + fi . $MINICONDA/etc/profile.d/conda.sh + + conda create -n arrow-test -y -q -c conda-forge \ + python=3.6 \ + nomkl \ + numpy \ + pandas \ + six \ + cython + conda activate arrow-test } # Build and test Java (Requires newer Maven -- I used 3.3.9) @@ -240,16 +263,7 @@ test_package_java() { # Build and test C++ test_and_install_cpp() { - conda create -n arrow-test -y -q -c conda-forge \ - python=3.6 \ - nomkl \ - numpy \ - pandas \ - six \ - cython - conda activate arrow-test - - mkdir cpp/build + mkdir -p cpp/build pushd cpp/build ARROW_CMAKE_OPTIONS=" @@ -262,6 +276,7 @@ ${ARROW_CMAKE_OPTIONS:-} -DARROW_PYTHON=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON +-DARROW_DATASET=ON -DPARQUET_REQUIRE_ENCRYPTION=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_ZLIB=ON @@ -351,6 +366,7 @@ test_python() { pip install -r requirements.txt -r requirements-test.txt + export PYARROW_WITH_DATASET=1 export PYARROW_WITH_GANDIVA=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 @@ -492,7 +508,7 @@ test_integration() { export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar export ARROW_CPP_EXE_PATH=$CPP_BUILD_DIR/release - pip3 install -e dev/archery + pip install -e dev/archery INTEGRATION_TEST_ARGS="" @@ -511,6 +527,18 @@ test_integration() { $INTEGRATION_TEST_ARGS } +clone_testing_repositories() { + # Clone testing repositories if not cloned already + if [ ! -d "arrow-testing" ]; then + git clone https://github.com/apache/arrow-testing.git + fi + if [ ! -d "parquet-testing" ]; then + git clone https://github.com/apache/parquet-testing.git + fi + export ARROW_TEST_DATA=$PWD/arrow-testing/data + export PARQUET_TEST_DATA=$PWD/parquet-testing/data +} + test_source_distribution() { export ARROW_HOME=$TMPDIR/install export PARQUET_HOME=$TMPDIR/install @@ -523,17 +551,12 @@ test_source_distribution() { NPROC=$(nproc) fi - git clone https://github.com/apache/arrow-testing.git - export ARROW_TEST_DATA=$PWD/arrow-testing/data - - git clone https://github.com/apache/parquet-testing.git - export PARQUET_TEST_DATA=$PWD/parquet-testing/data + clone_testing_repositories if [ ${TEST_JAVA} -gt 0 ]; then test_package_java fi if [ ${TEST_CPP} -gt 0 ]; then - setup_miniconda test_and_install_cpp fi if [ ${TEST_CSHARP} -gt 0 ]; then @@ -584,7 +607,8 @@ check_python_imports() { python -c "import pyarrow.fs" if [[ "$py_arch" =~ ^3 ]]; then - # Flight and Gandiva are only available for py3 + # Flight, Gandiva and Dataset are only available for py3 + python -c "import pyarrow.dataset" python -c "import pyarrow.flight" python -c "import pyarrow.gandiva" fi @@ -609,8 +633,10 @@ test_linux_wheels() { pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[mu.]/}-cp${py_arch//./}-manylinux${ml_spec}_x86_64.whl check_python_imports py_arch + # install test requirements + pip install -r ${ARROW_DIR}/python/requirements-test.txt + # execute the python unit tests - conda install -y --file ${ARROW_DIR}/ci/conda_env_python.yml pandas pytest --pyargs pyarrow done @@ -640,8 +666,10 @@ test_macos_wheels() { pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[m.]/}-cp${py_arch//./}-${macos_suffix}.whl check_python_imports py_arch + # install test requirements + pip install -r ${ARROW_DIR}/python/requirements-test.txt + # execute the python unit tests - conda install -y --file ${ARROW_DIR}/ci/conda_env_python.yml pandas pytest --pyargs pyarrow conda deactivate @@ -649,6 +677,8 @@ test_macos_wheels() { } test_wheels() { + clone_testing_repositories + local download_dir=binaries mkdir -p ${download_dir} @@ -661,9 +691,9 @@ test_wheels() { conda create -yq -n py3-base python=3.7 conda activate py3-base - python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ - --regex=${filter_regex} \ - --dest=${download_dir} + python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ + --regex=${filter_regex} \ + --dest=${download_dir} verify_dir_artifact_signatures ${download_dir} @@ -726,6 +756,9 @@ setup_tempdir "arrow-${VERSION}" echo "Working in sandbox ${TMPDIR}" cd ${TMPDIR} +setup_miniconda +echo "Using miniconda environment ${MINICONDA}" + if [ "${ARTIFACT}" == "source" ]; then dist_name="apache-arrow-${VERSION}" if [ ${TEST_SOURCE} -gt 0 ]; then @@ -734,6 +767,10 @@ if [ "${ARTIFACT}" == "source" ]; then tar xf ${dist_name}.tar.gz else mkdir -p ${dist_name} + if [ ! -f ${TEST_ARCHIVE} ]; then + echo "${TEST_ARCHIVE} not found, did you mean to pass TEST_SOURCE=1?" + exit 1 + fi tar xf ${TEST_ARCHIVE} -C ${dist_name} --strip-components=1 fi pushd ${dist_name} @@ -741,7 +778,6 @@ if [ "${ARTIFACT}" == "source" ]; then popd elif [ "${ARTIFACT}" == "wheels" ]; then import_gpg_keys - setup_miniconda test_wheels else import_gpg_keys diff --git a/dev/tasks/gandiva-jars/travis.osx.yml b/dev/tasks/gandiva-jars/travis.osx.yml index 5626b1b795d..8a43e3a2429 100644 --- a/dev/tasks/gandiva-jars/travis.osx.yml +++ b/dev/tasks/gandiva-jars/travis.osx.yml @@ -33,10 +33,10 @@ env: - ARROW_TRAVIS_GANDIVA=1 before_script: + - pwd - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - if [ $CROSSBOW_USE_COMMIT_ID = true ]; then git -C arrow checkout {{ arrow.head }}; else git -C arrow checkout FETCH_HEAD; fi - - export TRAVIS_BUILD_DIR=$TRAVIS_BUILD_DIR/arrow - brew update - brew install bison flex llvm@7 @@ -52,15 +52,17 @@ script: - mkdir -p dist - dev/tasks/gandiva-jars/build-cpp-osx.sh || travis_terminate 1 - dev/tasks/gandiva-jars/build-java.sh || travis_terminate 1 - -deploy: - provider: releases - api_key: $CROSSBOW_GITHUB_TOKEN - file_glob: true - file: dist/*.jar - skip_cleanup: true - on: - tags: true + # deploy using crossbow + - brew install libgit2 + - pip3 install click github3.py jinja2 jira pygit2 ruamel.yaml setuptools_scm toolz + - > + python3 dev/tasks/crossbow.py + --queue-path $TRAVIS_BUILD_DIR + --queue-remote {{ queue.remote_url }} + upload-artifacts + --sha {{ task.branch }} + --tag {{ task.tag }} + --pattern "dist/*.jar" notifications: email: diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 7142491fea6..38023f7f6a4 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -122,8 +122,6 @@ groups: - test-ubuntu-18.04-r-sanitizer - test-debian-10-go-1.12 - test-ubuntu-18.04-docs - - test-ubuntu-fuzzit-fuzzing - - test-ubuntu-fuzzit-regression - test-conda-python-2.7-pandas-latest - test-conda-python-2.7-pandas-master - test-conda-python-3.7-pandas-latest @@ -300,9 +298,6 @@ groups: - test-ubuntu-18.04-r-sanitizer - test-debian-10-go-1.12 - test-ubuntu-18.04-docs - # https://issues.apache.org/jira/browse/ARROW-7712 - # - test-ubuntu-fuzzit-fuzzing - # - test-ubuntu-fuzzit-regression - test-conda-python-2.7-pandas-latest - test-conda-python-3.7-pandas-latest - test-conda-python-3.7-pandas-master @@ -2058,30 +2053,6 @@ tasks: - docker-compose build ubuntu-docs - docker-compose run ubuntu-docs - test-ubuntu-fuzzit-fuzzing: - ci: circle - platform: linux - template: docker-tests/circle.linux.yml - params: - commands: - - docker-compose pull --ignore-pull-failures ubuntu-cpp - - docker-compose pull --ignore-pull-failures ubuntu-fuzzit - - docker-compose build ubuntu-cpp - - docker-compose build ubuntu-fuzzit - - docker-compose run -e FUZZIT_JOB_TYPE=fuzzing ubuntu-fuzzit - - test-ubuntu-fuzzit-regression: - ci: circle - platform: linux - template: docker-tests/circle.linux.yml - params: - commands: - - docker-compose pull --ignore-pull-failures ubuntu-cpp - - docker-compose pull --ignore-pull-failures ubuntu-fuzzit - - docker-compose build ubuntu-cpp - - docker-compose build ubuntu-fuzzit - - docker-compose run -e FUZZIT_JOB_TYPE=regression ubuntu-fuzzit - ############################## Integration tests ############################ test-conda-python-2.7-pandas-latest: diff --git a/docker-compose.yml b/docker-compose.yml index 4eb92bfa14c..108a039adca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -951,30 +951,6 @@ services: command: > /bin/bash -c "pip install -e /arrow/dev/archery && archery lint" - ubuntu-fuzzit: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-fuzzit - # docker-compose run ubuntu-fuzzit - build: - context: . - dockerfile: ci/docker/linux-apt-fuzzit.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-fuzzit - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - environment: - <<: *ccache - FUZZIT_API_KEY: c0b760d37db6012fcaafd8ca5f412ba7bcd297ac969650502994b51aa11798153824442f999a067e1ef67821989ed664 - FUZZIT_HOST: bionic-llvm7 - CI_ARROW_SHA: ${CI_ARROW_SHA:-UNSET} - CI_ARROW_BRANCH: ${CI_ARROW_BRANCH:-UNSET} - volumes: *debian-volumes - command: > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/fuzzit_build.sh /arrow /build" - ######################### Integration Tests ################################# postgres: diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 1de67eed19a..9c8ddae0b7c 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -116,7 +116,7 @@ table SparseMatrixIndexCSX { union SparseTensorIndex { SparseTensorIndexCOO, - SparseMatrixIndexCSX, + SparseMatrixIndexCSX } table SparseTensor { diff --git a/go/arrow/memory/buffer.go b/go/arrow/memory/buffer.go index 234f5d4337f..57c0db487d1 100644 --- a/go/arrow/memory/buffer.go +++ b/go/arrow/memory/buffer.go @@ -22,6 +22,7 @@ import ( "github.com/apache/arrow/go/arrow/internal/debug" ) +// Buffer is a wrapper type for a buffer of bytes. type Buffer struct { refCount int64 buf []byte @@ -35,7 +36,7 @@ func NewBufferBytes(data []byte) *Buffer { return &Buffer{refCount: 0, buf: data, length: len(data)} } -// NewBuffer creates a mutable, resizable buffer with an Allocator for managing memory. +// NewResizableBuffer creates a mutable, resizable buffer with an Allocator for managing memory. func NewResizableBuffer(mem Allocator) *Buffer { return &Buffer{refCount: 1, mutable: true, mem: mem} } @@ -60,15 +61,28 @@ func (b *Buffer) Release() { } } +// Reset resets the buffer for reuse. +func (b *Buffer) Reset(buf []byte) { + b.buf = buf + b.length = len(buf) +} + // Buf returns the slice of memory allocated by the Buffer, which is adjusted by calling Reserve. func (b *Buffer) Buf() []byte { return b.buf } // Bytes returns a slice of size Len, which is adjusted by calling Resize. func (b *Buffer) Bytes() []byte { return b.buf[:b.length] } + +// Mutable returns a bool indicating whether the buffer is mutable or not. func (b *Buffer) Mutable() bool { return b.mutable } -func (b *Buffer) Len() int { return b.length } -func (b *Buffer) Cap() int { return len(b.buf) } +// Len returns the length of the buffer. +func (b *Buffer) Len() int { return b.length } + +// Cap returns the capacity of the buffer. +func (b *Buffer) Cap() int { return len(b.buf) } + +// Reserve reserves the provided amount of capacity for the buffer. func (b *Buffer) Reserve(capacity int) { if capacity > len(b.buf) { newCap := roundUpToMultipleOf64(capacity) @@ -80,10 +94,13 @@ func (b *Buffer) Reserve(capacity int) { } } +// Resize resizes the buffer to the target size. func (b *Buffer) Resize(newSize int) { b.resize(newSize, true) } +// ResizeNoShrink resizes the buffer to the target size, but will not +// shrink it. func (b *Buffer) ResizeNoShrink(newSize int) { b.resize(newSize, false) } diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java index 21278e4cabd..f7bf3248eb4 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java @@ -59,7 +59,7 @@ import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.Float4Vector; @@ -210,7 +210,7 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig *
  • BINARY --> ArrowType.Binary
  • *
  • VARBINARY --> ArrowType.Binary
  • *
  • LONGVARBINARY --> ArrowType.Binary
  • - *
  • DATE --> ArrowType.Date(DateUnit.MILLISECOND)
  • + *
  • DATE --> ArrowType.Date(DateUnit.DAY)
  • *
  • TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)
  • *
  • TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)
  • *
  • CLOB --> ArrowType.Utf8
  • @@ -265,7 +265,7 @@ public static ArrowType getArrowTypeForJdbcField(JdbcFieldInfo fieldInfo, Calend case Types.CLOB: return new ArrowType.Utf8(); case Types.DATE: - return new ArrowType.Date(DateUnit.MILLISECOND); + return new ArrowType.Date(DateUnit.DAY); case Types.TIME: return new ArrowType.Time(TimeUnit.MILLISECOND, 32); case Types.TIMESTAMP: @@ -402,7 +402,7 @@ static JdbcConsumer getConsumer(ResultSet resultSet, int columnIndex, int jdbcCo case Types.LONGNVARCHAR: return VarCharConsumer.createConsumer((VarCharVector) vector, columnIndex, nullable); case Types.DATE: - return DateConsumer.createConsumer((DateMilliVector) vector, columnIndex, nullable, calendar); + return DateConsumer.createConsumer((DateDayVector) vector, columnIndex, nullable, calendar); case Types.TIME: return TimeConsumer.createConsumer((TimeMilliVector) vector, columnIndex, nullable, calendar); case Types.TIMESTAMP: diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java index 2fbffa08cf7..bb9ab1cf0b9 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java @@ -17,11 +17,15 @@ package org.apache.arrow.adapter.jdbc.consumer; +import java.sql.Date; import java.sql.ResultSet; import java.sql.SQLException; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Calendar; -import java.util.Date; +import java.util.concurrent.TimeUnit; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; /** @@ -30,11 +34,23 @@ */ public class DateConsumer { + public static final int MAX_DAY; + + static { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + try { + java.util.Date date = dateFormat.parse("9999-12-31"); + MAX_DAY = (int) TimeUnit.MILLISECONDS.toDays(date.getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("Failed to parse max day", e); + } + } + /** * Creates a consumer for {@link DateMilliVector}. */ - public static JdbcConsumer createConsumer( - DateMilliVector vector, int index, boolean nullable, Calendar calendar) { + public static JdbcConsumer createConsumer( + DateDayVector vector, int index, boolean nullable, Calendar calendar) { if (nullable) { return new NullableDateConsumer(vector, index, calendar); } else { @@ -45,21 +61,21 @@ public static JdbcConsumer createConsumer( /** * Nullable consumer for date. */ - static class NullableDateConsumer extends BaseConsumer { + static class NullableDateConsumer extends BaseConsumer { protected final Calendar calendar; /** * Instantiate a DateConsumer. */ - public NullableDateConsumer(DateMilliVector vector, int index) { + public NullableDateConsumer(DateDayVector vector, int index) { this(vector, index, /* calendar */null); } /** * Instantiate a DateConsumer. */ - public NullableDateConsumer(DateMilliVector vector, int index, Calendar calendar) { + public NullableDateConsumer(DateDayVector vector, int index, Calendar calendar) { super(vector, index); this.calendar = calendar; } @@ -69,7 +85,11 @@ public void consume(ResultSet resultSet) throws SQLException { Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : resultSet.getDate(columnIndexInResultSet, calendar); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex, date.getTime()); + int day = (int) TimeUnit.MILLISECONDS.toDays(date.getTime()); + if (day < 0 || day > MAX_DAY) { + throw new IllegalArgumentException("Day overflow: " + day); + } + vector.setSafe(currentIndex, day); } currentIndex++; } @@ -78,21 +98,21 @@ public void consume(ResultSet resultSet) throws SQLException { /** * Non-nullable consumer for date. */ - static class NonNullableDateConsumer extends BaseConsumer { + static class NonNullableDateConsumer extends BaseConsumer { protected final Calendar calendar; /** * Instantiate a DateConsumer. */ - public NonNullableDateConsumer(DateMilliVector vector, int index) { + public NonNullableDateConsumer(DateDayVector vector, int index) { this(vector, index, /* calendar */null); } /** * Instantiate a DateConsumer. */ - public NonNullableDateConsumer(DateMilliVector vector, int index, Calendar calendar) { + public NonNullableDateConsumer(DateDayVector vector, int index, Calendar calendar) { super(vector, index); this.calendar = calendar; } @@ -101,7 +121,11 @@ public NonNullableDateConsumer(DateMilliVector vector, int index, Calendar calen public void consume(ResultSet resultSet) throws SQLException { Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : resultSet.getDate(columnIndexInResultSet, calendar); - vector.setSafe(currentIndex, date.getTime()); + int day = (int) TimeUnit.MILLISECONDS.toDays(date.getTime()); + if (day < 0 || day > MAX_DAY) { + throw new IllegalArgumentException("Day overflow: " + day); + } + vector.setSafe(currentIndex, day); currentIndex++; } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java index 3e21144c0ca..c194dfbc355 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java @@ -32,7 +32,7 @@ import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -173,14 +173,14 @@ public static void assertTimeVectorValues(TimeMilliVector timeMilliVector, int r } } - public static void assertDateVectorValues(DateMilliVector dateMilliVector, int rowCount, Long[] values) { - assertEquals(rowCount, dateMilliVector.getValueCount()); + public static void assertDateVectorValues(DateDayVector dateDayVector, int rowCount, Integer[] values) { + assertEquals(rowCount, dateDayVector.getValueCount()); - for (int j = 0; j < dateMilliVector.getValueCount(); j++) { + for (int j = 0; j < dateDayVector.getValueCount(); j++) { if (values[j] == null) { - assertTrue(dateMilliVector.isNull(j)); + assertTrue(dateDayVector.isNull(j)); } else { - assertEquals(values[j].longValue(), dateMilliVector.get(j)); + assertEquals(values[j].longValue(), dateDayVector.get(j)); } } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java index 650c766e02a..cafb7a050d7 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java @@ -50,7 +50,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -202,8 +202,8 @@ public void testDataSets(VectorSchemaRoot root) { table.getCharValues()); break; case DATE: - assertDateVectorValues((DateMilliVector) root.getVector(table.getVector()), table.getValues().length, - table.getLongValues()); + assertDateVectorValues((DateDayVector) root.getVector(table.getVector()), table.getValues().length, + table.getIntValues()); break; case TIME: assertTimeVectorValues((TimeMilliVector) root.getVector(table.getVector()), table.getValues().length, diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java index 199b821ba19..57fcf566d7d 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java @@ -58,7 +58,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -206,8 +206,8 @@ private void testAllVectorValues(VectorSchemaRoot root) { assertBooleanVectorValues((BitVector) root.getVector(BOOL), table.getRowCount(), getBooleanValues(table.getValues(), BOOL)); - assertDateVectorValues((DateMilliVector) root.getVector(DATE), table.getRowCount(), - getLongValues(table.getValues(), DATE)); + assertDateVectorValues((DateDayVector) root.getVector(DATE), table.getRowCount(), + getIntValues(table.getValues(), DATE)); assertTimeVectorValues((TimeMilliVector) root.getVector(TIME), table.getRowCount(), getLongValues(table.getValues(), TIME)); @@ -242,7 +242,7 @@ public void sqlToArrowTestNullValues(String[] vectors, VectorSchemaRoot root, in assertNullValues((Float8Vector) root.getVector(vectors[6]), rowCount); assertNullValues((Float4Vector) root.getVector(vectors[7]), rowCount); assertNullValues((TimeMilliVector) root.getVector(vectors[8]), rowCount); - assertNullValues((DateMilliVector) root.getVector(vectors[9]), rowCount); + assertNullValues((DateDayVector) root.getVector(vectors[9]), rowCount); assertNullValues((TimeStampVector) root.getVector(vectors[10]), rowCount); assertNullValues((VarBinaryVector) root.getVector(vectors[11]), rowCount); assertNullValues((VarCharVector) root.getVector(vectors[12]), rowCount); @@ -265,7 +265,7 @@ public void sqlToArrowTestSelectedNullColumnsValues(String[] vectors, VectorSche assertNullValues((Float8Vector) root.getVector(vectors[2]), rowCount); assertNullValues((Float4Vector) root.getVector(vectors[3]), rowCount); assertNullValues((TimeMilliVector) root.getVector(vectors[4]), rowCount); - assertNullValues((DateMilliVector) root.getVector(vectors[5]), rowCount); + assertNullValues((DateDayVector) root.getVector(vectors[5]), rowCount); assertNullValues((TimeStampVector) root.getVector(vectors[6]), rowCount); assertNullValues((VarBinaryVector) root.getVector(vectors[7]), rowCount); assertNullValues((VarCharVector) root.getVector(vectors[8]), rowCount); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java index dfcdb90be7c..b7d517f26e1 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java @@ -79,7 +79,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -202,8 +202,8 @@ public void testDataSets(VectorSchemaRoot root) { assertBooleanVectorValues((BitVector) root.getVector(BOOL), table.getRowCount(), getBooleanValues(table.getValues(), BOOL)); - assertDateVectorValues((DateMilliVector) root.getVector(DATE), table.getRowCount(), - getLongValues(table.getValues(), DATE)); + assertDateVectorValues((DateDayVector) root.getVector(DATE), table.getRowCount(), + getIntValues(table.getValues(), DATE)); assertTimeVectorValues((TimeMilliVector) root.getVector(TIME), table.getRowCount(), getLongValues(table.getValues(), TIME)); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java index 002a1239101..ef2b406d120 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java @@ -37,7 +37,7 @@ import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; import org.apache.arrow.adapter.jdbc.Table; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.TimeMilliVector; import org.apache.arrow.vector.TimeStampVector; import org.apache.arrow.vector.VectorSchemaRoot; @@ -145,8 +145,8 @@ public void testDataSets(VectorSchemaRoot root) { case EST_DATE: case GMT_DATE: case PST_DATE: - assertDateVectorValues((DateMilliVector) root.getVector(table.getVector()), table.getValues().length, - table.getLongValues()); + assertDateVectorValues((DateDayVector) root.getVector(table.getVector()), table.getValues().length, + table.getIntValues()); break; case EST_TIME: case GMT_TIME: diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java index d041f082b0b..fa75c1cdbaf 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java @@ -38,7 +38,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -51,6 +51,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -66,6 +67,7 @@ public JdbcToArrowVectorIteratorTest(Table table) { super(table); } + @Test @Override public void testJdbcToArrowValues() throws SQLException, IOException { @@ -91,7 +93,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep List vectorsForChar = new ArrayList<>(); List vectorsForBit = new ArrayList<>(); List vectorsForBool = new ArrayList<>(); - List dateMilliVectors = new ArrayList<>(); + List dateDayVectors = new ArrayList<>(); List timeMilliVectors = new ArrayList<>(); List timeStampVectors = new ArrayList<>(); List decimalVectors = new ArrayList<>(); @@ -115,7 +117,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep vectorsForChar.add((VarCharVector) root.getVector(CHAR)); vectorsForBit.add((BitVector) root.getVector(BIT)); vectorsForBool.add((BitVector) root.getVector(BOOL)); - dateMilliVectors.add((DateMilliVector) root.getVector(DATE)); + dateDayVectors.add((DateDayVector) root.getVector(DATE)); timeMilliVectors.add((TimeMilliVector) root.getVector(TIME)); timeStampVectors.add((TimeStampVector) root.getVector(TIMESTAMP)); decimalVectors.add((DecimalVector) root.getVector(DECIMAL)); @@ -134,7 +136,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep assertVarCharVectorValues(vectorsForChar, table.getRowCount(), getCharArray(table.getValues(), CHAR)); assertBitVectorValues(vectorsForBit, table.getRowCount(), getIntValues(table.getValues(), BIT)); assertBooleanVectorValues(vectorsForBool, table.getRowCount(), getBooleanValues(table.getValues(), BOOL)); - assertDateMilliVectorValues(dateMilliVectors, table.getRowCount(), getLongValues(table.getValues(), DATE)); + assertDateDayVectorValues(dateDayVectors, table.getRowCount(), getLongValues(table.getValues(), DATE)); assertTimeMilliVectorValues(timeMilliVectors, table.getRowCount(), getLongValues(table.getValues(), TIME)); assertTimeStampVectorValues(timeStampVectors, table.getRowCount(), getLongValues(table.getValues(), TIMESTAMP)); assertDecimalVectorValues(decimalVectors, table.getRowCount(), getDecimalValues(table.getValues(), DECIMAL)); @@ -205,12 +207,12 @@ private void assertTimeMilliVectorValues(List vectors, int rowC } } - private void assertDateMilliVectorValues(List vectors, int rowCount, Long[] values) { + private void assertDateDayVectorValues(List vectors, int rowCount, Long[] values) { int valueCount = vectors.stream().mapToInt(ValueVector::getValueCount).sum(); assertEquals(rowCount, valueCount); int index = 0; - for (DateMilliVector vector : vectors) { + for (DateDayVector vector : vectors) { for (int i = 0; i < vector.getValueCount(); i++) { assertEquals(values[index++].longValue(), vector.get(i)); } diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml index fc9784df7ff..9baae643aef 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml @@ -85,7 +85,7 @@ values: - 'DECIMAL_FIELD6=17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23' - 'DOUBLE_FIELD7=56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345' - 'TIME_FIELD9=45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000' - - 'DATE_FIELD10=1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000' + - 'DATE_FIELD10=17574,17574,17574,17574,17574,17574,17574,17574,17574,17574' - 'TIMESTAMP_FIELD11=1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000' - 'CHAR_FIELD16=some char text,some char text,some char text,some char text,some char text, some char text,some char text,some char text,some char text,some char text' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml index bb9bfb81548..4be8ab86ecc 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml @@ -71,7 +71,7 @@ values: - 'DECIMAL_FIELD6=null,17345667789.23,null,17345667789.23,null' - 'DOUBLE_FIELD7=null,56478356785.345,null,56478356785.345,null' - 'TIME_FIELD9=null,45935000,null,45935000,null' - - 'DATE_FIELD10=null,1518393600000,null,1518393600000,null' + - 'DATE_FIELD10=null,17574,null,17574,null' - 'TIMESTAMP_FIELD11=null,1518439535000,null,1518439535000,null' - 'CHAR_FIELD16=null,some char text,null,some char text,null' - 'VARCHAR_FIELD13=null,some text that needs to be converted to varchar,null, diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml index 45aa56c7417..da33d9d47cd 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml @@ -34,13 +34,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml index 290c32ebafa..1868db3adeb 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml @@ -36,13 +36,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' - - '1518411600000' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml index 03929c936e4..65824861a8a 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml @@ -36,13 +36,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' - - '1518393600000' \ No newline at end of file + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' \ No newline at end of file diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml index 81a668f37a4..798cfc7d67d 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml @@ -36,13 +36,13 @@ query: 'select date_field10 from table1;' drop: 'DROP table table1;' values: - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' - - '1518422400000' \ No newline at end of file + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' + - '17574' \ No newline at end of file diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java index bac25cf8165..c5ecd8acb9b 100644 --- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java +++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java @@ -46,6 +46,7 @@ public void sortInPlace(V vec, VectorValueComparator comparator) { this.comparator = comparator; this.pivotBuffer = (V) vec.getField().createVector(vec.getAllocator()); this.pivotBuffer.allocateNew(1); + this.pivotBuffer.setValueCount(1); comparator.attachVectors(vec, pivotBuffer); quickSort(); diff --git a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java new file mode 100644 index 00000000000..235eca53c84 --- /dev/null +++ b/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +/** + * Benchmarks for {@link TransferPair}. + */ +@State(Scope.Benchmark) +public class TransferPairBenchmarks { + + private static final int VECTOR_LENGTH = 1024; + + private static final int ALLOCATOR_CAPACITY = 1024 * 1024; + + private BufferAllocator allocator; + + private IntVector intVector; + + private VarCharVector varCharVector; + + /** + * Setup benchmarks. + */ + @Setup + public void prepare() { + allocator = new RootAllocator(ALLOCATOR_CAPACITY); + intVector = new IntVector("intVector", allocator); + varCharVector = new VarCharVector("varcharVector", allocator); + + intVector.allocateNew(VECTOR_LENGTH); + varCharVector.allocateNew(VECTOR_LENGTH); + + for (int i = 0; i < VECTOR_LENGTH; i++) { + if (i % 3 == 0) { + intVector.setNull(i); + varCharVector.setNull(i); + } else { + intVector.setSafe(i, i * i); + varCharVector.setSafe(i, ("teststring" + i).getBytes(StandardCharsets.UTF_8)); + } + } + intVector.setValueCount(VECTOR_LENGTH); + varCharVector.setValueCount(VECTOR_LENGTH); + } + + /** + * Tear down benchmarks. + */ + @TearDown + public void tearDown() { + intVector.close(); + varCharVector.close();; + allocator.close(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MICROSECONDS) + public int splitAndTransferIntVector() { + IntVector toVector = new IntVector("intVector", allocator); + toVector.setValueCount(VECTOR_LENGTH); + TransferPair transferPair = intVector.makeTransferPair(toVector); + transferPair.splitAndTransfer(0, VECTOR_LENGTH); + toVector.close(); + return 0; + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MICROSECONDS) + public int splitAndTransferVarcharVector() { + VarCharVector toVector = new VarCharVector("varcharVector", allocator); + toVector.setValueCount(VECTOR_LENGTH); + TransferPair transferPair = varCharVector.makeTransferPair(toVector); + transferPair.splitAndTransfer(0, VECTOR_LENGTH); + toVector.close(); + return 0; + } + + public static void main(String [] args) throws RunnerException { + Options opt = new OptionsBuilder() + .include(TransferPairBenchmarks.class.getSimpleName()) + .forks(1) + .build(); + + new Runner(opt).run(); + } +} diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index db3c8a89f5e..8bf5d8a9b20 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -469,7 +469,10 @@ public void transfer() { @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); to.clear(); internalStructVectorTransferPair.splitAndTransfer(startIndex, length); final int startPoint = startIndex * TYPE_WIDTH; @@ -550,9 +553,13 @@ public Iterator iterator() { return vectors.iterator(); } - private ValueVector getVector(int index) { - int type = typeBuffer.getByte(index * TYPE_WIDTH); - switch (MinorType.values()[type]) { + public ValueVector getVector(int index) { + int type = typeBuffer.getByte(index * TYPE_WIDTH); + return getVectorByType(type); + } + + public ValueVector getVectorByType(int typeId) { + switch (MinorType.values()[typeId]) { case NULL: return null; <#list vv.types as type> @@ -571,7 +578,7 @@ private ValueVector getVector(int index) { case LIST: return getList(); default: - throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]); + throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[typeId]); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index ed91d6e659b..a4e94bcac09 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -584,7 +584,10 @@ public void transferTo(BaseFixedWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseFixedWidthVector target) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); compareTypes(target, "splitAndTransferTo"); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 7e276445643..89395ef7e22 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -714,6 +714,10 @@ public void transferTo(BaseVariableWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseVariableWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); compareTypes(target, "splitAndTransferTo"); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); @@ -750,7 +754,6 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariab */ private void splitAndTransferValidityBuffer(int startIndex, int length, BaseVariableWidthVector target) { - Preconditions.checkArgument(startIndex + length <= valueCount); int firstByteSource = BitVectorHelper.byteIndex(startIndex); int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); int byteSizeTarget = getValidityBufferSizeFromCount(length); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 2fa5fdef4ef..b41c62677fc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -158,6 +158,10 @@ public int getBufferSize() { * @param target destination vector */ public void splitAndTransferTo(int startIndex, int length, BaseFixedWidthVector target) { + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); compareTypes(target, "splitAndTransferTo"); target.clear(); target.validityBuffer = splitAndTransferBuffer(startIndex, length, target, @@ -174,7 +178,6 @@ private ArrowBuf splitAndTransferBuffer( BaseFixedWidthVector target, ArrowBuf sourceBuffer, ArrowBuf destBuffer) { - assert startIndex + length <= valueCount; int firstByteSource = BitVectorHelper.byteIndex(startIndex); int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); int byteSizeTarget = getValidityBufferSizeFromCount(length); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java index 7f244bfa658..9448a1a1453 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java @@ -37,6 +37,7 @@ * maintained to track which elements in the vector are null. */ public final class DateDayVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; private final FieldReader reader; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java index ab5ce94e7e3..bcf8c64e0ce 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java @@ -109,8 +109,10 @@ public Boolean visit(BaseFixedWidthVector left, Range range) { } @Override - protected ApproxEqualsVisitor createInnerVisitor(ValueVector left, ValueVector right) { - return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone()); + protected ApproxEqualsVisitor createInnerVisitor( + ValueVector left, ValueVector right, + BiFunction typeComparator) { + return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone(), typeComparator); } private boolean float4ApproxEquals(Range range) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index cbba0b82039..49fa69f5ac7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -24,7 +24,6 @@ import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; -import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; @@ -193,12 +192,9 @@ public Boolean visit(NullVector left, Range range) { return true; } - /** - * Creates a visitor to visit child vectors. - * It is used for complex vector types. - * @return the visitor for child vecors. - */ - protected RangeEqualsVisitor createInnerVisitor(ValueVector leftInner, ValueVector rightInner) { + protected RangeEqualsVisitor createInnerVisitor( + ValueVector leftInner, ValueVector rightInner, + BiFunction typeComparator) { return new RangeEqualsVisitor(leftInner, rightInner, typeComparator); } @@ -206,16 +202,23 @@ protected boolean compareUnionVectors(Range range) { UnionVector leftVector = (UnionVector) left; UnionVector rightVector = (UnionVector) right; - List leftChildren = leftVector.getChildrenFromFields(); - List rightChildren = rightVector.getChildrenFromFields(); - - if (leftChildren.size() != rightChildren.size()) { - return false; - } - - for (int k = 0; k < leftChildren.size(); k++) { - RangeEqualsVisitor visitor = createInnerVisitor(leftChildren.get(k), rightChildren.get(k)); - if (!visitor.rangeEquals(range)) { + Range subRange = new Range(0, 0, 1); + for (int i = 0; i < range.getLength(); i++) { + subRange.setLeftStart(range.getLeftStart() + i).setRightStart(range.getRightStart() + i); + ValueVector leftSubVector = leftVector.getVector(range.getLeftStart() + i); + ValueVector rightSubVector = rightVector.getVector(range.getRightStart() + i); + + if (leftSubVector == null || rightSubVector == null) { + if (leftSubVector == rightSubVector) { + continue; + } else { + return false; + } + } + TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightSubVector); + RangeEqualsVisitor visitor = + createInnerVisitor(leftSubVector, rightSubVector, (left, right) -> typeVisitor.equals(left)); + if (!visitor.rangeEquals(subRange)) { return false; } } @@ -232,7 +235,8 @@ protected boolean compareStructVectors(Range range) { } for (String name : leftChildNames) { - RangeEqualsVisitor visitor = createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name)); + RangeEqualsVisitor visitor = + createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name), /*type comparator*/ null); if (!visitor.rangeEquals(range)) { return false; } @@ -311,7 +315,8 @@ protected boolean compareListVectors(Range range) { ListVector leftVector = (ListVector) left; ListVector rightVector = (ListVector) right; - RangeEqualsVisitor innerVisitor = createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector()); + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); Range innerRange = new Range(); for (int i = 0; i < range.getLength(); i++) { @@ -357,7 +362,8 @@ protected boolean compareFixedSizeListVectors(Range range) { } int listSize = leftVector.getListSize(); - RangeEqualsVisitor innerVisitor = createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector()); + RangeEqualsVisitor innerVisitor = + createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); Range innerRange = new Range(0, 0, listSize); for (int i = 0; i < range.getLength(); i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index 653719feafe..8fa43fb06b7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -590,7 +590,10 @@ public void transfer() { @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); final int startPoint = listSize * startIndex; final int sliceLength = listSize * length; to.clear(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 59481d39399..312a3556082 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -493,7 +493,10 @@ public void transfer() { */ @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java index ec922a032a9..7b22835963a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -195,7 +195,10 @@ public void copyValueSafe(int fromIndex, int toIndex) { @Override public void splitAndTransfer(int startIndex, int length) { - Preconditions.checkArgument(startIndex + length <= valueCount); + Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, + "Invalid startIndex: %s", startIndex); + Preconditions.checkArgument(startIndex + length <= valueCount, + "Invalid length: %s", length); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); super.splitAndTransfer(startIndex, length); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java new file mode 100644 index 00000000000..8dac7676858 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.HashSet; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +import io.netty.util.internal.PlatformDependent; + +/** + * Utility to append two vectors together. + */ +class VectorAppender implements VectorVisitor { + + /** + * The targetVector to be appended. + */ + private final ValueVector targetVector; + + private final TypeEqualsVisitor typeVisitor; + + /** + * Constructs a new targetVector appender, with the given targetVector. + * @param targetVector the targetVector to be appended. + */ + VectorAppender(ValueVector targetVector) { + this.targetVector = targetVector; + typeVisitor = new TypeEqualsVisitor(targetVector, false, true); + } + + @Override + public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(), + deltaVector.getTypeWidth() * deltaVector.getValueCount()); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetDataSize = targetVector.getOffsetBuffer().getInt( + targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int deltaDataSize = deltaVector.getOffsetBuffer().getInt( + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int newValueCapacity = targetDataSize + deltaDataSize; + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + while (targetVector.getDataBuffer().capacity() < newValueCapacity) { + ((BaseVariableWidthVector) targetVector).reallocDataBuffer(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append data buffer + PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(), + targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); + + // copy offset buffer + PlatformDependent.copyMemory( + deltaVector.getOffsetBuffer().memoryAddress() + BaseVariableWidthVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + BaseVariableWidthVector.OFFSET_WIDTH, + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + int oldOffset = targetVector.getOffsetBuffer().getInt((targetVector.getValueCount() + 1 + i) * + BaseVariableWidthVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setInt( + (targetVector.getValueCount() + 1 + i) * + BaseVariableWidthVector.OFFSET_WIDTH, oldOffset + targetDataSize); + } + ((BaseVariableWidthVector) targetVector).setLastSet(newValueCount - 1); + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(ListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The targetVector to append must have the same type as the targetVector being appended"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetListSize = targetVector.getOffsetBuffer().getInt( + targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + int deltaListSize = deltaVector.getOffsetBuffer().getInt( + deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH); + + ListVector targetListVector = (ListVector) targetVector; + + // make sure the underlying vector has value count set + targetListVector.getDataVector().setValueCount(targetListSize); + deltaVector.getDataVector().setValueCount(deltaListSize); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append offset buffer + PlatformDependent.copyMemory(deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH, + targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * + ListVector.OFFSET_WIDTH, + deltaVector.getValueCount() * ListVector.OFFSET_WIDTH); + + // increase each offset from the second buffer + for (int i = 0; i < deltaVector.getValueCount(); i++) { + int oldOffset = + targetVector.getOffsetBuffer().getInt((targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH); + targetVector.getOffsetBuffer().setInt((targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH, + oldOffset + targetListSize); + } + targetListVector.setLastSet(newValueCount - 1); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(FixedSizeListVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The vector to append must have the same type as the targetVector being appended"); + + FixedSizeListVector targetListVector = (FixedSizeListVector) targetVector; + + Preconditions.checkArgument(targetListVector.getListSize() == deltaVector.getListSize(), + "FixedSizeListVector must have the same list size to append"); + + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + int targetListSize = targetListVector.getValueCount() * targetListVector.getListSize(); + int deltaListSize = deltaVector.getValueCount() * deltaVector.getListSize(); + + // make sure the underlying vector has value count set + targetListVector.getDataVector().setValueCount(targetListSize); + deltaVector.getDataVector().setValueCount(deltaListSize); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append underlying vectors + VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector()); + deltaVector.getDataVector().accept(innerAppender, null); + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(NonNullableStructVector deltaVector, Void value) { + Preconditions.checkArgument(typeVisitor.equals(deltaVector), + "The vector to append must have the same type as the targetVector being appended"); + + NonNullableStructVector targetStructVector = (NonNullableStructVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), targetVector.getValueCount(), + deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer()); + + // append child vectors + for (int i = 0; i < targetStructVector.getChildrenFromFields().size(); i++) { + ValueVector targetChild = targetStructVector.getVectorById(i); + ValueVector deltaChild = deltaVector.getVectorById(i); + + targetChild.setValueCount(targetStructVector.getValueCount()); + deltaChild.setValueCount(deltaVector.getValueCount()); + + VectorAppender innerAppender = new VectorAppender(targetChild); + deltaChild.accept(innerAppender, null); + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(UnionVector deltaVector, Void value) { + // we only make sure that both vectors are union vectors. + Preconditions.checkArgument(targetVector.getMinorType() == deltaVector.getMinorType(), + "The vector to append must have the same type as the targetVector being appended"); + + UnionVector targetUnionVector = (UnionVector) targetVector; + int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetUnionVector.getValueCapacity() < newValueCount) { + targetUnionVector.reAlloc(); + } + + // append type buffers + PlatformDependent.copyMemory(deltaVector.getValidityBufferAddress(), + targetUnionVector.getValidityBufferAddress() + targetVector.getValueCount(), + deltaVector.getValueCount()); + + // build the hash set for all types + HashSet targetTypes = new HashSet<>(); + for (int i = 0; i < targetUnionVector.getValueCount(); i++) { + targetTypes.add((int) targetUnionVector.getValidityBuffer().getByte(i)); + } + HashSet deltaTypes = new HashSet<>(); + for (int i = 0; i < deltaVector.getValueCount(); i++) { + deltaTypes.add((int) deltaVector.getValidityBuffer().getByte(i)); + } + + // append child vectors + for (int i = 0; i < Byte.MAX_VALUE; i++) { + if (targetTypes.contains(i) || deltaTypes.contains(i)) { + ValueVector targetChild = targetUnionVector.getVectorByType(i); + if (!targetTypes.contains(i)) { + // if the vector type does not exist in the target, it must be newly created + // and we must make sure it has enough capacity. + while (targetChild.getValueCapacity() < newValueCount) { + targetChild.reAlloc(); + } + } + + if (deltaTypes.contains(i)) { + // append child vectors + ValueVector deltaChild = deltaVector.getVectorByType(i); + + targetChild.setValueCount(targetUnionVector.getValueCount()); + deltaChild.setValueCount(deltaVector.getValueCount()); + + VectorAppender innerAppender = new VectorAppender(targetChild); + deltaChild.accept(innerAppender, null); + } + targetChild.setValueCount(newValueCount); + } + } + + targetVector.setValueCount(newValueCount); + return targetVector; + } + + @Override + public ValueVector visit(DenseUnionVector left, Void value) { + throw new UnsupportedOperationException(); + } + + @Override + public ValueVector visit(NullVector deltaVector, Void value) { + Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()), + "The targetVector to append must have the same type as the targetVector being appended"); + return targetVector; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java new file mode 100644 index 00000000000..570783d1070 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.apache.arrow.vector.ValueVector; + +/** + * Utility to add vector values in batch. + */ +public class VectorBatchAppender { + + /** + * Add value vectors in batch. + * @param targetVector the target vector. + * @param vectorsToAppend the vectors to append. + * @param the vector type. + */ + public static void batchAppend(V targetVector, V... vectorsToAppend) { + VectorAppender appender = new VectorAppender(targetVector); + for (V delta : vectorsToAppend) { + delta.accept(appender, null); + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index 6405e256b22..d5e4a00d599 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -18,16 +18,17 @@ package org.apache.arrow.vector; import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; import org.junit.Before; import org.junit.Test; +import org.junit.jupiter.api.Assertions; public class TestSplitAndTransfer { private BufferAllocator allocator; @@ -41,6 +42,17 @@ public void init() { public void terminate() throws Exception { allocator.close(); } + + private void populateVarcharVector(final VarCharVector vector, int valueCount, String[] compareArray) { + for (int i = 0; i < valueCount; i += 3) { + final String s = String.format("%010d", i); + vector.set(i, s.getBytes()); + if (compareArray != null) { + compareArray[i] = s; + } + } + vector.setValueCount(valueCount); + } @Test /* VarCharVector */ public void test() throws Exception { @@ -50,12 +62,7 @@ public void test() throws Exception { final int valueCount = 500; final String[] compareArray = new String[valueCount]; - for (int i = 0; i < valueCount; i += 3) { - final String s = String.format("%010d", i); - varCharVector.set(i, s.getBytes()); - compareArray[i] = s; - } - varCharVector.setValueCount(valueCount); + populateVarcharVector(varCharVector, valueCount, compareArray); final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); @@ -89,11 +96,7 @@ public void testMemoryConstrainedTransfer() { final int valueCount = 1000; - for (int i = 0; i < valueCount; i += 3) { - final String s = String.format("%010d", i); - varCharVector.set(i, s.getBytes()); - } - varCharVector.setValueCount(valueCount); + populateVarcharVector(varCharVector, valueCount, null); final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); @@ -107,4 +110,143 @@ public void testMemoryConstrainedTransfer() { } } } + + @Test + public void testTransfer() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + final String[] compareArray = new String[valueCount]; + populateVarcharVector(varCharVector, valueCount, compareArray); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + tp.transfer(); + + assertEquals(0, varCharVector.valueCount); + assertEquals(valueCount, newVarCharVector.valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + final byte[] expectedValue = compareArray[i].getBytes(); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + + newVarCharVector.clear(); + } + } + + @Test + public void testCopyValueSafe() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + varCharVector.allocateNew(10000, 1000); + + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + // new vector memory is not pre-allocated, we expect copyValueSafe work fine. + for (int i = 0; i < valueCount; i++) { + tp.copyValueSafe(i, i); + } + newVarCharVector.setValueCount(valueCount); + + for (int i = 0; i < valueCount; i++) { + final boolean expectedSet = (i % 3) == 0; + if (expectedSet) { + assertFalse(varCharVector.isNull(i)); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(varCharVector.get(i), newVarCharVector.get(i)); + } else { + assertTrue(newVarCharVector.isNull(i)); + } + } + } + } + + @Test + public void testSplitAndTransferNon() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + + tp.splitAndTransfer(0, 0); + assertEquals(0, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testSplitAndTransferAll() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.getTransferPair(allocator); + VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); + + tp.splitAndTransfer(0, valueCount); + assertEquals(valueCount, newVarCharVector.getValueCount()); + + newVarCharVector.clear(); + } + } + + @Test + public void testInvalidStartIndex() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(valueCount, 10)); + + assertEquals("Invalid startIndex: 500", e.getMessage()); + + newVarCharVector.clear(); + } + } + + @Test + public void testInvalidLength() { + try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator); + final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) { + + varCharVector.allocateNew(10000, 1000); + final int valueCount = 500; + populateVarcharVector(varCharVector, valueCount, null); + + final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector); + + IllegalArgumentException e = Assertions.assertThrows( + IllegalArgumentException.class, + () -> tp.splitAndTransfer(0, valueCount * 2)); + + assertEquals("Invalid length: 1000", e.getMessage()); + + newVarCharVector.clear(); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index 183c8abac18..c35fd4107a9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -282,6 +282,57 @@ public void testUnionVectorRangeEquals() { } } + /** + * Test comparing two union vectors. + * The two vectors are different in total, but have a range with equal values. + */ + @Test + public void testUnionVectorSubRangeEquals() { + try (final UnionVector vector1 = new UnionVector("union", allocator, null, null); + final UnionVector vector2 = new UnionVector("union", allocator, null, null);) { + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 10; + uInt4Holder.isSet = 1; + + final NullableIntHolder intHolder = new NullableIntHolder(); + intHolder.value = 20; + intHolder.isSet = 1; + + vector1.setType(0, Types.MinorType.UINT4); + vector1.setSafe(0, uInt4Holder); + + vector1.setType(1, Types.MinorType.INT); + vector1.setSafe(1, intHolder); + + vector1.setType(2, Types.MinorType.INT); + vector1.setSafe(2, intHolder); + + vector1.setType(3, Types.MinorType.INT); + vector1.setSafe(3, intHolder); + + vector1.setValueCount(4); + + vector2.setType(0, Types.MinorType.UINT4); + vector2.setSafe(0, uInt4Holder); + + vector2.setType(1, Types.MinorType.INT); + vector2.setSafe(1, intHolder); + + vector2.setType(2, Types.MinorType.INT); + vector2.setSafe(2, intHolder); + + vector2.setType(3, Types.MinorType.UINT4); + vector2.setSafe(3, uInt4Holder); + + vector2.setValueCount(4); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertFalse(visitor.rangeEquals(new Range(0, 0, 4))); + assertTrue(visitor.rangeEquals(new Range(1, 1, 2))); + } + } + @Ignore @Test public void testEqualsWithOutTypeCheck() { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index ab3f81a6446..b50a3441322 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -17,10 +17,14 @@ package org.apache.arrow.vector.testing; +import static org.junit.Assert.assertEquals; + import java.nio.charset.StandardCharsets; +import java.util.List; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.DateDayVector; import org.apache.arrow.vector.DateMilliVector; import org.apache.arrow.vector.DecimalVector; @@ -51,7 +55,12 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.holders.IntervalDayHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; /** * Utility for populating {@link org.apache.arrow.vector.ValueVector}. @@ -533,10 +542,72 @@ public static void setVector(VarCharVector vector, String... values) { vector.allocateNewSafe(); for (int i = 0; i < length; i++) { if (values[i] != null) { - vector.set(i, values[i].getBytes(StandardCharsets.UTF_8)); + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); } } vector.setValueCount(length); } + /** + * Populate values for {@link ListVector}. + */ + public static void setVector(ListVector vector, List... values) { + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + vector.getOffsetBuffer().setInt(0, curPos); + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.set(curPos, value); + curPos += 1; + } + } + vector.getOffsetBuffer().setInt((i + 1) * BaseRepeatedValueVector.OFFSET_WIDTH, curPos); + } + dataVector.setValueCount(curPos); + vector.setLastSet(values.length - 1); + vector.setValueCount(values.length); + } + + /** + * Populate values for {@link FixedSizeListVector}. + */ + public static void setVector(FixedSizeListVector vector, List... values) { + for (int i = 0; i < values.length; i++) { + if (values[i] != null) { + assertEquals(vector.getListSize(), values[i].size()); + } + } + + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.set(curPos, value); + curPos += 1; + } + } + } + dataVector.setValueCount(curPos); + vector.setValueCount(values.length); + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java new file mode 100644 index 00000000000..83fc24fe90a --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.compare.TypeEqualsVisitor; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorAppender}. + */ +public class TestVectorAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testAppendFixedWidthVector() { + final int length1 = 10; + final int length2 = 5; + try (IntVector target = new IntVector("", allocator); + IntVector delta = new IntVector("", allocator)) { + + target.allocateNew(length1); + delta.allocateNew(length2); + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, null, 8, 9); + ValueVectorDataPopulator.setVector(delta, null, 11, 12, 13, 14); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 + length2, target.getValueCount()); + + try (IntVector expected = new IntVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, 0, 1, 2, 3, 4, 5, 6, null, 8, 9, null, 11, 12, 13, 14); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendVariableWidthVector() { + final int length1 = 10; + final int length2 = 5; + try (VarCharVector target = new VarCharVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(5, length1); + delta.allocateNew(5, length2); + + ValueVectorDataPopulator.setVector(target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + ValueVectorDataPopulator.setVector(delta, "a10", "a11", "a12", "a13", null); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (VarCharVector expected = new VarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector(expected, + "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9", "a10", "a11", "a12", "a13", null); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendListVector() { + final int length1 = 5; + final int length2 = 2; + try (ListVector target = ListVector.empty("target", allocator); + ListVector delta = ListVector.empty("delta", allocator)) { + + target.allocateNew(); + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1), + Arrays.asList(2, 3), + null, + Arrays.asList(6, 7), + Arrays.asList(8, 9)); + assertEquals(length1, target.getValueCount()); + + delta.allocateNew(); + ValueVectorDataPopulator.setVector(delta, + Arrays.asList(10, 11, 12, 13, 14), + Arrays.asList(15, 16, 17, 18, 19)); + assertEquals(length2, delta.getValueCount()); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(7, target.getValueCount()); + + List expected = Arrays.asList(0, 1); + assertEquals(expected, target.getObject(0)); + + expected = Arrays.asList(2, 3); + assertEquals(expected, target.getObject(1)); + + assertTrue(target.isNull(2)); + + expected = Arrays.asList(6, 7); + assertEquals(expected, target.getObject(3)); + + expected = Arrays.asList(8, 9); + assertEquals(expected, target.getObject(4)); + + expected = Arrays.asList(10, 11, 12, 13, 14); + assertEquals(expected, target.getObject(5)); + + expected = Arrays.asList(15, 16, 17, 18, 19); + assertEquals(expected, target.getObject(6)); + } + } + + @Test + public void testAppendFixedSizeListVector() { + try (FixedSizeListVector target = FixedSizeListVector.empty("target", 5, allocator); + FixedSizeListVector delta = FixedSizeListVector.empty("delta", 5, allocator)) { + + target.allocateNew(); + ValueVectorDataPopulator.setVector(target, + Arrays.asList(0, 1, 2, 3, 4), + null); + assertEquals(2, target.getValueCount()); + + delta.allocateNew(); + ValueVectorDataPopulator.setVector(delta, + Arrays.asList(10, 11, 12, 13, 14), + Arrays.asList(15, 16, 17, 18, 19)); + assertEquals(2, delta.getValueCount()); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(4, target.getValueCount()); + + assertEquals(Arrays.asList(0, 1, 2, 3, 4), target.getObject(0)); + assertTrue(target.isNull(1)); + assertEquals(Arrays.asList(10, 11, 12, 13, 14), target.getObject(2)); + assertEquals(Arrays.asList(15, 16, 17, 18, 19), target.getObject(3)); + } + } + + @Test + public void testAppendStructVector() { + final int length1 = 10; + final int length2 = 5; + try (final StructVector target = StructVector.empty("target", allocator); + final StructVector delta = StructVector.empty("delta", allocator)) { + + IntVector targetChild1 = target.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector targetChild2 = target.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + targetChild1.allocateNew(); + targetChild2.allocateNew(); + ValueVectorDataPopulator.setVector(targetChild1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(targetChild2, "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9"); + target.setValueCount(length1); + + IntVector deltaChild1 = delta.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class); + VarCharVector deltaChild2 = delta.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class); + deltaChild1.allocateNew(); + deltaChild2.allocateNew(); + ValueVectorDataPopulator.setVector(deltaChild1, 10, 11, 12, null, 14); + ValueVectorDataPopulator.setVector(deltaChild2, "a10", "a11", "a12", "a13", "a14"); + delta.setValueCount(length2); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 + length2, target.getValueCount()); + IntVector child1 = (IntVector) target.getVectorById(0); + VarCharVector child2 = (VarCharVector) target.getVectorById(1); + + try (IntVector expected1 = new IntVector("expected1", allocator); + VarCharVector expected2 = new VarCharVector("expected2", allocator)) { + expected1.allocateNew(); + expected2.allocateNew(); + + ValueVectorDataPopulator.setVector(expected1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9, 10, 11, 12, null, 14); + ValueVectorDataPopulator.setVector(expected2, + "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9", "a10", "a11", "a12", "a13", "a14"); + + assertVectorsEqual(expected1, target.getChild("f0")); + assertVectorsEqual(expected2, target.getChild("f1")); + } + } + } + + @Test + public void testAppendUnionVector() { + final int length1 = 10; + final int length2 = 5; + + try (final UnionVector target = UnionVector.empty("target", allocator); + final UnionVector delta = UnionVector.empty("delta", allocator)) { + + // alternating ints and big ints + target.setType(0, Types.MinorType.INT); + target.setType(1, Types.MinorType.BIGINT); + target.setType(2, Types.MinorType.INT); + target.setType(3, Types.MinorType.BIGINT); + target.setType(4, Types.MinorType.INT); + target.setType(5, Types.MinorType.BIGINT); + target.setType(6, Types.MinorType.INT); + target.setType(7, Types.MinorType.BIGINT); + target.setType(8, Types.MinorType.INT); + target.setType(9, Types.MinorType.BIGINT); + target.setType(10, Types.MinorType.INT); + target.setType(11, Types.MinorType.BIGINT); + target.setType(12, Types.MinorType.INT); + target.setType(13, Types.MinorType.BIGINT); + target.setType(14, Types.MinorType.INT); + target.setType(15, Types.MinorType.BIGINT); + target.setType(16, Types.MinorType.INT); + target.setType(17, Types.MinorType.BIGINT); + target.setType(18, Types.MinorType.INT); + target.setType(19, Types.MinorType.BIGINT); + + IntVector targetIntVec = target.getIntVector(); + targetIntVec.allocateNew(); + ValueVectorDataPopulator.setVector( + targetIntVec, + 0, null, 1, null, 2, null, 3, null, 4, null, 5, null, 6, null, 7, null, 8, null, 9, null); + assertEquals(length1 * 2, targetIntVec.getValueCount()); + + BigIntVector targetBigIntVec = target.getBigIntVector(); + targetBigIntVec.allocateNew(); + ValueVectorDataPopulator.setVector( + targetBigIntVec, + null, 0L, null, 1L, null, 2L, null, 3L, null, 4L, null, 5L, null, 6L, null, 7L, null, 8L, null, 9L); + assertEquals(length1 * 2, targetBigIntVec.getValueCount()); + + target.setValueCount(length1 * 2); + + // populate the delta vector + delta.setType(0, Types.MinorType.FLOAT4); + delta.setType(1, Types.MinorType.FLOAT4); + delta.setType(2, Types.MinorType.FLOAT4); + delta.setType(3, Types.MinorType.FLOAT4); + delta.setType(4, Types.MinorType.FLOAT4); + + Float4Vector deltaFloatVector = delta.getFloat4Vector(); + deltaFloatVector.allocateNew(); + ValueVectorDataPopulator.setVector(deltaFloatVector, 10f, 11f, 12f, 13f, 14f); + assertEquals(length2, deltaFloatVector.getValueCount()); + delta.setValueCount(length2); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(length1 * 2 + length2, target.getValueCount()); + + for (int i = 0; i < length1; i++) { + Object intObj = target.getObject(i * 2); + assertTrue(intObj instanceof Integer); + assertEquals(i, ((Integer) intObj).intValue()); + + Object longObj = target.getObject(i * 2 + 1); + assertTrue(longObj instanceof Long); + assertEquals(i, ((Long) longObj).longValue()); + } + + for (int i = 0; i < length2; i++) { + Object floatObj = target.getObject(length1 * 2 + i); + assertTrue(floatObj instanceof Float); + assertEquals(i + length1, ((Float) floatObj).intValue()); + } + } + } + + @Test + public void testAppendVectorNegative() { + final int vectorLength = 10; + try (IntVector target = new IntVector("", allocator); + VarCharVector delta = new VarCharVector("", allocator)) { + + target.allocateNew(vectorLength); + delta.allocateNew(vectorLength); + + VectorAppender appender = new VectorAppender(target); + + assertThrows(IllegalArgumentException.class, + () -> delta.accept(appender, null)); + } + } + + public static void assertVectorsEqual(ValueVector vector1, ValueVector vector2) { + assertEquals(vector1.getValueCount(), vector2.getValueCount()); + + TypeEqualsVisitor typeEqualsVisitor = new TypeEqualsVisitor(vector1, false, false); + RangeEqualsVisitor equalsVisitor = + new RangeEqualsVisitor(vector1, vector2, (v1, v2) -> typeEqualsVisitor.equals(vector2)); + assertTrue(equalsVisitor.rangeEquals(new Range(0, 0, vector1.getValueCount()))); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java new file mode 100644 index 00000000000..799c25c0ad7 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static junit.framework.TestCase.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link VectorBatchAppender}. + */ +public class TestVectorBatchAppender { + + private BufferAllocator allocator; + + @Before + public void prepare() { + allocator = new RootAllocator(1024 * 1024); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testBatchAppendIntVector() { + final int length1 = 10; + final int length2 = 5; + final int length3 = 7; + try (IntVector target = new IntVector("", allocator); + IntVector delta1 = new IntVector("", allocator); + IntVector delta2 = new IntVector("", allocator)) { + + target.allocateNew(length1); + delta1.allocateNew(length2); + delta2.allocateNew(length3); + + ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + ValueVectorDataPopulator.setVector(delta1, 10, 11, 12, 13, 14); + ValueVectorDataPopulator.setVector(delta2, 15, 16, 17, 18, 19, 20, 21); + + VectorBatchAppender.batchAppend(target, delta1, delta2); + + assertEquals(length1 + length2 + length3, target.getValueCount()); + for (int i = 0; i < target.getValueCount(); i++) { + assertEquals(i, target.get(i)); + } + } + } +} diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 256c3cd2179..cc2b25da72b 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1304,14 +1304,15 @@ cdef class ParquetWriter: else: props.disallow_truncated_timestamps() - cdef void _set_version(self, WriterProperties.Builder* props): + cdef int _set_version(self, WriterProperties.Builder* props) except -1: if self.version is not None: if self.version == "1.0": props.version(ParquetVersion_V1) elif self.version == "2.0": props.version(ParquetVersion_V2) else: - raise ArrowException("Unsupported Parquet format version") + raise ValueError("Unsupported Parquet format version: {0}" + .format(self.version)) cdef void _set_compression_props(self, WriterProperties.Builder* props): if isinstance(self.compression, basestring): diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index ff31b64105c..fe0470f7e5d 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -60,7 +60,7 @@ def resource_root(): """Get the path to the test resources directory.""" if not os.environ.get("ARROW_TEST_DATA"): raise RuntimeError("Test resources not found; set " - "ARROW_TEST_DATA to /testing") + "ARROW_TEST_DATA to /testing/data") return pathlib.Path(os.environ["ARROW_TEST_DATA"]) / "flight" diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f106da3267c..e18075e4aab 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -177,6 +177,12 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size): tm.assert_frame_equal(df, df_read) +def test_parquet_invalid_version(tempdir): + table = pa.table({'a': [1, 2, 3]}) + with pytest.raises(ValueError, match="Unsupported Parquet format version"): + _write_table(table, tempdir / 'test_version.parquet', version="2.2") + + def test_set_data_page_size(): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) diff --git a/python/requirements-wheel.txt b/python/requirements-wheel.txt index 99a2666dd42..18e79636e39 100644 --- a/python/requirements-wheel.txt +++ b/python/requirements-wheel.txt @@ -6,7 +6,6 @@ pandas setuptools_scm==3.2.0 six>=1.0.0 # TODO: TensorFlow doesn't support Python 3.8 yet. +# See https://github.com/tensorflow/tensorflow/issues/33374 tensorflow; python_version >= "3.0" and python_version < "3.8" -# pin wheel, because auditwheel is not compatible with wheel=0.32 -# TODO(kszucs): remove after auditwheel properly supports wheel>0.31 -wheel==0.31.1 +wheel diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 29829610889..57fc261985c 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -50,10 +50,12 @@ packed_simd = { version = "0.3.1", optional = true } chrono = "0.4" flatbuffers = "0.6.0" hex = "0.4" +arrow-flight = { path = "../arrow-flight", optional = true } [features] simd = ["packed_simd"] -default = ["simd"] +flight = ["arrow-flight"] +default = ["simd", "flight"] [dev-dependencies] criterion = "0.2" diff --git a/rust/arrow/src/flight/mod.rs b/rust/arrow/src/flight/mod.rs new file mode 100644 index 00000000000..6d09ca48c2a --- /dev/null +++ b/rust/arrow/src/flight/mod.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities to assist with reading and writing Arrow data as Flight messages + +use std::convert::TryFrom; +use std::sync::Arc; + +use flight::FlightData; + +use crate::datatypes::Schema; +use crate::error::{ArrowError, Result}; +use crate::ipc::{convert, reader, writer}; +use crate::record_batch::RecordBatch; + +/// Convert a `RecordBatch` to `FlightData` by getting the header and body as bytes +impl From<&RecordBatch> for FlightData { + fn from(batch: &RecordBatch) -> Self { + let (header, body) = writer::record_batch_to_bytes(batch); + Self { + flight_descriptor: None, + app_metadata: vec![], + data_header: header, + data_body: body, + } + } +} + +/// Convert a `Schema` to `FlightData` by converting to an IPC message +impl From<&Schema> for FlightData { + fn from(schema: &Schema) -> Self { + let schema = writer::schema_to_bytes(schema); + Self { + flight_descriptor: None, + app_metadata: vec![], + data_header: schema, + data_body: vec![], + } + } +} + +/// Try convert `FlightData` into an Arrow Schema +/// +/// Returns an error if the `FlightData` header is not a valid IPC schema +impl TryFrom<&FlightData> for Schema { + type Error = ArrowError; + fn try_from(data: &FlightData) -> Result { + convert::schema_from_bytes(&data.data_header[..]).ok_or(ArrowError::ParseError( + "Unable to convert flight data to Arrow schema".to_string(), + )) + } +} + +/// Convert a FlightData message to a RecordBatch +pub fn flight_data_to_batch( + data: &FlightData, + schema: Arc, +) -> Result> { + // check that the data_header is a record batch message + let message = crate::ipc::get_root_as_message(&data.data_header[..]); + let batch_header = message + .header_as_record_batch() + .ok_or(ArrowError::ParseError( + "Unable to convert flight data header to a record batch".to_string(), + ))?; + reader::read_record_batch(&data.data_body, batch_header, schema) +} + +// TODO: add more explicit conversion that expoess flight descriptor and metadata options diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index a38975fbf1c..f53914e31d2 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -152,6 +152,12 @@ pub(crate) fn fb_to_schema(fb: ipc::Schema) -> Schema { Schema::new_with_metadata(fields, metadata) } +/// Deserialize an IPC message into a schema +pub(crate) fn schema_from_bytes(bytes: &[u8]) -> Option { + let ipc = ipc::get_root_as_message(bytes); + ipc.header_as_schema().map(|schema| fb_to_schema(schema)) +} + /// Get the Arrow data type from the flatbuffer Field table pub(crate) fn get_data_type(field: ipc::Field) -> DataType { match field.type_type() { diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index 99173641d32..25ba0cd3df1 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -348,7 +348,7 @@ fn create_list_array( } /// Creates a record batch from binary data using the `ipc::RecordBatch` indexes and the `Schema` -fn read_record_batch( +pub(crate) fn read_record_batch( buf: &Vec, batch: ipc::RecordBatch, schema: Arc, diff --git a/rust/arrow/src/ipc/writer.rs b/rust/arrow/src/ipc/writer.rs index 36c89c7a666..c872c8286d7 100644 --- a/rust/arrow/src/ipc/writer.rs +++ b/rust/arrow/src/ipc/writer.rs @@ -209,8 +209,7 @@ impl Drop for StreamWriter { } } -/// Convert the schema to its IPC representation, and write it to the `writer` -fn write_schema(writer: &mut BufWriter, schema: &Schema) -> Result { +pub(crate) fn schema_to_bytes(schema: &Schema) -> Vec { let mut fbb = FlatBufferBuilder::new(); let schema = { let fb = ipc::convert::schema_to_fb_offset(&mut fbb, schema); @@ -227,9 +226,13 @@ fn write_schema(writer: &mut BufWriter, schema: &Schema) -> Result< fbb.finish(data, None); let data = fbb.finished_data(); - let written = write_padded_data(writer, data, WriteDataType::Header); + data.to_vec() +} - written +/// Convert the schema to its IPC representation, and write it to the `writer` +fn write_schema(writer: &mut BufWriter, schema: &Schema) -> Result { + let data = schema_to_bytes(schema); + write_padded_data(writer, &data[..], WriteDataType::Header) } /// The message type being written. This determines whether to write the data length or not. @@ -266,13 +269,8 @@ fn write_padded_data( Ok(total_len as usize) } -/// Write a record batch to the writer, writing the message size before the message -/// if the record batch is being written to a stream -fn write_record_batch( - writer: &mut BufWriter, - batch: &RecordBatch, - is_stream: bool, -) -> Result<(usize, usize)> { +/// Write a `RecordBatch` into a tuple of bytes, one for the header (ipc::Message) and the other for the batch's data +pub(crate) fn record_batch_to_bytes(batch: &RecordBatch) -> (Vec, Vec) { let mut fbb = FlatBufferBuilder::new(); let mut nodes: Vec = vec![]; @@ -313,13 +311,24 @@ fn write_record_batch( let root = message.finish(); fbb.finish(root, None); let finished_data = fbb.finished_data(); + + (finished_data.to_vec(), arrow_data) +} + +/// Write a record batch to the writer, writing the message size before the message +/// if the record batch is being written to a stream +fn write_record_batch( + writer: &mut BufWriter, + batch: &RecordBatch, + is_stream: bool, +) -> Result<(usize, usize)> { + let (meta_data, arrow_data) = record_batch_to_bytes(batch); // write the length of data if writing to stream if is_stream { - let total_len: u32 = finished_data.len() as u32; + let total_len: u32 = meta_data.len() as u32; writer.write(&total_len.to_le_bytes()[..])?; } - let meta_written = - write_padded_data(writer, fbb.finished_data(), WriteDataType::Body)?; + let meta_written = write_padded_data(writer, &meta_data[..], WriteDataType::Body)?; let arrow_data_written = write_padded_data(writer, &arrow_data[..], WriteDataType::Body)?; Ok((meta_written, arrow_data_written)) diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs index 899bb62f08a..4383922be09 100644 --- a/rust/arrow/src/lib.rs +++ b/rust/arrow/src/lib.rs @@ -33,6 +33,8 @@ pub mod compute; pub mod csv; pub mod datatypes; pub mod error; +#[cfg(feature = "flight")] +pub mod flight; pub mod ipc; pub mod json; pub mod memory; diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml index 14fc4e54548..f489d2b0d6d 100644 --- a/rust/datafusion/Cargo.toml +++ b/rust/datafusion/Cargo.toml @@ -56,6 +56,12 @@ crossbeam = "0.7.1" [dev-dependencies] criterion = "0.2.0" tempdir = "0.3.7" +futures = "0.3" +prost = "0.6" +tokio = { version = "0.2", features = ["macros"] } +tonic = "0.1" +flatbuffers = "0.6.0" +arrow-flight = { path = "../arrow-flight", version = "0.16.0-SNAPSHOT" } [[bench]] name = "aggregate_query_sql" diff --git a/rust/datafusion/examples/README.md b/rust/datafusion/examples/README.md new file mode 100644 index 00000000000..163ef3d952b --- /dev/null +++ b/rust/datafusion/examples/README.md @@ -0,0 +1,28 @@ + + +# DataFusion Examples + +## Single Process + +The examples `csv_sql.rs` and `parquet_sql.rs` demonstrate building a query plan from a SQL statement and then executing the query plan against local CSV and Parquet files, respectively. + +## Distributed + +The `flight-client.rs` and `flight-server.rs` examples demonstrate how to run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol. \ No newline at end of file diff --git a/rust/datafusion/examples/flight-client.rs b/rust/datafusion/examples/flight-client.rs new file mode 100644 index 00000000000..97db7934620 --- /dev/null +++ b/rust/datafusion/examples/flight-client.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::convert::TryFrom; +use std::sync::Arc; + +use arrow::array::Int32Array; +use arrow::datatypes::Schema; +use arrow::flight::flight_data_to_batch; +use flight::flight_service_client::FlightServiceClient; +use flight::Ticket; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let mut client = FlightServiceClient::connect("http://localhost:50051").await?; + + let request = tonic::Request::new(Ticket { + ticket: "SELECT id FROM alltypes_plain".into(), + }); + + let mut stream = client.do_get(request).await?.into_inner(); + + // the schema should be the first message returned, else client should error + let flight_data = stream.message().await?.unwrap(); + // convert FlightData to a stream + let schema = Arc::new(Schema::try_from(&flight_data)?); + println!("Schema: {:?}", schema); + + // all the remaining stream messages should be dictionary and record batches + while let Some(flight_data) = stream.message().await? { + // the unwrap is infallible and thus safe + let record_batch = flight_data_to_batch(&flight_data, schema.clone())?.unwrap(); + + println!( + "record_batch has {} columns and {} rows", + record_batch.num_columns(), + record_batch.num_rows() + ); + let column = record_batch.column(0); + let column = column + .as_any() + .downcast_ref::() + .expect("Unable to get column"); + println!("Column 1: {:?}", column); + } + + Ok(()) +} diff --git a/rust/datafusion/examples/flight-server.rs b/rust/datafusion/examples/flight-server.rs new file mode 100644 index 00000000000..e6cc22ac503 --- /dev/null +++ b/rust/datafusion/examples/flight-server.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::pin::Pin; + +use futures::Stream; +use tonic::transport::Server; +use tonic::{Request, Response, Status, Streaming}; + +use datafusion::execution::context::ExecutionContext; + +use flight::{ + flight_service_server::FlightService, flight_service_server::FlightServiceServer, + Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, +}; + +#[derive(Clone)] +pub struct FlightServiceImpl {} + +#[tonic::async_trait] +impl FlightService for FlightServiceImpl { + type HandshakeStream = Pin< + Box> + Send + Sync + 'static>, + >; + type ListFlightsStream = + Pin> + Send + Sync + 'static>>; + type DoGetStream = + Pin> + Send + Sync + 'static>>; + type DoPutStream = + Pin> + Send + Sync + 'static>>; + type DoActionStream = Pin< + Box> + Send + Sync + 'static>, + >; + type ListActionsStream = + Pin> + Send + Sync + 'static>>; + + async fn do_get( + &self, + request: Request, + ) -> Result, Status> { + let ticket = request.into_inner(); + match String::from_utf8(ticket.ticket.to_vec()) { + Ok(sql) => { + println!("do_get: {}", sql); + + // create local execution context + let mut ctx = ExecutionContext::new(); + + let testdata = std::env::var("PARQUET_TEST_DATA") + .expect("PARQUET_TEST_DATA not defined"); + + // register parquet file with the execution context + ctx.register_parquet( + "alltypes_plain", + &format!("{}/alltypes_plain.parquet", testdata), + ) + .map_err(|e| to_tonic_err(&e))?; + + // create the query plan + let plan = ctx + .create_logical_plan(&sql) + .and_then(|plan| ctx.optimize(&plan)) + .and_then(|plan| ctx.create_physical_plan(&plan, 1024 * 1024)) + .map_err(|e| to_tonic_err(&e))?; + + // execute the query + let results = ctx.collect(plan.as_ref()).map_err(|e| to_tonic_err(&e))?; + if results.is_empty() { + return Err(Status::internal("There were no results from ticket")); + } + + // add an initial FlightData message that sends schema + let schema = plan.schema(); + let mut flights: Vec> = + vec![Ok(FlightData::from(schema.as_ref()))]; + + let mut batches: Vec> = results + .iter() + .map(|batch| Ok(FlightData::from(batch))) + .collect(); + + // append batch vector to schema vector, so that the first message sent is the schema + flights.append(&mut batches); + + let output = futures::stream::iter(flights); + + Ok(Response::new(Box::pin(output) as Self::DoGetStream)) + } + Err(e) => Err(Status::invalid_argument(format!("Invalid ticket: {:?}", e))), + } + } + + async fn handshake( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn list_flights( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn get_flight_info( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn get_schema( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn do_put( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn do_action( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + + async fn list_actions( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not yet implemented")) + } +} + +fn to_tonic_err(e: &datafusion::error::ExecutionError) -> Status { + Status::internal(format!("{:?}", e)) +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let addr = "0.0.0.0:50051".parse()?; + let service = FlightServiceImpl {}; + + let svc = FlightServiceServer::new(service); + + println!("Listening on {:?}", addr); + + Server::builder().add_service(svc).serve(addr).await?; + + Ok(()) +}