diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index ac1b570534f..1744a626313 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -141,6 +141,7 @@ set(ARROW_SRCS array/validate.cc builder.cc buffer.cc + chunked_array.cc compare.cc datum.cc device.cc @@ -565,7 +566,12 @@ set_source_files_properties(public_api_test.cc add_arrow_test(scalar_test) add_arrow_test(type_test) -add_arrow_test(table_test SOURCES table_test.cc table_builder_test.cc) +add_arrow_test(table_test + SOURCES + chunked_array_test.cc + record_batch_test.cc + table_test.cc + table_builder_test.cc) add_arrow_test(tensor_test) add_arrow_test(sparse_tensor_test) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index b0f70e896f7..e357d9baaef 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -23,6 +23,7 @@ #include "arrow/array/concatenate.h" // IYWU pragma: export #include "arrow/buffer.h" // IYWU pragma: export #include "arrow/builder.h" // IYWU pragma: export +#include "arrow/chunked_array.h" // IYWU pragma: export #include "arrow/compare.h" // IYWU pragma: export #include "arrow/extension_type.h" // IYWU pragma: export #include "arrow/memory_pool.h" // IYWU pragma: export diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc new file mode 100644 index 00000000000..4ed4245d944 --- /dev/null +++ b/cpp/src/arrow/chunked_array.cc @@ -0,0 +1,249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/chunked_array.h" + +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_nested.h" +#include "arrow/array/validate.h" +#include "arrow/pretty_print.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::checked_cast; + +class MemoryPool; + +// ---------------------------------------------------------------------- +// ChunkedArray methods + +ChunkedArray::ChunkedArray(ArrayVector chunks) : chunks_(std::move(chunks)) { + length_ = 0; + null_count_ = 0; + + ARROW_CHECK_GT(chunks_.size(), 0) + << "cannot construct ChunkedArray from empty vector and omitted type"; + type_ = chunks_[0]->type(); + for (const std::shared_ptr& chunk : chunks_) { + length_ += chunk->length(); + null_count_ += chunk->null_count(); + } +} + +ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) + : chunks_(std::move(chunks)), type_(std::move(type)) { + length_ = 0; + null_count_ = 0; + for (const std::shared_ptr& chunk : chunks_) { + length_ += chunk->length(); + null_count_ += chunk->null_count(); + } +} + +bool ChunkedArray::Equals(const ChunkedArray& other) const { + if (length_ != other.length()) { + return false; + } + if (null_count_ != other.null_count()) { + return false; + } + // We cannot toggle check_metadata here yet, so we don't check it + if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { + return false; + } + + // Check contents of the underlying arrays. This checks for equality of + // the underlying data independently of the chunk size. + return internal::ApplyBinaryChunked( + *this, other, + [](const Array& left_piece, const Array& right_piece, + int64_t ARROW_ARG_UNUSED(position)) { + if (!left_piece.Equals(right_piece)) { + return Status::Invalid("Unequal piece"); + } + return Status::OK(); + }) + .ok(); +} + +bool ChunkedArray::Equals(const std::shared_ptr& other) const { + if (this == other.get()) { + return true; + } + if (!other) { + return false; + } + return Equals(*other.get()); +} + +std::shared_ptr ChunkedArray::Slice(int64_t offset, int64_t length) const { + ARROW_CHECK_LE(offset, length_) << "Slice offset greater than array length"; + bool offset_equals_length = offset == length_; + int curr_chunk = 0; + while (curr_chunk < num_chunks() && offset >= chunk(curr_chunk)->length()) { + offset -= chunk(curr_chunk)->length(); + curr_chunk++; + } + + ArrayVector new_chunks; + if (num_chunks() > 0 && (offset_equals_length || length == 0)) { + // Special case the zero-length slice to make sure there is at least 1 Array + // in the result. When there are zero chunks we return zero chunks + new_chunks.push_back(chunk(std::min(curr_chunk, num_chunks() - 1))->Slice(0, 0)); + } else { + while (curr_chunk < num_chunks() && length > 0) { + new_chunks.push_back(chunk(curr_chunk)->Slice(offset, length)); + length -= chunk(curr_chunk)->length() - offset; + offset = 0; + curr_chunk++; + } + } + + return std::make_shared(new_chunks, type_); +} + +std::shared_ptr ChunkedArray::Slice(int64_t offset) const { + return Slice(offset, length_); +} + +Result>> ChunkedArray::Flatten( + MemoryPool* pool) const { + if (type()->id() != Type::STRUCT) { + // Emulate nonexistent copy constructor + return std::vector>{ + std::make_shared(chunks_, type_)}; + } + + std::vector flattened_chunks(type()->num_fields()); + for (const auto& chunk : chunks_) { + ARROW_ASSIGN_OR_RAISE(auto arrays, + checked_cast(*chunk).Flatten(pool)); + DCHECK_EQ(arrays.size(), flattened_chunks.size()); + for (size_t i = 0; i < arrays.size(); ++i) { + flattened_chunks[i].push_back(arrays[i]); + } + } + + std::vector> flattened(type()->num_fields()); + for (size_t i = 0; i < flattened.size(); ++i) { + auto child_type = type()->field(static_cast(i))->type(); + flattened[i] = + std::make_shared(std::move(flattened_chunks[i]), child_type); + } + return flattened; +} + +Result> ChunkedArray::View( + const std::shared_ptr& type) const { + ArrayVector out_chunks(this->num_chunks()); + for (int i = 0; i < this->num_chunks(); ++i) { + ARROW_ASSIGN_OR_RAISE(out_chunks[i], chunks_[i]->View(type)); + } + return std::make_shared(out_chunks, type); +} + +std::string ChunkedArray::ToString() const { + std::stringstream ss; + ARROW_CHECK_OK(PrettyPrint(*this, 0, &ss)); + return ss.str(); +} + +Status ChunkedArray::Validate() const { + if (chunks_.size() == 0) { + return Status::OK(); + } + + const auto& type = *chunks_[0]->type(); + // Make sure chunks all have the same type + for (size_t i = 1; i < chunks_.size(); ++i) { + const Array& chunk = *chunks_[i]; + if (!chunk.type()->Equals(type)) { + return Status::Invalid("In chunk ", i, " expected type ", type.ToString(), + " but saw ", chunk.type()->ToString()); + } + } + // Validate the chunks themselves + for (size_t i = 0; i < chunks_.size(); ++i) { + const Array& chunk = *chunks_[i]; + const Status st = internal::ValidateArray(chunk); + if (!st.ok()) { + return Status::Invalid("In chunk ", i, ": ", st.ToString()); + } + } + return Status::OK(); +} + +Status ChunkedArray::ValidateFull() const { + RETURN_NOT_OK(Validate()); + for (size_t i = 0; i < chunks_.size(); ++i) { + const Array& chunk = *chunks_[i]; + const Status st = internal::ValidateArrayData(chunk); + if (!st.ok()) { + return Status::Invalid("In chunk ", i, ": ", st.ToString()); + } + } + return Status::OK(); +} + +namespace internal { + +bool MultipleChunkIterator::Next(std::shared_ptr* next_left, + std::shared_ptr* next_right) { + if (pos_ == length_) return false; + + // Find non-empty chunk + std::shared_ptr chunk_left, chunk_right; + while (true) { + chunk_left = left_.chunk(chunk_idx_left_); + chunk_right = right_.chunk(chunk_idx_right_); + if (chunk_pos_left_ == chunk_left->length()) { + chunk_pos_left_ = 0; + ++chunk_idx_left_; + continue; + } + if (chunk_pos_right_ == chunk_right->length()) { + chunk_pos_right_ = 0; + ++chunk_idx_right_; + continue; + } + break; + } + // Determine how big of a section to return + int64_t iteration_size = std::min(chunk_left->length() - chunk_pos_left_, + chunk_right->length() - chunk_pos_right_); + + *next_left = chunk_left->Slice(chunk_pos_left_, iteration_size); + *next_right = chunk_right->Slice(chunk_pos_right_, iteration_size); + + pos_ += iteration_size; + chunk_pos_left_ += iteration_size; + chunk_pos_right_ += iteration_size; + return true; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h new file mode 100644 index 00000000000..7d920b65cdf --- /dev/null +++ b/cpp/src/arrow/chunked_array.h @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class MemoryPool; + +/// \class ChunkedArray +/// \brief A data structure managing a list of primitive Arrow arrays logically +/// as one large array +class ARROW_EXPORT ChunkedArray { + public: + /// \brief Construct a chunked array from a vector of arrays + /// + /// The vector must be non-empty and all its elements must have the same + /// data type. + explicit ChunkedArray(ArrayVector chunks); + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(std::shared_ptr chunk) + : ChunkedArray(ArrayVector{std::move(chunk)}) {} + + /// \brief Construct a chunked array from a vector of arrays and a data type + /// + /// As the data type is passed explicitly, the vector may be empty. + ChunkedArray(ArrayVector chunks, std::shared_ptr type); + + /// \return the total length of the chunked array; computed on construction + int64_t length() const { return length_; } + + /// \return the total number of nulls among all chunks + int64_t null_count() const { return null_count_; } + + int num_chunks() const { return static_cast(chunks_.size()); } + + /// \return chunk a particular chunk from the chunked array + std::shared_ptr chunk(int i) const { return chunks_[i]; } + + const ArrayVector& chunks() const { return chunks_; } + + /// \brief Construct a zero-copy slice of the chunked array with the + /// indicated offset and length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the chunked array, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// \brief Slice from offset until end of the chunked array + std::shared_ptr Slice(int64_t offset) const; + + /// \brief Flatten this chunked array as a vector of chunked arrays, one + /// for each struct field + /// + /// \param[in] pool The pool for buffer allocations, if any + Result>> Flatten( + MemoryPool* pool = default_memory_pool()) const; + + /// Construct a zero-copy view of this chunked array with the given + /// type. Calls Array::View on each constituent chunk. Always succeeds if + /// there are zero chunks + Result> View(const std::shared_ptr& type) const; + + std::shared_ptr type() const { return type_; } + + /// \brief Determine if two chunked arrays are equal. + /// + /// Two chunked arrays can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. + bool Equals(const ChunkedArray& other) const; + /// \brief Determine if two chunked arrays are equal. + bool Equals(const std::shared_ptr& other) const; + + /// \return PrettyPrint representation suitable for debugging + std::string ToString() const; + + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the chunk array's internal data. + /// + /// This is O(k*m) where k is the number of array descendents, + /// and m is the number of chunks. + /// + /// \return Status + Status Validate() const; + + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the chunk array's internal data. + /// + /// This is O(k*n) where k is the number of array descendents, + /// and n is the length in elements. + /// + /// \return Status + Status ValidateFull() const; + + protected: + ArrayVector chunks_; + int64_t length_; + int64_t null_count_; + std::shared_ptr type_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); +}; + +namespace internal { + +/// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous +/// pieces of potentially differently-chunked ChunkedArray objects +class ARROW_EXPORT MultipleChunkIterator { + public: + MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right) + : left_(left), + right_(right), + pos_(0), + length_(left.length()), + chunk_idx_left_(0), + chunk_idx_right_(0), + chunk_pos_left_(0), + chunk_pos_right_(0) {} + + bool Next(std::shared_ptr* next_left, std::shared_ptr* next_right); + + int64_t position() const { return pos_; } + + private: + const ChunkedArray& left_; + const ChunkedArray& right_; + + // The amount of the entire ChunkedArray consumed + int64_t pos_; + + // Length of the chunked array(s) + int64_t length_; + + // Current left chunk + int chunk_idx_left_; + + // Current right chunk + int chunk_idx_right_; + + // Offset into the current left chunk + int64_t chunk_pos_left_; + + // Offset into the current right chunk + int64_t chunk_pos_right_; +}; + +/// \brief Evaluate binary function on two ChunkedArray objects having possibly +/// different chunk layouts. The passed binary function / functor should have +/// the following signature. +/// +/// Status(const Array&, const Array&, int64_t) +/// +/// The third argument is the absolute position relative to the start of each +/// ChunkedArray. The function is executed against each contiguous pair of +/// array segments, slicing if necessary. +/// +/// For example, if two arrays have chunk sizes +/// +/// left: [10, 10, 20] +/// right: [15, 10, 15] +/// +/// Then the following invocations take place (pseudocode) +/// +/// func(left.chunk[0][0:10], right.chunk[0][0:10], 0) +/// func(left.chunk[1][0:5], right.chunk[0][10:15], 10) +/// func(left.chunk[1][5:10], right.chunk[1][0:5], 15) +/// func(left.chunk[2][0:5], right.chunk[1][5:10], 20) +/// func(left.chunk[2][5:20], right.chunk[2][:], 25) +template +Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right, + Action&& action) { + MultipleChunkIterator iterator(left, right); + std::shared_ptr left_piece, right_piece; + while (iterator.Next(&left_piece, &right_piece)) { + ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position())); + } + return Status::OK(); +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc new file mode 100644 index 00000000000..5074eadf47a --- /dev/null +++ b/cpp/src/arrow/chunked_array_test.cc @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/key_value_metadata.h" + +namespace arrow { + +class TestChunkedArray : public TestBase { + protected: + virtual void Construct() { + one_ = std::make_shared(arrays_one_); + if (!arrays_another_.empty()) { + another_ = std::make_shared(arrays_another_); + } + } + + ArrayVector arrays_one_; + ArrayVector arrays_another_; + + std::shared_ptr one_; + std::shared_ptr another_; +}; + +TEST_F(TestChunkedArray, BasicEquals) { + std::vector null_bitmap(100, true); + std::vector data(100, 1); + std::shared_ptr array; + ArrayFromVector(null_bitmap, data, &array); + arrays_one_.push_back(array); + arrays_another_.push_back(array); + + Construct(); + ASSERT_TRUE(one_->Equals(one_)); + ASSERT_FALSE(one_->Equals(nullptr)); + ASSERT_TRUE(one_->Equals(another_)); + ASSERT_TRUE(one_->Equals(*another_.get())); +} + +TEST_F(TestChunkedArray, EqualsDifferingTypes) { + std::vector null_bitmap(100, true); + std::vector data32(100, 1); + std::vector data64(100, 1); + std::shared_ptr array; + ArrayFromVector(null_bitmap, data32, &array); + arrays_one_.push_back(array); + ArrayFromVector(null_bitmap, data64, &array); + arrays_another_.push_back(array); + + Construct(); + ASSERT_FALSE(one_->Equals(another_)); + ASSERT_FALSE(one_->Equals(*another_.get())); +} + +TEST_F(TestChunkedArray, EqualsDifferingLengths) { + std::vector null_bitmap100(100, true); + std::vector null_bitmap101(101, true); + std::vector data100(100, 1); + std::vector data101(101, 1); + std::shared_ptr array; + ArrayFromVector(null_bitmap100, data100, &array); + arrays_one_.push_back(array); + ArrayFromVector(null_bitmap101, data101, &array); + arrays_another_.push_back(array); + + Construct(); + ASSERT_FALSE(one_->Equals(another_)); + ASSERT_FALSE(one_->Equals(*another_.get())); + + std::vector null_bitmap1(1, true); + std::vector data1(1, 1); + ArrayFromVector(null_bitmap1, data1, &array); + arrays_one_.push_back(array); + + Construct(); + ASSERT_TRUE(one_->Equals(another_)); + ASSERT_TRUE(one_->Equals(*another_.get())); +} + +TEST_F(TestChunkedArray, EqualsDifferingMetadata) { + auto left_ty = list(field("item", int32())); + + auto metadata = key_value_metadata({"foo"}, {"bar"}); + auto right_ty = list(field("item", int32(), true, metadata)); + + std::vector> left_chunks = {ArrayFromJSON(left_ty, "[[]]")}; + std::vector> right_chunks = {ArrayFromJSON(right_ty, "[[]]")}; + + ChunkedArray left(left_chunks); + ChunkedArray right(right_chunks); + ASSERT_TRUE(left.Equals(right)); +} + +TEST_F(TestChunkedArray, SliceEquals) { + arrays_one_.push_back(MakeRandomArray(100)); + arrays_one_.push_back(MakeRandomArray(50)); + arrays_one_.push_back(MakeRandomArray(50)); + Construct(); + + std::shared_ptr slice = one_->Slice(125, 50); + ASSERT_EQ(slice->length(), 50); + AssertChunkedEqual(*one_->Slice(125, 50), *slice); + + std::shared_ptr slice2 = one_->Slice(75)->Slice(25)->Slice(25, 50); + ASSERT_EQ(slice2->length(), 50); + AssertChunkedEqual(*slice, *slice2); + + // Making empty slices of a ChunkedArray + std::shared_ptr slice3 = one_->Slice(one_->length(), 99); + ASSERT_EQ(slice3->length(), 0); + ASSERT_EQ(slice3->num_chunks(), 1); + ASSERT_TRUE(slice3->type()->Equals(one_->type())); + + std::shared_ptr slice4 = one_->Slice(10, 0); + ASSERT_EQ(slice4->length(), 0); + ASSERT_EQ(slice4->num_chunks(), 1); + ASSERT_TRUE(slice4->type()->Equals(one_->type())); + + // Slicing an empty ChunkedArray + std::shared_ptr slice5 = slice4->Slice(0, 10); + ASSERT_EQ(slice5->length(), 0); + ASSERT_EQ(slice5->num_chunks(), 1); + ASSERT_TRUE(slice5->type()->Equals(one_->type())); +} + +TEST_F(TestChunkedArray, ZeroChunksIssues) { + ArrayVector empty = {}; + auto no_chunks = std::make_shared(empty, int8()); + + // ARROW-8911, assert that slicing is a no-op when there are zero-chunks + auto sliced = no_chunks->Slice(0, 0); + auto sliced2 = no_chunks->Slice(0, 5); + AssertChunkedEqual(*no_chunks, *sliced); + AssertChunkedEqual(*no_chunks, *sliced2); +} + +TEST_F(TestChunkedArray, Validate) { + // Valid if empty + ArrayVector empty = {}; + auto no_chunks = std::make_shared(empty, utf8()); + ASSERT_OK(no_chunks->ValidateFull()); + + random::RandomArrayGenerator gen(0); + arrays_one_.push_back(gen.Int32(50, 0, 100, 0.1)); + Construct(); + ASSERT_OK(one_->ValidateFull()); + + arrays_one_.push_back(gen.Int32(50, 0, 100, 0.1)); + Construct(); + ASSERT_OK(one_->ValidateFull()); + + arrays_one_.push_back(gen.String(50, 0, 10, 0.1)); + Construct(); + ASSERT_RAISES(Invalid, one_->ValidateFull()); +} + +TEST_F(TestChunkedArray, View) { + auto in_ty = int32(); + auto out_ty = fixed_size_binary(4); +#if ARROW_LITTLE_ENDIAN + auto arr = ArrayFromJSON(in_ty, "[2020568934, 2054316386, null]"); + auto arr2 = ArrayFromJSON(in_ty, "[2020568934, 2054316386]"); +#else + auto arr = ArrayFromJSON(in_ty, "[1718579064, 1650553466, null]"); + auto arr2 = ArrayFromJSON(in_ty, "[1718579064, 1650553466]"); +#endif + auto ex = ArrayFromJSON(out_ty, R"(["foox", "barz", null])"); + auto ex2 = ArrayFromJSON(out_ty, R"(["foox", "barz"])"); + + ArrayVector chunks = {arr, arr2}; + ArrayVector ex_chunks = {ex, ex2}; + auto carr = std::make_shared(chunks); + auto expected = std::make_shared(ex_chunks); + + ASSERT_OK_AND_ASSIGN(auto result, carr->View(out_ty)); + AssertChunkedEqual(*expected, *result); + + // Zero length + ArrayVector empty = {}; + carr = std::make_shared(empty, in_ty); + expected = std::make_shared(empty, out_ty); + ASSERT_OK_AND_ASSIGN(result, carr->View(out_ty)); + AssertChunkedEqual(*expected, *result); +} + +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 6ee4c35aa2a..3d4cbafb227 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -29,6 +29,7 @@ #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/chunked_array.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" #include "arrow/compute/kernel.h" @@ -37,7 +38,6 @@ #include "arrow/datum.h" #include "arrow/scalar.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index d8615075054..75a2089b3dd 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -27,6 +27,7 @@ #include "arrow/array/array_base.h" #include "arrow/array/data.h" #include "arrow/buffer.h" +#include "arrow/chunked_array.h" #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" @@ -35,7 +36,6 @@ #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 9f92da9dd69..94e39123d6c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -23,10 +23,10 @@ #include #include "arrow/array.h" +#include "arrow/chunked_array.h" #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/test_util.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h index b2e02cbec4e..21244320f38 100644 --- a/cpp/src/arrow/compute/kernels/common.h +++ b/cpp/src/arrow/compute/kernels/common.h @@ -28,6 +28,7 @@ #include "arrow/array/data.h" #include "arrow/buffer.h" +#include "arrow/chunked_array.h" #include "arrow/compute/exec.h" #include "arrow/compute/function.h" #include "arrow/compute/kernel.h" @@ -36,7 +37,6 @@ #include "arrow/datum.h" #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc index cb3fdb06014..1609e5d61ef 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc @@ -22,9 +22,9 @@ #include +#include "arrow/chunked_array.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/test_util.h" -#include "arrow/table.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 4970c830f2b..a3a683c1af5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -26,10 +26,10 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/chunked_array.h" #include "arrow/extension_type.h" #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 3350d29a9e7..920e577c108 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -31,12 +31,12 @@ #include "arrow/array/array_base.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" +#include "arrow/chunked_array.h" #include "arrow/compute/api.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/testing/gtest_compat.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 4ac12299fdf..19340c923b7 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -22,10 +22,10 @@ #include #include "arrow/array.h" +#include "arrow/chunked_array.h" #include "arrow/compute/exec.h" #include "arrow/datum.h" #include "arrow/result.h" -#include "arrow/table.h" #include "arrow/testing/gtest_util.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc index c7b584fea19..83e8240c3a3 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -30,9 +30,9 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/chunked_array.h" #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/util.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 52c744ec8e1..cc141681afa 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -26,12 +26,14 @@ #include "arrow/array/builder_primitive.h" #include "arrow/array/concatenate.h" #include "arrow/buffer_builder.h" +#include "arrow/chunked_array.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/util_internal.h" #include "arrow/extension_type.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/table.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index b0a61b6efa0..a67c67c302b 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/chunked_array.h" #include "arrow/compute/api.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/table.h" diff --git a/cpp/src/arrow/csv/column_builder.cc b/cpp/src/arrow/csv/column_builder.cc index ab61d014137..4d0e21313a8 100644 --- a/cpp/src/arrow/csv/column_builder.cc +++ b/cpp/src/arrow/csv/column_builder.cc @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/chunked_array.h" #include "arrow/csv/column_builder.h" #include "arrow/csv/converter.h" #include "arrow/csv/inference_internal.h" @@ -33,7 +34,6 @@ #include "arrow/csv/parser.h" #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/util/logging.h" #include "arrow/util/task_group.h" diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc index 42270fc7bad..5e48c01e35d 100644 --- a/cpp/src/arrow/datum.cc +++ b/cpp/src/arrow/datum.cc @@ -24,6 +24,7 @@ #include "arrow/array/array_base.h" #include "arrow/array/util.h" +#include "arrow/chunked_array.h" #include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/table.h" diff --git a/cpp/src/arrow/datum_test.cc b/cpp/src/arrow/datum_test.cc index e0b62c71312..3c8b22c9620 100644 --- a/cpp/src/arrow/datum_test.cc +++ b/cpp/src/arrow/datum_test.cc @@ -21,6 +21,7 @@ #include #include "arrow/array/array_base.h" +#include "arrow/chunked_array.h" #include "arrow/datum.h" #include "arrow/scalar.h" #include "arrow/table.h" diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index c1d61150d88..5f757b4e92c 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -31,6 +31,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/chunked_array.h" #include "arrow/io/interfaces.h" #include "arrow/ipc/metadata_internal.h" #include "arrow/ipc/options.h" diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 70b6fd0419b..bb8b09f1fd9 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -27,6 +27,7 @@ #include #include "arrow/array.h" +#include "arrow/chunked_array.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 4fd1dd4c8d2..6e23b7f9ece 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -30,8 +30,8 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/chunked_array.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc new file mode 100644 index 00000000000..33786bd443b --- /dev/null +++ b/cpp/src/arrow/record_batch_test.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/array/util.h" +#include "arrow/chunked_array.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/key_value_metadata.h" + +namespace arrow { + +class TestRecordBatch : public TestBase {}; + +TEST_F(TestRecordBatch, Equals) { + const int length = 10; + + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int16()); + + auto metadata = key_value_metadata({"foo"}, {"bar"}); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema({f0, f1, f2}); + auto schema2 = ::arrow::schema({f0, f1}); + auto schema3 = ::arrow::schema({f0, f1, f2}, metadata); + + auto a0 = MakeRandomArray(length); + auto a1 = MakeRandomArray(length); + auto a2 = MakeRandomArray(length); + + auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); + auto b2 = RecordBatch::Make(schema3, length, {a0, a1, a2}); + auto b3 = RecordBatch::Make(schema2, length, {a0, a1}); + auto b4 = RecordBatch::Make(schema, length, {a0, a1, a1}); + + ASSERT_TRUE(b1->Equals(*b1)); + ASSERT_FALSE(b1->Equals(*b3)); + ASSERT_FALSE(b1->Equals(*b4)); + + // Different metadata + ASSERT_TRUE(b1->Equals(*b2)); + ASSERT_FALSE(b1->Equals(*b2, /*check_metadata=*/true)); +} + +TEST_F(TestRecordBatch, Validate) { + const int length = 10; + + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int16()); + + auto schema = ::arrow::schema({f0, f1, f2}); + + auto a0 = MakeRandomArray(length); + auto a1 = MakeRandomArray(length); + auto a2 = MakeRandomArray(length); + auto a3 = MakeRandomArray(5); + + auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); + + ASSERT_OK(b1->ValidateFull()); + + // Length mismatch + auto b2 = RecordBatch::Make(schema, length, {a0, a1, a3}); + ASSERT_RAISES(Invalid, b2->ValidateFull()); + + // Type mismatch + auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); + ASSERT_RAISES(Invalid, b3->ValidateFull()); +} + +TEST_F(TestRecordBatch, Slice) { + const int length = 7; + + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int8()); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema(fields); + + auto a0 = MakeRandomArray(length); + auto a1 = MakeRandomArray(length); + auto a2 = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6]"); + + auto batch = RecordBatch::Make(schema, length, {a0, a1, a2}); + + auto batch_slice = batch->Slice(2); + auto batch_slice2 = batch->Slice(1, 5); + + ASSERT_EQ(batch_slice->num_rows(), batch->num_rows() - 2); + + for (int i = 0; i < batch->num_columns(); ++i) { + ASSERT_EQ(2, batch_slice->column(i)->offset()); + ASSERT_EQ(length - 2, batch_slice->column(i)->length()); + + ASSERT_EQ(1, batch_slice2->column(i)->offset()); + ASSERT_EQ(5, batch_slice2->column(i)->length()); + } + + // ARROW-9143: RecordBatch::Slice was incorrectly setting a2's + // ArrayData::null_count to kUnknownNullCount + ASSERT_EQ(batch_slice->column(2)->data()->null_count, 0); + ASSERT_EQ(batch_slice2->column(2)->data()->null_count, 0); +} + +TEST_F(TestRecordBatch, AddColumn) { + const int length = 10; + + auto field1 = field("f1", int32()); + auto field2 = field("f2", uint8()); + auto field3 = field("f3", int16()); + + auto schema1 = ::arrow::schema({field1, field2}); + auto schema2 = ::arrow::schema({field2, field3}); + auto schema3 = ::arrow::schema({field2}); + + auto array1 = MakeRandomArray(length); + auto array2 = MakeRandomArray(length); + auto array3 = MakeRandomArray(length); + + auto batch1 = RecordBatch::Make(schema1, length, {array1, array2}); + auto batch2 = RecordBatch::Make(schema2, length, {array2, array3}); + auto batch3 = RecordBatch::Make(schema3, length, {array2}); + + const RecordBatch& batch = *batch3; + + // Negative tests with invalid index + ASSERT_RAISES(Invalid, batch.AddColumn(5, field1, array1)); + ASSERT_RAISES(Invalid, batch.AddColumn(2, field1, array1)); + ASSERT_RAISES(Invalid, batch.AddColumn(-1, field1, array1)); + + // Negative test with wrong length + auto longer_col = MakeRandomArray(length + 1); + ASSERT_RAISES(Invalid, batch.AddColumn(0, field1, longer_col)); + + // Negative test with mismatch type + ASSERT_RAISES(Invalid, batch.AddColumn(0, field1, array2)); + + ASSERT_OK_AND_ASSIGN(auto new_batch, batch.AddColumn(0, field1, array1)); + AssertBatchesEqual(*new_batch, *batch1); + + ASSERT_OK_AND_ASSIGN(new_batch, batch.AddColumn(1, field3, array3)); + AssertBatchesEqual(*new_batch, *batch2); + + ASSERT_OK_AND_ASSIGN(auto new_batch2, batch.AddColumn(1, "f3", array3)); + AssertBatchesEqual(*new_batch2, *new_batch); + + ASSERT_TRUE(new_batch2->schema()->field(1)->nullable()); +} + +TEST_F(TestRecordBatch, RemoveColumn) { + const int length = 10; + + auto field1 = field("f1", int32()); + auto field2 = field("f2", uint8()); + auto field3 = field("f3", int16()); + + auto schema1 = ::arrow::schema({field1, field2, field3}); + auto schema2 = ::arrow::schema({field2, field3}); + auto schema3 = ::arrow::schema({field1, field3}); + auto schema4 = ::arrow::schema({field1, field2}); + + auto array1 = MakeRandomArray(length); + auto array2 = MakeRandomArray(length); + auto array3 = MakeRandomArray(length); + + auto batch1 = RecordBatch::Make(schema1, length, {array1, array2, array3}); + auto batch2 = RecordBatch::Make(schema2, length, {array2, array3}); + auto batch3 = RecordBatch::Make(schema3, length, {array1, array3}); + auto batch4 = RecordBatch::Make(schema4, length, {array1, array2}); + + const RecordBatch& batch = *batch1; + std::shared_ptr result; + + // Negative tests with invalid index + ASSERT_RAISES(Invalid, batch.RemoveColumn(3)); + ASSERT_RAISES(Invalid, batch.RemoveColumn(-1)); + + ASSERT_OK_AND_ASSIGN(auto new_batch, batch.RemoveColumn(0)); + AssertBatchesEqual(*new_batch, *batch2); + + ASSERT_OK_AND_ASSIGN(new_batch, batch.RemoveColumn(1)); + AssertBatchesEqual(*new_batch, *batch3); + + ASSERT_OK_AND_ASSIGN(new_batch, batch.RemoveColumn(2)); + AssertBatchesEqual(*new_batch, *batch4); +} + +TEST_F(TestRecordBatch, RemoveColumnEmpty) { + const int length = 10; + + auto field1 = field("f1", int32()); + auto schema1 = ::arrow::schema({field1}); + auto array1 = MakeRandomArray(length); + auto batch1 = RecordBatch::Make(schema1, length, {array1}); + + ASSERT_OK_AND_ASSIGN(auto empty, batch1->RemoveColumn(0)); + ASSERT_EQ(batch1->num_rows(), empty->num_rows()); + + ASSERT_OK_AND_ASSIGN(auto added, empty->AddColumn(0, field1, array1)); + AssertBatchesEqual(*added, *batch1); +} + +TEST_F(TestRecordBatch, ToFromEmptyStructArray) { + auto batch1 = + RecordBatch::Make(::arrow::schema({}), 10, std::vector>{}); + ASSERT_OK_AND_ASSIGN(auto struct_array, batch1->ToStructArray()); + ASSERT_EQ(10, struct_array->length()); + ASSERT_OK_AND_ASSIGN(auto batch2, RecordBatch::FromStructArray(struct_array)); + ASSERT_TRUE(batch1->Equals(*batch2)); +} + +TEST_F(TestRecordBatch, FromStructArrayInvalidType) { + ASSERT_RAISES(Invalid, RecordBatch::FromStructArray(MakeRandomArray(10))); +} + +TEST_F(TestRecordBatch, FromStructArrayInvalidNullCount) { + auto struct_array = + ArrayFromJSON(struct_({field("f1", int32())}), R"([{"f1": 1}, null])"); + ASSERT_RAISES(Invalid, RecordBatch::FromStructArray(struct_array)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index 10e3a83dd74..5ba7a15072d 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -29,6 +29,7 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/chunked_array.h" #include "arrow/compute/api.h" #include "arrow/memory_pool.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 8aeb51d9126..30982ec2226 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -29,7 +29,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/concatenate.h" #include "arrow/array/util.h" -#include "arrow/array/validate.h" +#include "arrow/chunked_array.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/result.h" @@ -45,214 +45,9 @@ namespace arrow { using internal::checked_cast; -// ---------------------------------------------------------------------- -// ChunkedArray methods - -ChunkedArray::ChunkedArray(ArrayVector chunks) : chunks_(std::move(chunks)) { - length_ = 0; - null_count_ = 0; - - ARROW_CHECK_GT(chunks_.size(), 0) - << "cannot construct ChunkedArray from empty vector and omitted type"; - type_ = chunks_[0]->type(); - for (const std::shared_ptr& chunk : chunks_) { - length_ += chunk->length(); - null_count_ += chunk->null_count(); - } -} - -ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), type_(std::move(type)) { - length_ = 0; - null_count_ = 0; - for (const std::shared_ptr& chunk : chunks_) { - length_ += chunk->length(); - null_count_ += chunk->null_count(); - } -} - -bool ChunkedArray::Equals(const ChunkedArray& other) const { - if (length_ != other.length()) { - return false; - } - if (null_count_ != other.null_count()) { - return false; - } - // We cannot toggle check_metadata here yet, so we don't check it - if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { - return false; - } - - // Check contents of the underlying arrays. This checks for equality of - // the underlying data independently of the chunk size. - return internal::ApplyBinaryChunked( - *this, other, - [](const Array& left_piece, const Array& right_piece, - int64_t ARROW_ARG_UNUSED(position)) { - if (!left_piece.Equals(right_piece)) { - return Status::Invalid("Unequal piece"); - } - return Status::OK(); - }) - .ok(); -} - -bool ChunkedArray::Equals(const std::shared_ptr& other) const { - if (this == other.get()) { - return true; - } - if (!other) { - return false; - } - return Equals(*other.get()); -} - -std::shared_ptr ChunkedArray::Slice(int64_t offset, int64_t length) const { - ARROW_CHECK_LE(offset, length_) << "Slice offset greater than array length"; - bool offset_equals_length = offset == length_; - int curr_chunk = 0; - while (curr_chunk < num_chunks() && offset >= chunk(curr_chunk)->length()) { - offset -= chunk(curr_chunk)->length(); - curr_chunk++; - } - - ArrayVector new_chunks; - if (num_chunks() > 0 && (offset_equals_length || length == 0)) { - // Special case the zero-length slice to make sure there is at least 1 Array - // in the result. When there are zero chunks we return zero chunks - new_chunks.push_back(chunk(std::min(curr_chunk, num_chunks() - 1))->Slice(0, 0)); - } else { - while (curr_chunk < num_chunks() && length > 0) { - new_chunks.push_back(chunk(curr_chunk)->Slice(offset, length)); - length -= chunk(curr_chunk)->length() - offset; - offset = 0; - curr_chunk++; - } - } - - return std::make_shared(new_chunks, type_); -} - -std::shared_ptr ChunkedArray::Slice(int64_t offset) const { - return Slice(offset, length_); -} - -Result>> ChunkedArray::Flatten( - MemoryPool* pool) const { - if (type()->id() != Type::STRUCT) { - // Emulate nonexistent copy constructor - return std::vector>{ - std::make_shared(chunks_, type_)}; - } - - std::vector flattened_chunks(type()->num_fields()); - for (const auto& chunk : chunks_) { - ARROW_ASSIGN_OR_RAISE(auto arrays, - checked_cast(*chunk).Flatten(pool)); - DCHECK_EQ(arrays.size(), flattened_chunks.size()); - for (size_t i = 0; i < arrays.size(); ++i) { - flattened_chunks[i].push_back(arrays[i]); - } - } - - std::vector> flattened(type()->num_fields()); - for (size_t i = 0; i < flattened.size(); ++i) { - auto child_type = type()->field(static_cast(i))->type(); - flattened[i] = - std::make_shared(std::move(flattened_chunks[i]), child_type); - } - return flattened; -} - -Result> ChunkedArray::View( - const std::shared_ptr& type) const { - ArrayVector out_chunks(this->num_chunks()); - for (int i = 0; i < this->num_chunks(); ++i) { - ARROW_ASSIGN_OR_RAISE(out_chunks[i], chunks_[i]->View(type)); - } - return std::make_shared(out_chunks, type); -} - -std::string ChunkedArray::ToString() const { - std::stringstream ss; - ARROW_CHECK_OK(PrettyPrint(*this, 0, &ss)); - return ss.str(); -} - -Status ChunkedArray::Validate() const { - if (chunks_.size() == 0) { - return Status::OK(); - } - - const auto& type = *chunks_[0]->type(); - // Make sure chunks all have the same type - for (size_t i = 1; i < chunks_.size(); ++i) { - const Array& chunk = *chunks_[i]; - if (!chunk.type()->Equals(type)) { - return Status::Invalid("In chunk ", i, " expected type ", type.ToString(), - " but saw ", chunk.type()->ToString()); - } - } - // Validate the chunks themselves - for (size_t i = 0; i < chunks_.size(); ++i) { - const Array& chunk = *chunks_[i]; - const Status st = internal::ValidateArray(chunk); - if (!st.ok()) { - return Status::Invalid("In chunk ", i, ": ", st.ToString()); - } - } - return Status::OK(); -} - -Status ChunkedArray::ValidateFull() const { - RETURN_NOT_OK(Validate()); - for (size_t i = 0; i < chunks_.size(); ++i) { - const Array& chunk = *chunks_[i]; - const Status st = internal::ValidateArrayData(chunk); - if (!st.ok()) { - return Status::Invalid("In chunk ", i, ": ", st.ToString()); - } - } - return Status::OK(); -} - -namespace internal { - -bool MultipleChunkIterator::Next(std::shared_ptr* next_left, - std::shared_ptr* next_right) { - if (pos_ == length_) return false; - - // Find non-empty chunk - std::shared_ptr chunk_left, chunk_right; - while (true) { - chunk_left = left_.chunk(chunk_idx_left_); - chunk_right = right_.chunk(chunk_idx_right_); - if (chunk_pos_left_ == chunk_left->length()) { - chunk_pos_left_ = 0; - ++chunk_idx_left_; - continue; - } - if (chunk_pos_right_ == chunk_right->length()) { - chunk_pos_right_ = 0; - ++chunk_idx_right_; - continue; - } - break; - } - // Determine how big of a section to return - int64_t iteration_size = std::min(chunk_left->length() - chunk_pos_left_, - chunk_right->length() - chunk_pos_right_); - - *next_left = chunk_left->Slice(chunk_pos_left_, iteration_size); - *next_right = chunk_right->Slice(chunk_pos_right_, iteration_size); - - pos_ += iteration_size; - chunk_pos_left_ += iteration_size; - chunk_pos_right_ += iteration_size; - return true; -} - -} // namespace internal +class KeyValueMetadata; +class MemoryPool; +struct ArrayData; // ---------------------------------------------------------------------- // Table methods diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 7e3602ee458..20f5c684a71 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -20,9 +20,9 @@ #include #include #include -#include #include +#include "arrow/chunked_array.h" // IWYU pragma: keep #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/type.h" @@ -32,183 +32,10 @@ namespace arrow { -/// \class ChunkedArray -/// \brief A data structure managing a list of primitive Arrow arrays logically -/// as one large array -class ARROW_EXPORT ChunkedArray { - public: - /// \brief Construct a chunked array from a vector of arrays - /// - /// The vector must be non-empty and all its elements must have the same - /// data type. - explicit ChunkedArray(ArrayVector chunks); - - /// \brief Construct a chunked array from a single Array - explicit ChunkedArray(std::shared_ptr chunk) - : ChunkedArray(ArrayVector{std::move(chunk)}) {} - - /// \brief Construct a chunked array from a vector of arrays and a data type - /// - /// As the data type is passed explicitly, the vector may be empty. - ChunkedArray(ArrayVector chunks, std::shared_ptr type); - - /// \return the total length of the chunked array; computed on construction - int64_t length() const { return length_; } - - /// \return the total number of nulls among all chunks - int64_t null_count() const { return null_count_; } - - int num_chunks() const { return static_cast(chunks_.size()); } - - /// \return chunk a particular chunk from the chunked array - std::shared_ptr chunk(int i) const { return chunks_[i]; } - - const ArrayVector& chunks() const { return chunks_; } - - /// \brief Construct a zero-copy slice of the chunked array with the - /// indicated offset and length - /// - /// \param[in] offset the position of the first element in the constructed - /// slice - /// \param[in] length the length of the slice. If there are not enough - /// elements in the chunked array, the length will be adjusted accordingly - /// - /// \return a new object wrapped in std::shared_ptr - std::shared_ptr Slice(int64_t offset, int64_t length) const; - - /// \brief Slice from offset until end of the chunked array - std::shared_ptr Slice(int64_t offset) const; - - /// \brief Flatten this chunked array as a vector of chunked arrays, one - /// for each struct field - /// - /// \param[in] pool The pool for buffer allocations, if any - Result>> Flatten( - MemoryPool* pool = default_memory_pool()) const; - - /// Construct a zero-copy view of this chunked array with the given - /// type. Calls Array::View on each constituent chunk. Always succeeds if - /// there are zero chunks - Result> View(const std::shared_ptr& type) const; - - std::shared_ptr type() const { return type_; } - - /// \brief Determine if two chunked arrays are equal. - /// - /// Two chunked arrays can be equal only if they have equal datatypes. - /// However, they may be equal even if they have different chunkings. - bool Equals(const ChunkedArray& other) const; - /// \brief Determine if two chunked arrays are equal. - bool Equals(const std::shared_ptr& other) const; - - /// \return PrettyPrint representation suitable for debugging - std::string ToString() const; - - /// \brief Perform cheap validation checks to determine obvious inconsistencies - /// within the chunk array's internal data. - /// - /// This is O(k*m) where k is the number of array descendents, - /// and m is the number of chunks. - /// - /// \return Status - Status Validate() const; - - /// \brief Perform extensive validation checks to determine inconsistencies - /// within the chunk array's internal data. - /// - /// This is O(k*n) where k is the number of array descendents, - /// and n is the length in elements. - /// - /// \return Status - Status ValidateFull() const; - - protected: - ArrayVector chunks_; - int64_t length_; - int64_t null_count_; - std::shared_ptr type_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); -}; - -namespace internal { - -/// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous -/// pieces of potentially differently-chunked ChunkedArray objects -class ARROW_EXPORT MultipleChunkIterator { - public: - MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right) - : left_(left), - right_(right), - pos_(0), - length_(left.length()), - chunk_idx_left_(0), - chunk_idx_right_(0), - chunk_pos_left_(0), - chunk_pos_right_(0) {} - - bool Next(std::shared_ptr* next_left, std::shared_ptr* next_right); - - int64_t position() const { return pos_; } - - private: - const ChunkedArray& left_; - const ChunkedArray& right_; - - // The amount of the entire ChunkedArray consumed - int64_t pos_; - - // Length of the chunked array(s) - int64_t length_; - - // Current left chunk - int chunk_idx_left_; - - // Current right chunk - int chunk_idx_right_; - - // Offset into the current left chunk - int64_t chunk_pos_left_; - - // Offset into the current right chunk - int64_t chunk_pos_right_; -}; - -/// \brief Evaluate binary function on two ChunkedArray objects having possibly -/// different chunk layouts. The passed binary function / functor should have -/// the following signature. -/// -/// Status(const Array&, const Array&, int64_t) -/// -/// The third argument is the absolute position relative to the start of each -/// ChunkedArray. The function is executed against each contiguous pair of -/// array segments, slicing if necessary. -/// -/// For example, if two arrays have chunk sizes -/// -/// left: [10, 10, 20] -/// right: [15, 10, 15] -/// -/// Then the following invocations take place (pseudocode) -/// -/// func(left.chunk[0][0:10], right.chunk[0][0:10], 0) -/// func(left.chunk[1][0:5], right.chunk[0][10:15], 10) -/// func(left.chunk[1][5:10], right.chunk[1][0:5], 15) -/// func(left.chunk[2][0:5], right.chunk[1][5:10], 20) -/// func(left.chunk[2][5:20], right.chunk[2][:], 25) -template -Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right, - Action&& action) { - MultipleChunkIterator iterator(left, right); - std::shared_ptr left_piece, right_piece; - while (iterator.Next(&left_piece, &right_piece)) { - ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position())); - } - return Status::OK(); -} - -} // namespace internal +class Array; +class ChunkedArray; +class KeyValueMetadata; +class MemoryPool; /// \class Table /// \brief Logical table as sequence of chunked arrays diff --git a/cpp/src/arrow/table_builder.h b/cpp/src/arrow/table_builder.h index 88ff6e7b91d..db130d38950 100644 --- a/cpp/src/arrow/table_builder.h +++ b/cpp/src/arrow/table_builder.h @@ -22,8 +22,6 @@ #include #include "arrow/array/builder_base.h" -#include "arrow/memory_pool.h" -#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -32,6 +30,9 @@ namespace arrow { +class MemoryPool; +class RecordBatch; + /// \class RecordBatchBuilder /// \brief Helper class for creating record batches iteratively given a known /// schema diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index cb61d6f96d2..38bc2059755 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -22,196 +22,20 @@ #include #include +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" #include "arrow/array/util.h" +#include "arrow/chunked_array.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" -#include "arrow/testing/random.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" #include "arrow/util/key_value_metadata.h" namespace arrow { -class TestChunkedArray : public TestBase { - protected: - virtual void Construct() { - one_ = std::make_shared(arrays_one_); - if (!arrays_another_.empty()) { - another_ = std::make_shared(arrays_another_); - } - } - - ArrayVector arrays_one_; - ArrayVector arrays_another_; - - std::shared_ptr one_; - std::shared_ptr another_; -}; - -TEST_F(TestChunkedArray, BasicEquals) { - std::vector null_bitmap(100, true); - std::vector data(100, 1); - std::shared_ptr array; - ArrayFromVector(null_bitmap, data, &array); - arrays_one_.push_back(array); - arrays_another_.push_back(array); - - Construct(); - ASSERT_TRUE(one_->Equals(one_)); - ASSERT_FALSE(one_->Equals(nullptr)); - ASSERT_TRUE(one_->Equals(another_)); - ASSERT_TRUE(one_->Equals(*another_.get())); -} - -TEST_F(TestChunkedArray, EqualsDifferingTypes) { - std::vector null_bitmap(100, true); - std::vector data32(100, 1); - std::vector data64(100, 1); - std::shared_ptr array; - ArrayFromVector(null_bitmap, data32, &array); - arrays_one_.push_back(array); - ArrayFromVector(null_bitmap, data64, &array); - arrays_another_.push_back(array); - - Construct(); - ASSERT_FALSE(one_->Equals(another_)); - ASSERT_FALSE(one_->Equals(*another_.get())); -} - -TEST_F(TestChunkedArray, EqualsDifferingLengths) { - std::vector null_bitmap100(100, true); - std::vector null_bitmap101(101, true); - std::vector data100(100, 1); - std::vector data101(101, 1); - std::shared_ptr array; - ArrayFromVector(null_bitmap100, data100, &array); - arrays_one_.push_back(array); - ArrayFromVector(null_bitmap101, data101, &array); - arrays_another_.push_back(array); - - Construct(); - ASSERT_FALSE(one_->Equals(another_)); - ASSERT_FALSE(one_->Equals(*another_.get())); - - std::vector null_bitmap1(1, true); - std::vector data1(1, 1); - ArrayFromVector(null_bitmap1, data1, &array); - arrays_one_.push_back(array); - - Construct(); - ASSERT_TRUE(one_->Equals(another_)); - ASSERT_TRUE(one_->Equals(*another_.get())); -} - -TEST_F(TestChunkedArray, EqualsDifferingMetadata) { - auto left_ty = list(field("item", int32())); - - auto metadata = key_value_metadata({"foo"}, {"bar"}); - auto right_ty = list(field("item", int32(), true, metadata)); - - std::vector> left_chunks = {ArrayFromJSON(left_ty, "[[]]")}; - std::vector> right_chunks = {ArrayFromJSON(right_ty, "[[]]")}; - - ChunkedArray left(left_chunks); - ChunkedArray right(right_chunks); - ASSERT_TRUE(left.Equals(right)); -} - -TEST_F(TestChunkedArray, SliceEquals) { - arrays_one_.push_back(MakeRandomArray(100)); - arrays_one_.push_back(MakeRandomArray(50)); - arrays_one_.push_back(MakeRandomArray(50)); - Construct(); - - std::shared_ptr slice = one_->Slice(125, 50); - ASSERT_EQ(slice->length(), 50); - AssertChunkedEqual(*one_->Slice(125, 50), *slice); - - std::shared_ptr slice2 = one_->Slice(75)->Slice(25)->Slice(25, 50); - ASSERT_EQ(slice2->length(), 50); - AssertChunkedEqual(*slice, *slice2); - - // Making empty slices of a ChunkedArray - std::shared_ptr slice3 = one_->Slice(one_->length(), 99); - ASSERT_EQ(slice3->length(), 0); - ASSERT_EQ(slice3->num_chunks(), 1); - ASSERT_TRUE(slice3->type()->Equals(one_->type())); - - std::shared_ptr slice4 = one_->Slice(10, 0); - ASSERT_EQ(slice4->length(), 0); - ASSERT_EQ(slice4->num_chunks(), 1); - ASSERT_TRUE(slice4->type()->Equals(one_->type())); - - // Slicing an empty ChunkedArray - std::shared_ptr slice5 = slice4->Slice(0, 10); - ASSERT_EQ(slice5->length(), 0); - ASSERT_EQ(slice5->num_chunks(), 1); - ASSERT_TRUE(slice5->type()->Equals(one_->type())); -} - -TEST_F(TestChunkedArray, ZeroChunksIssues) { - ArrayVector empty = {}; - auto no_chunks = std::make_shared(empty, int8()); - - // ARROW-8911, assert that slicing is a no-op when there are zero-chunks - auto sliced = no_chunks->Slice(0, 0); - auto sliced2 = no_chunks->Slice(0, 5); - AssertChunkedEqual(*no_chunks, *sliced); - AssertChunkedEqual(*no_chunks, *sliced2); -} - -TEST_F(TestChunkedArray, Validate) { - // Valid if empty - ArrayVector empty = {}; - auto no_chunks = std::make_shared(empty, utf8()); - ASSERT_OK(no_chunks->ValidateFull()); - - random::RandomArrayGenerator gen(0); - arrays_one_.push_back(gen.Int32(50, 0, 100, 0.1)); - Construct(); - ASSERT_OK(one_->ValidateFull()); - - arrays_one_.push_back(gen.Int32(50, 0, 100, 0.1)); - Construct(); - ASSERT_OK(one_->ValidateFull()); - - arrays_one_.push_back(gen.String(50, 0, 10, 0.1)); - Construct(); - ASSERT_RAISES(Invalid, one_->ValidateFull()); -} - -TEST_F(TestChunkedArray, View) { - auto in_ty = int32(); - auto out_ty = fixed_size_binary(4); -#if ARROW_LITTLE_ENDIAN - auto arr = ArrayFromJSON(in_ty, "[2020568934, 2054316386, null]"); - auto arr2 = ArrayFromJSON(in_ty, "[2020568934, 2054316386]"); -#else - auto arr = ArrayFromJSON(in_ty, "[1718579064, 1650553466, null]"); - auto arr2 = ArrayFromJSON(in_ty, "[1718579064, 1650553466]"); -#endif - auto ex = ArrayFromJSON(out_ty, R"(["foox", "barz", null])"); - auto ex2 = ArrayFromJSON(out_ty, R"(["foox", "barz"])"); - - ArrayVector chunks = {arr, arr2}; - ArrayVector ex_chunks = {ex, ex2}; - auto carr = std::make_shared(chunks); - auto expected = std::make_shared(ex_chunks); - - ASSERT_OK_AND_ASSIGN(auto result, carr->View(out_ty)); - AssertChunkedEqual(*expected, *result); - - // Zero length - ArrayVector empty = {}; - carr = std::make_shared(empty, in_ty); - expected = std::make_shared(empty, out_ty); - ASSERT_OK_AND_ASSIGN(result, carr->View(out_ty)); - AssertChunkedEqual(*expected, *result); -} - class TestTable : public TestBase { public: void MakeExample1(int length) { @@ -804,219 +628,6 @@ TEST_F(TestTable, AddColumn) { ASSERT_TRUE(result->Equals(*expected)); } -class TestRecordBatch : public TestBase {}; - -TEST_F(TestRecordBatch, Equals) { - const int length = 10; - - auto f0 = field("f0", int32()); - auto f1 = field("f1", uint8()); - auto f2 = field("f2", int16()); - - auto metadata = key_value_metadata({"foo"}, {"bar"}); - - std::vector> fields = {f0, f1, f2}; - auto schema = ::arrow::schema({f0, f1, f2}); - auto schema2 = ::arrow::schema({f0, f1}); - auto schema3 = ::arrow::schema({f0, f1, f2}, metadata); - - auto a0 = MakeRandomArray(length); - auto a1 = MakeRandomArray(length); - auto a2 = MakeRandomArray(length); - - auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - auto b2 = RecordBatch::Make(schema3, length, {a0, a1, a2}); - auto b3 = RecordBatch::Make(schema2, length, {a0, a1}); - auto b4 = RecordBatch::Make(schema, length, {a0, a1, a1}); - - ASSERT_TRUE(b1->Equals(*b1)); - ASSERT_FALSE(b1->Equals(*b3)); - ASSERT_FALSE(b1->Equals(*b4)); - - // Different metadata - ASSERT_TRUE(b1->Equals(*b2)); - ASSERT_FALSE(b1->Equals(*b2, /*check_metadata=*/true)); -} - -TEST_F(TestRecordBatch, Validate) { - const int length = 10; - - auto f0 = field("f0", int32()); - auto f1 = field("f1", uint8()); - auto f2 = field("f2", int16()); - - auto schema = ::arrow::schema({f0, f1, f2}); - - auto a0 = MakeRandomArray(length); - auto a1 = MakeRandomArray(length); - auto a2 = MakeRandomArray(length); - auto a3 = MakeRandomArray(5); - - auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - - ASSERT_OK(b1->ValidateFull()); - - // Length mismatch - auto b2 = RecordBatch::Make(schema, length, {a0, a1, a3}); - ASSERT_RAISES(Invalid, b2->ValidateFull()); - - // Type mismatch - auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); - ASSERT_RAISES(Invalid, b3->ValidateFull()); -} - -TEST_F(TestRecordBatch, Slice) { - const int length = 7; - - auto f0 = field("f0", int32()); - auto f1 = field("f1", uint8()); - auto f2 = field("f2", int8()); - - std::vector> fields = {f0, f1, f2}; - auto schema = ::arrow::schema(fields); - - auto a0 = MakeRandomArray(length); - auto a1 = MakeRandomArray(length); - auto a2 = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6]"); - - auto batch = RecordBatch::Make(schema, length, {a0, a1, a2}); - - auto batch_slice = batch->Slice(2); - auto batch_slice2 = batch->Slice(1, 5); - - ASSERT_EQ(batch_slice->num_rows(), batch->num_rows() - 2); - - for (int i = 0; i < batch->num_columns(); ++i) { - ASSERT_EQ(2, batch_slice->column(i)->offset()); - ASSERT_EQ(length - 2, batch_slice->column(i)->length()); - - ASSERT_EQ(1, batch_slice2->column(i)->offset()); - ASSERT_EQ(5, batch_slice2->column(i)->length()); - } - - // ARROW-9143: RecordBatch::Slice was incorrectly setting a2's - // ArrayData::null_count to kUnknownNullCount - ASSERT_EQ(batch_slice->column(2)->data()->null_count, 0); - ASSERT_EQ(batch_slice2->column(2)->data()->null_count, 0); -} - -TEST_F(TestRecordBatch, AddColumn) { - const int length = 10; - - auto field1 = field("f1", int32()); - auto field2 = field("f2", uint8()); - auto field3 = field("f3", int16()); - - auto schema1 = ::arrow::schema({field1, field2}); - auto schema2 = ::arrow::schema({field2, field3}); - auto schema3 = ::arrow::schema({field2}); - - auto array1 = MakeRandomArray(length); - auto array2 = MakeRandomArray(length); - auto array3 = MakeRandomArray(length); - - auto batch1 = RecordBatch::Make(schema1, length, {array1, array2}); - auto batch2 = RecordBatch::Make(schema2, length, {array2, array3}); - auto batch3 = RecordBatch::Make(schema3, length, {array2}); - - const RecordBatch& batch = *batch3; - - // Negative tests with invalid index - ASSERT_RAISES(Invalid, batch.AddColumn(5, field1, array1)); - ASSERT_RAISES(Invalid, batch.AddColumn(2, field1, array1)); - ASSERT_RAISES(Invalid, batch.AddColumn(-1, field1, array1)); - - // Negative test with wrong length - auto longer_col = MakeRandomArray(length + 1); - ASSERT_RAISES(Invalid, batch.AddColumn(0, field1, longer_col)); - - // Negative test with mismatch type - ASSERT_RAISES(Invalid, batch.AddColumn(0, field1, array2)); - - ASSERT_OK_AND_ASSIGN(auto new_batch, batch.AddColumn(0, field1, array1)); - AssertBatchesEqual(*new_batch, *batch1); - - ASSERT_OK_AND_ASSIGN(new_batch, batch.AddColumn(1, field3, array3)); - AssertBatchesEqual(*new_batch, *batch2); - - ASSERT_OK_AND_ASSIGN(auto new_batch2, batch.AddColumn(1, "f3", array3)); - AssertBatchesEqual(*new_batch2, *new_batch); - - ASSERT_TRUE(new_batch2->schema()->field(1)->nullable()); -} - -TEST_F(TestRecordBatch, RemoveColumn) { - const int length = 10; - - auto field1 = field("f1", int32()); - auto field2 = field("f2", uint8()); - auto field3 = field("f3", int16()); - - auto schema1 = ::arrow::schema({field1, field2, field3}); - auto schema2 = ::arrow::schema({field2, field3}); - auto schema3 = ::arrow::schema({field1, field3}); - auto schema4 = ::arrow::schema({field1, field2}); - - auto array1 = MakeRandomArray(length); - auto array2 = MakeRandomArray(length); - auto array3 = MakeRandomArray(length); - - auto batch1 = RecordBatch::Make(schema1, length, {array1, array2, array3}); - auto batch2 = RecordBatch::Make(schema2, length, {array2, array3}); - auto batch3 = RecordBatch::Make(schema3, length, {array1, array3}); - auto batch4 = RecordBatch::Make(schema4, length, {array1, array2}); - - const RecordBatch& batch = *batch1; - std::shared_ptr result; - - // Negative tests with invalid index - ASSERT_RAISES(Invalid, batch.RemoveColumn(3)); - ASSERT_RAISES(Invalid, batch.RemoveColumn(-1)); - - ASSERT_OK_AND_ASSIGN(auto new_batch, batch.RemoveColumn(0)); - AssertBatchesEqual(*new_batch, *batch2); - - ASSERT_OK_AND_ASSIGN(new_batch, batch.RemoveColumn(1)); - AssertBatchesEqual(*new_batch, *batch3); - - ASSERT_OK_AND_ASSIGN(new_batch, batch.RemoveColumn(2)); - AssertBatchesEqual(*new_batch, *batch4); -} - -TEST_F(TestRecordBatch, RemoveColumnEmpty) { - const int length = 10; - - auto field1 = field("f1", int32()); - auto schema1 = ::arrow::schema({field1}); - auto array1 = MakeRandomArray(length); - auto batch1 = RecordBatch::Make(schema1, length, {array1}); - - ASSERT_OK_AND_ASSIGN(auto empty, batch1->RemoveColumn(0)); - ASSERT_EQ(batch1->num_rows(), empty->num_rows()); - - ASSERT_OK_AND_ASSIGN(auto added, empty->AddColumn(0, field1, array1)); - AssertBatchesEqual(*added, *batch1); -} - -TEST_F(TestRecordBatch, ToFromEmptyStructArray) { - auto batch1 = - RecordBatch::Make(::arrow::schema({}), 10, std::vector>{}); - ASSERT_OK_AND_ASSIGN(auto struct_array, batch1->ToStructArray()); - ASSERT_EQ(10, struct_array->length()); - ASSERT_OK_AND_ASSIGN(auto batch2, RecordBatch::FromStructArray(struct_array)); - ASSERT_TRUE(batch1->Equals(*batch2)); -} - -TEST_F(TestRecordBatch, FromStructArrayInvalidType) { - ASSERT_RAISES(Invalid, RecordBatch::FromStructArray(MakeRandomArray(10))); -} - -TEST_F(TestRecordBatch, FromStructArrayInvalidNullCount) { - auto struct_array = - ArrayFromJSON(struct_({field("f1", int32())}), R"([{"f1": 1}, null])"); - ASSERT_RAISES(Invalid, RecordBatch::FromStructArray(struct_array)); -} - class TestTableBatchReader : public TestBase {}; TEST_F(TestTableBatchReader, ReadNext) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 95c3a919897..1ebf7f54605 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -29,6 +29,7 @@ #include #include "arrow/array.h" +#include "arrow/chunked_array.h" #include "arrow/compare.h" #include "arrow/record_batch.h" #include "arrow/result.h" diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e218f8451ac..aee01e16b16 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -30,7 +30,7 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/table.h" +#include "arrow/chunked_array.h" #include "arrow/type.h" #include "arrow/util/bit_stream_utils.h" #include "arrow/util/checked_cast.h" diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 1ecd35c5f2b..b9effd5adad 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -23,7 +23,7 @@ #if defined(ARROW_R_WITH_ARROW) #include #include -#include +#include #include #include diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index eb0cca28296..babc67a1813 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -22,7 +22,7 @@ using Rcpp::wrap; #if defined(ARROW_R_WITH_ARROW) -#include +#include // [[arrow::export]] int ChunkedArray__length(const std::shared_ptr& chunked_array) {