From bc05614309a5493a0a34ba19673a510d7e5985bb Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Thu, 30 Dec 2021 17:33:18 -0500 Subject: [PATCH 01/25] add binary search for finding a chunk --- cpp/src/arrow/chunked_array.cc | 57 +++++++++++++++++++++++----- cpp/src/arrow/chunked_array.h | 1 + python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/table.pxi | 6 ++- 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 30746902945..0a8896a6475 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,44 @@ namespace arrow { using internal::checked_cast; +namespace internal { + +Result> chunked_array_binary_search( + const ArrayVector& chunks, const std::vector& inds, int64_t index) { + auto bound_iter = std::upper_bound(inds.begin(), inds.end(), index); + if (bound_iter == inds.end()) { + return Status::Invalid("index out of bounds"); + } + auto chunk_idx = std::distance(inds.begin(), bound_iter); + auto chunk = chunks[chunk_idx]; + auto rel_index = index - (chunk_idx ? inds[chunk_idx - 1] : 0); + return chunk->GetScalar(rel_index); +} + +Result> chunked_array_binary_search2( + const ArrayVector& chunks, const std::vector& inds, int64_t index) { + auto bound_iter = std::upper_bound(inds.begin(), inds.end(), index); + if (bound_iter == inds.begin() || bound_iter == inds.end()) { + return Status::Invalid("index out of bounds"); + } + auto chunk_idx = std::distance(std::next(inds.begin()), bound_iter); + auto rel_index = index - inds[chunk_idx]; + return chunks[chunk_idx]->GetScalar(rel_index); +} + +Result> chunked_array_linear_search(const ArrayVector& chunks, + int64_t index) { + for (const auto& chunk : chunks) { + if (index < chunk->length()) { + return chunk->GetScalar(index); + } + index -= chunk->length(); + } + return Status::Invalid("index out of bounds"); +} + +} // namespace internal + class MemoryPool; // ---------------------------------------------------------------------- @@ -44,16 +83,18 @@ class MemoryPool; ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) : chunks_(std::move(chunks)), type_(std::move(type)) { - length_ = 0; - null_count_ = 0; - if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; type_ = chunks_[0]->type(); } + + length_ = 0; + null_count_ = 0; + inds_.push_back(length_); for (const auto& chunk : chunks_) { length_ += chunk->length(); + inds_.push_back(length_); null_count_ += chunk->null_count(); } } @@ -147,13 +188,9 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - for (const auto& chunk : chunks_) { - if (index < chunk->length()) { - return chunk->GetScalar(index); - } - index -= chunk->length(); - } - return Status::Invalid("index out of bounds"); + // return internal::chunked_array_linear_search(chunks_, index); + // return internal::chunked_array_binary_search(chunks_, inds_, index); + return internal::chunked_array_binary_search2(chunks_, inds_, index); } std::shared_ptr ChunkedArray::Slice(int64_t offset, int64_t length) const { diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 36b2f38ed43..caa6f9af4a3 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -177,6 +177,7 @@ class ARROW_EXPORT ChunkedArray { protected: ArrayVector chunks_; + std::vector inds_; int64_t length_; int64_t null_count_; std::shared_ptr type_; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 55935feba67..c8e945dc14a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -754,6 +754,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] chunk(int i) shared_ptr[CDataType] type() + CResult[shared_ptr[CScalar]] GetScalar(int64_t index) const shared_ptr[CChunkedArray] Slice(int64_t offset, int64_t length) const shared_ptr[CChunkedArray] Slice(int64_t offset) const diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index c4e83266529..ce61e80049d 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -285,10 +285,14 @@ cdef class ChunkedArray(_PandasConvertible): ------- value : Scalar (index) or ChunkedArray (slice) """ + cdef shared_ptr[CScalar] c_scalar + if isinstance(key, slice): return _normalize_slice(self, key) - return self.getitem(_normalize_index(key, self.chunked_array.length())) + # return self.getitem(_normalize_index(key, self.chunked_array.length())) + c_scalar = GetResultValue(self.chunked_array.GetScalar(_normalize_index(key, self.chunked_array.length()))) + return Scalar.wrap( c_scalar) cdef getitem(self, int64_t index): cdef int j From 8434d14ca8d26fcf7a7aad6ce95142c2f93ccd4b Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Sat, 26 Feb 2022 22:48:21 -0500 Subject: [PATCH 02/25] (unstable) use ChunkResolver --- cpp/src/arrow/chunked_array.cc | 48 ++----------------- cpp/src/arrow/chunked_array.h | 8 ++-- cpp/src/arrow/chunked_array_test.cc | 2 +- .../arrow/compute/kernels/chunked_internal.h | 2 +- cpp/src/arrow/public_api_test.cc | 6 +-- python/pyarrow/table.pxi | 1 - 6 files changed, 13 insertions(+), 54 deletions(-) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 0a8896a6475..af3d335f938 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -38,51 +37,13 @@ namespace arrow { using internal::checked_cast; -namespace internal { - -Result> chunked_array_binary_search( - const ArrayVector& chunks, const std::vector& inds, int64_t index) { - auto bound_iter = std::upper_bound(inds.begin(), inds.end(), index); - if (bound_iter == inds.end()) { - return Status::Invalid("index out of bounds"); - } - auto chunk_idx = std::distance(inds.begin(), bound_iter); - auto chunk = chunks[chunk_idx]; - auto rel_index = index - (chunk_idx ? inds[chunk_idx - 1] : 0); - return chunk->GetScalar(rel_index); -} - -Result> chunked_array_binary_search2( - const ArrayVector& chunks, const std::vector& inds, int64_t index) { - auto bound_iter = std::upper_bound(inds.begin(), inds.end(), index); - if (bound_iter == inds.begin() || bound_iter == inds.end()) { - return Status::Invalid("index out of bounds"); - } - auto chunk_idx = std::distance(std::next(inds.begin()), bound_iter); - auto rel_index = index - inds[chunk_idx]; - return chunks[chunk_idx]->GetScalar(rel_index); -} - -Result> chunked_array_linear_search(const ArrayVector& chunks, - int64_t index) { - for (const auto& chunk : chunks) { - if (index < chunk->length()) { - return chunk->GetScalar(index); - } - index -= chunk->length(); - } - return Status::Invalid("index out of bounds"); -} - -} // namespace internal - class MemoryPool; // ---------------------------------------------------------------------- // ChunkedArray methods ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), type_(std::move(type)) { + : chunks_(std::move(chunks)), type_(std::move(type)), resolver_(compute::internal::ChunkedArrayResolver::MakeLengths(compute::internal::GetArrayPointers(chunks_))) { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; @@ -91,10 +52,8 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) length_ = 0; null_count_ = 0; - inds_.push_back(length_); for (const auto& chunk : chunks_) { length_ += chunk->length(); - inds_.push_back(length_); null_count_ += chunk->null_count(); } } @@ -188,9 +147,8 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - // return internal::chunked_array_linear_search(chunks_, index); - // return internal::chunked_array_binary_search(chunks_, inds_, index); - return internal::chunked_array_binary_search2(chunks_, inds_, index); + auto chunk_location = resolver_.Resolve(index); + return chunks_[chunk_location.chunk_index]->GetScalar(chunk_location.index_in_chunk); } std::shared_ptr ChunkedArray::Slice(int64_t offset, int64_t length) const { diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index caa6f9af4a3..291aa5ee323 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -24,6 +24,7 @@ #include #include "arrow/compare.h" +#include "arrow/compute/kernels/chunked_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -67,8 +68,8 @@ class MemoryPool; /// inputs should not expect the chunk layout to be the same in each input. class ARROW_EXPORT ChunkedArray { public: - ChunkedArray(ChunkedArray&&) = default; - ChunkedArray& operator=(ChunkedArray&&) = default; + // ChunkedArray(ChunkedArray&&) = default; + // ChunkedArray& operator=(ChunkedArray&&) = default; /// \brief Construct a chunked array from a single Array explicit ChunkedArray(std::shared_ptr chunk) @@ -177,12 +178,13 @@ class ARROW_EXPORT ChunkedArray { protected: ArrayVector chunks_; - std::vector inds_; int64_t length_; int64_t null_count_; std::shared_ptr type_; private: + std::vector lengths_; + compute::internal::ChunkResolver resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index a623cd47065..d1dc69de274 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -278,7 +278,7 @@ TEST_F(TestChunkedArray, GetScalar) { check_scalar(carr, 4, **MakeScalar(ty, 3)); check_scalar(carr, 6, **MakeScalar(ty, 5)); - ASSERT_RAISES(Invalid, carr.GetScalar(7)); + ASSERT_RAISES(IndexError, carr.GetScalar(7)); } } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index b007d6cbfb8..b5a2a6473c4 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -144,7 +144,6 @@ struct ChunkedArrayResolver : protected ChunkResolver { checked_cast(chunks_[loc.chunk_index]), loc.index_in_chunk); } - protected: static std::vector MakeLengths(const std::vector& chunks) { std::vector lengths(chunks.size()); std::transform(chunks.begin(), chunks.end(), lengths.begin(), @@ -152,6 +151,7 @@ struct ChunkedArrayResolver : protected ChunkResolver { return lengths; } + protected: const std::vector chunks_; }; diff --git a/cpp/src/arrow/public_api_test.cc b/cpp/src/arrow/public_api_test.cc index 45f3313c67f..dbb65c0d239 100644 --- a/cpp/src/arrow/public_api_test.cc +++ b/cpp/src/arrow/public_api_test.cc @@ -54,9 +54,9 @@ #include "arrow/python/api.h" // IWYU pragma: keep #endif -#ifdef DCHECK -#error "DCHECK should not be visible from Arrow public headers." -#endif +// #ifdef DCHECK +// #error "DCHECK should not be visible from Arrow public headers." +// #endif #ifdef ASSIGN_OR_RAISE #error "ASSIGN_OR_RAISE should not be visible from Arrow public headers." diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index ce61e80049d..7a9b9494ef1 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -290,7 +290,6 @@ cdef class ChunkedArray(_PandasConvertible): if isinstance(key, slice): return _normalize_slice(self, key) - # return self.getitem(_normalize_index(key, self.chunked_array.length())) c_scalar = GetResultValue(self.chunked_array.GetScalar(_normalize_index(key, self.chunked_array.length()))) return Scalar.wrap( c_scalar) From c2082087748431a442c662ec895211d6fba9fdc7 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 28 Feb 2022 02:31:22 -0500 Subject: [PATCH 03/25] split ChunkResolver/ChunkedArrayResolver files --- cpp/src/arrow/chunked_array.cc | 6 +- cpp/src/arrow/chunked_array.h | 7 +- cpp/src/arrow/chunked_array_internal.h | 113 ++++++++++++++++++ .../arrow/compute/kernels/chunked_internal.h | 80 +------------ .../compute/kernels/vector_sort_internal.h | 1 + cpp/src/arrow/public_api_test.cc | 6 +- 6 files changed, 127 insertions(+), 86 deletions(-) create mode 100644 cpp/src/arrow/chunked_array_internal.h diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index af3d335f938..99e72529000 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -43,7 +43,7 @@ class MemoryPool; // ChunkedArray methods ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), type_(std::move(type)), resolver_(compute::internal::ChunkedArrayResolver::MakeLengths(compute::internal::GetArrayPointers(chunks_))) { + : chunks_(std::move(chunks)), type_(std::move(type)), resolver_{compute::internal::ChunkResolver::FromChunks(chunks_)} { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; @@ -147,8 +147,8 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - auto chunk_location = resolver_.Resolve(index); - return chunks_[chunk_location.chunk_index]->GetScalar(chunk_location.index_in_chunk); + const auto loc = resolver_.Resolve(index); + return chunks_[loc.chunk_index]->GetScalar(loc.index_in_chunk); } std::shared_ptr ChunkedArray::Slice(int64_t offset, int64_t length) const { diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 291aa5ee323..923116ef618 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -24,7 +24,7 @@ #include #include "arrow/compare.h" -#include "arrow/compute/kernels/chunked_internal.h" +#include "arrow/chunked_array_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -68,8 +68,8 @@ class MemoryPool; /// inputs should not expect the chunk layout to be the same in each input. class ARROW_EXPORT ChunkedArray { public: - // ChunkedArray(ChunkedArray&&) = default; - // ChunkedArray& operator=(ChunkedArray&&) = default; + ChunkedArray(ChunkedArray&&) = default; + ChunkedArray& operator=(ChunkedArray&&) = default; /// \brief Construct a chunked array from a single Array explicit ChunkedArray(std::shared_ptr chunk) @@ -183,7 +183,6 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type_; private: - std::vector lengths_; compute::internal::ChunkResolver resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/chunked_array_internal.h b/cpp/src/arrow/chunked_array_internal.h new file mode 100644 index 00000000000..25039883abe --- /dev/null +++ b/cpp/src/arrow/chunked_array_internal.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/type.h" + +namespace arrow { +namespace compute { +namespace internal { + +struct ChunkLocation { + int64_t chunk_index, index_in_chunk; +}; + +// An object that resolves an array chunk depending on the index. +struct ChunkResolver { + explicit ChunkResolver(std::vector lengths) + : num_chunks_(static_cast(lengths.size())), + offsets_(MakeEndOffsets(std::move(lengths))), + cached_chunk_(0) {} + + ChunkLocation Resolve(int64_t index) const { + // It is common for the algorithms below to make consecutive accesses at + // a relatively small distance from each other, hence often falling in + // the same chunk. + // This is trivial when merging (assuming each side of the merge uses + // its own resolver), but also in the inner recursive invocations of + // partitioning. + const bool cache_hit = + (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); + if (ARROW_PREDICT_TRUE(cache_hit)) { + return {cached_chunk_, index - offsets_[cached_chunk_]}; + } else { + return ResolveMissBisect(index); + } + } + + static ChunkResolver FromChunks(const ArrayVector& chunks) { + std::vector lengths(chunks.size()); + std::transform(chunks.begin(), chunks.end(), lengths.begin(), + [](const std::shared_ptr& arr) { return arr->length(); }); + return ChunkResolver(std::move(lengths)); + } + + static ChunkResolver FromBatches(const RecordBatchVector& batches) { + std::vector lengths(batches.size()); + std::transform( + batches.begin(), batches.end(), lengths.begin(), + [](const std::shared_ptr& batch) { return batch->num_rows(); }); + return ChunkResolver(std::move(lengths)); + } + + protected: + ChunkLocation ResolveMissBisect(int64_t index) const { + // Like std::upper_bound(), but hand-written as it can help the compiler. + const int64_t* raw_offsets = offsets_.data(); + // Search [lo, lo + n) + int64_t lo = 0, n = num_chunks_; + while (n > 1) { + int64_t m = n >> 1; + int64_t mid = lo + m; + if (index >= raw_offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } + cached_chunk_ = lo; + return {lo, index - offsets_[lo]}; + } + + static std::vector MakeEndOffsets(std::vector lengths) { + int64_t offset = 0; + for (auto& v : lengths) { + const auto this_length = v; + v = offset; + offset += this_length; + } + lengths.push_back(offset); + return lengths; + } + + int64_t num_chunks_; + std::vector offsets_; + + mutable int64_t cached_chunk_; +}; + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index b5a2a6473c4..9db57a3371a 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -61,81 +61,9 @@ struct ResolvedChunk { bool IsNull() const { return array->IsNull(index); } }; -struct ChunkLocation { - int64_t chunk_index, index_in_chunk; -}; - -// An object that resolves an array chunk depending on the index. -struct ChunkResolver { - explicit ChunkResolver(std::vector lengths) - : num_chunks_(static_cast(lengths.size())), - offsets_(MakeEndOffsets(std::move(lengths))), - cached_chunk_(0) {} - - ChunkLocation Resolve(int64_t index) const { - // It is common for the algorithms below to make consecutive accesses at - // a relatively small distance from each other, hence often falling in - // the same chunk. - // This is trivial when merging (assuming each side of the merge uses - // its own resolver), but also in the inner recursive invocations of - // partitioning. - const bool cache_hit = - (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); - if (ARROW_PREDICT_TRUE(cache_hit)) { - return {cached_chunk_, index - offsets_[cached_chunk_]}; - } else { - return ResolveMissBisect(index); - } - } - - static ChunkResolver FromBatches(const RecordBatchVector& batches) { - std::vector lengths(batches.size()); - std::transform( - batches.begin(), batches.end(), lengths.begin(), - [](const std::shared_ptr& batch) { return batch->num_rows(); }); - return ChunkResolver(std::move(lengths)); - } - - protected: - ChunkLocation ResolveMissBisect(int64_t index) const { - // Like std::upper_bound(), but hand-written as it can help the compiler. - const int64_t* raw_offsets = offsets_.data(); - // Search [lo, lo + n) - int64_t lo = 0, n = num_chunks_; - while (n > 1) { - int64_t m = n >> 1; - int64_t mid = lo + m; - if (index >= raw_offsets[mid]) { - lo = mid; - n -= m; - } else { - n = m; - } - } - cached_chunk_ = lo; - return {lo, index - offsets_[lo]}; - } - - static std::vector MakeEndOffsets(std::vector lengths) { - int64_t offset = 0; - for (auto& v : lengths) { - const auto this_length = v; - v = offset; - offset += this_length; - } - lengths.push_back(offset); - return lengths; - } - - int64_t num_chunks_; - std::vector offsets_; - - mutable int64_t cached_chunk_; -}; - struct ChunkedArrayResolver : protected ChunkResolver { explicit ChunkedArrayResolver(const std::vector& chunks) - : ChunkResolver(MakeLengths(chunks)), chunks_(chunks) {} + : ChunkResolver{FromArrayPointers(chunks)}, chunks_(chunks) {} template ResolvedChunk Resolve(int64_t index) const { @@ -144,14 +72,14 @@ struct ChunkedArrayResolver : protected ChunkResolver { checked_cast(chunks_[loc.chunk_index]), loc.index_in_chunk); } - static std::vector MakeLengths(const std::vector& chunks) { + protected: + static ChunkResolver FromArrayPointers(const std::vector& chunks) { std::vector lengths(chunks.size()); std::transform(chunks.begin(), chunks.end(), lengths.begin(), [](const Array* arr) { return arr->length(); }); - return lengths; + return ChunkResolver(std::move(lengths)); } - protected: const std::vector chunks_; }; diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index d8b024525c8..fc2eb0593cd 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -24,6 +24,7 @@ #include "arrow/array.h" #include "arrow/compute/api_vector.h" +#include "arrow/chunked_array_internal.h" #include "arrow/compute/kernels/chunked_internal.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/public_api_test.cc b/cpp/src/arrow/public_api_test.cc index dbb65c0d239..45f3313c67f 100644 --- a/cpp/src/arrow/public_api_test.cc +++ b/cpp/src/arrow/public_api_test.cc @@ -54,9 +54,9 @@ #include "arrow/python/api.h" // IWYU pragma: keep #endif -// #ifdef DCHECK -// #error "DCHECK should not be visible from Arrow public headers." -// #endif +#ifdef DCHECK +#error "DCHECK should not be visible from Arrow public headers." +#endif #ifdef ASSIGN_OR_RAISE #error "ASSIGN_OR_RAISE should not be visible from Arrow public headers." From 210b4fa89d90dc9e4d6a4648008f3044dbb9620c Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 28 Feb 2022 03:47:55 -0500 Subject: [PATCH 04/25] add source/header for ChunkResolver --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/chunk_resolver.cc | 96 +++++++++++++++ cpp/src/arrow/chunk_resolver.h | 60 ++++++++++ cpp/src/arrow/chunked_array.h | 2 +- cpp/src/arrow/chunked_array_internal.h | 113 ------------------ .../arrow/compute/kernels/chunked_internal.h | 4 +- .../compute/kernels/vector_sort_internal.h | 2 +- 7 files changed, 161 insertions(+), 117 deletions(-) create mode 100644 cpp/src/arrow/chunk_resolver.cc create mode 100644 cpp/src/arrow/chunk_resolver.h delete mode 100644 cpp/src/arrow/chunked_array_internal.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b6f1e2481fa..29334572874 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -158,6 +158,7 @@ set(ARROW_SRCS builder.cc buffer.cc chunked_array.cc + chunk_resolver.cc compare.cc config.cc datum.cc diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc new file mode 100644 index 00000000000..5bc501e40f7 --- /dev/null +++ b/cpp/src/arrow/chunk_resolver.cc @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "chunk_resolver.h" + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" + +namespace arrow { +namespace compute { +namespace internal { + +ChunkLocation ChunkResolver::Resolve(int64_t index) const { + // It is common for the algorithms below to make consecutive accesses at + // a relatively small distance from each other, hence often falling in + // the same chunk. + // This is trivial when merging (assuming each side of the merge uses + // its own resolver), but also in the inner recursive invocations of + // partitioning. + const bool cache_hit = + (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); + if (ARROW_PREDICT_TRUE(cache_hit)) { + return {cached_chunk_, index - offsets_[cached_chunk_]}; + } else { + return ResolveMissBisect(index); + } +} + +ChunkResolver ChunkResolver::FromChunks(const ArrayVector& chunks) { + std::vector lengths(chunks.size()); + std::transform(chunks.begin(), chunks.end(), lengths.begin(), + [](const std::shared_ptr& arr) { return arr->length(); }); + return ChunkResolver(std::move(lengths)); +} + +ChunkResolver ChunkResolver::FromBatches(const RecordBatchVector& batches) { + std::vector lengths(batches.size()); + std::transform( + batches.begin(), batches.end(), lengths.begin(), + [](const std::shared_ptr& batch) { return batch->num_rows(); }); + return ChunkResolver(std::move(lengths)); +} + +ChunkLocation ChunkResolver::ResolveMissBisect(int64_t index) const { + // Like std::upper_bound(), but hand-written as it can help the compiler. + const int64_t* raw_offsets = offsets_.data(); + // Search [lo, lo + n) + int64_t lo = 0, n = num_chunks_; + while (n > 1) { + int64_t m = n >> 1; + int64_t mid = lo + m; + if (index >= raw_offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } + cached_chunk_ = lo; + return {lo, index - offsets_[lo]}; +} + +std::vector ChunkResolver::MakeEndOffsets(std::vector lengths) { + int64_t offset = 0; + for (auto& v : lengths) { + const auto this_length = v; + v = offset; + offset += this_length; + } + lengths.push_back(offset); + return lengths; +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h new file mode 100644 index 00000000000..d98cec4c8fe --- /dev/null +++ b/cpp/src/arrow/chunk_resolver.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/type_fwd.h" + +namespace arrow { +namespace compute { +namespace internal { + +struct ChunkLocation { + int64_t chunk_index, index_in_chunk; +}; + +// An object that resolves an array chunk depending on the index. +struct ChunkResolver { + explicit ChunkResolver(std::vector lengths) + : num_chunks_(static_cast(lengths.size())), + offsets_(MakeEndOffsets(std::move(lengths))), + cached_chunk_(0) {} + + ChunkLocation Resolve(int64_t index) const; + + static ChunkResolver FromChunks(const ArrayVector& chunks); + + static ChunkResolver FromBatches(const RecordBatchVector& batches); + + protected: + ChunkLocation ResolveMissBisect(int64_t index) const; + + static std::vector MakeEndOffsets(std::vector lengths); + + int64_t num_chunks_; + std::vector offsets_; + + mutable int64_t cached_chunk_; +}; + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 923116ef618..9531d6f3ade 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -23,8 +23,8 @@ #include #include +#include "arrow/chunk_resolver.h" #include "arrow/compare.h" -#include "arrow/chunked_array_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/chunked_array_internal.h b/cpp/src/arrow/chunked_array_internal.h deleted file mode 100644 index 25039883abe..00000000000 --- a/cpp/src/arrow/chunked_array_internal.h +++ /dev/null @@ -1,113 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/record_batch.h" -#include "arrow/type.h" - -namespace arrow { -namespace compute { -namespace internal { - -struct ChunkLocation { - int64_t chunk_index, index_in_chunk; -}; - -// An object that resolves an array chunk depending on the index. -struct ChunkResolver { - explicit ChunkResolver(std::vector lengths) - : num_chunks_(static_cast(lengths.size())), - offsets_(MakeEndOffsets(std::move(lengths))), - cached_chunk_(0) {} - - ChunkLocation Resolve(int64_t index) const { - // It is common for the algorithms below to make consecutive accesses at - // a relatively small distance from each other, hence often falling in - // the same chunk. - // This is trivial when merging (assuming each side of the merge uses - // its own resolver), but also in the inner recursive invocations of - // partitioning. - const bool cache_hit = - (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); - if (ARROW_PREDICT_TRUE(cache_hit)) { - return {cached_chunk_, index - offsets_[cached_chunk_]}; - } else { - return ResolveMissBisect(index); - } - } - - static ChunkResolver FromChunks(const ArrayVector& chunks) { - std::vector lengths(chunks.size()); - std::transform(chunks.begin(), chunks.end(), lengths.begin(), - [](const std::shared_ptr& arr) { return arr->length(); }); - return ChunkResolver(std::move(lengths)); - } - - static ChunkResolver FromBatches(const RecordBatchVector& batches) { - std::vector lengths(batches.size()); - std::transform( - batches.begin(), batches.end(), lengths.begin(), - [](const std::shared_ptr& batch) { return batch->num_rows(); }); - return ChunkResolver(std::move(lengths)); - } - - protected: - ChunkLocation ResolveMissBisect(int64_t index) const { - // Like std::upper_bound(), but hand-written as it can help the compiler. - const int64_t* raw_offsets = offsets_.data(); - // Search [lo, lo + n) - int64_t lo = 0, n = num_chunks_; - while (n > 1) { - int64_t m = n >> 1; - int64_t mid = lo + m; - if (index >= raw_offsets[mid]) { - lo = mid; - n -= m; - } else { - n = m; - } - } - cached_chunk_ = lo; - return {lo, index - offsets_[lo]}; - } - - static std::vector MakeEndOffsets(std::vector lengths) { - int64_t offset = 0; - for (auto& v : lengths) { - const auto this_length = v; - v = offset; - offset += this_length; - } - lengths.push_back(offset); - return lengths; - } - - int64_t num_chunks_; - std::vector offsets_; - - mutable int64_t cached_chunk_; -}; - -} // namespace internal -} // namespace compute -} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 9db57a3371a..98066b09744 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -19,12 +19,12 @@ #include #include +#include +#include #include #include "arrow/array.h" #include "arrow/compute/kernels/codegen_internal.h" -#include "arrow/record_batch.h" -#include "arrow/type.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index fc2eb0593cd..63ded6a858a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -23,8 +23,8 @@ #include #include "arrow/array.h" +#include "arrow/chunk_resolver.h" #include "arrow/compute/api_vector.h" -#include "arrow/chunked_array_internal.h" #include "arrow/compute/kernels/chunked_internal.h" #include "arrow/type.h" #include "arrow/type_traits.h" From 8e82ce4fc55c1d74ffb4131cd86113170ea7b210 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 28 Feb 2022 22:47:24 -0500 Subject: [PATCH 05/25] fix lint errors --- cpp/src/arrow/chunk_resolver.cc | 2 +- cpp/src/arrow/chunked_array.cc | 4 +++- python/pyarrow/table.pxi | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 5bc501e40f7..d5259a5710a 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "chunk_resolver.h" +#include "arrow/chunk_resolver.h" #include #include diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 99e72529000..9b30c611f44 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -43,7 +43,9 @@ class MemoryPool; // ChunkedArray methods ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), type_(std::move(type)), resolver_{compute::internal::ChunkResolver::FromChunks(chunks_)} { + : chunks_(std::move(chunks)), + type_(std::move(type)), + resolver_{compute::internal::ChunkResolver::FromChunks(chunks_)} { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 7a9b9494ef1..ac87d081ac5 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -290,7 +290,8 @@ cdef class ChunkedArray(_PandasConvertible): if isinstance(key, slice): return _normalize_slice(self, key) - c_scalar = GetResultValue(self.chunked_array.GetScalar(_normalize_index(key, self.chunked_array.length()))) + c_scalar = GetResultValue(self.chunked_array.GetScalar( + _normalize_index(key, self.chunked_array.length()))) return Scalar.wrap( c_scalar) cdef getitem(self, int64_t index): From 469522528e086c00ec1bd48b40fdae1f78c5230b Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Fri, 11 Mar 2022 21:47:51 -0500 Subject: [PATCH 06/25] use arrow::internal, inline ChunkResolver fast path --- cpp/src/arrow/chunk_resolver.cc | 48 ------------------ cpp/src/arrow/chunk_resolver.h | 49 ++++++++++++++++--- cpp/src/arrow/chunked_array.cc | 2 +- cpp/src/arrow/chunked_array.h | 2 +- .../arrow/compute/kernels/chunked_internal.h | 10 ++-- cpp/src/arrow/compute/kernels/vector_sort.cc | 10 ++-- 6 files changed, 55 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index d5259a5710a..1a9324ec339 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -27,25 +27,8 @@ #include "arrow/record_batch.h" namespace arrow { -namespace compute { namespace internal { -ChunkLocation ChunkResolver::Resolve(int64_t index) const { - // It is common for the algorithms below to make consecutive accesses at - // a relatively small distance from each other, hence often falling in - // the same chunk. - // This is trivial when merging (assuming each side of the merge uses - // its own resolver), but also in the inner recursive invocations of - // partitioning. - const bool cache_hit = - (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); - if (ARROW_PREDICT_TRUE(cache_hit)) { - return {cached_chunk_, index - offsets_[cached_chunk_]}; - } else { - return ResolveMissBisect(index); - } -} - ChunkResolver ChunkResolver::FromChunks(const ArrayVector& chunks) { std::vector lengths(chunks.size()); std::transform(chunks.begin(), chunks.end(), lengths.begin(), @@ -61,36 +44,5 @@ ChunkResolver ChunkResolver::FromBatches(const RecordBatchVector& batches) { return ChunkResolver(std::move(lengths)); } -ChunkLocation ChunkResolver::ResolveMissBisect(int64_t index) const { - // Like std::upper_bound(), but hand-written as it can help the compiler. - const int64_t* raw_offsets = offsets_.data(); - // Search [lo, lo + n) - int64_t lo = 0, n = num_chunks_; - while (n > 1) { - int64_t m = n >> 1; - int64_t mid = lo + m; - if (index >= raw_offsets[mid]) { - lo = mid; - n -= m; - } else { - n = m; - } - } - cached_chunk_ = lo; - return {lo, index - offsets_[lo]}; -} - -std::vector ChunkResolver::MakeEndOffsets(std::vector lengths) { - int64_t offset = 0; - for (auto& v : lengths) { - const auto this_length = v; - v = offset; - offset += this_length; - } - lengths.push_back(offset); - return lengths; -} - } // namespace internal -} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index d98cec4c8fe..15ab609976e 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -24,7 +24,6 @@ #include "arrow/type_fwd.h" namespace arrow { -namespace compute { namespace internal { struct ChunkLocation { @@ -38,23 +37,61 @@ struct ChunkResolver { offsets_(MakeEndOffsets(std::move(lengths))), cached_chunk_(0) {} - ChunkLocation Resolve(int64_t index) const; + inline ChunkLocation Resolve(int64_t index) const { + // It is common for the algorithms below to make consecutive accesses at + // a relatively small distance from each other, hence often falling in + // the same chunk. + // This is trivial when merging (assuming each side of the merge uses + // its own resolver), but also in the inner recursive invocations of + // partitioning. + const bool cache_hit = + (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); + if (ARROW_PREDICT_TRUE(cache_hit)) { + return {cached_chunk_, index - offsets_[cached_chunk_]}; + } else { + return ResolveMissBisect(index); + } + } static ChunkResolver FromChunks(const ArrayVector& chunks); static ChunkResolver FromBatches(const RecordBatchVector& batches); protected: - ChunkLocation ResolveMissBisect(int64_t index) const; + inline ChunkLocation ResolveMissBisect(int64_t index) const { + // Like std::upper_bound(), but hand-written as it can help the compiler. + const int64_t* raw_offsets = offsets_.data(); + // Search [lo, lo + n) + int64_t lo = 0, n = num_chunks_; + while (n > 1) { + int64_t m = n >> 1; + int64_t mid = lo + m; + if (index >= raw_offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } + cached_chunk_ = lo; + return {lo, index - offsets_[lo]}; + } - static std::vector MakeEndOffsets(std::vector lengths); + static inline std::vector MakeEndOffsets(std::vector lengths) { + int64_t offset = 0; + for (auto& v : lengths) { + const auto this_length = v; + v = offset; + offset += this_length; + } + lengths.push_back(offset); + return lengths; + } int64_t num_chunks_; std::vector offsets_; - mutable int64_t cached_chunk_; }; } // namespace internal -} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 9b30c611f44..6c95230ec32 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -45,7 +45,7 @@ class MemoryPool; ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) : chunks_(std::move(chunks)), type_(std::move(type)), - resolver_{compute::internal::ChunkResolver::FromChunks(chunks_)} { + resolver_{internal::ChunkResolver::FromChunks(chunks_)} { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 9531d6f3ade..e343ef3bee0 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -183,7 +183,7 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type_; private: - compute::internal::ChunkResolver resolver_; + internal::ChunkResolver resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 98066b09744..94c4f29d380 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -61,23 +61,23 @@ struct ResolvedChunk { bool IsNull() const { return array->IsNull(index); } }; -struct ChunkedArrayResolver : protected ChunkResolver { +struct ChunkedArrayResolver : protected ::arrow::internal::ChunkResolver { explicit ChunkedArrayResolver(const std::vector& chunks) - : ChunkResolver{FromArrayPointers(chunks)}, chunks_(chunks) {} + : ::arrow::internal::ChunkResolver{FromArrayPointers(chunks)}, chunks_(chunks) {} template ResolvedChunk Resolve(int64_t index) const { - const auto loc = ChunkResolver::Resolve(index); + const auto loc = ::arrow::internal::ChunkResolver::Resolve(index); return ResolvedChunk( checked_cast(chunks_[loc.chunk_index]), loc.index_in_chunk); } protected: - static ChunkResolver FromArrayPointers(const std::vector& chunks) { + static ::arrow::internal::ChunkResolver FromArrayPointers(const std::vector& chunks) { std::vector lengths(chunks.size()); std::transform(chunks.begin(), chunks.end(), lengths.begin(), [](const Array* arr) { return arr->length(); }); - return ChunkResolver(std::move(lengths)); + return ::arrow::internal::ChunkResolver(std::move(lengths)); } const std::vector chunks_; diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 1f62ce5b42a..d9c3939c0a3 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -849,10 +849,10 @@ class TableSorter { order(order), null_count(null_count) {} - using LocationType = ChunkLocation; + using LocationType = ::arrow::internal::ChunkLocation; template - ResolvedChunk GetChunk(ChunkLocation loc) const { + ResolvedChunk GetChunk(::arrow::internal::ChunkLocation loc) const { return {checked_cast(chunks[loc.chunk_index]), loc.index_in_chunk}; } @@ -895,8 +895,8 @@ class TableSorter { batches_(MakeBatches(table, &status_)), options_(options), null_placement_(options.null_placement), - left_resolver_(ChunkResolver::FromBatches(batches_)), - right_resolver_(ChunkResolver::FromBatches(batches_)), + left_resolver_(::arrow::internal::ChunkResolver::FromBatches(batches_)), + right_resolver_(::arrow::internal::ChunkResolver::FromBatches(batches_)), sort_keys_(ResolveSortKeys(table, batches_, options.sort_keys, &status_)), indices_begin_(indices_begin), indices_end_(indices_end), @@ -1137,7 +1137,7 @@ class TableSorter { const RecordBatchVector batches_; const SortOptions& options_; const NullPlacement null_placement_; - const ChunkResolver left_resolver_, right_resolver_; + const ::arrow::internal::ChunkResolver left_resolver_, right_resolver_; const std::vector sort_keys_; uint64_t* indices_begin_; uint64_t* indices_end_; From 98108f1cb38ad8101c0193e4e63a38903a020a48 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Sat, 12 Mar 2022 00:39:26 -0500 Subject: [PATCH 07/25] follow convention for getitem methods and PySlice_Checks --- python/pyarrow/table.pxi | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index ac87d081ac5..dcf2fc98880 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -222,7 +222,7 @@ cdef class ChunkedArray(_PandasConvertible): If buffers are shared between arrays then the shared portion will only be counted multiple times. - The dictionary of dictionary arrays will always be counted in their + The dictionary of dictionary arrays will always be counted in their entirety even if the array only references a portion of the dictionary. Examples @@ -285,23 +285,14 @@ cdef class ChunkedArray(_PandasConvertible): ------- value : Scalar (index) or ChunkedArray (slice) """ - cdef shared_ptr[CScalar] c_scalar - if isinstance(key, slice): + if PySlice_Check(key): return _normalize_slice(self, key) - c_scalar = GetResultValue(self.chunked_array.GetScalar( - _normalize_index(key, self.chunked_array.length()))) - return Scalar.wrap( c_scalar) - - cdef getitem(self, int64_t index): - cdef int j + return self.getitem(_normalize_index(key, self.chunked_array.length())) - for j in range(self.num_chunks): - if index < self.chunked_array.chunk(j).get().length(): - return self.chunk(j)[index] - else: - index -= self.chunked_array.chunk(j).get().length() + cdef getitem(self, int64_t i): + return Scalar.wrap(GetResultValue(self.chunked_array.GetScalar(i))) def is_null(self, *, nan_is_null=False): """ @@ -1993,10 +1984,10 @@ cdef class RecordBatch(_PandasConvertible): ------- value : Array (index/column) or RecordBatch (slice) """ - if isinstance(key, slice): + if PySlice_Check(key): return _normalize_slice(self, key) - else: - return self.column(key) + + return self.column(key) def serialize(self, memory_pool=None): """ @@ -2821,10 +2812,10 @@ cdef class Table(_PandasConvertible): ------- ChunkedArray (index/column) or Table (slice) """ - if isinstance(key, slice): + if PySlice_Check(key): return _normalize_slice(self, key) - else: - return self.column(key) + + return self.column(key) def slice(self, offset=0, length=None): """ From 43f67b473024c3b622e6d3d99698ba3befa7dec3 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Sat, 12 Mar 2022 01:02:16 -0500 Subject: [PATCH 08/25] fix lint error --- cpp/src/arrow/compute/kernels/chunked_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 94c4f29d380..1d42acf04cd 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -73,7 +73,8 @@ struct ChunkedArrayResolver : protected ::arrow::internal::ChunkResolver { } protected: - static ::arrow::internal::ChunkResolver FromArrayPointers(const std::vector& chunks) { + static ::arrow::internal::ChunkResolver FromArrayPointers( + const std::vector& chunks) { std::vector lengths(chunks.size()); std::transform(chunks.begin(), chunks.end(), lengths.begin(), [](const Array* arr) { return arr->length(); }); From 2fd223b12679e2e6860b002316c560d4e124699c Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Sat, 12 Mar 2022 02:42:16 -0500 Subject: [PATCH 09/25] add constructors --- cpp/src/arrow/chunk_resolver.cc | 10 ++++++---- cpp/src/arrow/chunk_resolver.h | 13 ++++++------- cpp/src/arrow/chunked_array.cc | 10 ++++++---- cpp/src/arrow/chunked_array.h | 2 +- cpp/src/arrow/compute/kernels/vector_sort.cc | 4 ++-- 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 1a9324ec339..a20464072f9 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -29,19 +29,21 @@ namespace arrow { namespace internal { -ChunkResolver ChunkResolver::FromChunks(const ArrayVector& chunks) { +ChunkResolver::ChunkResolver(const ArrayVector& chunks) { std::vector lengths(chunks.size()); std::transform(chunks.begin(), chunks.end(), lengths.begin(), [](const std::shared_ptr& arr) { return arr->length(); }); - return ChunkResolver(std::move(lengths)); + num_chunks_ = static_cast(lengths.size()); + offsets_ = MakeEndOffsets(std::move(lengths)); } -ChunkResolver ChunkResolver::FromBatches(const RecordBatchVector& batches) { +ChunkResolver::ChunkResolver(const RecordBatchVector& batches) { std::vector lengths(batches.size()); std::transform( batches.begin(), batches.end(), lengths.begin(), [](const std::shared_ptr& batch) { return batch->num_rows(); }); - return ChunkResolver(std::move(lengths)); + num_chunks_ = static_cast(lengths.size()); + offsets_ = MakeEndOffsets(std::move(lengths)); } } // namespace internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 15ab609976e..4164498a1dc 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -34,8 +34,11 @@ struct ChunkLocation { struct ChunkResolver { explicit ChunkResolver(std::vector lengths) : num_chunks_(static_cast(lengths.size())), - offsets_(MakeEndOffsets(std::move(lengths))), - cached_chunk_(0) {} + offsets_(MakeEndOffsets(std::move(lengths))) {} + + explicit ChunkResolver(const ArrayVector& chunks); + + explicit ChunkResolver(const RecordBatchVector& batches); inline ChunkLocation Resolve(int64_t index) const { // It is common for the algorithms below to make consecutive accesses at @@ -53,10 +56,6 @@ struct ChunkResolver { } } - static ChunkResolver FromChunks(const ArrayVector& chunks); - - static ChunkResolver FromBatches(const RecordBatchVector& batches); - protected: inline ChunkLocation ResolveMissBisect(int64_t index) const { // Like std::upper_bound(), but hand-written as it can help the compiler. @@ -90,7 +89,7 @@ struct ChunkResolver { int64_t num_chunks_; std::vector offsets_; - mutable int64_t cached_chunk_; + mutable int64_t cached_chunk_ = 0; }; } // namespace internal diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 6c95230ec32..fde3801f472 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -32,6 +32,7 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/make_unique.h" namespace arrow { @@ -43,9 +44,7 @@ class MemoryPool; // ChunkedArray methods ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), - type_(std::move(type)), - resolver_{internal::ChunkResolver::FromChunks(chunks_)} { + : chunks_(std::move(chunks)), type_(std::move(type)) { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; @@ -149,7 +148,10 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - const auto loc = resolver_.Resolve(index); + if (!resolver_) { + resolver_ = internal::make_unique(chunks_); + } + const auto loc = resolver_->Resolve(index); return chunks_[loc.chunk_index]->GetScalar(loc.index_in_chunk); } diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index e343ef3bee0..9f027542bb7 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -183,7 +183,7 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type_; private: - internal::ChunkResolver resolver_; + mutable std::unique_ptr resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index d9c3939c0a3..4e8f49cf25b 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -895,8 +895,8 @@ class TableSorter { batches_(MakeBatches(table, &status_)), options_(options), null_placement_(options.null_placement), - left_resolver_(::arrow::internal::ChunkResolver::FromBatches(batches_)), - right_resolver_(::arrow::internal::ChunkResolver::FromBatches(batches_)), + left_resolver_(batches_), + right_resolver_(batches_), sort_keys_(ResolveSortKeys(table, batches_, options.sort_keys, &status_)), indices_begin_(indices_begin), indices_end_(indices_end), From 863040678cd6048471c79d8b248218d2976c2f72 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Sun, 13 Mar 2022 00:24:43 -0500 Subject: [PATCH 10/25] add bisect algorithm to utils and group common code --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/chunk_resolver.cc | 50 -------------- cpp/src/arrow/chunk_resolver.h | 66 ++++++++---------- cpp/src/arrow/chunked_array.cc | 4 ++ .../arrow/compute/kernels/chunked_internal.h | 24 +++---- cpp/src/arrow/util/bisect.h | 67 +++++++++++++++++++ 6 files changed, 110 insertions(+), 102 deletions(-) delete mode 100644 cpp/src/arrow/chunk_resolver.cc create mode 100644 cpp/src/arrow/util/bisect.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 29334572874..b6f1e2481fa 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -158,7 +158,6 @@ set(ARROW_SRCS builder.cc buffer.cc chunked_array.cc - chunk_resolver.cc compare.cc config.cc datum.cc diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc deleted file mode 100644 index a20464072f9..00000000000 --- a/cpp/src/arrow/chunk_resolver.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/chunk_resolver.h" - -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/record_batch.h" - -namespace arrow { -namespace internal { - -ChunkResolver::ChunkResolver(const ArrayVector& chunks) { - std::vector lengths(chunks.size()); - std::transform(chunks.begin(), chunks.end(), lengths.begin(), - [](const std::shared_ptr& arr) { return arr->length(); }); - num_chunks_ = static_cast(lengths.size()); - offsets_ = MakeEndOffsets(std::move(lengths)); -} - -ChunkResolver::ChunkResolver(const RecordBatchVector& batches) { - std::vector lengths(batches.size()); - std::transform( - batches.begin(), batches.end(), lengths.begin(), - [](const std::shared_ptr& batch) { return batch->num_rows(); }); - num_chunks_ = static_cast(lengths.size()); - offsets_ = MakeEndOffsets(std::move(lengths)); -} - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 4164498a1dc..b40103217bf 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -17,11 +17,15 @@ #pragma once +#include #include +#include #include #include -#include "arrow/type_fwd.h" +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/util/bisect.h" namespace arrow { namespace internal { @@ -33,12 +37,29 @@ struct ChunkLocation { // An object that resolves an array chunk depending on the index. struct ChunkResolver { explicit ChunkResolver(std::vector lengths) - : num_chunks_(static_cast(lengths.size())), - offsets_(MakeEndOffsets(std::move(lengths))) {} + : offsets_(ConvertLengthsToOffsets(std::move(lengths))) {} - explicit ChunkResolver(const ArrayVector& chunks); + explicit ChunkResolver(const ArrayVector& chunks) { + std::vector lengths(chunks.size()); + std::transform(chunks.begin(), chunks.end(), lengths.begin(), + [](const std::shared_ptr& arr) { return arr->length(); }); + offsets_ = ConvertLengthsToOffsets(std::move(lengths)); + } - explicit ChunkResolver(const RecordBatchVector& batches); + explicit ChunkResolver(const std::vector& chunks) { + std::vector lengths(chunks.size()); + std::transform(chunks.begin(), chunks.end(), lengths.begin(), + [](const Array* arr) { return arr->length(); }); + offsets_ = ConvertLengthsToOffsets(std::move(lengths)); + } + + explicit ChunkResolver(const RecordBatchVector& batches) { + std::vector lengths(batches.size()); + std::transform( + batches.begin(), batches.end(), lengths.begin(), + [](const std::shared_ptr& batch) { return batch->num_rows(); }); + offsets_ = ConvertLengthsToOffsets(std::move(lengths)); + } inline ChunkLocation Resolve(int64_t index) const { // It is common for the algorithms below to make consecutive accesses at @@ -52,42 +73,13 @@ struct ChunkResolver { if (ARROW_PREDICT_TRUE(cache_hit)) { return {cached_chunk_, index - offsets_[cached_chunk_]}; } else { - return ResolveMissBisect(index); + auto chunk_index = Bisect(index, offsets_); + cached_chunk_ = chunk_index; + return {chunk_index, index - offsets_[chunk_index]}; } } protected: - inline ChunkLocation ResolveMissBisect(int64_t index) const { - // Like std::upper_bound(), but hand-written as it can help the compiler. - const int64_t* raw_offsets = offsets_.data(); - // Search [lo, lo + n) - int64_t lo = 0, n = num_chunks_; - while (n > 1) { - int64_t m = n >> 1; - int64_t mid = lo + m; - if (index >= raw_offsets[mid]) { - lo = mid; - n -= m; - } else { - n = m; - } - } - cached_chunk_ = lo; - return {lo, index - offsets_[lo]}; - } - - static inline std::vector MakeEndOffsets(std::vector lengths) { - int64_t offset = 0; - for (auto& v : lengths) { - const auto this_length = v; - v = offset; - offset += this_length; - } - lengths.push_back(offset); - return lengths; - } - - int64_t num_chunks_; std::vector offsets_; mutable int64_t cached_chunk_ = 0; }; diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index fde3801f472..86adb1bd48c 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -152,6 +152,10 @@ Result> ChunkedArray::GetScalar(int64_t index) const { resolver_ = internal::make_unique(chunks_); } const auto loc = resolver_->Resolve(index); + if (loc.chunk_index >= static_cast(chunks_.size())) { + return Status::IndexError("tried to refer to chunk ", loc.chunk_index, + " but ChunkedArray is only ", chunks_.size(), " long"); + } return chunks_[loc.chunk_index]->GetScalar(loc.index_in_chunk); } diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 1d42acf04cd..4fc1f780942 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -24,6 +24,7 @@ #include #include "arrow/array.h" +#include "arrow/chunk_resolver.h" #include "arrow/compute/kernels/codegen_internal.h" namespace arrow { @@ -33,8 +34,8 @@ namespace internal { // The target chunk in a chunked array. template struct ResolvedChunk { - using V = GetViewType; - using LogicalValueType = typename V::T; + using ViewType = GetViewType; + using LogicalValueType = typename ViewType::T; // The target array in chunked array. const ArrayType* array; @@ -45,7 +46,7 @@ struct ResolvedChunk { bool IsNull() const { return array->IsNull(index); } - LogicalValueType Value() const { return V::LogicalValue(array->GetView(index)); } + LogicalValueType Value() const { return ViewType::LogicalValue(array->GetView(index)); } }; // ResolvedChunk specialization for untyped arrays when all is needed is null lookup @@ -63,24 +64,19 @@ struct ResolvedChunk { struct ChunkedArrayResolver : protected ::arrow::internal::ChunkResolver { explicit ChunkedArrayResolver(const std::vector& chunks) - : ::arrow::internal::ChunkResolver{FromArrayPointers(chunks)}, chunks_(chunks) {} + : ::arrow::internal::ChunkResolver(chunks), chunks_(chunks) {} template ResolvedChunk Resolve(int64_t index) const { const auto loc = ::arrow::internal::ChunkResolver::Resolve(index); - return ResolvedChunk( - checked_cast(chunks_[loc.chunk_index]), loc.index_in_chunk); + // if (loc.chunk_index >= static_cast(chunks_.size())) { + // return Status::IndexError("tried to refer to chunk ", loc.chunk_index, + // " but ChunkedArray is only ", chunks_.size(), " long"); + // } + return {checked_cast(chunks_[loc.chunk_index]), loc.index_in_chunk}; } protected: - static ::arrow::internal::ChunkResolver FromArrayPointers( - const std::vector& chunks) { - std::vector lengths(chunks.size()); - std::transform(chunks.begin(), chunks.end(), lengths.begin(), - [](const Array* arr) { return arr->length(); }); - return ::arrow::internal::ChunkResolver(std::move(lengths)); - } - const std::vector chunks_; }; diff --git a/cpp/src/arrow/util/bisect.h b/cpp/src/arrow/util/bisect.h new file mode 100644 index 00000000000..429c9fd46dd --- /dev/null +++ b/cpp/src/arrow/util/bisect.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/type_traits.h" + +#include +#include + +namespace arrow { +namespace internal { + +template +using enable_if_c_integer = + ::arrow::enable_if_t::value>::value, + R>; + +template +inline enable_if_c_integer Bisect(const int64_t index, const std::vector& offsets) { + // Like std::upper_bound(), but hand-written as it can help the compiler. + // Search [lo, lo + n) + T lo = 0; + T n = static_cast(offsets.size()); + while (n > 1) { + const T m = n >> 1; + const T mid = lo + m; + if (static_cast(index) >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } + return lo; +} + +template +inline enable_if_c_integer> ConvertLengthsToOffsets( + std::vector lengths) { + // Offsets are stored in-place + T offset = 0; + for (auto& v : lengths) { + const auto this_length = v; + v = offset; + offset += this_length; + } + lengths.push_back(offset); + return lengths; +} + +} // namespace internal +} // namespace arrow From aaef1adb15db03a38ae263d325dde00faba01f5a Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 14 Mar 2022 02:56:44 -0400 Subject: [PATCH 11/25] refactor ChunkResolver --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/chunk_resolver.cc | 84 +++++++++++++++++++++++++++++++++ cpp/src/arrow/chunk_resolver.h | 76 +++++++++++++++-------------- cpp/src/arrow/chunked_array.cc | 6 +-- cpp/src/arrow/chunked_array.h | 2 +- cpp/src/arrow/util/bisect.h | 67 -------------------------- 6 files changed, 130 insertions(+), 106 deletions(-) create mode 100644 cpp/src/arrow/chunk_resolver.cc delete mode 100644 cpp/src/arrow/util/bisect.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b6f1e2481fa..29334572874 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -158,6 +158,7 @@ set(ARROW_SRCS builder.cc buffer.cc chunked_array.cc + chunk_resolver.cc compare.cc config.cc datum.cc diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc new file mode 100644 index 00000000000..9dc078d3d09 --- /dev/null +++ b/cpp/src/arrow/chunk_resolver.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/chunk_resolver.h" + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" + +namespace arrow { +namespace internal { + +namespace { +static inline std::vector MakeArraysOffsets(const ArrayVector& chunks) { + std::vector offsets(chunks.size()); + int64_t offset = 0; + std::transform(chunks.begin(), chunks.end(), offsets.begin(), + [&offset](const std::shared_ptr& arr) { + auto curr_offset = offset; + offset += arr->length(); + return curr_offset; + }); + offsets.push_back(offset); + return offsets; +} + +static inline std::vector MakeArrayPointersOffsets( + const std::vector& chunks) { + std::vector offsets(chunks.size()); + int64_t offset = 0; + std::transform(chunks.begin(), chunks.end(), offsets.begin(), + [&offset](const Array* arr) { + auto curr_offset = offset; + offset += arr->length(); + return curr_offset; + }); + offsets.push_back(offset); + return offsets; +} + +static inline std::vector MakeRecordBatchesOffsets( + const RecordBatchVector& batches) { + std::vector offsets(batches.size()); + int64_t offset = 0; + std::transform(batches.begin(), batches.end(), offsets.begin(), + [&offset](const std::shared_ptr& batch) { + auto curr_offset = offset; + offset += batch->num_rows(); + return curr_offset; + }); + offsets.push_back(offset); + return offsets; +} +} // namespace + +ChunkResolver::ChunkResolver(const ArrayVector& chunks) + : offsets_(MakeArraysOffsets(chunks)) {} + +ChunkResolver::ChunkResolver(const std::vector& chunks) + : offsets_(MakeArrayPointersOffsets(chunks)) {} + +ChunkResolver::ChunkResolver(const RecordBatchVector& batches) + : offsets_(MakeRecordBatchesOffsets(batches)) {} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index b40103217bf..235e614cb60 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -17,15 +17,11 @@ #pragma once -#include #include -#include -#include #include -#include "arrow/array.h" -#include "arrow/record_batch.h" -#include "arrow/util/bisect.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" namespace arrow { namespace internal { @@ -34,53 +30,63 @@ struct ChunkLocation { int64_t chunk_index, index_in_chunk; }; -// An object that resolves an array chunk depending on the index. +/// \class ChunkResolver +/// \brief An object that resolves an array chunk depending on the index struct ChunkResolver { - explicit ChunkResolver(std::vector lengths) - : offsets_(ConvertLengthsToOffsets(std::move(lengths))) {} + /// \brief Construct a ChunkResolver from a vector of Arrays + explicit ChunkResolver(const ArrayVector& chunks); - explicit ChunkResolver(const ArrayVector& chunks) { - std::vector lengths(chunks.size()); - std::transform(chunks.begin(), chunks.end(), lengths.begin(), - [](const std::shared_ptr& arr) { return arr->length(); }); - offsets_ = ConvertLengthsToOffsets(std::move(lengths)); - } - - explicit ChunkResolver(const std::vector& chunks) { - std::vector lengths(chunks.size()); - std::transform(chunks.begin(), chunks.end(), lengths.begin(), - [](const Array* arr) { return arr->length(); }); - offsets_ = ConvertLengthsToOffsets(std::move(lengths)); - } + /// \brief Construct a ChunkResolver from a vector of C-style Array pointers + explicit ChunkResolver(const std::vector& chunks); - explicit ChunkResolver(const RecordBatchVector& batches) { - std::vector lengths(batches.size()); - std::transform( - batches.begin(), batches.end(), lengths.begin(), - [](const std::shared_ptr& batch) { return batch->num_rows(); }); - offsets_ = ConvertLengthsToOffsets(std::move(lengths)); - } + /// \brief Construct a ChunkResolver from a vector of RecordBatches + explicit ChunkResolver(const RecordBatchVector& batches); - inline ChunkLocation Resolve(int64_t index) const { + /// \brief Return a ChunkLocation containing the chunk index and in-chunk value index of + /// the chunked array at logical index + inline ChunkLocation Resolve(const int64_t index) const { // It is common for the algorithms below to make consecutive accesses at // a relatively small distance from each other, hence often falling in // the same chunk. // This is trivial when merging (assuming each side of the merge uses // its own resolver), but also in the inner recursive invocations of // partitioning. + if (offsets_.size() <= 1) { + return {0, index}; + } const bool cache_hit = (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); if (ARROW_PREDICT_TRUE(cache_hit)) { return {cached_chunk_, index - offsets_[cached_chunk_]}; - } else { - auto chunk_index = Bisect(index, offsets_); - cached_chunk_ = chunk_index; - return {chunk_index, index - offsets_[chunk_index]}; } + auto chunk_index = Bisect(index); + cached_chunk_ = chunk_index; + return {cached_chunk_, index - offsets_[cached_chunk_]}; } protected: - std::vector offsets_; + /// \brief Find the chunk index corresponding to a value index using binary search + /// (bisect) algorithm + inline int64_t Bisect(const int64_t index) const { + // Like std::upper_bound(), but hand-written as it can help the compiler. + // Search [lo, lo + n) + int64_t lo = 0; + auto n = static_cast(offsets_.size()); + while (n > 1) { + const int64_t m = n >> 1; + const int64_t mid = lo + m; + if (static_cast(index) >= offsets_[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } + return lo; + } + + private: + const std::vector offsets_; mutable int64_t cached_chunk_ = 0; }; diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 86adb1bd48c..f9b5dcf53fc 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -148,10 +148,10 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - if (!resolver_) { - resolver_ = internal::make_unique(chunks_); + if (!chunk_resolver_) { + chunk_resolver_ = internal::make_unique(chunks_); } - const auto loc = resolver_->Resolve(index); + const auto loc = chunk_resolver_->Resolve(index); if (loc.chunk_index >= static_cast(chunks_.size())) { return Status::IndexError("tried to refer to chunk ", loc.chunk_index, " but ChunkedArray is only ", chunks_.size(), " long"); diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 9f027542bb7..baadf9e58dc 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -183,7 +183,7 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type_; private: - mutable std::unique_ptr resolver_; + mutable std::unique_ptr chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/util/bisect.h b/cpp/src/arrow/util/bisect.h deleted file mode 100644 index 429c9fd46dd..00000000000 --- a/cpp/src/arrow/util/bisect.h +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/type_traits.h" - -#include -#include - -namespace arrow { -namespace internal { - -template -using enable_if_c_integer = - ::arrow::enable_if_t::value>::value, - R>; - -template -inline enable_if_c_integer Bisect(const int64_t index, const std::vector& offsets) { - // Like std::upper_bound(), but hand-written as it can help the compiler. - // Search [lo, lo + n) - T lo = 0; - T n = static_cast(offsets.size()); - while (n > 1) { - const T m = n >> 1; - const T mid = lo + m; - if (static_cast(index) >= offsets[mid]) { - lo = mid; - n -= m; - } else { - n = m; - } - } - return lo; -} - -template -inline enable_if_c_integer> ConvertLengthsToOffsets( - std::vector lengths) { - // Offsets are stored in-place - T offset = 0; - for (auto& v : lengths) { - const auto this_length = v; - v = offset; - offset += this_length; - } - lengths.push_back(offset); - return lengths; -} - -} // namespace internal -} // namespace arrow From bc29cc92740c9df935973db2f78107e23780bcbd Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 14 Mar 2022 14:12:54 -0400 Subject: [PATCH 12/25] remove static keyword for anon namespaces, remove internal index check --- cpp/src/arrow/chunk_resolver.cc | 7 +++---- cpp/src/arrow/compute/kernels/chunked_internal.h | 4 ---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 9dc078d3d09..ae072083d63 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -29,7 +29,7 @@ namespace arrow { namespace internal { namespace { -static inline std::vector MakeArraysOffsets(const ArrayVector& chunks) { +inline std::vector MakeArraysOffsets(const ArrayVector& chunks) { std::vector offsets(chunks.size()); int64_t offset = 0; std::transform(chunks.begin(), chunks.end(), offsets.begin(), @@ -42,7 +42,7 @@ static inline std::vector MakeArraysOffsets(const ArrayVector& chunks) return offsets; } -static inline std::vector MakeArrayPointersOffsets( +inline std::vector MakeArrayPointersOffsets( const std::vector& chunks) { std::vector offsets(chunks.size()); int64_t offset = 0; @@ -56,8 +56,7 @@ static inline std::vector MakeArrayPointersOffsets( return offsets; } -static inline std::vector MakeRecordBatchesOffsets( - const RecordBatchVector& batches) { +inline std::vector MakeRecordBatchesOffsets(const RecordBatchVector& batches) { std::vector offsets(batches.size()); int64_t offset = 0; std::transform(batches.begin(), batches.end(), offsets.begin(), diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 4fc1f780942..b14f27810f4 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -69,10 +69,6 @@ struct ChunkedArrayResolver : protected ::arrow::internal::ChunkResolver { template ResolvedChunk Resolve(int64_t index) const { const auto loc = ::arrow::internal::ChunkResolver::Resolve(index); - // if (loc.chunk_index >= static_cast(chunks_.size())) { - // return Status::IndexError("tried to refer to chunk ", loc.chunk_index, - // " but ChunkedArray is only ", chunks_.size(), " long"); - // } return {checked_cast(chunks_[loc.chunk_index]), loc.index_in_chunk}; } From 7126456d4e006116f6e619b77e9e3492f7c317cd Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 14 Mar 2022 18:30:22 -0400 Subject: [PATCH 13/25] fully allocate offsets during construction --- cpp/src/arrow/chunk_resolver.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index ae072083d63..9c7a71ad6ce 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -30,7 +30,7 @@ namespace internal { namespace { inline std::vector MakeArraysOffsets(const ArrayVector& chunks) { - std::vector offsets(chunks.size()); + std::vector offsets(chunks.size() + 1); int64_t offset = 0; std::transform(chunks.begin(), chunks.end(), offsets.begin(), [&offset](const std::shared_ptr& arr) { @@ -38,13 +38,13 @@ inline std::vector MakeArraysOffsets(const ArrayVector& chunks) { offset += arr->length(); return curr_offset; }); - offsets.push_back(offset); + offsets[chunks.size()] = offset; return offsets; } inline std::vector MakeArrayPointersOffsets( const std::vector& chunks) { - std::vector offsets(chunks.size()); + std::vector offsets(chunks.size() + 1); int64_t offset = 0; std::transform(chunks.begin(), chunks.end(), offsets.begin(), [&offset](const Array* arr) { @@ -52,12 +52,12 @@ inline std::vector MakeArrayPointersOffsets( offset += arr->length(); return curr_offset; }); - offsets.push_back(offset); + offsets[chunks.size()] = offset; return offsets; } inline std::vector MakeRecordBatchesOffsets(const RecordBatchVector& batches) { - std::vector offsets(batches.size()); + std::vector offsets(batches.size() + 1); int64_t offset = 0; std::transform(batches.begin(), batches.end(), offsets.begin(), [&offset](const std::shared_ptr& batch) { @@ -65,7 +65,7 @@ inline std::vector MakeRecordBatchesOffsets(const RecordBatchVector& ba offset += batch->num_rows(); return curr_offset; }); - offsets.push_back(offset); + offsets[batches.size()] = offset; return offsets; } } // namespace From bb4a2527aade2cd4e413f4443b5d659f1bc0a0c3 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 14 Mar 2022 22:01:31 -0400 Subject: [PATCH 14/25] simplify ChunkResolver construction --- cpp/src/arrow/chunk_resolver.cc | 48 ++++++++++++--------------------- cpp/src/arrow/chunk_resolver.h | 9 ++----- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 9c7a71ad6ce..b7cd0a07ca4 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -29,55 +29,41 @@ namespace arrow { namespace internal { namespace { -inline std::vector MakeArraysOffsets(const ArrayVector& chunks) { - std::vector offsets(chunks.size() + 1); - int64_t offset = 0; - std::transform(chunks.begin(), chunks.end(), offsets.begin(), - [&offset](const std::shared_ptr& arr) { - auto curr_offset = offset; - offset += arr->length(); - return curr_offset; - }); - offsets[chunks.size()] = offset; - return offsets; +template +constexpr int64_t GetLength(const T& array) { + // General case assumes argument is an Array pointer + return array->length(); } -inline std::vector MakeArrayPointersOffsets( - const std::vector& chunks) { +template <> +constexpr int64_t GetLength>( + const std::shared_ptr& batch) { + return batch->num_rows(); +} + +template +inline std::vector MakeChunksOffsets(const std::vector& chunks) { std::vector offsets(chunks.size() + 1); int64_t offset = 0; std::transform(chunks.begin(), chunks.end(), offsets.begin(), - [&offset](const Array* arr) { + [&offset](const T& chunk) { auto curr_offset = offset; - offset += arr->length(); + offset += GetLength(chunk); return curr_offset; }); offsets[chunks.size()] = offset; return offsets; } - -inline std::vector MakeRecordBatchesOffsets(const RecordBatchVector& batches) { - std::vector offsets(batches.size() + 1); - int64_t offset = 0; - std::transform(batches.begin(), batches.end(), offsets.begin(), - [&offset](const std::shared_ptr& batch) { - auto curr_offset = offset; - offset += batch->num_rows(); - return curr_offset; - }); - offsets[batches.size()] = offset; - return offsets; -} } // namespace ChunkResolver::ChunkResolver(const ArrayVector& chunks) - : offsets_(MakeArraysOffsets(chunks)) {} + : offsets_(MakeChunksOffsets(chunks)) {} ChunkResolver::ChunkResolver(const std::vector& chunks) - : offsets_(MakeArrayPointersOffsets(chunks)) {} + : offsets_(MakeChunksOffsets(chunks)) {} ChunkResolver::ChunkResolver(const RecordBatchVector& batches) - : offsets_(MakeRecordBatchesOffsets(batches)) {} + : offsets_(MakeChunksOffsets(batches)) {} } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 235e614cb60..118e0292d5a 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -30,16 +30,12 @@ struct ChunkLocation { int64_t chunk_index, index_in_chunk; }; -/// \class ChunkResolver -/// \brief An object that resolves an array chunk depending on the index +// An object that resolves an array chunk depending on the index struct ChunkResolver { - /// \brief Construct a ChunkResolver from a vector of Arrays explicit ChunkResolver(const ArrayVector& chunks); - /// \brief Construct a ChunkResolver from a vector of C-style Array pointers explicit ChunkResolver(const std::vector& chunks); - /// \brief Construct a ChunkResolver from a vector of RecordBatches explicit ChunkResolver(const RecordBatchVector& batches); /// \brief Return a ChunkLocation containing the chunk index and in-chunk value index of @@ -65,8 +61,7 @@ struct ChunkResolver { } protected: - /// \brief Find the chunk index corresponding to a value index using binary search - /// (bisect) algorithm + // Find the chunk index corresponding to a value index using binary search inline int64_t Bisect(const int64_t index) const { // Like std::upper_bound(), but hand-written as it can help the compiler. // Search [lo, lo + n) From 458ab211197cc806c34ab9f3a55cdad6ab9d29aa Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 14 Mar 2022 23:21:59 -0400 Subject: [PATCH 15/25] add atomic and shared_ptr --- cpp/src/arrow/chunk_resolver.cc | 4 ++++ cpp/src/arrow/chunk_resolver.h | 14 +++++++++----- cpp/src/arrow/chunked_array.cc | 3 +-- cpp/src/arrow/chunked_array.h | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index b7cd0a07ca4..e96c36241b6 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -56,6 +56,10 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { } } // namespace +ChunkResolver::ChunkResolver(const ChunkResolver& chunks) : offsets_(chunks.offsets_) { + cached_chunk_.store(chunks.cached_chunk_); +} + ChunkResolver::ChunkResolver(const ArrayVector& chunks) : offsets_(MakeChunksOffsets(chunks)) {} diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 118e0292d5a..5c0acc60ff1 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -32,6 +33,8 @@ struct ChunkLocation { // An object that resolves an array chunk depending on the index struct ChunkResolver { + ChunkResolver(const ChunkResolver& chunks); + explicit ChunkResolver(const ArrayVector& chunks); explicit ChunkResolver(const std::vector& chunks); @@ -50,14 +53,15 @@ struct ChunkResolver { if (offsets_.size() <= 1) { return {0, index}; } + const auto cached_chunk = cached_chunk_.load(); const bool cache_hit = - (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]); + (index >= offsets_[cached_chunk] && index < offsets_[cached_chunk + 1]); if (ARROW_PREDICT_TRUE(cache_hit)) { - return {cached_chunk_, index - offsets_[cached_chunk_]}; + return {cached_chunk, index - offsets_[cached_chunk]}; } auto chunk_index = Bisect(index); - cached_chunk_ = chunk_index; - return {cached_chunk_, index - offsets_[cached_chunk_]}; + cached_chunk_.store(chunk_index); + return {chunk_index, index - offsets_[chunk_index]}; } protected: @@ -82,7 +86,7 @@ struct ChunkResolver { private: const std::vector offsets_; - mutable int64_t cached_chunk_ = 0; + mutable std::atomic cached_chunk_{0}; }; } // namespace internal diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index f9b5dcf53fc..87616a110b2 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -32,7 +32,6 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" -#include "arrow/util/make_unique.h" namespace arrow { @@ -149,7 +148,7 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, Result> ChunkedArray::GetScalar(int64_t index) const { if (!chunk_resolver_) { - chunk_resolver_ = internal::make_unique(chunks_); + chunk_resolver_ = std::make_shared(chunks_); } const auto loc = chunk_resolver_->Resolve(index); if (loc.chunk_index >= static_cast(chunks_.size())) { diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index baadf9e58dc..1788feba776 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -183,7 +183,7 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type_; private: - mutable std::unique_ptr chunk_resolver_; + mutable std::shared_ptr chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; From 37c2892066029ce472382b77ac3db00d5c6dc0b6 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Thu, 31 Mar 2022 12:42:28 -0400 Subject: [PATCH 16/25] Update error message and slice checks --- cpp/src/arrow/array/array_base.cc | 4 ++-- cpp/src/arrow/chunked_array.cc | 4 ++-- python/pyarrow/array.pxi | 2 +- python/pyarrow/io.pxi | 2 +- python/pyarrow/table.pxi | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 11b6b1630c5..a23802d6132 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -165,8 +165,8 @@ struct ScalarFromArraySlotImpl { Result> Finish() && { if (index_ >= array_.length()) { - return Status::IndexError("tried to refer to element ", index_, - " but array is only ", array_.length(), " long"); + return Status::IndexError("index with value of ", index_, " is out-of-bounds " + "for array of length ", array_.length()); } if (array_.IsNull(index_)) { diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 87616a110b2..c5d8fd85324 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -152,8 +152,8 @@ Result> ChunkedArray::GetScalar(int64_t index) const { } const auto loc = chunk_resolver_->Resolve(index); if (loc.chunk_index >= static_cast(chunks_.size())) { - return Status::IndexError("tried to refer to chunk ", loc.chunk_index, - " but ChunkedArray is only ", chunks_.size(), " long"); + return Status::IndexError("index with value of ", index, " is out-of-bounds " + "for chunked array of length ", length_); } return chunks_[loc.chunk_index]->GetScalar(loc.index_in_chunk); } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 05aeff13251..9c5069c2d20 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1278,7 +1278,7 @@ cdef class Array(_PandasConvertible): ------- value : Scalar (index) or Array (slice) """ - if PySlice_Check(key): + if isinstance(key, slice): return _normalize_slice(self, key) return self.getitem(_normalize_index(key, self.length())) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index c6bf11b2dc4..c39fc78d2ad 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1050,7 +1050,7 @@ cdef class Buffer(_Weakrefable): return pyarrow_wrap_buffer(parent_buf) def __getitem__(self, key): - if PySlice_Check(key): + if isinstance(key, slice): if (key.step or 1) != 1: raise IndexError('only slices with step 1 supported') return _normalize_slice(self, key) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index dcf2fc98880..9bd05536836 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -286,7 +286,7 @@ cdef class ChunkedArray(_PandasConvertible): value : Scalar (index) or ChunkedArray (slice) """ - if PySlice_Check(key): + if isinstance(key, slice): return _normalize_slice(self, key) return self.getitem(_normalize_index(key, self.chunked_array.length())) @@ -1984,7 +1984,7 @@ cdef class RecordBatch(_PandasConvertible): ------- value : Array (index/column) or RecordBatch (slice) """ - if PySlice_Check(key): + if isinstance(key, slice): return _normalize_slice(self, key) return self.column(key) @@ -2812,7 +2812,7 @@ cdef class Table(_PandasConvertible): ------- ChunkedArray (index/column) or Table (slice) """ - if PySlice_Check(key): + if isinstance(key, slice): return _normalize_slice(self, key) return self.column(key) From 0b43f2910c32faee0927924463a38d6be147ee5f Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Thu, 31 Mar 2022 17:16:47 -0400 Subject: [PATCH 17/25] remove constexpr --- cpp/src/arrow/chunk_resolver.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index e96c36241b6..4083cbd7e22 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -30,13 +30,13 @@ namespace internal { namespace { template -constexpr int64_t GetLength(const T& array) { +int64_t GetLength(const T& array) { // General case assumes argument is an Array pointer return array->length(); } template <> -constexpr int64_t GetLength>( +int64_t GetLength>( const std::shared_ptr& batch) { return batch->num_rows(); } From b37332a1f74ddb120885b1b889c2ff4fa2460537 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Fri, 1 Apr 2022 11:27:22 -0400 Subject: [PATCH 18/25] Initialize ChunkResolver during construction --- cpp/src/arrow/array/array_base.cc | 5 +++-- cpp/src/arrow/chunked_array.cc | 10 +++++----- cpp/src/arrow/chunked_array.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index a23802d6132..b36fb0fb94a 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -165,8 +165,9 @@ struct ScalarFromArraySlotImpl { Result> Finish() && { if (index_ >= array_.length()) { - return Status::IndexError("index with value of ", index_, " is out-of-bounds " - "for array of length ", array_.length()); + return Status::IndexError("index with value of ", index_, + " is out-of-bounds for array of length ", + array_.length()); } if (array_.IsNull(index_)) { diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index c5d8fd85324..cd4b560b918 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -32,6 +32,7 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/make_unique.h" namespace arrow { @@ -56,6 +57,8 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) length_ += chunk->length(); null_count_ += chunk->null_count(); } + + chunk_resolver_ = internal::make_unique(chunks_); } Result> ChunkedArray::Make(ArrayVector chunks, @@ -147,13 +150,10 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - if (!chunk_resolver_) { - chunk_resolver_ = std::make_shared(chunks_); - } const auto loc = chunk_resolver_->Resolve(index); if (loc.chunk_index >= static_cast(chunks_.size())) { - return Status::IndexError("index with value of ", index, " is out-of-bounds " - "for chunked array of length ", length_); + return Status::IndexError("index with value of ", index, + " is out-of-bounds for chunked array of length ", length_); } return chunks_[loc.chunk_index]->GetScalar(loc.index_in_chunk); } diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 1788feba776..d2a9ef9e08a 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -183,7 +183,7 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type_; private: - mutable std::shared_ptr chunk_resolver_; + std::unique_ptr chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; From ced0b1a7931527a61590605264855486d0346656 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 13 Apr 2022 12:43:32 -0400 Subject: [PATCH 19/25] use list initialization --- cpp/src/arrow/chunk_resolver.cc | 6 +++--- cpp/src/arrow/chunk_resolver.h | 2 +- cpp/src/arrow/chunked_array.cc | 4 +--- cpp/src/arrow/chunked_array.h | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 4083cbd7e22..4a021aee57d 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -61,13 +61,13 @@ ChunkResolver::ChunkResolver(const ChunkResolver& chunks) : offsets_(chunks.offs } ChunkResolver::ChunkResolver(const ArrayVector& chunks) - : offsets_(MakeChunksOffsets(chunks)) {} + : offsets_(MakeChunksOffsets(chunks)), cached_chunk_{0} {} ChunkResolver::ChunkResolver(const std::vector& chunks) - : offsets_(MakeChunksOffsets(chunks)) {} + : offsets_(MakeChunksOffsets(chunks)), cached_chunk_{0} {} ChunkResolver::ChunkResolver(const RecordBatchVector& batches) - : offsets_(MakeChunksOffsets(batches)) {} + : offsets_(MakeChunksOffsets(batches)), cached_chunk_{0} {} } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 5c0acc60ff1..f7477c3ec9b 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -86,7 +86,7 @@ struct ChunkResolver { private: const std::vector offsets_; - mutable std::atomic cached_chunk_{0}; + mutable std::atomic cached_chunk_; }; } // namespace internal diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index cd4b560b918..b8c6e41e690 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -44,15 +44,13 @@ class MemoryPool; // ChunkedArray methods ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), type_(std::move(type)) { + : chunks_(std::move(chunks)), type_(std::move(type)), length_(0), null_count_(0) { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; type_ = chunks_[0]->type(); } - length_ = 0; - null_count_ = 0; for (const auto& chunk : chunks_) { length_ += chunk->length(); null_count_ += chunk->null_count(); diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index d2a9ef9e08a..070b227adb8 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -178,9 +178,9 @@ class ARROW_EXPORT ChunkedArray { protected: ArrayVector chunks_; + std::shared_ptr type_; int64_t length_; int64_t null_count_; - std::shared_ptr type_; private: std::unique_ptr chunk_resolver_; From 6515bdc506513721727f0c29a59c12c4af5d0d0d Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 13 Apr 2022 14:20:17 -0400 Subject: [PATCH 20/25] use brace list initialization, add comment on hacky copy constructor --- cpp/src/arrow/chunk_resolver.cc | 10 +++------- cpp/src/arrow/chunk_resolver.h | 8 +++++++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 4a021aee57d..fa949a20c68 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -56,18 +56,14 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { } } // namespace -ChunkResolver::ChunkResolver(const ChunkResolver& chunks) : offsets_(chunks.offsets_) { - cached_chunk_.store(chunks.cached_chunk_); -} - ChunkResolver::ChunkResolver(const ArrayVector& chunks) - : offsets_(MakeChunksOffsets(chunks)), cached_chunk_{0} {} + : offsets_{MakeChunksOffsets(chunks)}, cached_chunk_{0} {} ChunkResolver::ChunkResolver(const std::vector& chunks) - : offsets_(MakeChunksOffsets(chunks)), cached_chunk_{0} {} + : offsets_{MakeChunksOffsets(chunks)}, cached_chunk_{0} {} ChunkResolver::ChunkResolver(const RecordBatchVector& batches) - : offsets_(MakeChunksOffsets(batches)), cached_chunk_{0} {} + : offsets_{MakeChunksOffsets(batches)}, cached_chunk_{0} {} } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index f7477c3ec9b..f1b2d0b7266 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -33,7 +33,13 @@ struct ChunkLocation { // An object that resolves an array chunk depending on the index struct ChunkResolver { - ChunkResolver(const ChunkResolver& chunks); + // TODO: This copy constructor is a hack and not semantically correct because this + // class contains a `std::atomic` member (non copyable/movable), + // but this explicit copy constructor allows compilation even if copies are performed + // such as in TableSorter/ResolveSortKey in vector_sort.cc. + ChunkResolver(const ChunkResolver& chunks) : offsets_{chunks.offsets_} { + cached_chunk_.store(chunks.cached_chunk_); + } explicit ChunkResolver(const ArrayVector& chunks); From 557529072bb3d950734675f558d747088c17b2cb Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 13 Apr 2022 16:11:19 -0400 Subject: [PATCH 21/25] add move/assigment constructor to ChunkResolver --- cpp/src/arrow/chunk_resolver.cc | 12 +++++++++--- cpp/src/arrow/chunk_resolver.h | 18 +++++++++--------- cpp/src/arrow/chunked_array.cc | 10 ++++++---- cpp/src/arrow/chunked_array.h | 2 +- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index fa949a20c68..1b91e068a63 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -57,13 +57,19 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { } // namespace ChunkResolver::ChunkResolver(const ArrayVector& chunks) - : offsets_{MakeChunksOffsets(chunks)}, cached_chunk_{0} {} + : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {} ChunkResolver::ChunkResolver(const std::vector& chunks) - : offsets_{MakeChunksOffsets(chunks)}, cached_chunk_{0} {} + : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {} ChunkResolver::ChunkResolver(const RecordBatchVector& batches) - : offsets_{MakeChunksOffsets(batches)}, cached_chunk_{0} {} + : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {} + +ChunkResolver& ChunkResolver::operator=(const ChunkResolver&& other) { + offsets_ = std::move(other.offsets_); + cached_chunk_.store(other.cached_chunk_.load()); + return *this; +} } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index f1b2d0b7266..5dab96428ea 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -33,20 +33,20 @@ struct ChunkLocation { // An object that resolves an array chunk depending on the index struct ChunkResolver { - // TODO: This copy constructor is a hack and not semantically correct because this - // class contains a `std::atomic` member (non copyable/movable), - // but this explicit copy constructor allows compilation even if copies are performed - // such as in TableSorter/ResolveSortKey in vector_sort.cc. - ChunkResolver(const ChunkResolver& chunks) : offsets_{chunks.offsets_} { - cached_chunk_.store(chunks.cached_chunk_); - } - explicit ChunkResolver(const ArrayVector& chunks); explicit ChunkResolver(const std::vector& chunks); explicit ChunkResolver(const RecordBatchVector& batches); + ChunkResolver(const ChunkResolver& other) + : offsets_(other.offsets_), cached_chunk_(other.cached_chunk_.load()) {} + + ChunkResolver(ChunkResolver&& other) + : offsets_(std::move(other.offsets_)), cached_chunk_(other.cached_chunk_.load()) {} + + ChunkResolver& operator=(const ChunkResolver&& other); + /// \brief Return a ChunkLocation containing the chunk index and in-chunk value index of /// the chunked array at logical index inline ChunkLocation Resolve(const int64_t index) const { @@ -91,7 +91,7 @@ struct ChunkResolver { } private: - const std::vector offsets_; + std::vector offsets_; mutable std::atomic cached_chunk_; }; diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index b8c6e41e690..b316935d15e 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -44,7 +44,11 @@ class MemoryPool; // ChunkedArray methods ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) - : chunks_(std::move(chunks)), type_(std::move(type)), length_(0), null_count_(0) { + : chunks_(std::move(chunks)), + type_(std::move(type)), + length_(0), + null_count_(0), + chunk_resolver_{chunks_} { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; @@ -55,8 +59,6 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) length_ += chunk->length(); null_count_ += chunk->null_count(); } - - chunk_resolver_ = internal::make_unique(chunks_); } Result> ChunkedArray::Make(ArrayVector chunks, @@ -148,7 +150,7 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, } Result> ChunkedArray::GetScalar(int64_t index) const { - const auto loc = chunk_resolver_->Resolve(index); + const auto loc = chunk_resolver_.Resolve(index); if (loc.chunk_index >= static_cast(chunks_.size())) { return Status::IndexError("index with value of ", index, " is out-of-bounds for chunked array of length ", length_); diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 070b227adb8..e130a99bbe2 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -183,7 +183,7 @@ class ARROW_EXPORT ChunkedArray { int64_t null_count_; private: - std::unique_ptr chunk_resolver_; + internal::ChunkResolver chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; From a0ab9ff07b399163555f1836d709a210c52f80bc Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 13 Apr 2022 17:30:46 -0400 Subject: [PATCH 22/25] moved copy constructor to ChunkedArrayResolver --- cpp/src/arrow/chunk_resolver.h | 3 --- cpp/src/arrow/compute/kernels/chunked_internal.h | 3 +++ cpp/src/arrow/compute/kernels/vector_sort.cc | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 5dab96428ea..07013982c0d 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -39,9 +39,6 @@ struct ChunkResolver { explicit ChunkResolver(const RecordBatchVector& batches); - ChunkResolver(const ChunkResolver& other) - : offsets_(other.offsets_), cached_chunk_(other.cached_chunk_.load()) {} - ChunkResolver(ChunkResolver&& other) : offsets_(std::move(other.offsets_)), cached_chunk_(other.cached_chunk_.load()) {} diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index b14f27810f4..c2d7a65d23a 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -63,6 +63,9 @@ struct ResolvedChunk { }; struct ChunkedArrayResolver : protected ::arrow::internal::ChunkResolver { + ChunkedArrayResolver(const ChunkedArrayResolver& other) + : ::arrow::internal::ChunkResolver(other.chunks_), chunks_(other.chunks_) {} + explicit ChunkedArrayResolver(const std::vector& chunks) : ::arrow::internal::ChunkResolver(chunks), chunks_(chunks) {} diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 4e8f49cf25b..0e0e7be0923 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -1671,9 +1671,8 @@ class TableSelecter : public TypeVisitor { : order(order), type(GetPhysicalType(chunked_array->type())), chunks(GetPhysicalChunks(*chunked_array, type)), - chunk_pointers(GetArrayPointers(chunks)), null_count(chunked_array->null_count()), - resolver(chunk_pointers) {} + resolver(GetArrayPointers(chunks)) {} using LocationType = int64_t; @@ -1687,7 +1686,6 @@ class TableSelecter : public TypeVisitor { const SortOrder order; const std::shared_ptr type; const ArrayVector chunks; - const std::vector chunk_pointers; const int64_t null_count; const ChunkedArrayResolver resolver; }; From 69093c17407574d6175c2e8bb2b2cdbb9bcb4439 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 13 Apr 2022 17:36:45 -0400 Subject: [PATCH 23/25] document member variables --- cpp/src/arrow/chunk_resolver.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 07013982c0d..aaf18c65da6 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -31,7 +31,7 @@ struct ChunkLocation { int64_t chunk_index, index_in_chunk; }; -// An object that resolves an array chunk depending on the index +// An object that resolves an array chunk depending on a logical index struct ChunkResolver { explicit ChunkResolver(const ArrayVector& chunks); @@ -88,7 +88,11 @@ struct ChunkResolver { } private: + // Collection of starting offsets used for binary search std::vector offsets_; + + // Tracks the most recently used chunk index to allow fast + // access for consecutive indices corresponding to the same chunk mutable std::atomic cached_chunk_; }; From 76b25ffa0fc41ee548cbfb53005fbba88586fd5e Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 13 Apr 2022 18:03:06 -0400 Subject: [PATCH 24/25] IWYU --- cpp/src/arrow/chunk_resolver.h | 2 +- cpp/src/arrow/chunked_array.cc | 1 - cpp/src/arrow/compute/kernels/chunked_internal.h | 1 - cpp/src/arrow/compute/kernels/vector_sort.cc | 2 ++ cpp/src/arrow/compute/kernels/vector_sort_internal.h | 1 - 5 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index aaf18c65da6..072df481f9d 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -88,7 +88,7 @@ struct ChunkResolver { } private: - // Collection of starting offsets used for binary search + // Collection of starting offsets used for binary search std::vector offsets_; // Tracks the most recently used chunk index to allow fast diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index b316935d15e..840dd04a5ad 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -32,7 +32,6 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" -#include "arrow/util/make_unique.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index c2d7a65d23a..69f439fccf0 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "arrow/array.h" diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 0e0e7be0923..ad3323199ad 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -27,7 +27,9 @@ #include "arrow/array/concatenate.h" #include "arrow/array/data.h" +#include "arrow/chunk_resolver.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/kernels/chunked_internal.h" #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/util_internal.h" #include "arrow/compute/kernels/vector_sort_internal.h" diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index 63ded6a858a..d8b024525c8 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -23,7 +23,6 @@ #include #include "arrow/array.h" -#include "arrow/chunk_resolver.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/chunked_internal.h" #include "arrow/type.h" From 5d2922c85f80725fb04000b2d54ec1b130b82830 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 14 Apr 2022 09:09:08 +0200 Subject: [PATCH 25/25] Fix move assignment operator argument constness, make inline --- cpp/src/arrow/chunk_resolver.cc | 6 ------ cpp/src/arrow/chunk_resolver.h | 6 +++++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 1b91e068a63..4a1ba6d0a32 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -65,11 +65,5 @@ ChunkResolver::ChunkResolver(const std::vector& chunks) ChunkResolver::ChunkResolver(const RecordBatchVector& batches) : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {} -ChunkResolver& ChunkResolver::operator=(const ChunkResolver&& other) { - offsets_ = std::move(other.offsets_); - cached_chunk_.store(other.cached_chunk_.load()); - return *this; -} - } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 072df481f9d..f69253b1617 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -42,7 +42,11 @@ struct ChunkResolver { ChunkResolver(ChunkResolver&& other) : offsets_(std::move(other.offsets_)), cached_chunk_(other.cached_chunk_.load()) {} - ChunkResolver& operator=(const ChunkResolver&& other); + ChunkResolver& operator=(ChunkResolver&& other) { + offsets_ = std::move(other.offsets_); + cached_chunk_.store(other.cached_chunk_.load()); + return *this; + } /// \brief Return a ChunkLocation containing the chunk index and in-chunk value index of /// the chunked array at logical index