From 83bd54e258bc69d5549cbfb5c3805d301a2dff81 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 21 Dec 2020 11:03:26 -1000 Subject: [PATCH 1/4] Added concatenate implementation for dictionaries --- cpp/src/arrow/array/array_dict.cc | 41 +++++++++++++ cpp/src/arrow/array/array_dict_test.cc | 2 +- cpp/src/arrow/array/concatenate.cc | 62 ++++++++++++++++++- cpp/src/arrow/array/concatenate_test.cc | 80 +++++++++++++++++++++++++ cpp/src/arrow/type.h | 6 ++ 5 files changed, 189 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 614c81d8f40..041efefb6f0 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -44,6 +44,29 @@ namespace arrow { using internal::checked_cast; using internal::CopyBitmap; +struct MaxIndexAccessor { + MaxIndexAccessor() {} + + template + enable_if_t::value, Status> Visit(const T&) { + return Status::Invalid("Dictionary index types must be integer types"); + } + + template + enable_if_integer Visit(const T&) { + max_index_value_ = std::numeric_limits::max(); + return Status::OK(); + } + + int64_t max_index_value_ = 0; +}; + +Result DictionaryIndexMaxValue(std::shared_ptr index_type) { + MaxIndexAccessor index_accessor; + RETURN_NOT_OK(VisitTypeInline(*index_type, &index_accessor)); + return index_accessor.max_index_value_; +} + // ---------------------------------------------------------------------- // DictionaryArray @@ -205,6 +228,24 @@ class DictionaryUnifierImpl : public DictionaryUnifier { return Status::OK(); } + Status GetResultWithIndexType(std::shared_ptr index_type, + std::shared_ptr* out_dict) override { + int64_t dict_length = memo_table_.size(); + ARROW_ASSIGN_OR_RAISE(auto max_value, DictionaryIndexMaxValue(index_type)); + if (dict_length > max_value) { + return Status::Invalid( + "These dictionaries cannot be combined. The unified dictionary requires a " + "larger index type."); + } + + // Build unified dictionary array + std::shared_ptr data; + RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_, + 0 /* start_offset */, &data)); + *out_dict = MakeArray(data); + return Status::OK(); + } + private: MemoryPool* pool_; std::shared_ptr value_type_; diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index fca442b2567..a00fe822287 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -109,7 +109,6 @@ TYPED_TEST(TestDictionaryBuilder, MakeBuilder) { using c_type = typename TypeParam::c_type; auto value_type = std::make_shared(); - auto dict_array = ArrayFromJSON(value_type, "[1, 2]"); auto dict_type = dictionary(int8(), value_type); std::unique_ptr boxed_builder; ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); @@ -128,6 +127,7 @@ TYPED_TEST(TestDictionaryBuilder, MakeBuilder) { std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); + auto dict_array = ArrayFromJSON(value_type, "[1, 2]"); auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); DictionaryArray expected(dict_type, int_array, dict_array); diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index a22b28be21a..64d0e00790d 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -163,6 +163,46 @@ static Status PutOffsets(const std::shared_ptr& src, Offset first_offset return Status::OK(); } +struct DictionaryConcatenate { + DictionaryConcatenate(BufferVector& index_buffers, + std::vector>& index_lookup, + MemoryPool* pool) + : out_(nullptr), + index_buffers_(index_buffers), + index_lookup_(index_lookup), + pool_(pool) {} + + template + enable_if_t::value, Status> Visit(const T& t) { + return Status::Invalid("Dictionary indices must be integral types"); + } + + template + enable_if_integer Visit(const T& index_type) { + int64_t out_length = 0; + for (const auto& buffer : index_buffers_) { + out_length += buffer->size(); + } + ARROW_ASSIGN_OR_RAISE(out_, AllocateBuffer(out_length, pool_)); + auto out_data = out_->mutable_data(); + for (size_t i = 0; i < index_buffers_.size(); i++) { + auto buffer = index_buffers_[i]; + auto old_indices = reinterpret_cast(buffer->data()); + auto indices_map = reinterpret_cast(index_lookup_[i]->mutable_data()); + for (int64_t j = 0; j < buffer->size(); j++) { + out_data[j] = indices_map[old_indices[j]]; + } + out_data += buffer->size(); + } + return Status::OK(); + } + + std::shared_ptr out_; + BufferVector& index_buffers_; + std::vector>& index_lookup_; + MemoryPool* pool_; +}; + class ConcatenateImpl { public: ConcatenateImpl(const std::vector>& in, @@ -255,6 +295,21 @@ class ConcatenateImpl { return Status::OK(); } + Result UnifyDictionaries(const DictionaryType& d) { + BufferVector new_index_lookup; + ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(d.value_type())); + new_index_lookup.resize(in_.size()); + for (size_t i = 0; i < in_.size(); i++) { + auto item = in_[i]; + auto dictionary_array = MakeArray(item->dictionary); + RETURN_NOT_OK(unifier->Unify(*dictionary_array, &new_index_lookup[i])); + } + std::shared_ptr out_dictionary; + RETURN_NOT_OK(unifier->GetResultWithIndexType(d.index_type(), &out_dictionary)); + out_->dictionary = out_dictionary->data(); + return new_index_lookup; + } + Status Visit(const DictionaryType& d) { auto fixed = internal::checked_cast(d.index_type().get()); @@ -274,7 +329,12 @@ class ConcatenateImpl { ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed)); return ConcatenateBuffers(index_buffers, pool_).Value(&out_->buffers[1]); } else { - return Status::NotImplemented("Concat with dictionary unification NYI"); + ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed)); + ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d)); + DictionaryConcatenate concatenate(index_buffers, index_lookup, pool_); + RETURN_NOT_OK(VisitTypeInline(*d.index_type(), &concatenate)); + out_->buffers[1] = std::move(concatenate.out_); + return Status::OK(); } } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 428967889ec..fb161fe1fe9 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -225,6 +225,86 @@ TEST_F(ConcatenateTest, DictionaryType) { }); } +TEST_F(ConcatenateTest, DictionaryTypeDifferentDictionaries) { + auto dict_type = dictionary(uint8(), utf8()); + auto dict_one = DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0]", + "[\"A0\", \"A1\", \"A2\", \"A3\"]"); + auto dict_two = DictArrayFromJSON(dict_type, "[null, 4, 2, 1]", + "[\"B0\", \"B1\", \"B2\", \"B3\", \"B4\"]"); + auto concat_expected = DictArrayFromJSON( + dict_type, "[1, 2, null, 3, 0, null, 8, 6, 5]", + "[\"A0\", \"A1\", \"A2\", \"A3\", \"B0\", \"B1\", \"B2\", \"B3\", \"B4\"]"); + ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two})); + AssertArraysEqual(*concat_expected, *concat_actual); +} + +TEST_F(ConcatenateTest, DictionaryTypePartialOverlapDictionaries) { + auto dict_type = dictionary(uint8(), utf8()); + auto dict_one = DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0]", + "[\"A0\", \"A1\", \"C2\", \"C3\"]"); + auto dict_two = DictArrayFromJSON(dict_type, "[null, 4, 2, 1]", + "[\"B0\", \"B1\", \"C2\", \"C3\", \"B4\"]"); + auto concat_expected = + DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0, null, 6, 2, 5]", + "[\"A0\", \"A1\", \"C2\", \"C3\", \"B0\", \"B1\", \"B4\"]"); + ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two})); + AssertArraysEqual(*concat_expected, *concat_actual); +} + +TEST_F(ConcatenateTest, DictionaryTypeDifferentSizeIndex) { + auto dict_type = dictionary(uint8(), utf8()); + auto bigger_dict_type = dictionary(uint16(), utf8()); + auto dict_one = DictArrayFromJSON(dict_type, "[0]", "[\"A0\"]"); + auto dict_two = DictArrayFromJSON(bigger_dict_type, "[0]", "[\"B0\"]"); + ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status()); +} + +TEST_F(ConcatenateTest, DictionaryTypeCantUnifyNullInDictionary) { + auto dict_type = dictionary(uint8(), utf8()); + auto dict_one = DictArrayFromJSON(dict_type, "[0, 1]", "[null, \"A\"]"); + auto dict_two = DictArrayFromJSON(dict_type, "[0, 1]", "[null, \"B\"]"); + ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status()); +} + +TEST_F(ConcatenateTest, DictionaryTypeEnlargedIndices) { + auto size = std::numeric_limits::max() + 1; + auto dict_type = dictionary(uint8(), uint16()); + + UInt8Builder index_builder; + ASSERT_OK(index_builder.Reserve(size)); + for (auto i = 0; i < size; i++) { + index_builder.UnsafeAppend(i); + } + ASSERT_OK_AND_ASSIGN(auto indices, index_builder.Finish()); + + UInt16Builder values_builder; + ASSERT_OK(values_builder.Reserve(size)); + for (auto i = 0; i < size; i++) { + values_builder.UnsafeAppend(i); + } + ASSERT_OK_AND_ASSIGN(auto dictionary_one, values_builder.Finish()); + + UInt16Builder values_builder_two; + ASSERT_OK(values_builder_two.Reserve(size)); + for (auto i = size; i < 2 * size; i++) { + values_builder_two.UnsafeAppend(i); + } + ASSERT_OK_AND_ASSIGN(auto dictionary_two, values_builder_two.Finish()); + + auto dict_one = std::make_shared(dict_type, indices, dictionary_one); + auto dict_two = std::make_shared(dict_type, indices, dictionary_two); + ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status()); + + auto bigger_dict_type = dictionary(uint16(), uint16()); + + auto bigger_one = + std::make_shared(bigger_dict_type, dictionary_one, dictionary_one); + auto bigger_two = + std::make_shared(bigger_dict_type, dictionary_one, dictionary_two); + ASSERT_OK_AND_ASSIGN(auto combined, Concatenate({bigger_one, bigger_two})); + ASSERT_EQ(size * 2, combined->length()); +} + TEST_F(ConcatenateTest, DISABLED_UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a1071c4c86e..6726a7f36e2 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1367,6 +1367,12 @@ class ARROW_EXPORT DictionaryUnifier { /// after this is called virtual Status GetResult(std::shared_ptr* out_type, std::shared_ptr* out_dict) = 0; + + /// \brief Return a result DictionaryType with the given index type. If + /// the index type is not large enough then an invalid status will be returned. + /// The unifier cannot be used after this is called + virtual Status GetResultWithIndexType(std::shared_ptr index_type, + std::shared_ptr* out_dict) = 0; }; // ---------------------------------------------------------------------- From 7075b20cda225384d0d48fbf349cf3ff20facb63 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 21 Dec 2020 11:30:01 -1000 Subject: [PATCH 2/4] Fixed a test line I had accidentally moved --- cpp/src/arrow/array/array_dict_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index a00fe822287..fca442b2567 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -109,6 +109,7 @@ TYPED_TEST(TestDictionaryBuilder, MakeBuilder) { using c_type = typename TypeParam::c_type; auto value_type = std::make_shared(); + auto dict_array = ArrayFromJSON(value_type, "[1, 2]"); auto dict_type = dictionary(int8(), value_type); std::unique_ptr boxed_builder; ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); @@ -127,7 +128,6 @@ TYPED_TEST(TestDictionaryBuilder, MakeBuilder) { std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); - auto dict_array = ArrayFromJSON(value_type, "[1, 2]"); auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); DictionaryArray expected(dict_type, int_array, dict_array); From 9fd5dbd09f79712aae0fac9bd942a68247a4a877 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 8 Jan 2021 11:56:00 -1000 Subject: [PATCH 3/4] Addressing PR comments. Added test case for 2 byte indices which revealed issue with the index mapping. Reusing IntUtil functions for transpose and identifying the max index value. Cleaned up some comments and styling --- cpp/src/arrow/array/array_dict.cc | 27 +-------- cpp/src/arrow/array/concatenate.cc | 45 ++++++++------- cpp/src/arrow/array/concatenate_test.cc | 76 +++++++++++++++++++++---- cpp/src/arrow/type.h | 2 +- 4 files changed, 92 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 041efefb6f0..dc9c23cdd80 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -29,6 +29,7 @@ #include "arrow/array/dict_internal.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/datum.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -44,29 +45,6 @@ namespace arrow { using internal::checked_cast; using internal::CopyBitmap; -struct MaxIndexAccessor { - MaxIndexAccessor() {} - - template - enable_if_t::value, Status> Visit(const T&) { - return Status::Invalid("Dictionary index types must be integer types"); - } - - template - enable_if_integer Visit(const T&) { - max_index_value_ = std::numeric_limits::max(); - return Status::OK(); - } - - int64_t max_index_value_ = 0; -}; - -Result DictionaryIndexMaxValue(std::shared_ptr index_type) { - MaxIndexAccessor index_accessor; - RETURN_NOT_OK(VisitTypeInline(*index_type, &index_accessor)); - return index_accessor.max_index_value_; -} - // ---------------------------------------------------------------------- // DictionaryArray @@ -231,8 +209,7 @@ class DictionaryUnifierImpl : public DictionaryUnifier { Status GetResultWithIndexType(std::shared_ptr index_type, std::shared_ptr* out_dict) override { int64_t dict_length = memo_table_.size(); - ARROW_ASSIGN_OR_RAISE(auto max_value, DictionaryIndexMaxValue(index_type)); - if (dict_length > max_value) { + if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) { return Status::Invalid( "These dictionaries cannot be combined. The unified dictionary requires a " "larger index type."); diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 64d0e00790d..1aa33c05f7c 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -36,6 +36,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/int_util.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -44,6 +45,7 @@ namespace arrow { using internal::SafeSignedAdd; +namespace { /// offset, length pair for representing a Range of a buffer or array struct Range { int64_t offset = -1, length = 0; @@ -66,8 +68,8 @@ struct Bitmap { }; // Allocate a buffer and concatenate bitmaps into it. -static Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, - std::shared_ptr* out) { +Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, + std::shared_ptr* out) { int64_t out_length = 0; for (const auto& bitmap : bitmaps) { if (internal::AddWithOverflow(out_length, bitmap.range.length, &out_length)) { @@ -94,15 +96,15 @@ static Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -static Status PutOffsets(const std::shared_ptr& src, Offset first_offset, - Offset* dst, Range* values_range); +Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, + Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, // also computing the ranges of values spanned by each buffer of offsets. template -static Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, - std::shared_ptr* out, - std::vector* values_ranges) { +Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, + std::shared_ptr* out, + std::vector* values_ranges) { values_ranges->resize(buffers.size()); // allocate output buffer @@ -130,8 +132,8 @@ static Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, } template -static Status PutOffsets(const std::shared_ptr& src, Offset first_offset, - Offset* dst, Range* values_range) { +Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, + Range* values_range) { if (src->size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) @@ -164,8 +166,7 @@ static Status PutOffsets(const std::shared_ptr& src, Offset first_offset } struct DictionaryConcatenate { - DictionaryConcatenate(BufferVector& index_buffers, - std::vector>& index_lookup, + DictionaryConcatenate(BufferVector& index_buffers, BufferVector& index_lookup, MemoryPool* pool) : out_(nullptr), index_buffers_(index_buffers), @@ -184,22 +185,21 @@ struct DictionaryConcatenate { out_length += buffer->size(); } ARROW_ASSIGN_OR_RAISE(out_, AllocateBuffer(out_length, pool_)); - auto out_data = out_->mutable_data(); + CType* out_data = reinterpret_cast(out_->mutable_data()); for (size_t i = 0; i < index_buffers_.size(); i++) { - auto buffer = index_buffers_[i]; + const auto& buffer = index_buffers_[i]; + auto size = buffer->size() / sizeof(CType); auto old_indices = reinterpret_cast(buffer->data()); - auto indices_map = reinterpret_cast(index_lookup_[i]->mutable_data()); - for (int64_t j = 0; j < buffer->size(); j++) { - out_data[j] = indices_map[old_indices[j]]; - } - out_data += buffer->size(); + auto indices_map = reinterpret_cast(index_lookup_[i]->data()); + internal::TransposeInts(old_indices, out_data, size, indices_map); + out_data += size; } return Status::OK(); } std::shared_ptr out_; - BufferVector& index_buffers_; - std::vector>& index_lookup_; + const BufferVector& index_buffers_; + const BufferVector& index_lookup_; MemoryPool* pool_; }; @@ -324,12 +324,11 @@ class ConcatenateImpl { } } + ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed)); if (dictionaries_same) { out_->dictionary = in_[0]->dictionary; - ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed)); return ConcatenateBuffers(index_buffers, pool_).Value(&out_->buffers[1]); } else { - ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed)); ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d)); DictionaryConcatenate concatenate(index_buffers, index_lookup, pool_); RETURN_NOT_OK(VisitTypeInline(*d.index_type(), &concatenate)); @@ -476,6 +475,8 @@ class ConcatenateImpl { std::shared_ptr out_; }; +} // namespace + Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { if (arrays.size() == 0) { return Status::Invalid("Must pass at least one array"); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index fb161fe1fe9..b5ae8cabcf6 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -226,16 +226,72 @@ TEST_F(ConcatenateTest, DictionaryType) { } TEST_F(ConcatenateTest, DictionaryTypeDifferentDictionaries) { - auto dict_type = dictionary(uint8(), utf8()); - auto dict_one = DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0]", - "[\"A0\", \"A1\", \"A2\", \"A3\"]"); - auto dict_two = DictArrayFromJSON(dict_type, "[null, 4, 2, 1]", - "[\"B0\", \"B1\", \"B2\", \"B3\", \"B4\"]"); - auto concat_expected = DictArrayFromJSON( - dict_type, "[1, 2, null, 3, 0, null, 8, 6, 5]", - "[\"A0\", \"A1\", \"A2\", \"A3\", \"B0\", \"B1\", \"B2\", \"B3\", \"B4\"]"); - ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two})); - AssertArraysEqual(*concat_expected, *concat_actual); + { + auto dict_type = dictionary(uint8(), utf8()); + auto dict_one = DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0]", + "[\"A0\", \"A1\", \"A2\", \"A3\"]"); + auto dict_two = DictArrayFromJSON(dict_type, "[null, 4, 2, 1]", + "[\"B0\", \"B1\", \"B2\", \"B3\", \"B4\"]"); + auto concat_expected = DictArrayFromJSON( + dict_type, "[1, 2, null, 3, 0, null, 8, 6, 5]", + "[\"A0\", \"A1\", \"A2\", \"A3\", \"B0\", \"B1\", \"B2\", \"B3\", \"B4\"]"); + ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two})); + AssertArraysEqual(*concat_expected, *concat_actual); + } + { + const int SIZE = 500; + auto dict_type = dictionary(uint16(), utf8()); + + UInt16Builder index_builder; + UInt16Builder expected_index_builder; + ASSERT_OK(index_builder.Reserve(SIZE)); + ASSERT_OK(expected_index_builder.Reserve(SIZE * 2)); + for (auto i = 0; i < SIZE; i++) { + index_builder.UnsafeAppend(i); + expected_index_builder.UnsafeAppend(i); + } + for (auto i = SIZE; i < 2 * SIZE; i++) { + expected_index_builder.UnsafeAppend(i); + } + ASSERT_OK_AND_ASSIGN(auto indices, index_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto expected_indices, expected_index_builder.Finish()); + + // Creates three dictionaries. The first maps i->"{i}" the second maps i->"{500+i}", + // each for 500 values and the third maps i->"{i}" but for 1000 values. + // The first and second concatenated should end up equaling the third. All strings + // are padded to length 8 so we can know the size ahead of time. + StringBuilder values_one_builder; + StringBuilder values_two_builder; + StringBuilder expected_values_builder; + ASSERT_OK(values_one_builder.Resize(SIZE)); + ASSERT_OK(values_two_builder.Resize(SIZE)); + ASSERT_OK(expected_values_builder.Resize(SIZE * 2)); + ASSERT_OK(values_one_builder.ReserveData(8 * SIZE)); + ASSERT_OK(values_two_builder.ReserveData(8 * SIZE)); + ASSERT_OK(expected_values_builder.ReserveData(8 * SIZE * 2)); + for (auto i = 0; i < SIZE; i++) { + auto i_str = std::to_string(i); + auto padded = i_str.insert(0, 8 - i_str.length(), '0'); + values_one_builder.UnsafeAppend(padded); + expected_values_builder.UnsafeAppend(padded); + } + for (auto i = SIZE; i < 2 * SIZE; i++) { + auto i_str = std::to_string(i); + auto padded = i_str.insert(0, 8 - i_str.length(), '0'); + values_two_builder.UnsafeAppend(padded); + expected_values_builder.UnsafeAppend(padded); + } + ASSERT_OK_AND_ASSIGN(auto dictionary_one, values_one_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto dictionary_two, values_two_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto expected_dictionary, expected_values_builder.Finish()); + + auto one = std::make_shared(dict_type, indices, dictionary_one); + auto two = std::make_shared(dict_type, indices, dictionary_two); + auto expected = std::make_shared(dict_type, expected_indices, + expected_dictionary); + ASSERT_OK_AND_ASSIGN(auto combined, Concatenate({one, two})); + AssertArraysEqual(*combined, *expected); + } } TEST_F(ConcatenateTest, DictionaryTypePartialOverlapDictionaries) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 6726a7f36e2..127ed598399 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1368,7 +1368,7 @@ class ARROW_EXPORT DictionaryUnifier { virtual Status GetResult(std::shared_ptr* out_type, std::shared_ptr* out_dict) = 0; - /// \brief Return a result DictionaryType with the given index type. If + /// \brief Return a unified dictionary with the given index type. If /// the index type is not large enough then an invalid status will be returned. /// The unifier cannot be used after this is called virtual Status GetResultWithIndexType(std::shared_ptr index_type, From a3a478ffd43bb3f8cccecbee9360c2a77511b301 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 8 Jan 2021 12:06:54 -1000 Subject: [PATCH 4/4] Reworked the double loops a bit to make it clearer what is being created --- cpp/src/arrow/array/concatenate_test.cc | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index b5ae8cabcf6..c6e6e5bcc57 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -262,28 +262,22 @@ TEST_F(ConcatenateTest, DictionaryTypeDifferentDictionaries) { // are padded to length 8 so we can know the size ahead of time. StringBuilder values_one_builder; StringBuilder values_two_builder; - StringBuilder expected_values_builder; ASSERT_OK(values_one_builder.Resize(SIZE)); ASSERT_OK(values_two_builder.Resize(SIZE)); - ASSERT_OK(expected_values_builder.Resize(SIZE * 2)); ASSERT_OK(values_one_builder.ReserveData(8 * SIZE)); ASSERT_OK(values_two_builder.ReserveData(8 * SIZE)); - ASSERT_OK(expected_values_builder.ReserveData(8 * SIZE * 2)); for (auto i = 0; i < SIZE; i++) { auto i_str = std::to_string(i); auto padded = i_str.insert(0, 8 - i_str.length(), '0'); values_one_builder.UnsafeAppend(padded); - expected_values_builder.UnsafeAppend(padded); - } - for (auto i = SIZE; i < 2 * SIZE; i++) { - auto i_str = std::to_string(i); - auto padded = i_str.insert(0, 8 - i_str.length(), '0'); - values_two_builder.UnsafeAppend(padded); - expected_values_builder.UnsafeAppend(padded); + auto upper_i_str = std::to_string(i + SIZE); + auto upper_padded = upper_i_str.insert(0, 8 - i_str.length(), '0'); + values_two_builder.UnsafeAppend(upper_padded); } ASSERT_OK_AND_ASSIGN(auto dictionary_one, values_one_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto dictionary_two, values_two_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto expected_dictionary, expected_values_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto expected_dictionary, + Concatenate({dictionary_one, dictionary_two})) auto one = std::make_shared(dict_type, indices, dictionary_one); auto two = std::make_shared(dict_type, indices, dictionary_two); @@ -335,16 +329,13 @@ TEST_F(ConcatenateTest, DictionaryTypeEnlargedIndices) { UInt16Builder values_builder; ASSERT_OK(values_builder.Reserve(size)); + UInt16Builder values_builder_two; + ASSERT_OK(values_builder_two.Reserve(size)); for (auto i = 0; i < size; i++) { values_builder.UnsafeAppend(i); + values_builder_two.UnsafeAppend(i + size); } ASSERT_OK_AND_ASSIGN(auto dictionary_one, values_builder.Finish()); - - UInt16Builder values_builder_two; - ASSERT_OK(values_builder_two.Reserve(size)); - for (auto i = size; i < 2 * size; i++) { - values_builder_two.UnsafeAppend(i); - } ASSERT_OK_AND_ASSIGN(auto dictionary_two, values_builder_two.Finish()); auto dict_one = std::make_shared(dict_type, indices, dictionary_one);