diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 665867a2a93..7b85f943520 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -239,12 +239,6 @@ class DictionaryBuilderBase : public ArrayBuilder { Status Resize(int64_t capacity) override { ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); capacity = std::max(capacity, kMinBuilderCapacity); - - if (capacity_ == 0) { - // Initialize hash table - // XXX should we let the user pass additional size heuristics? - delta_offset_ = 0; - } ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); capacity_ = indices_builder_.capacity(); return Status::OK(); @@ -266,6 +260,10 @@ class DictionaryBuilderBase : public ArrayBuilder { // Update internals for further uses of this DictionaryBuilder delta_offset_ = memo_table_->size(); + + // ARROW-6861: Manually set capacity/length/null count until we can fix up + // Reset to not reset the memo table in ARROW-6869 + capacity_ = length_ = null_count_ = 0; indices_builder_.Reset(); return Status::OK(); @@ -380,6 +378,7 @@ class DictionaryBuilder : public internal::DictionaryBuilderBaseindices_builder_.null_count(); ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); this->length_ += length; this->null_count_ += this->indices_builder_.null_count() - null_count_before; return Status::OK(); @@ -402,6 +401,7 @@ class Dictionary32Builder : public internal::DictionaryBuilderBaseindices_builder_.null_count(); ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); this->length_ += length; this->null_count_ += this->indices_builder_.null_count() - null_count_before; return Status::OK(); diff --git a/cpp/src/arrow/array_dict_test.cc b/cpp/src/arrow/array_dict_test.cc index 2c5e58a65b3..6728d48f97c 100644 --- a/cpp/src/arrow/array_dict_test.cc +++ b/cpp/src/arrow/array_dict_test.cc @@ -304,6 +304,57 @@ TYPED_TEST(TestDictionaryBuilder, Dictionary32_BasicPrimitive) { ASSERT_TRUE(expected.Equals(result)); } +TYPED_TEST(TestDictionaryBuilder, FinishResetBehavior) { + // ARROW-6861 + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + + Dictionary32Builder builder; + + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + + // Properties from indices_builder propagated + ASSERT_LT(0, builder.capacity()); + ASSERT_LT(0, builder.null_count()); + ASSERT_EQ(4, builder.length()); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + // Everything reset + ASSERT_EQ(0, builder.capacity()); + ASSERT_EQ(0, builder.length()); + ASSERT_EQ(0, builder.null_count()); + + // Use the builder again + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Append(static_cast(4))); + + ASSERT_OK(builder.Finish(&result)); + + // Dictionary has 4 elements because the dictionary memo was not reset. This + // behavior will change after ARROW-6869 + ASSERT_EQ(2, static_cast(*result).dictionary()->length()); +} + +TEST(TestDictionaryBuilderAdHoc, AppendIndicesUpdateCapacity) { + DictionaryBuilder builder; + Dictionary32Builder builder32; + + std::vector indices_i32 = {0, 1, 2}; + std::vector indices_i64 = {0, 1, 2}; + + ASSERT_OK(builder.AppendIndices(indices_i64.data(), 3)); + ASSERT_OK(builder32.AppendIndices(indices_i32.data(), 3)); + + ASSERT_LT(0, builder.capacity()); + ASSERT_LT(0, builder32.capacity()); +} + TEST(TestStringDictionaryBuilder, Basic) { // Build the dictionary Array StringDictionaryBuilder builder;