From 452f6331ee8b5c1bbad494e6ad03ac7ccc8bbd51 Mon Sep 17 00:00:00 2001 From: tianchen Date: Mon, 3 Aug 2020 17:08:53 +0800 Subject: [PATCH 1/9] ARROW-9304: [C++] Add "AppendEmpty" builder APIs for use inside StructBuilder::AppendNull Change-Id: I2d04e0e102b5c92418fb1a691ca8b9b9f653f9fc --- cpp/src/arrow/array/array_struct_test.cc | 6 +-- cpp/src/arrow/array/builder_adaptive.h | 12 ++++++ cpp/src/arrow/array/builder_base.h | 3 ++ cpp/src/arrow/array/builder_binary.cc | 18 ++++++++ cpp/src/arrow/array/builder_binary.h | 27 ++++++++++++ cpp/src/arrow/array/builder_dict.h | 14 +++++++ cpp/src/arrow/array/builder_nested.cc | 35 +++++++++++++++- cpp/src/arrow/array/builder_nested.h | 53 ++++++++++++++++++++++-- cpp/src/arrow/array/builder_primitive.h | 46 ++++++++++++++++++++ cpp/src/arrow/array/builder_union.h | 34 +++++++++++++++ cpp/src/arrow/buffer_builder.h | 5 +++ cpp/src/arrow/pretty_print.cc | 15 ++++++- 12 files changed, 258 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index 0afadcf9285..b3878db63e6 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -266,10 +266,8 @@ TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_EQ(2, result_->field(1)->length()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_TRUE(result_->field(0)->IsNull(0)); - ASSERT_TRUE(result_->field(0)->IsNull(1)); - ASSERT_TRUE(result_->field(1)->IsNull(0)); - ASSERT_TRUE(result_->field(1)->IsNull(1)); + ASSERT_EQ(2, result_->field(0)->null_count()); + ASSERT_EQ(2, result_->field(1)->null_count()); ASSERT_EQ(Type::LIST, result_->field(0)->type_id()); ASSERT_EQ(Type::INT32, result_->field(1)->type_id()); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 1895f8ddaee..0a1ab4ec417 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -64,6 +64,18 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } + Status AppendEmpties(int64_t length) { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + length_ += length; + null_count_ += length; + null_bitmap_builder_.Forward(length); + return Status::OK(); + } + + Status AppendEmpty() { return AppendEmpties(1); } + void Reset() override; Status Resize(int64_t capacity) override; diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index d73681756ba..78818ff8eaf 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -100,6 +100,9 @@ class ARROW_EXPORT ArrayBuilder { virtual Status AppendNull() = 0; virtual Status AppendNulls(int64_t length) = 0; + virtual Status AppendEmpty() = 0; + virtual Status AppendEmpties(int64_t length) = 0; + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index ecb0e95fb44..6631617d0b8 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -73,6 +73,24 @@ Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) { return Status::OK(); } +Status FixedSizeBinaryBuilder::AppendEmpty() { + RETURN_NOT_OK(Reserve(1)); + byte_builder_.UnsafeAppend(byte_width_, 0); + null_bitmap_builder_.Forward(1); + ++length_; + ++null_count_; + return Status::OK(); +} + +Status FixedSizeBinaryBuilder::AppendEmpties(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0); + null_bitmap_builder_.Forward(length); + length_ += length; + null_count_ += length; + return Status::OK(); +} + void FixedSizeBinaryBuilder::Reset() { ArrayBuilder::Reset(); byte_builder_.Reset(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 70fde2a2e64..f95fa1f4158 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -94,6 +94,30 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmpty() final { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + null_bitmap_builder_.Forward(1); + ++null_count_; + ++length_; + return Status::OK(); + } + + Status AppendEmpties(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { + return AppendOverflow(num_bytes); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + null_bitmap_builder_.Forward(length); + null_count_ += length; + length_ += length; + return Status::OK(); + } + /// \brief Append without checking capacity /// /// Offsets and data should have been presized using Reserve() and @@ -440,6 +464,9 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final; + Status AppendEmpty() final; + Status AppendEmpties(int64_t length) final; + void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 7f920bb8cd6..393277d3720 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -261,6 +261,20 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } + Status AppendEmpty() { + length_ += 1; + null_count_ += 1; + + return indices_builder_.AppendEmpty(); + } + + Status AppendEmpties(int64_t length) { + length_ += length; + null_count_ += length; + + return indices_builder_.AppendEmpties(length); + } + /// \brief Insert values into the dictionary's memo, but do not append any /// indices. Can be used to initialize a new builder with known dictionary /// values diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index c3786a8fab4..20a19bab454 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -123,6 +123,24 @@ Status MapBuilder::AppendNulls(int64_t length) { return Status::OK(); } +Status MapBuilder::AppendEmpty() { + DCHECK_EQ(item_builder_->length(), key_builder_->length()); + RETURN_NOT_OK(AdjustStructBuilderLength()); + RETURN_NOT_OK(list_builder_->AppendEmpty()); + length_ = list_builder_->length(); + null_count_ = list_builder_->null_count(); + return Status::OK(); +} + +Status MapBuilder::AppendEmpties(int64_t length) { + DCHECK_EQ(item_builder_->length(), key_builder_->length()); + RETURN_NOT_OK(AdjustStructBuilderLength()); + RETURN_NOT_OK(list_builder_->AppendNulls(length)); + length_ = list_builder_->length(); + null_count_ = list_builder_->null_count(); + return Status::OK(); +} + Status MapBuilder::AdjustStructBuilderLength() { // If key/item builders have been appended, adjust struct builder length // to match. Struct and key are non-nullable, append all valid values. @@ -193,6 +211,21 @@ Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { " elements, have ", new_elements); } return Status::OK(); + +Status FixedSizeListBuilder::AppendEmpty() { + RETURN_NOT_OK(Reserve(1)); + null_bitmap_builder_.Forward(1); + ++length_; + ++null_count_; + return value_builder_->AppendEmpties(list_size_); +} + +Status FixedSizeListBuilder::AppendEmpties(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + null_bitmap_builder_.Forward(length); + length_ += length; + null_count_ += length; + return value_builder_->AppendEmpties(list_size_ * length); } Status FixedSizeListBuilder::Resize(int64_t capacity) { @@ -234,7 +267,7 @@ void StructBuilder::Reset() { Status StructBuilder::AppendNulls(int64_t length) { for (const auto& field : children_) { - RETURN_NOT_OK(field->AppendNulls(length)); + RETURN_NOT_OK(field->AppendEmpties(length)); } ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, false); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index b8948403acc..d1a1575ae94 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -109,6 +109,25 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmpty() final { + null_bitmap_builder_.Forward(1); + ++length_; + ++null_count_; + return AppendNextOffset(); + } + + Status AppendEmpties(int64_t length) final { + ARROW_RETURN_NOT_OK(CheckNextOffset()); + const int64_t num_values = value_builder_->length(); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_values)); + } + null_bitmap_builder_.Forward(length); + length_ += length; + null_count_ += length; + return Status::OK(); + } + Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); @@ -258,6 +277,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final; + Status AppendEmpty() final; + + Status AppendEmpties(int64_t length) final; + /// \brief Get builder to append keys. /// /// Append a key with this builder should be followed by appending @@ -353,6 +376,10 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status ValidateOverflow(int64_t new_elements); + Status AppendEmpty() final; + + Status AppendEmpties(int64_t length) final; + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { @@ -410,19 +437,39 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a null value. Automatically appends a null to each child + /// \brief Append a null value. Automatically appends a empty to each child /// builder. Status AppendNull() final { for (const auto& field : children_) { - ARROW_RETURN_NOT_OK(field->AppendNull()); + ARROW_RETURN_NOT_OK(field->AppendEmpty()); } return Append(false); } - /// \brief Append multiple null values. Automatically appends nulls to each + /// \brief Append multiple null values. Automatically appends empties to each /// child builder. Status AppendNulls(int64_t length) final; + Status AppendEmpty() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmpty()); + } + null_bitmap_builder_.Forward(1); + length_++; + null_count_++; + return Status::OK(); + } + + Status AppendEmpties(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmpties(length)); + } + null_bitmap_builder_.Forward(length); + length_ += length; + null_count_ += length; + return Status::OK(); + } + void Reset() override; ArrayBuilder* field_builder(int i) const { return children_[i].get(); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index cc907fb6b8a..5db1bf39156 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -46,6 +46,14 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { return AppendNulls(1); } + Status AppendEmpties(int64_t length) final { + if (length < 0) return Status::Invalid("length must be positive"); + length_ += length; + return Status::OK(); + } + + Status AppendEmpty() final { return AppendEmpties(1); } + Status Append(std::nullptr_t) { return AppendNull(); } Status FinishInternal(std::shared_ptr* out) override; @@ -100,6 +108,26 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a empty element + Status AppendEmpty() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + null_bitmap_builder_.Forward(1); + ++length_; + ++null_count_; + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmpties(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + null_bitmap_builder_.Forward(length); + length_ += length; + null_count_ += length; + return Status::OK(); + } + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } void Reset() override { data_builder_.Reset(); } @@ -297,6 +325,24 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmpty() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(false); + null_bitmap_builder_.Forward(1); + ++length_; + ++null_count_; + return Status::OK(); + } + + Status AppendEmpties(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + null_bitmap_builder_.Forward(length); + length_ += length; + null_count_ += length; + return Status::OK(); + } + /// Scalar append Status Append(const bool val) { ARROW_RETURN_NOT_OK(Reserve(1)); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 1ccc7ef159f..db1ece2e22e 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -117,6 +117,24 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return child_builder->AppendNull(); } + Status AppendEmpty() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + return child_builder->AppendEmpty(); + } + + Status AppendEmpties(int64_t length) { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + return child_builder->AppendEmpty(); + } + /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. /// @@ -180,6 +198,22 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { return Status::OK(); } + Status AppendEmpty() final { + ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); + for (int8_t code : type_codes_) { + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmpty()); + } + return Status::OK(); + } + + Status AppendEmpties(int64_t length) { + ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); + for (int8_t code : type_codes_) { + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmpties(length)); + } + return Status::OK(); + } + /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. /// diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 41a47c91729..ff4a1482b6b 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -292,6 +292,11 @@ class TypedBufferBuilder { return Status::OK(); } + void Forward(int64_t num_elements) { + false_count_ += num_elements; + bit_length_ += num_elements; + } + void UnsafeAppend(bool value) { BitUtil::SetBitTo(mutable_data(), bit_length_, value); if (!value) { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index f88230ffd64..050d14547ac 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/pretty_print.h" + #include #include #include @@ -28,7 +30,6 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" -#include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" @@ -327,7 +328,6 @@ class ArrayPrinter : public PrettyPrinter { if (offset != 0) { field = field->Slice(offset, length); } - RETURN_NOT_OK(PrettyPrint(*field, indent_ + options_.indent_size, sink_)); } return Status::OK(); @@ -339,6 +339,17 @@ class ArrayPrinter : public PrettyPrinter { children.reserve(array.num_fields()); for (int i = 0; i < array.num_fields(); ++i) { children.emplace_back(array.field(i)); + // set field array null at index which parent struct is null, + // because StructBuilder#AppendNull dosen't set child null bitmap. + auto field_null_buf = array.field(i)->null_bitmap(); + if (field_null_buf != nullptr) { + uint8_t* field_null_data = field_null_buf->mutable_data(); + for (int k = 0; k < array.length(); k++) { + if (array.IsNull(k)) { + BitUtil::SetBitTo(field_null_data, k, false); + } + } + } } return PrintChildren(children, 0, array.length()); } From 342cc00ebf3284bd2f6a061226f765c210f6323a Mon Sep 17 00:00:00 2001 From: tianchen Date: Mon, 17 Aug 2020 11:26:09 +0800 Subject: [PATCH 2/9] fix build Change-Id: I9183b580f293d74cd7d083f4a3e879c0ae51d70d --- cpp/src/arrow/array/builder_adaptive.h | 4 ++-- cpp/src/arrow/array/builder_dict.h | 4 ++-- cpp/src/arrow/array/builder_union.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 0a1ab4ec417..e5813ff9d00 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -64,7 +64,7 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } - Status AppendEmpties(int64_t length) { + Status AppendEmpties(int64_t length) final { ARROW_RETURN_NOT_OK(CommitPendingData()); ARROW_RETURN_NOT_OK(Reserve(length)); memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); @@ -74,7 +74,7 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } - Status AppendEmpty() { return AppendEmpties(1); } + Status AppendEmpty() final { return AppendEmpties(1); } void Reset() override; Status Resize(int64_t capacity) override; diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 393277d3720..7b10e3265fc 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -261,14 +261,14 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } - Status AppendEmpty() { + Status AppendEmpty() final { length_ += 1; null_count_ += 1; return indices_builder_.AppendEmpty(); } - Status AppendEmpties(int64_t length) { + Status AppendEmpties(int64_t length) final { length_ += length; null_count_ += length; diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index db1ece2e22e..e77d4c5b6f2 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -126,7 +126,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return child_builder->AppendEmpty(); } - Status AppendEmpties(int64_t length) { + Status AppendEmpties(int64_t length) final { const int8_t first_child_code = type_codes_[0]; ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); @@ -206,7 +206,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { return Status::OK(); } - Status AppendEmpties(int64_t length) { + Status AppendEmpties(int64_t length) final { ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); for (int8_t code : type_codes_) { ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmpties(length)); From fecc0fd9fa81dbd5397e3b7cc238d0b9f4a3582a Mon Sep 17 00:00:00 2001 From: tianchen Date: Mon, 7 Sep 2020 12:32:00 +0800 Subject: [PATCH 3/9] minor fix Change-Id: I745c2171929d4039d059496e6b80fc4e381cc97b --- cpp/src/arrow/array/array_struct_test.cc | 8 ++-- cpp/src/arrow/array/builder_adaptive.h | 8 ++-- cpp/src/arrow/array/builder_base.h | 4 +- cpp/src/arrow/array/builder_binary.cc | 14 +++---- cpp/src/arrow/array/builder_binary.h | 16 +++---- cpp/src/arrow/array/builder_dict.h | 22 ++++++++-- cpp/src/arrow/array/builder_nested.cc | 26 +++++------- cpp/src/arrow/array/builder_nested.h | 42 +++++++------------ cpp/src/arrow/array/builder_primitive.h | 28 +++++-------- cpp/src/arrow/array/builder_union.h | 18 ++++---- cpp/src/arrow/buffer_builder.h | 5 --- cpp/src/arrow/json/parser_test.cc | 13 +++--- cpp/src/arrow/pretty_print.cc | 11 ----- cpp/src/arrow/pretty_print_test.cc | 11 ++--- python/pyarrow/tests/test_array.py | 4 +- .../test/test-struct-array-builder.rb | 16 +++---- ruby/red-arrow/test/test-struct-array.rb | 4 +- 17 files changed, 110 insertions(+), 140 deletions(-) diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index b3878db63e6..39ad61b4562 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include + #include #include #include #include -#include - #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/status.h" @@ -266,8 +266,8 @@ TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_EQ(2, result_->field(1)->length()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_EQ(2, result_->field(0)->null_count()); - ASSERT_EQ(2, result_->field(1)->null_count()); + ASSERT_EQ(0, result_->field(0)->null_count()); + ASSERT_EQ(0, result_->field(1)->null_count()); ASSERT_EQ(Type::LIST, result_->field(0)->type_id()); ASSERT_EQ(Type::INT32, result_->field(1)->type_id()); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index e5813ff9d00..500c99153a3 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -64,17 +64,15 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(CommitPendingData()); ARROW_RETURN_NOT_OK(Reserve(length)); memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - length_ += length; - null_count_ += length; - null_bitmap_builder_.Forward(length); + UnsafeSetNotNull(length); return Status::OK(); } - Status AppendEmpty() final { return AppendEmpties(1); } + Status AppendEmptyValue() final { return AppendEmptyValues(1); } void Reset() override; Status Resize(int64_t capacity) override; diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 78818ff8eaf..1190354d50b 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -100,8 +100,8 @@ class ARROW_EXPORT ArrayBuilder { virtual Status AppendNull() = 0; virtual Status AppendNulls(int64_t length) = 0; - virtual Status AppendEmpty() = 0; - virtual Status AppendEmpties(int64_t length) = 0; + virtual Status AppendEmptyValue() = 0; + virtual Status AppendEmptyValues(int64_t length) = 0; /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 6631617d0b8..6822dc89903 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -73,21 +73,17 @@ Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) { return Status::OK(); } -Status FixedSizeBinaryBuilder::AppendEmpty() { +Status FixedSizeBinaryBuilder::AppendEmptyValue() { RETURN_NOT_OK(Reserve(1)); - byte_builder_.UnsafeAppend(byte_width_, 0); - null_bitmap_builder_.Forward(1); - ++length_; - ++null_count_; + UnsafeAppendToBitmap(true); + byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); return Status::OK(); } -Status FixedSizeBinaryBuilder::AppendEmpties(int64_t length) { +Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) { RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0); - null_bitmap_builder_.Forward(length); - length_ += length; - null_count_ += length; return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index f95fa1f4158..f52a3dca2bb 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -94,16 +94,14 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendEmpty() final { + Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(AppendNextOffset()); ARROW_RETURN_NOT_OK(Reserve(1)); - null_bitmap_builder_.Forward(1); - ++null_count_; - ++length_; + UnsafeAppendToBitmap(true); return Status::OK(); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { return AppendOverflow(num_bytes); @@ -112,9 +110,7 @@ class BaseBinaryBuilder : public ArrayBuilder { for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); } - null_bitmap_builder_.Forward(length); - null_count_ += length; - length_ += length; + UnsafeAppendToBitmap(length, true); return Status::OK(); } @@ -464,8 +460,8 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final; - Status AppendEmpty() final; - Status AppendEmpties(int64_t length) final; + Status AppendEmptyValue() final; + Status AppendEmptyValues(int64_t length) final; void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 7b10e3265fc..e7e715f8696 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -261,18 +261,18 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } - Status AppendEmpty() final { + Status AppendEmptyValue() final { length_ += 1; null_count_ += 1; - return indices_builder_.AppendEmpty(); + return indices_builder_.AppendEmptyValue(); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { length_ += length; null_count_ += length; - return indices_builder_.AppendEmpties(length); + return indices_builder_.AppendEmptyValues(length); } /// \brief Insert values into the dictionary's memo, but do not append any @@ -451,6 +451,20 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } + Status AppendEmptyValue() final { + length_ += 1; + null_count_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + null_count_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + /// \brief Append a whole dense array to the builder Status AppendArray(const Array& array) { #ifndef NDEBUG diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 20a19bab454..fd6b9005736 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -123,19 +123,19 @@ Status MapBuilder::AppendNulls(int64_t length) { return Status::OK(); } -Status MapBuilder::AppendEmpty() { +Status MapBuilder::AppendEmptyValue() { DCHECK_EQ(item_builder_->length(), key_builder_->length()); RETURN_NOT_OK(AdjustStructBuilderLength()); - RETURN_NOT_OK(list_builder_->AppendEmpty()); + RETURN_NOT_OK(list_builder_->AppendEmptyValue()); length_ = list_builder_->length(); null_count_ = list_builder_->null_count(); return Status::OK(); } -Status MapBuilder::AppendEmpties(int64_t length) { +Status MapBuilder::AppendEmptyValues(int64_t length) { DCHECK_EQ(item_builder_->length(), key_builder_->length()); RETURN_NOT_OK(AdjustStructBuilderLength()); - RETURN_NOT_OK(list_builder_->AppendNulls(length)); + RETURN_NOT_OK(list_builder_->AppendEmptyValues(length)); length_ = list_builder_->length(); null_count_ = list_builder_->null_count(); return Status::OK(); @@ -212,20 +212,16 @@ Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { } return Status::OK(); -Status FixedSizeListBuilder::AppendEmpty() { +Status FixedSizeListBuilder::AppendEmptyValue() { RETURN_NOT_OK(Reserve(1)); - null_bitmap_builder_.Forward(1); - ++length_; - ++null_count_; - return value_builder_->AppendEmpties(list_size_); + UnsafeAppendToBitmap(true); + return value_builder_->AppendEmptyValues(list_size_); } -Status FixedSizeListBuilder::AppendEmpties(int64_t length) { +Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) { RETURN_NOT_OK(Reserve(length)); - null_bitmap_builder_.Forward(length); - length_ += length; - null_count_ += length; - return value_builder_->AppendEmpties(list_size_ * length); + UnsafeAppendToBitmap(length, true); + return value_builder_->AppendEmptyValues(list_size_ * length); } Status FixedSizeListBuilder::Resize(int64_t capacity) { @@ -267,7 +263,7 @@ void StructBuilder::Reset() { Status StructBuilder::AppendNulls(int64_t length) { for (const auto& field : children_) { - RETURN_NOT_OK(field->AppendEmpties(length)); + RETURN_NOT_OK(field->AppendEmptyValues(length)); } ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, false); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d1a1575ae94..b2b632748c5 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -109,22 +109,16 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendEmpty() final { - null_bitmap_builder_.Forward(1); - ++length_; - ++null_count_; - return AppendNextOffset(); - } + Status AppendEmptyValue() final { return Append(true); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); ARROW_RETURN_NOT_OK(CheckNextOffset()); + UnsafeAppendToBitmap(length, true); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_values)); } - null_bitmap_builder_.Forward(length); - length_ += length; - null_count_ += length; return Status::OK(); } @@ -277,9 +271,9 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final; - Status AppendEmpty() final; + Status AppendEmptyValue() final; - Status AppendEmpties(int64_t length) final; + Status AppendEmptyValues(int64_t length) final; /// \brief Get builder to append keys. /// @@ -376,9 +370,9 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status ValidateOverflow(int64_t new_elements); - Status AppendEmpty() final; + Status AppendEmptyValue() final; - Status AppendEmpties(int64_t length) final; + Status AppendEmptyValues(int64_t length) final; ArrayBuilder* value_builder() const { return value_builder_.get(); } @@ -441,7 +435,7 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { /// builder. Status AppendNull() final { for (const auto& field : children_) { - ARROW_RETURN_NOT_OK(field->AppendEmpty()); + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); } return Append(false); } @@ -450,23 +444,19 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { /// child builder. Status AppendNulls(int64_t length) final; - Status AppendEmpty() final { + Status AppendEmptyValue() final { for (const auto& field : children_) { - ARROW_RETURN_NOT_OK(field->AppendEmpty()); + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); } - null_bitmap_builder_.Forward(1); - length_++; - null_count_++; - return Status::OK(); + return Append(true); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { for (const auto& field : children_) { - ARROW_RETURN_NOT_OK(field->AppendEmpties(length)); + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); } - null_bitmap_builder_.Forward(length); - length_ += length; - null_count_ += length; + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 5db1bf39156..d66f1308422 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -46,13 +46,13 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { return AppendNulls(1); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { if (length < 0) return Status::Invalid("length must be positive"); length_ += length; return Status::OK(); } - Status AppendEmpty() final { return AppendEmpties(1); } + Status AppendEmptyValue() final { return AppendEmptyValues(1); } Status Append(std::nullptr_t) { return AppendNull(); } @@ -109,22 +109,18 @@ class NumericBuilder : public ArrayBuilder { } /// \brief Append a empty element - Status AppendEmpty() final { + Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(Reserve(1)); data_builder_.UnsafeAppend(value_type{}); // zero - null_bitmap_builder_.Forward(1); - ++length_; - ++null_count_; + UnsafeAppendToBitmap(true); return Status::OK(); } /// \brief Append several empty elements - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(length, value_type{}); // zero - null_bitmap_builder_.Forward(length); - length_ += length; - null_count_ += length; + UnsafeSetNotNull(length); return Status::OK(); } @@ -325,21 +321,17 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendEmpty() final { + Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeSetNotNull(1); data_builder_.UnsafeAppend(false); - null_bitmap_builder_.Forward(1); - ++length_; - ++null_count_; return Status::OK(); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(length, false); - null_bitmap_builder_.Forward(length); - length_ += length; - null_count_ += length; + UnsafeSetNotNull(length); return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index e77d4c5b6f2..542432807d3 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -117,22 +117,23 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return child_builder->AppendNull(); } - Status AppendEmpty() final { + Status AppendEmptyValue() final { const int8_t first_child_code = type_codes_[0]; ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); ARROW_RETURN_NOT_OK( offsets_builder_.Append(static_cast(child_builder->length()))); - return child_builder->AppendEmpty(); + // Append a empty value to the first child + return child_builder->AppendEmptyValue(); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { const int8_t first_child_code = type_codes_[0]; ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); ARROW_RETURN_NOT_OK( offsets_builder_.Append(length, static_cast(child_builder->length()))); - return child_builder->AppendEmpty(); + return child_builder->AppendEmptyValues(length); } /// \brief Append an element to the UnionArray. This must be followed @@ -198,18 +199,19 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { return Status::OK(); } - Status AppendEmpty() final { + Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmpty()); + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue()); } return Status::OK(); } - Status AppendEmpties(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); + // Append empties to children for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmpties(length)); + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length)); } return Status::OK(); } diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index ff4a1482b6b..41a47c91729 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -292,11 +292,6 @@ class TypedBufferBuilder { return Status::OK(); } - void Forward(int64_t num_elements) { - false_count_ += num_elements; - bit_length_ += num_elements; - } - void UnsafeAppend(bool value) { BitUtil::SetBitTo(mutable_data(), bit_length_, value); if (!value) { diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index 49332990b2e..e22648e84b3 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -15,15 +15,16 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include +#include "arrow/json/parser.h" #include #include +#include +#include +#include + #include "arrow/json/options.h" -#include "arrow/json/parser.h" #include "arrow/json/test_common.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" @@ -188,7 +189,7 @@ TEST(BlockParserWithSchema, Nested) { field("nuf", struct_({field("ps", utf8())}))}, {"[\"thing\", null, \"\xe5\xbf\x8d\", null]", R"([["1", "2", "3"], ["2"], [], null])", - R"([{"ps":null}, null, {"ps":"78"}, {"ps":"90"}])"}); + R"([{"ps":null}, {}, {"ps":"78"}, {"ps":"90"}])"}); } TEST(BlockParserWithSchema, FailOnIncompleteJson) { @@ -217,7 +218,7 @@ TEST(BlockParser, Nested) { field("nuf", struct_({field("ps", utf8())}))}, {"[\"thing\", null, \"\xe5\xbf\x8d\", null]", R"([["1", "2", "3"], ["2"], [], null])", - R"([{"ps":null}, null, {"ps":"78"}, {"ps":"90"}])"}); + R"([{"ps":null}, {}, {"ps":"78"}, {"ps":"90"}])"}); } TEST(BlockParser, AdHoc) { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 050d14547ac..9223ce7fba6 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -339,17 +339,6 @@ class ArrayPrinter : public PrettyPrinter { children.reserve(array.num_fields()); for (int i = 0; i < array.num_fields(); ++i) { children.emplace_back(array.field(i)); - // set field array null at index which parent struct is null, - // because StructBuilder#AppendNull dosen't set child null bitmap. - auto field_null_buf = array.field(i)->null_bitmap(); - if (field_null_buf != nullptr) { - uint8_t* field_null_data = field_null_buf->mutable_data(); - for (int k = 0; k < array.length(); k++) { - if (array.IsNull(k)) { - BitUtil::SetBitTo(field_null_data, k, false); - } - } - } } return PrintChildren(children, 0, array.length()); } diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 6124b8f2ddc..9e58e46fe94 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/pretty_print.h" + +#include + #include #include #include @@ -22,11 +26,8 @@ #include #include -#include - #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/pretty_print.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -324,13 +325,13 @@ TEST_F(TestPrettyPrint, StructTypeAdvanced) { -- child 0 type: int32 [ 11, - null, + 0, null ] -- child 1 type: int32 [ 22, - null, + 0, 33 ])expected"; CheckStream(*array, {0, 10}, ex); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c28a9995cf2..5c49dcd4937 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2146,12 +2146,12 @@ def test_buffers_nested(): assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() - assert bytearray(null_bitmap)[0] == 0b00000001 + assert bytearray(null_bitmap)[0] == 0b00000011 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() - assert bytearray(null_bitmap)[0] == 0b00000100 + assert bytearray(null_bitmap)[0] == 0b00000110 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,) diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb index 0101719f14e..ab0aa5edffa 100644 --- a/ruby/red-arrow/test/test-struct-array-builder.rb +++ b/ruby/red-arrow/test/test-struct-array-builder.rb @@ -27,8 +27,8 @@ def setup @builder.append_value(nil) array = @builder.finish assert_equal([ - [nil], - [nil], + [false], + [0], ], [ array.find_field(0).to_a, @@ -87,8 +87,8 @@ def setup @builder.append_values([nil]) array = @builder.finish assert_equal([ - [nil], - [nil], + [false], + [0], ], [ array.find_field(0).to_a, @@ -130,8 +130,8 @@ def setup ]) array = @builder.finish assert_equal([ - [nil, true, false], - [nil, 1, 2], + [false, true, false], + [0, 1, 2], ], [ array.find_field(0).to_a, @@ -152,8 +152,8 @@ def setup ]) array = @builder.finish assert_equal([ - [true, nil, true], - [1, nil, 3], + [true, false, true], + [1, 0, 3], ], [ array.find_field(0).to_a, diff --git a/ruby/red-arrow/test/test-struct-array.rb b/ruby/red-arrow/test/test-struct-array.rb index 9ecdd98f4d4..2c01f33ef8d 100644 --- a/ruby/red-arrow/test/test-struct-array.rb +++ b/ruby/red-arrow/test/test-struct-array.rb @@ -27,8 +27,8 @@ class StructArrayTest < Test::Unit::TestCase ] array = Arrow::StructArray.new(data_type, values) assert_equal([ - [true, nil, false], - [1, nil, 2], + [true, false, false], + [1, 0, 2], ], [ array.find_field(0).to_a, From 65a195e9f5e3ecb6cc08af35f45d8fa86408880d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 9 Sep 2020 15:37:38 +0200 Subject: [PATCH 4/9] Disable failing Parquet test Add some simple JSON tests --- cpp/src/arrow/ipc/json_simple_test.cc | 29 ++++++++++++++++++- .../parquet/arrow/arrow_reader_writer_test.cc | 23 +++++++++++++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index f6a6a92c5f7..639be263fa4 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -973,7 +973,34 @@ TEST(TestStruct, SimpleStruct) { ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); - // With nulls + // With nulls only in parent + ArrayFromVector({4, 5, 6}, &a); + ArrayFromVector({true, false, false}, &b); + children.assign({a, b}); + BitmapFromVector({true, false, true}, &null_bitmap); + expected = std::make_shared(type, 3, children, null_bitmap, 1); + + ASSERT_OK(ArrayFromJSON(type, "[[4, true], null, [6, false]]", &actual)); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + ASSERT_OK(ArrayFromJSON( + type, "[{\"a\": 4, \"b\": true}, null, {\"b\": false, \"a\": 6}]", &actual)); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + + // With nulls only in children + is_valid = {false, true, true}; + ArrayFromVector(is_valid, {0, 5, 6}, &a); + is_valid = {true, false, true}; + ArrayFromVector(is_valid, {true, true, true}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 3, children); + + ASSERT_OK(ArrayFromJSON(type, "[[null, true], [5, null], [6, true]]", &actual)); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + + // With nulls in both is_valid = {false, true, false, false}; ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); is_valid = {false, false, true, false}; diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index da79a7d4baa..f4cd1a8dbc5 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2345,7 +2345,8 @@ TEST(ArrowReadWrite, SimpleStructRoundTrip) { 2); } -TEST(ArrowReadWrite, SingleColumnNullableStruct) { +// Roundtrip bug similar to NestedOptionalRequired in reconstruct_internal_test.cc +TEST(ArrowReadWrite, DISABLED_SingleColumnNullableStruct1) { auto links = field("Links", ::arrow::struct_({field("Backward", ::arrow::int64(), /*nullable=*/true)})); @@ -2395,7 +2396,25 @@ TEST(ArrowReadWrite, NestedNullableField) { /*row_group_size=*/8); } -TEST(TestArrowReadWrite, CanonicalNestedRoundTrip) { +TEST(ArrowReadWrite, SingleColumnNullableStruct2) { + auto links = + field("Links", + ::arrow::struct_({field("Backward", ::arrow::int64(), /*nullable=*/true)})); + + auto links_id_array = ::arrow::ArrayFromJSON(links->type(), + "[null, " + "{\"Backward\": null}" + "]"); + + CheckSimpleRoundtrip( + ::arrow::Table::Make(std::make_shared<::arrow::Schema>( + std::vector>{links}), + {links_id_array}), + 3); +} + +// Disabled until implementation can be finished. +TEST(TestArrowReadWrite, DISABLED_CanonicalNestedRoundTrip) { auto doc_id = field("DocId", ::arrow::int64(), /*nullable=*/false); auto links = field( "Links", From 24da1c3d7ed749e87092159a415b9715e988fe1e Mon Sep 17 00:00:00 2001 From: tianchen Date: Thu, 10 Sep 2020 10:30:34 +0800 Subject: [PATCH 5/9] fix some comments, do not increase null_count Change-Id: I94b27df7b2db33f8625713818bad7c2055b9b93a --- cpp/src/arrow/array/builder_adaptive.h | 13 ++++++++++++- cpp/src/arrow/array/builder_dict.h | 4 ---- cpp/src/arrow/array/builder_nested.h | 4 ++-- cpp/src/arrow/array/builder_primitive.h | 6 +----- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 500c99153a3..9220cd61018 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -72,7 +72,18 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } - Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status AppendEmptyValue() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 1; + pending_has_nulls_ = false; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } void Reset() override; Status Resize(int64_t capacity) override; diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index e7e715f8696..d15855b0a89 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -263,14 +263,12 @@ class DictionaryBuilderBase : public ArrayBuilder { Status AppendEmptyValue() final { length_ += 1; - null_count_ += 1; return indices_builder_.AppendEmptyValue(); } Status AppendEmptyValues(int64_t length) final { length_ += length; - null_count_ += length; return indices_builder_.AppendEmptyValues(length); } @@ -453,14 +451,12 @@ class DictionaryBuilderBase : public ArrayBuilder { Status AppendEmptyValue() final { length_ += 1; - null_count_ += 1; return indices_builder_.AppendEmptyValue(); } Status AppendEmptyValues(int64_t length) final { length_ += length; - null_count_ += length; return indices_builder_.AppendEmptyValues(length); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index b2b632748c5..6371e6b8e16 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -431,7 +431,7 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a null value. Automatically appends a empty to each child + /// \brief Append a null value. Automatically appends an empty value to each child /// builder. Status AppendNull() final { for (const auto& field : children_) { @@ -440,7 +440,7 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Append(false); } - /// \brief Append multiple null values. Automatically appends empties to each + /// \brief Append multiple null values. Automatically appends empty values to each /// child builder. Status AppendNulls(int64_t length) final; diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index d66f1308422..ef0981f112a 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -46,11 +46,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { return AppendNulls(1); } - Status AppendEmptyValues(int64_t length) final { - if (length < 0) return Status::Invalid("length must be positive"); - length_ += length; - return Status::OK(); - } + Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); } Status AppendEmptyValue() final { return AppendEmptyValues(1); } From 5df0eebd58a3467a930b1a553de448b34d75ddc0 Mon Sep 17 00:00:00 2001 From: tianchen Date: Fri, 11 Sep 2020 10:22:40 +0800 Subject: [PATCH 6/9] add documents Change-Id: Icf1c2781fc6d738338eea3854d10d4430660e478 --- cpp/src/arrow/array/builder_base.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 1190354d50b..18c027b7b4c 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -100,6 +100,11 @@ class ARROW_EXPORT ArrayBuilder { virtual Status AppendNull() = 0; virtual Status AppendNulls(int64_t length) = 0; + /// Append an empty value to builder with null bitmap slot set to true. + /// Now it's mainly used in StructBuilder#AppendNull. + /// For Struct [{"joe", 1}, {null, 2}, null, {"mark", 4}] constructed with + /// StructBuilder, it's child arrays are: utf8 ["joe", null, "", "mark"], int32 [1, 2, + /// 0, 4] virtual Status AppendEmptyValue() = 0; virtual Status AppendEmptyValues(int64_t length) = 0; From d37d7d6acbd2f07a46d54ce747d9b009f63afa95 Mon Sep 17 00:00:00 2001 From: tianchen Date: Sat, 10 Oct 2020 15:59:31 +0800 Subject: [PATCH 7/9] Revert "Disable failing Parquet test" --- cpp/src/arrow/ipc/json_simple_test.cc | 29 +------------------ .../parquet/arrow/arrow_reader_writer_test.cc | 23 ++------------- 2 files changed, 3 insertions(+), 49 deletions(-) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 639be263fa4..f6a6a92c5f7 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -973,34 +973,7 @@ TEST(TestStruct, SimpleStruct) { ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); - // With nulls only in parent - ArrayFromVector({4, 5, 6}, &a); - ArrayFromVector({true, false, false}, &b); - children.assign({a, b}); - BitmapFromVector({true, false, true}, &null_bitmap); - expected = std::make_shared(type, 3, children, null_bitmap, 1); - - ASSERT_OK(ArrayFromJSON(type, "[[4, true], null, [6, false]]", &actual)); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - ASSERT_OK(ArrayFromJSON( - type, "[{\"a\": 4, \"b\": true}, null, {\"b\": false, \"a\": 6}]", &actual)); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - - // With nulls only in children - is_valid = {false, true, true}; - ArrayFromVector(is_valid, {0, 5, 6}, &a); - is_valid = {true, false, true}; - ArrayFromVector(is_valid, {true, true, true}, &b); - children.assign({a, b}); - expected = std::make_shared(type, 3, children); - - ASSERT_OK(ArrayFromJSON(type, "[[null, true], [5, null], [6, true]]", &actual)); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - - // With nulls in both + // With nulls is_valid = {false, true, false, false}; ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); is_valid = {false, false, true, false}; diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index f4cd1a8dbc5..da79a7d4baa 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2345,8 +2345,7 @@ TEST(ArrowReadWrite, SimpleStructRoundTrip) { 2); } -// Roundtrip bug similar to NestedOptionalRequired in reconstruct_internal_test.cc -TEST(ArrowReadWrite, DISABLED_SingleColumnNullableStruct1) { +TEST(ArrowReadWrite, SingleColumnNullableStruct) { auto links = field("Links", ::arrow::struct_({field("Backward", ::arrow::int64(), /*nullable=*/true)})); @@ -2396,25 +2395,7 @@ TEST(ArrowReadWrite, NestedNullableField) { /*row_group_size=*/8); } -TEST(ArrowReadWrite, SingleColumnNullableStruct2) { - auto links = - field("Links", - ::arrow::struct_({field("Backward", ::arrow::int64(), /*nullable=*/true)})); - - auto links_id_array = ::arrow::ArrayFromJSON(links->type(), - "[null, " - "{\"Backward\": null}" - "]"); - - CheckSimpleRoundtrip( - ::arrow::Table::Make(std::make_shared<::arrow::Schema>( - std::vector>{links}), - {links_id_array}), - 3); -} - -// Disabled until implementation can be finished. -TEST(TestArrowReadWrite, DISABLED_CanonicalNestedRoundTrip) { +TEST(TestArrowReadWrite, CanonicalNestedRoundTrip) { auto doc_id = field("DocId", ::arrow::int64(), /*nullable=*/false); auto links = field( "Links", From ee9d62c89ff7b07cbed64e8bd6ab938d71ade1a0 Mon Sep 17 00:00:00 2001 From: tianchen Date: Sat, 10 Oct 2020 17:01:23 +0800 Subject: [PATCH 8/9] minor fix Change-Id: I3743b056585fe2a746638c3466b15681dc66e7c5 --- cpp/src/arrow/array/builder_binary.h | 4 +--- cpp/src/arrow/array/builder_nested.cc | 1 + cpp/src/arrow/array/builder_nested.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index f52a3dca2bb..b8380ac27d2 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -103,9 +103,7 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index fd6b9005736..e1243f29d37 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -211,6 +211,7 @@ Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { " elements, have ", new_elements); } return Status::OK(); +} Status FixedSizeListBuilder::AppendEmptyValue() { RETURN_NOT_OK(Reserve(1)); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 6371e6b8e16..2c67d205f13 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -113,7 +113,7 @@ class BaseListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); UnsafeAppendToBitmap(length, true); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { From 173230d87be06048e66798e838de5c4db461cc92 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 22 Oct 2020 15:47:21 +0200 Subject: [PATCH 9/9] Tests and fixes --- cpp/src/arrow/array/array_test.cc | 58 ++++++++++++++++- cpp/src/arrow/array/array_union_test.cc | 86 +++++++++++++++++++++++-- cpp/src/arrow/array/builder_adaptive.h | 1 - cpp/src/arrow/array/builder_base.h | 18 ++++-- cpp/src/arrow/array/builder_binary.h | 5 +- cpp/src/arrow/array/builder_nested.cc | 9 --- cpp/src/arrow/array/builder_nested.h | 9 ++- cpp/src/arrow/array/builder_primitive.h | 2 +- cpp/src/arrow/array/builder_union.h | 36 ++++++----- 9 files changed, 179 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index ed0fe4ac8bf..c256d26d88a 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -70,6 +70,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; class TestArray : public ::testing::Test { public: @@ -641,7 +642,7 @@ class TestPrimitiveBuilder : public TestBuilder { std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - std::shared_ptr result = std::dynamic_pointer_cast(out); + std::shared_ptr result = checked_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -766,7 +767,7 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - std::shared_ptr result = std::dynamic_pointer_cast(out); + std::shared_ptr result = checked_pointer_cast(out); ASSERT_EQ(ex_null_count, result->null_count()); ASSERT_EQ(size, result->length()); @@ -883,7 +884,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { std::shared_ptr out; FinishAndCheckPadding(this->builder_.get(), &out); - auto result = std::dynamic_pointer_cast(out); + auto result = checked_pointer_cast(out); for (int64_t i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)) << i; @@ -917,6 +918,33 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) { } } +TYPED_TEST(TestPrimitiveBuilder, TestAppendEmptyValue) { + ASSERT_OK(this->builder_->AppendNull()); + ASSERT_OK(this->builder_->AppendEmptyValue()); + ASSERT_OK(this->builder_->AppendNulls(2)); + ASSERT_OK(this->builder_->AppendEmptyValues(2)); + + std::shared_ptr out; + FinishAndCheckPadding(this->builder_.get(), &out); + ASSERT_OK(out->ValidateFull()); + + auto result = checked_pointer_cast(out); + ASSERT_EQ(result->length(), 6); + ASSERT_EQ(result->null_count(), 3); + + ASSERT_TRUE(result->IsNull(0)); + ASSERT_FALSE(result->IsNull(1)); + ASSERT_TRUE(result->IsNull(2)); + ASSERT_TRUE(result->IsNull(3)); + ASSERT_FALSE(result->IsNull(4)); + ASSERT_FALSE(result->IsNull(5)); + + // implementation detail: the value slots are 0-initialized + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(result->Value(i), 0); + } +} + TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { DECL_T(); @@ -2104,6 +2132,18 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) { } } +TEST_F(TestAdaptiveIntBuilder, TestAppendEmptyValue) { + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->AppendEmptyValue()); + ASSERT_OK(builder_->Append(42)); + ASSERT_OK(builder_->AppendEmptyValues(2)); + Done(); + + ASSERT_OK(result_->ValidateFull()); + // NOTE: The fact that we get 0 is really an implementation detail + AssertArraysEqual(*result_, *ArrayFromJSON(int8(), "[null, null, 0, 42, 0, 0]")); +} + TEST(TestAdaptiveIntBuilderWithStartIntSize, TestReset) { auto builder = std::make_shared( static_cast(sizeof(int16_t)), default_memory_pool()); @@ -2322,6 +2362,18 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) { } } +TEST_F(TestAdaptiveUIntBuilder, TestAppendEmptyValue) { + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->AppendEmptyValue()); + ASSERT_OK(builder_->Append(42)); + ASSERT_OK(builder_->AppendEmptyValues(2)); + Done(); + + ASSERT_OK(result_->ValidateFull()); + // NOTE: The fact that we get 0 is really an implementation detail + AssertArraysEqual(*result_, *ArrayFromJSON(uint8(), "[null, null, 0, 42, 0, 0]")); +} + TEST(TestAdaptiveUIntBuilderWithStartIntSize, TestReset) { auto builder = std::make_shared( static_cast(sizeof(uint16_t)), default_memory_pool()); diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index a32b8b868de..6409de17c6b 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -307,7 +307,24 @@ class UnionBuilderTest : public ::testing::Test { AppendString("def"); AppendInt(-10); AppendDouble(0.5); + + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector(expected_types_vector, &expected_types); + } + + void AppendNullsAndEmptyValues() { + AppendString("abc"); + ASSERT_OK(union_builder->AppendNull()); + ASSERT_OK(union_builder->AppendEmptyValue()); + expected_types_vector.insert(expected_types_vector.end(), 3, I8); + AppendInt(42); + ASSERT_OK(union_builder->AppendNulls(2)); + ASSERT_OK(union_builder->AppendEmptyValues(2)); + expected_types_vector.insert(expected_types_vector.end(), 3, I8); + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); } @@ -329,7 +346,9 @@ class UnionBuilderTest : public ::testing::Test { AppendDouble(1.0); AppendDouble(-1.0); AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); ASSERT_EQ(I8, 0); @@ -357,6 +376,7 @@ class UnionBuilderTest : public ::testing::Test { AppendDouble(0.5); ASSERT_OK(list_builder.Finish(actual)); + ASSERT_OK((*actual)->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); } @@ -376,20 +396,20 @@ class SparseUnionBuilderTest : public UnionBuilderTest { void AppendInt(int8_t i) override { Base::AppendInt(i); - ASSERT_OK(str_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); + ASSERT_OK(str_builder->AppendEmptyValue()); + ASSERT_OK(dbl_builder->AppendEmptyValue()); } void AppendString(const std::string& str) override { Base::AppendString(str); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); + ASSERT_OK(i8_builder->AppendEmptyValue()); + ASSERT_OK(dbl_builder->AppendEmptyValue()); } void AppendDouble(double dbl) override { Base::AppendDouble(dbl); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(i8_builder->AppendEmptyValue()); + ASSERT_OK(str_builder->AppendEmptyValue()); } }; @@ -415,6 +435,34 @@ TEST_F(DenseUnionBuilderTest, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(DenseUnionBuilderTest, NullsAndEmptyValues) { + union_builder.reset(new DenseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); + AppendNullsAndEmptyValues(); + + // Four null / empty values (the latter implementation-defined) were appended to I8 + auto expected_i8 = ArrayFromJSON(int8(), "[null, 0, 42, null, 0]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[]"); + + // "abc", null, 0, 42, null, null, 0, 0 + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 2, 3, 3, 4, 4]"); + + ASSERT_OK_AND_ASSIGN(auto expected, + DenseUnionArray::Make(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL})); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); + // Physical arrays must be as expected + ASSERT_ARRAYS_EQUAL(*expected_i8, *actual->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_str, *actual->field(1)); + ASSERT_ARRAYS_EQUAL(*expected_dbl, *actual->field(2)); +} + TEST_F(DenseUnionBuilderTest, InferredType) { AppendInferred(); @@ -467,6 +515,32 @@ TEST_F(SparseUnionBuilderTest, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(SparseUnionBuilderTest, NullsAndEmptyValues) { + union_builder.reset(new SparseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + sparse_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); + AppendNullsAndEmptyValues(); + + // "abc", null, 0, 42, null, null, 0, 0 + // (note that getting 0 for empty values is implementation-defined) + auto expected_i8 = ArrayFromJSON(int8(), "[0, null, 0, 42, null, null, 0, 0]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "", "", "", "", "", ""])"); + auto expected_dbl = ArrayFromJSON(float64(), "[0, 0, 0, 0, 0, 0, 0, 0]"); + + ASSERT_OK_AND_ASSIGN( + auto expected, + SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL})); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); + // Physical arrays must be as expected + ASSERT_ARRAYS_EQUAL(*expected_i8, *actual->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_str, *actual->field(1)); + ASSERT_ARRAYS_EQUAL(*expected_dbl, *actual->field(2)); +} + TEST_F(SparseUnionBuilderTest, InferredType) { AppendInferred(); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 9220cd61018..c0df797256d 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -75,7 +75,6 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { Status AppendEmptyValue() final { pending_data_[pending_pos_] = 0; pending_valid_[pending_pos_] = 1; - pending_has_nulls_ = false; ++pending_pos_; ++length_; diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 18c027b7b4c..b500c653f25 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -97,15 +97,23 @@ class ARROW_EXPORT ArrayBuilder { /// Reset the builder. virtual void Reset(); + /// \brief Append a null value to builder virtual Status AppendNull() = 0; + /// \brief Append a number of null values to builder virtual Status AppendNulls(int64_t length) = 0; - /// Append an empty value to builder with null bitmap slot set to true. - /// Now it's mainly used in StructBuilder#AppendNull. - /// For Struct [{"joe", 1}, {null, 2}, null, {"mark", 4}] constructed with - /// StructBuilder, it's child arrays are: utf8 ["joe", null, "", "mark"], int32 [1, 2, - /// 0, 4] + /// \brief Append a non-null value to builder + /// + /// The appended value is an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending a null value to a parent nested type. virtual Status AppendEmptyValue() = 0; + + /// \brief Append a number of non-null values to builder + /// + /// The appended values are an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending null values to a parent nested type. virtual Status AppendEmptyValues(int64_t length) = 0; /// For cases where raw data was memcpy'd into the internal buffers, allows us diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b8380ac27d2..bc49c7d6787 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -61,6 +61,7 @@ class BaseBinaryBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(AppendNextOffset()); // Safety check for UBSAN. if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); } @@ -78,7 +79,6 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -103,7 +103,6 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -341,7 +340,6 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); return offsets_builder_.Append(static_cast(num_bytes)); } @@ -455,7 +453,6 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { const uint8_t* valid_bytes = NULLPTR); Status AppendNull() final; - Status AppendNulls(int64_t length) final; Status AppendEmptyValue() final; diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index e1243f29d37..a3bcde0381a 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -262,15 +262,6 @@ void StructBuilder::Reset() { } } -Status StructBuilder::AppendNulls(int64_t length) { - for (const auto& field : children_) { - RETURN_NOT_OK(field->AppendEmptyValues(length)); - } - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(length, false); - return Status::OK(); -} - Status StructBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 2c67d205f13..12b999b786e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -442,7 +442,14 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { /// \brief Append multiple null values. Automatically appends empty values to each /// child builder. - Status AppendNulls(int64_t length) final; + Status AppendNulls(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } Status AppendEmptyValue() final { for (const auto& field : children_) { diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index ef0981f112a..e10f11fdd6c 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -319,8 +319,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeSetNotNull(1); data_builder_.UnsafeAppend(false); + UnsafeSetNotNull(1); return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 542432807d3..060be474fb8 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -123,7 +123,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); ARROW_RETURN_NOT_OK( offsets_builder_.Append(static_cast(child_builder->length()))); - // Append a empty value to the first child + // Append an empty value arbitrarily to the first child return child_builder->AppendEmptyValue(); } @@ -133,7 +133,8 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); ARROW_RETURN_NOT_OK( offsets_builder_.Append(length, static_cast(child_builder->length()))); - return child_builder->AppendEmptyValues(length); + // Append just a single empty value to the first child + return child_builder->AppendEmptyValue(); } /// \brief Append an element to the UnionArray. This must be followed @@ -178,23 +179,29 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { const std::shared_ptr& type) : BasicUnionBuilder(pool, children, type) {} - /// \brief Append a null value. A null is added automatically to all the - /// children but the type id in the slot will be 0 + /// \brief Append a null value. + /// + /// A null is appended to the first child, empty values to the other children. Status AppendNull() final { - ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); - for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendNull()); + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull()); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue()); } return Status::OK(); } - /// \brief Append multiple null values. Nulls will be automatically appended - /// to all the children but the type ids will be all 0. + /// \brief Append multiple null values. + /// + /// Nulls are appended to the first child, empty values to the other children. Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); - // Append nulls to children - for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendNulls(length)); + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length)); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK( + type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length)); } return Status::OK(); } @@ -209,7 +216,6 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); - // Append empties to children for (int8_t code : type_codes_) { ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length)); } @@ -222,7 +228,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// \param[in] next_type type_id of the child to which the next value will be appended. /// /// The corresponding child builder must be appended to independently after this method - /// is called, and all other child builders must have null appended + /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } };