diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index 0afadcf9285..39ad61b4562 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include + #include #include #include #include -#include - #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/status.h" @@ -266,10 +266,8 @@ TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_EQ(2, result_->field(1)->length()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_TRUE(result_->field(0)->IsNull(0)); - ASSERT_TRUE(result_->field(0)->IsNull(1)); - ASSERT_TRUE(result_->field(1)->IsNull(0)); - ASSERT_TRUE(result_->field(1)->IsNull(1)); + ASSERT_EQ(0, result_->field(0)->null_count()); + ASSERT_EQ(0, result_->field(1)->null_count()); ASSERT_EQ(Type::LIST, result_->field(0)->type_id()); ASSERT_EQ(Type::INT32, result_->field(1)->type_id()); diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index ed0fe4ac8bf..c256d26d88a 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -70,6 +70,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; class TestArray : public ::testing::Test { public: @@ -641,7 +642,7 @@ class TestPrimitiveBuilder : public TestBuilder { std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - std::shared_ptr result = std::dynamic_pointer_cast(out); + std::shared_ptr result = checked_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -766,7 +767,7 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - std::shared_ptr result = std::dynamic_pointer_cast(out); + std::shared_ptr result = checked_pointer_cast(out); ASSERT_EQ(ex_null_count, result->null_count()); ASSERT_EQ(size, result->length()); @@ -883,7 +884,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { std::shared_ptr out; FinishAndCheckPadding(this->builder_.get(), &out); - auto result = std::dynamic_pointer_cast(out); + auto result = checked_pointer_cast(out); for (int64_t i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)) << i; @@ -917,6 +918,33 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) { } } +TYPED_TEST(TestPrimitiveBuilder, TestAppendEmptyValue) { + ASSERT_OK(this->builder_->AppendNull()); + ASSERT_OK(this->builder_->AppendEmptyValue()); + ASSERT_OK(this->builder_->AppendNulls(2)); + ASSERT_OK(this->builder_->AppendEmptyValues(2)); + + std::shared_ptr out; + FinishAndCheckPadding(this->builder_.get(), &out); + ASSERT_OK(out->ValidateFull()); + + auto result = checked_pointer_cast(out); + ASSERT_EQ(result->length(), 6); + ASSERT_EQ(result->null_count(), 3); + + ASSERT_TRUE(result->IsNull(0)); + ASSERT_FALSE(result->IsNull(1)); + ASSERT_TRUE(result->IsNull(2)); + ASSERT_TRUE(result->IsNull(3)); + ASSERT_FALSE(result->IsNull(4)); + ASSERT_FALSE(result->IsNull(5)); + + // implementation detail: the value slots are 0-initialized + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(result->Value(i), 0); + } +} + TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { DECL_T(); @@ -2104,6 +2132,18 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) { } } +TEST_F(TestAdaptiveIntBuilder, TestAppendEmptyValue) { + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->AppendEmptyValue()); + ASSERT_OK(builder_->Append(42)); + ASSERT_OK(builder_->AppendEmptyValues(2)); + Done(); + + ASSERT_OK(result_->ValidateFull()); + // NOTE: The fact that we get 0 is really an implementation detail + AssertArraysEqual(*result_, *ArrayFromJSON(int8(), "[null, null, 0, 42, 0, 0]")); +} + TEST(TestAdaptiveIntBuilderWithStartIntSize, TestReset) { auto builder = std::make_shared( static_cast(sizeof(int16_t)), default_memory_pool()); @@ -2322,6 +2362,18 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) { } } +TEST_F(TestAdaptiveUIntBuilder, TestAppendEmptyValue) { + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->AppendEmptyValue()); + ASSERT_OK(builder_->Append(42)); + ASSERT_OK(builder_->AppendEmptyValues(2)); + Done(); + + ASSERT_OK(result_->ValidateFull()); + // NOTE: The fact that we get 0 is really an implementation detail + AssertArraysEqual(*result_, *ArrayFromJSON(uint8(), "[null, null, 0, 42, 0, 0]")); +} + TEST(TestAdaptiveUIntBuilderWithStartIntSize, TestReset) { auto builder = std::make_shared( static_cast(sizeof(uint16_t)), default_memory_pool()); diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index a32b8b868de..6409de17c6b 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -307,7 +307,24 @@ class UnionBuilderTest : public ::testing::Test { AppendString("def"); AppendInt(-10); AppendDouble(0.5); + + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector(expected_types_vector, &expected_types); + } + + void AppendNullsAndEmptyValues() { + AppendString("abc"); + ASSERT_OK(union_builder->AppendNull()); + ASSERT_OK(union_builder->AppendEmptyValue()); + expected_types_vector.insert(expected_types_vector.end(), 3, I8); + AppendInt(42); + ASSERT_OK(union_builder->AppendNulls(2)); + ASSERT_OK(union_builder->AppendEmptyValues(2)); + expected_types_vector.insert(expected_types_vector.end(), 3, I8); + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); } @@ -329,7 +346,9 @@ class UnionBuilderTest : public ::testing::Test { AppendDouble(1.0); AppendDouble(-1.0); AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); ASSERT_EQ(I8, 0); @@ -357,6 +376,7 @@ class UnionBuilderTest : public ::testing::Test { AppendDouble(0.5); ASSERT_OK(list_builder.Finish(actual)); + ASSERT_OK((*actual)->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); } @@ -376,20 +396,20 @@ class SparseUnionBuilderTest : public UnionBuilderTest { void AppendInt(int8_t i) override { Base::AppendInt(i); - ASSERT_OK(str_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); + ASSERT_OK(str_builder->AppendEmptyValue()); + ASSERT_OK(dbl_builder->AppendEmptyValue()); } void AppendString(const std::string& str) override { Base::AppendString(str); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); + ASSERT_OK(i8_builder->AppendEmptyValue()); + ASSERT_OK(dbl_builder->AppendEmptyValue()); } void AppendDouble(double dbl) override { Base::AppendDouble(dbl); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(i8_builder->AppendEmptyValue()); + ASSERT_OK(str_builder->AppendEmptyValue()); } }; @@ -415,6 +435,34 @@ TEST_F(DenseUnionBuilderTest, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(DenseUnionBuilderTest, NullsAndEmptyValues) { + union_builder.reset(new DenseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); + AppendNullsAndEmptyValues(); + + // Four null / empty values (the latter implementation-defined) were appended to I8 + auto expected_i8 = ArrayFromJSON(int8(), "[null, 0, 42, null, 0]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[]"); + + // "abc", null, 0, 42, null, null, 0, 0 + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 2, 3, 3, 4, 4]"); + + ASSERT_OK_AND_ASSIGN(auto expected, + DenseUnionArray::Make(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL})); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); + // Physical arrays must be as expected + ASSERT_ARRAYS_EQUAL(*expected_i8, *actual->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_str, *actual->field(1)); + ASSERT_ARRAYS_EQUAL(*expected_dbl, *actual->field(2)); +} + TEST_F(DenseUnionBuilderTest, InferredType) { AppendInferred(); @@ -467,6 +515,32 @@ TEST_F(SparseUnionBuilderTest, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(SparseUnionBuilderTest, NullsAndEmptyValues) { + union_builder.reset(new SparseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + sparse_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); + AppendNullsAndEmptyValues(); + + // "abc", null, 0, 42, null, null, 0, 0 + // (note that getting 0 for empty values is implementation-defined) + auto expected_i8 = ArrayFromJSON(int8(), "[0, null, 0, 42, null, null, 0, 0]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "", "", "", "", "", ""])"); + auto expected_dbl = ArrayFromJSON(float64(), "[0, 0, 0, 0, 0, 0, 0, 0]"); + + ASSERT_OK_AND_ASSIGN( + auto expected, + SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL})); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); + // Physical arrays must be as expected + ASSERT_ARRAYS_EQUAL(*expected_i8, *actual->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_str, *actual->field(1)); + ASSERT_ARRAYS_EQUAL(*expected_dbl, *actual->field(2)); +} + TEST_F(SparseUnionBuilderTest, InferredType) { AppendInferred(); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 1895f8ddaee..c0df797256d 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -64,6 +64,26 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNotNull(length); + return Status::OK(); + } + + Status AppendEmptyValue() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + void Reset() override; Status Resize(int64_t capacity) override; diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index d73681756ba..b500c653f25 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -97,9 +97,25 @@ class ARROW_EXPORT ArrayBuilder { /// Reset the builder. virtual void Reset(); + /// \brief Append a null value to builder virtual Status AppendNull() = 0; + /// \brief Append a number of null values to builder virtual Status AppendNulls(int64_t length) = 0; + /// \brief Append a non-null value to builder + /// + /// The appended value is an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending a null value to a parent nested type. + virtual Status AppendEmptyValue() = 0; + + /// \brief Append a number of non-null values to builder + /// + /// The appended values are an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending null values to a parent nested type. + virtual Status AppendEmptyValues(int64_t length) = 0; + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index ecb0e95fb44..6822dc89903 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -73,6 +73,20 @@ Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) { return Status::OK(); } +Status FixedSizeBinaryBuilder::AppendEmptyValue() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); + return Status::OK(); +} + +Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0); + return Status::OK(); +} + void FixedSizeBinaryBuilder::Reset() { ArrayBuilder::Reset(); byte_builder_.Reset(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 70fde2a2e64..bc49c7d6787 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -61,6 +61,7 @@ class BaseBinaryBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(AppendNextOffset()); // Safety check for UBSAN. if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); } @@ -78,7 +79,6 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -94,6 +94,23 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } + /// \brief Append without checking capacity /// /// Offsets and data should have been presized using Reserve() and @@ -323,7 +340,6 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); return offsets_builder_.Append(static_cast(num_bytes)); } @@ -437,9 +453,11 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { const uint8_t* valid_bytes = NULLPTR); Status AppendNull() final; - Status AppendNulls(int64_t length) final; + Status AppendEmptyValue() final; + Status AppendEmptyValues(int64_t length) final; + void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 7f920bb8cd6..d15855b0a89 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -261,6 +261,18 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + /// \brief Insert values into the dictionary's memo, but do not append any /// indices. Can be used to initialize a new builder with known dictionary /// values @@ -437,6 +449,18 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + /// \brief Append a whole dense array to the builder Status AppendArray(const Array& array) { #ifndef NDEBUG diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index c3786a8fab4..a3bcde0381a 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -123,6 +123,24 @@ Status MapBuilder::AppendNulls(int64_t length) { return Status::OK(); } +Status MapBuilder::AppendEmptyValue() { + DCHECK_EQ(item_builder_->length(), key_builder_->length()); + RETURN_NOT_OK(AdjustStructBuilderLength()); + RETURN_NOT_OK(list_builder_->AppendEmptyValue()); + length_ = list_builder_->length(); + null_count_ = list_builder_->null_count(); + return Status::OK(); +} + +Status MapBuilder::AppendEmptyValues(int64_t length) { + DCHECK_EQ(item_builder_->length(), key_builder_->length()); + RETURN_NOT_OK(AdjustStructBuilderLength()); + RETURN_NOT_OK(list_builder_->AppendEmptyValues(length)); + length_ = list_builder_->length(); + null_count_ = list_builder_->null_count(); + return Status::OK(); +} + Status MapBuilder::AdjustStructBuilderLength() { // If key/item builders have been appended, adjust struct builder length // to match. Struct and key are non-nullable, append all valid values. @@ -195,6 +213,18 @@ Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { return Status::OK(); } +Status FixedSizeListBuilder::AppendEmptyValue() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return value_builder_->AppendEmptyValues(list_size_); +} + +Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + return value_builder_->AppendEmptyValues(list_size_ * length); +} + Status FixedSizeListBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity)); return ArrayBuilder::Resize(capacity); @@ -232,15 +262,6 @@ void StructBuilder::Reset() { } } -Status StructBuilder::AppendNulls(int64_t length) { - for (const auto& field : children_) { - RETURN_NOT_OK(field->AppendNulls(length)); - } - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(length, false); - return Status::OK(); -} - Status StructBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index b8948403acc..12b999b786e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -109,6 +109,19 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValue() final { return Append(true); } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); + UnsafeAppendToBitmap(length, true); + const int64_t num_values = value_builder_->length(); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_values)); + } + return Status::OK(); + } + Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); @@ -258,6 +271,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final; + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + /// \brief Get builder to append keys. /// /// Append a key with this builder should be followed by appending @@ -353,6 +370,10 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status ValidateOverflow(int64_t new_elements); + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { @@ -410,18 +431,41 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a null value. Automatically appends a null to each child + /// \brief Append a null value. Automatically appends an empty value to each child /// builder. Status AppendNull() final { for (const auto& field : children_) { - ARROW_RETURN_NOT_OK(field->AppendNull()); + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); } return Append(false); } - /// \brief Append multiple null values. Automatically appends nulls to each + /// \brief Append multiple null values. Automatically appends empty values to each /// child builder. - Status AppendNulls(int64_t length) final; + Status AppendNulls(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendEmptyValue() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); + } + return Append(true); + } + + Status AppendEmptyValues(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } void Reset() override; diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index cc907fb6b8a..e10f11fdd6c 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -46,6 +46,10 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { return AppendNulls(1); } + Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); } + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status Append(std::nullptr_t) { return AppendNull(); } Status FinishInternal(std::shared_ptr* out) override; @@ -100,6 +104,22 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a empty element + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + UnsafeSetNotNull(length); + return Status::OK(); + } + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } void Reset() override { data_builder_.Reset(); } @@ -297,6 +317,20 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(false); + UnsafeSetNotNull(1); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNotNull(length); + return Status::OK(); + } + /// Scalar append Status Append(const bool val) { ARROW_RETURN_NOT_OK(Reserve(1)); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 1ccc7ef159f..060be474fb8 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -117,6 +117,26 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return child_builder->AppendNull(); } + Status AppendEmptyValue() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + // Append an empty value arbitrarily to the first child + return child_builder->AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + // Append just a single empty value to the first child + return child_builder->AppendEmptyValue(); + } + /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. /// @@ -159,23 +179,45 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { const std::shared_ptr& type) : BasicUnionBuilder(pool, children, type) {} - /// \brief Append a null value. A null is added automatically to all the - /// children but the type id in the slot will be 0 + /// \brief Append a null value. + /// + /// A null is appended to the first child, empty values to the other children. Status AppendNull() final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull()); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue()); + } + return Status::OK(); + } + + /// \brief Append multiple null values. + /// + /// Nulls are appended to the first child, empty values to the other children. + Status AppendNulls(int64_t length) final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length)); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK( + type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length)); + } + return Status::OK(); + } + + Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendNull()); + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue()); } return Status::OK(); } - /// \brief Append multiple null values. Nulls will be automatically appended - /// to all the children but the type ids will be all 0. - Status AppendNulls(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); - // Append nulls to children for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendNulls(length)); + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length)); } return Status::OK(); } @@ -186,7 +228,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// \param[in] next_type type_id of the child to which the next value will be appended. /// /// The corresponding child builder must be appended to independently after this method - /// is called, and all other child builders must have null appended + /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } }; diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index 49332990b2e..e22648e84b3 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -15,15 +15,16 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include +#include "arrow/json/parser.h" #include #include +#include +#include +#include + #include "arrow/json/options.h" -#include "arrow/json/parser.h" #include "arrow/json/test_common.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" @@ -188,7 +189,7 @@ TEST(BlockParserWithSchema, Nested) { field("nuf", struct_({field("ps", utf8())}))}, {"[\"thing\", null, \"\xe5\xbf\x8d\", null]", R"([["1", "2", "3"], ["2"], [], null])", - R"([{"ps":null}, null, {"ps":"78"}, {"ps":"90"}])"}); + R"([{"ps":null}, {}, {"ps":"78"}, {"ps":"90"}])"}); } TEST(BlockParserWithSchema, FailOnIncompleteJson) { @@ -217,7 +218,7 @@ TEST(BlockParser, Nested) { field("nuf", struct_({field("ps", utf8())}))}, {"[\"thing\", null, \"\xe5\xbf\x8d\", null]", R"([["1", "2", "3"], ["2"], [], null])", - R"([{"ps":null}, null, {"ps":"78"}, {"ps":"90"}])"}); + R"([{"ps":null}, {}, {"ps":"78"}, {"ps":"90"}])"}); } TEST(BlockParser, AdHoc) { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index f88230ffd64..9223ce7fba6 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/pretty_print.h" + #include #include #include @@ -28,7 +30,6 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" -#include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" @@ -327,7 +328,6 @@ class ArrayPrinter : public PrettyPrinter { if (offset != 0) { field = field->Slice(offset, length); } - RETURN_NOT_OK(PrettyPrint(*field, indent_ + options_.indent_size, sink_)); } return Status::OK(); diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 6124b8f2ddc..9e58e46fe94 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/pretty_print.h" + +#include + #include #include #include @@ -22,11 +26,8 @@ #include #include -#include - #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/pretty_print.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -324,13 +325,13 @@ TEST_F(TestPrettyPrint, StructTypeAdvanced) { -- child 0 type: int32 [ 11, - null, + 0, null ] -- child 1 type: int32 [ 22, - null, + 0, 33 ])expected"; CheckStream(*array, {0, 10}, ex); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c28a9995cf2..5c49dcd4937 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2146,12 +2146,12 @@ def test_buffers_nested(): assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() - assert bytearray(null_bitmap)[0] == 0b00000001 + assert bytearray(null_bitmap)[0] == 0b00000011 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() - assert bytearray(null_bitmap)[0] == 0b00000100 + assert bytearray(null_bitmap)[0] == 0b00000110 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,) diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb index 0101719f14e..ab0aa5edffa 100644 --- a/ruby/red-arrow/test/test-struct-array-builder.rb +++ b/ruby/red-arrow/test/test-struct-array-builder.rb @@ -27,8 +27,8 @@ def setup @builder.append_value(nil) array = @builder.finish assert_equal([ - [nil], - [nil], + [false], + [0], ], [ array.find_field(0).to_a, @@ -87,8 +87,8 @@ def setup @builder.append_values([nil]) array = @builder.finish assert_equal([ - [nil], - [nil], + [false], + [0], ], [ array.find_field(0).to_a, @@ -130,8 +130,8 @@ def setup ]) array = @builder.finish assert_equal([ - [nil, true, false], - [nil, 1, 2], + [false, true, false], + [0, 1, 2], ], [ array.find_field(0).to_a, @@ -152,8 +152,8 @@ def setup ]) array = @builder.finish assert_equal([ - [true, nil, true], - [1, nil, 3], + [true, false, true], + [1, 0, 3], ], [ array.find_field(0).to_a, diff --git a/ruby/red-arrow/test/test-struct-array.rb b/ruby/red-arrow/test/test-struct-array.rb index 9ecdd98f4d4..2c01f33ef8d 100644 --- a/ruby/red-arrow/test/test-struct-array.rb +++ b/ruby/red-arrow/test/test-struct-array.rb @@ -27,8 +27,8 @@ class StructArrayTest < Test::Unit::TestCase ] array = Arrow::StructArray.new(data_type, values) assert_equal([ - [true, nil, false], - [1, nil, 2], + [true, false, false], + [1, 0, 2], ], [ array.find_field(0).to_a,