diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 99279f3a8bb..636d97f9d05 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -685,7 +685,7 @@ class TestStringArray : public ::testing::Test { TEST_F(TestStringArray, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); - ASSERT_OK(strings_->Validate()); + ASSERT_OK(ValidateArray(*strings_)); } TEST_F(TestStringArray, TestType) { @@ -801,7 +801,7 @@ class TestStringBuilder : public TestBuilder { EXPECT_OK(builder_->Finish(&out)); result_ = std::dynamic_pointer_cast(out); - result_->Validate(); + ASSERT_OK(ValidateArray(*result_)); } protected: @@ -899,7 +899,7 @@ class TestBinaryArray : public ::testing::Test { TEST_F(TestBinaryArray, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); - ASSERT_OK(strings_->Validate()); + ASSERT_OK(ValidateArray(*strings_)); } TEST_F(TestBinaryArray, TestType) { @@ -969,7 +969,7 @@ class TestBinaryBuilder : public TestBuilder { EXPECT_OK(builder_->Finish(&out)); result_ = std::dynamic_pointer_cast(out); - result_->Validate(); + ASSERT_OK(ValidateArray(*result_)); } protected: @@ -994,7 +994,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } } Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(ValidateArray(*result_)); ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); ASSERT_EQ(reps * 6, result_->data()->size()); @@ -1354,7 +1354,7 @@ TEST_F(TestListBuilder, TestAppendNull) { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(ValidateArray(*result_)); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); @@ -1368,7 +1368,7 @@ TEST_F(TestListBuilder, TestAppendNull) { void ValidateBasicListArray(const ListArray* result, const vector& values, const vector& is_valid) { - ASSERT_OK(result->Validate()); + ASSERT_OK(ValidateArray(*result)); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -1446,13 +1446,13 @@ TEST_F(TestListBuilder, BulkAppendInvalid) { } Done(); - ASSERT_RAISES(Invalid, result_->Validate()); + ASSERT_RAISES(Invalid, ValidateArray(*result_)); } TEST_F(TestListBuilder, TestZeroLength) { // All buffers are null Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(ValidateArray(*result_)); } // ---------------------------------------------------------------------- @@ -1569,9 +1569,9 @@ TEST(TestDictionary, Validate) { std::shared_ptr arr3 = std::make_shared(dict_type, indices3); // Only checking index type for now - ASSERT_OK(arr->Validate()); - ASSERT_RAISES(Invalid, arr2->Validate()); - ASSERT_OK(arr3->Validate()); + ASSERT_OK(ValidateArray(*arr)); + ASSERT_RAISES(Invalid, ValidateArray(*arr2)); + ASSERT_OK(ValidateArray(*arr3)); } // ---------------------------------------------------------------------- @@ -1582,7 +1582,7 @@ void ValidateBasicStructArray(const StructArray* result, const vector& list_is_valid, const vector& list_lengths, const vector& list_offsets, const vector& int_values) { ASSERT_EQ(4, result->length()); - ASSERT_OK(result->Validate()); + ASSERT_OK(ValidateArray(*result)); auto list_char_arr = static_cast(result->field(0).get()); auto char_arr = static_cast(list_char_arr->values().get()); @@ -1666,7 +1666,7 @@ TEST_F(TestStructBuilder, TestAppendNull) { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(ValidateArray(*result_)); ASSERT_EQ(2, static_cast(result_->fields().size())); ASSERT_EQ(2, result_->length()); @@ -1777,8 +1777,8 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) { } Done(); - // Even null bitmap of the parent Struct is not valid, Validate() will ignore it. - ASSERT_OK(result_->Validate()); + // Even null bitmap of the parent Struct is not valid, validate will ignore it. + ASSERT_OK(ValidateArray(*result_)); } TEST_F(TestStructBuilder, TestEquality) { @@ -1924,7 +1924,7 @@ TEST_F(TestStructBuilder, TestEquality) { TEST_F(TestStructBuilder, TestZeroLength) { // All buffers are null Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(ValidateArray(*result_)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 76dda2ca7b9..c5acf3ee6aa 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -105,10 +105,6 @@ bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, return are_equal; } -Status Array::Validate() const { - return Status::OK(); -} - // Last two parameters are in-out parameters static inline void ConformSliceParams( int64_t array_offset, int64_t array_length, int64_t* offset, int64_t* length) { @@ -166,58 +162,6 @@ std::shared_ptr BooleanArray::Slice(int64_t offset, int64_t length) const // ---------------------------------------------------------------------- // ListArray -Status ListArray::Validate() const { - if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (length_ && !value_offsets_) { return Status::Invalid("value_offsets_ was null"); } - if (value_offsets_->size() / static_cast(sizeof(int32_t)) < length_) { - std::stringstream ss; - ss << "offset buffer size (bytes): " << value_offsets_->size() - << " isn't large enough for length: " << length_; - return Status::Invalid(ss.str()); - } - const int32_t last_offset = this->value_offset(length_); - if (last_offset > 0) { - if (!values_) { - return Status::Invalid("last offset was non-zero and values was null"); - } - if (values_->length() != last_offset) { - std::stringstream ss; - ss << "Final offset invariant not equal to values length: " << last_offset - << "!=" << values_->length(); - return Status::Invalid(ss.str()); - } - - const Status child_valid = values_->Validate(); - if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString(); - return Status::Invalid(ss.str()); - } - } - - int32_t prev_offset = this->value_offset(0); - if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } - for (int64_t i = 1; i <= length_; ++i) { - int32_t current_offset = this->value_offset(i); - if (IsNull(i - 1) && current_offset != prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure at: " << i - << " inconsistent value_offsets for null slot" << current_offset - << "!=" << prev_offset; - return Status::Invalid(ss.str()); - } - if (current_offset < prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure: " << i - << " inconsistent offset for non-null slot: " << current_offset << "<" - << prev_offset; - return Status::Invalid(ss.str()); - } - prev_offset = current_offset; - } - return Status::OK(); -} - std::shared_ptr ListArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( @@ -250,11 +194,6 @@ BinaryArray::BinaryArray(const std::shared_ptr& type, int64_t length, if (data_ != nullptr) { raw_data_ = data_->data(); } } -Status BinaryArray::Validate() const { - // TODO(wesm): what to do here? - return Status::OK(); -} - std::shared_ptr BinaryArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( @@ -267,11 +206,6 @@ StringArray::StringArray(int64_t length, const std::shared_ptr& value_of : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) { } -Status StringArray::Validate() const { - // TODO(emkornfield) Validate proper UTF8 code points? - return BinaryArray::Validate(); -} - std::shared_ptr StringArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( @@ -361,42 +295,6 @@ std::shared_ptr StructArray::field(int pos) const { return children_[pos]; } -Status StructArray::Validate() const { - if (length_ < 0) { return Status::Invalid("Length was negative"); } - - if (null_count() > length_) { - return Status::Invalid("Null count exceeds the length of this struct"); - } - - if (children_.size() > 0) { - // Validate fields - int64_t array_length = children_[0]->length(); - size_t idx = 0; - for (auto it : children_) { - if (it->length() != array_length) { - std::stringstream ss; - ss << "Length is not equal from field " << it->type()->ToString() - << " at position {" << idx << "}"; - return Status::Invalid(ss.str()); - } - - const Status child_valid = it->Validate(); - if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx - << "}"; - return Status::Invalid(ss.str()); - } - ++idx; - } - - if (array_length > 0 && array_length != length_) { - return Status::Invalid("Struct's length is not equal to its child arrays"); - } - } - return Status::OK(); -} - std::shared_ptr StructArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( @@ -427,17 +325,6 @@ std::shared_ptr UnionArray::child(int pos) const { return children_[pos]; } -Status UnionArray::Validate() const { - if (length_ < 0) { return Status::Invalid("Length was negative"); } - - if (null_count() > length_) { - return Status::Invalid("Null count exceeds the length of this struct"); - } - - DCHECK(false) << "Validate not yet implemented"; - return Status::OK(); -} - std::shared_ptr UnionArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared(type_, length, children_, type_ids_, value_offsets_, @@ -456,14 +343,6 @@ DictionaryArray::DictionaryArray( DCHECK_EQ(type->id(), Type::DICTIONARY); } -Status DictionaryArray::Validate() const { - Type::type index_type_id = indices_->type()->id(); - if (!is_integer(index_type_id)) { - return Status::Invalid("Dictionary indices must be integer type"); - } - return Status::OK(); -} - std::shared_ptr DictionaryArray::dictionary() const { return dict_type_->dictionary(); } @@ -476,20 +355,136 @@ std::shared_ptr DictionaryArray::Slice(int64_t offset, int64_t length) co // ---------------------------------------------------------------------- // Implement Array::Accept as inline visitor -struct AcceptVirtualVisitor { - explicit AcceptVirtualVisitor(ArrayVisitor* visitor) : visitor(visitor) {} +Status Array::Accept(ArrayVisitor* visitor) const { + return VisitArrayInline(*this, visitor); +} + +// ---------------------------------------------------------------------- +// Implement Array::Validate as inline visitor + +struct ValidateVisitor { + Status Visit(const NullArray& array) { return Status::OK(); } + + Status Visit(const PrimitiveArray& array) { return Status::OK(); } + + Status Visit(const BinaryArray& array) { + // TODO(wesm): what to do here? + return Status::OK(); + } + + Status Visit(const ListArray& array) { + if (array.length() < 0) { return Status::Invalid("Length was negative"); } + + auto value_offsets = array.value_offsets(); + if (array.length() && !value_offsets) { + return Status::Invalid("value_offsets_ was null"); + } + if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { + std::stringstream ss; + ss << "offset buffer size (bytes): " << value_offsets->size() + << " isn't large enough for length: " << array.length(); + return Status::Invalid(ss.str()); + } + const int32_t last_offset = array.value_offset(array.length()); + if (last_offset > 0) { + if (!array.values()) { + return Status::Invalid("last offset was non-zero and values was null"); + } + if (array.values()->length() != last_offset) { + std::stringstream ss; + ss << "Final offset invariant not equal to values length: " << last_offset + << "!=" << array.values()->length(); + return Status::Invalid(ss.str()); + } + + const Status child_valid = ValidateArray(*array.values()); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString(); + return Status::Invalid(ss.str()); + } + } + + int32_t prev_offset = array.value_offset(0); + if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } + for (int64_t i = 1; i <= array.length(); ++i) { + int32_t current_offset = array.value_offset(i); + if (array.IsNull(i - 1) && current_offset != prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure at: " << i + << " inconsistent value_offsets for null slot" << current_offset + << "!=" << prev_offset; + return Status::Invalid(ss.str()); + } + if (current_offset < prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure: " << i + << " inconsistent offset for non-null slot: " << current_offset << "<" + << prev_offset; + return Status::Invalid(ss.str()); + } + prev_offset = current_offset; + } + return Status::OK(); + } - ArrayVisitor* visitor; + Status Visit(const StructArray& array) { + if (array.length() < 0) { return Status::Invalid("Length was negative"); } - template - Status Visit(const T& array) { - return visitor->Visit(array); + if (array.null_count() > array.length()) { + return Status::Invalid("Null count exceeds the length of this struct"); + } + + if (array.fields().size() > 0) { + // Validate fields + int64_t array_length = array.fields()[0]->length(); + size_t idx = 0; + for (auto it : array.fields()) { + if (it->length() != array_length) { + std::stringstream ss; + ss << "Length is not equal from field " << it->type()->ToString() + << " at position {" << idx << "}"; + return Status::Invalid(ss.str()); + } + + const Status child_valid = ValidateArray(*it); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString() << " at position {" + << idx << "}"; + return Status::Invalid(ss.str()); + } + ++idx; + } + + if (array_length > 0 && array_length != array.length()) { + return Status::Invalid("Struct's length is not equal to its child arrays"); + } + } + return Status::OK(); + } + + Status Visit(const UnionArray& array) { + if (array.length() < 0) { return Status::Invalid("Length was negative"); } + + if (array.null_count() > array.length()) { + return Status::Invalid("Null count exceeds the length of this struct"); + } + return Status::OK(); + } + + Status Visit(const DictionaryArray& array) { + Type::type index_type_id = array.indices()->type()->id(); + if (!is_integer(index_type_id)) { + return Status::Invalid("Dictionary indices must be integer type"); + } + return Status::OK(); } }; -Status Array::Accept(ArrayVisitor* visitor) const { - AcceptVirtualVisitor inline_visitor(visitor); - return VisitArrayInline(*this, visitor); +Status ValidateArray(const Array& array) { + ValidateVisitor validate_visitor; + return VisitArrayInline(array, &validate_visitor); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 071d4e30f52..331c6c3e7fa 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -108,11 +108,6 @@ class ARROW_EXPORT Array { bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, int64_t other_start_idx) const; - /// Determines if the array is internally consistent. - /// - /// Defaults to always returning Status::OK. This can be an expensive check. - virtual Status Validate() const; - Status Accept(ArrayVisitor* visitor) const; /// Construct a zero-copy slice of the array with the indicated offset and @@ -238,8 +233,6 @@ class ARROW_EXPORT ListArray : public Array { values_ = values; } - Status Validate() const override; - // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr values() const { return values_; } @@ -306,8 +299,6 @@ class ARROW_EXPORT BinaryArray : public Array { return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } - Status Validate() const override; - std::shared_ptr Slice(int64_t offset, int64_t length) const override; protected: @@ -342,8 +333,6 @@ class ARROW_EXPORT StringArray : public BinaryArray { return std::string(reinterpret_cast(str), nchars); } - Status Validate() const override; - std::shared_ptr Slice(int64_t offset, int64_t length) const override; }; @@ -406,8 +395,6 @@ class ARROW_EXPORT StructArray : public Array { std::shared_ptr null_bitmap = nullptr, int64_t null_count = 0, int64_t offset = 0); - Status Validate() const override; - // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr field(int pos) const; @@ -436,8 +423,6 @@ class ARROW_EXPORT UnionArray : public Array { const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, int64_t offset = 0); - Status Validate() const override; - /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return type_ids_; } @@ -490,8 +475,6 @@ class ARROW_EXPORT DictionaryArray : public Array { DictionaryArray( const std::shared_ptr& type, const std::shared_ptr& indices); - Status Validate() const override; - std::shared_ptr indices() const { return indices_; } std::shared_ptr dictionary() const; @@ -525,6 +508,16 @@ ARROW_EXTERN_TEMPLATE NumericArray; ARROW_EXTERN_TEMPLATE NumericArray; ARROW_EXTERN_TEMPLATE NumericArray; + +/// \brief Perform any validation checks to determine obvious inconsistencies +/// with the array's internal data +/// +/// This can be an expensive check. +/// +/// \param array an Array instance +/// \return Status +Status ValidateArray(const Array& array); + } // namespace arrow #endif diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index c8ca21cb8f1..5caa3a9808c 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -133,7 +133,7 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li ListBuilder builder(pool, child_array); RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); RETURN_NOT_OK(builder.Finish(out)); - return (*out)->Validate(); + return ValidateArray(**out); } typedef Status MakeRecordBatch(std::shared_ptr* out);