diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 423ed5825ee..621d2707cca 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -436,7 +436,7 @@ garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, * * Since: 0.12.0 */ -guint8 * +gint8 * garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes) { @@ -446,7 +446,7 @@ garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, const auto arrow_type_codes = arrow_union_data_type->type_codes(); const auto n = arrow_type_codes.size(); - auto type_codes = static_cast(g_new(guint8, n)); + auto type_codes = static_cast(g_new(gint8, n)); for (size_t i = 0; i < n; ++i) { type_codes[i] = arrow_type_codes[i]; } @@ -479,7 +479,7 @@ garrow_sparse_union_data_type_class_init(GArrowSparseUnionDataTypeClass *klass) */ GArrowSparseUnionDataType * garrow_sparse_union_data_type_new(GList *fields, - guint8 *type_codes, + gint8 *type_codes, gsize n_type_codes) { std::vector> arrow_fields; @@ -489,7 +489,7 @@ garrow_sparse_union_data_type_new(GList *fields, arrow_fields.push_back(arrow_field); } - std::vector arrow_type_codes; + std::vector arrow_type_codes; for (gsize i = 0; i < n_type_codes; ++i) { arrow_type_codes.push_back(type_codes[i]); } @@ -529,7 +529,7 @@ garrow_dense_union_data_type_class_init(GArrowDenseUnionDataTypeClass *klass) */ GArrowDenseUnionDataType * garrow_dense_union_data_type_new(GList *fields, - guint8 *type_codes, + gint8 *type_codes, gsize n_type_codes) { std::vector> arrow_fields; @@ -539,7 +539,7 @@ garrow_dense_union_data_type_new(GList *fields, arrow_fields.push_back(arrow_field); } - std::vector arrow_type_codes; + std::vector arrow_type_codes; for (gsize i = 0; i < n_type_codes; ++i) { arrow_type_codes.push_back(type_codes[i]); } diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index 3aa75ae6e65..6b27f0b0a35 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -108,7 +108,7 @@ garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type); GArrowField * garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, gint i); -guint8 * +gint8 * garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes); @@ -127,7 +127,7 @@ struct _GArrowSparseUnionDataTypeClass GArrowSparseUnionDataType * garrow_sparse_union_data_type_new(GList *fields, - guint8 *type_codes, + gint8 *type_codes, gsize n_type_codes); @@ -145,7 +145,7 @@ struct _GArrowDenseUnionDataTypeClass GArrowDenseUnionDataType * garrow_dense_union_data_type_new(GList *fields, - guint8 *type_codes, + gint8 *type_codes, gsize n_type_codes); diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f45db2b66b8..591621392fc 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -93,6 +93,7 @@ set(ARROW_SRCS array/concatenate.cc array/dict_internal.cc array/diff.cc + array/validate.cc buffer.cc compare.cc extension_type.cc diff --git a/cpp/src/arrow/adapters/orc/adapter_util.cc b/cpp/src/arrow/adapters/orc/adapter_util.cc index ec1914528d4..383383c7451 100644 --- a/cpp/src/arrow/adapters/orc/adapter_util.cc +++ b/cpp/src/arrow/adapters/orc/adapter_util.cc @@ -408,12 +408,12 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { } case liborc::UNION: { std::vector> fields; - std::vector type_codes; + std::vector type_codes; for (int child = 0; child < subtype_count; ++child) { std::shared_ptr elemtype; RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype)); fields.push_back(field("_union_" + std::to_string(child), elemtype)); - type_codes.push_back(static_cast(child)); + type_codes.push_back(static_cast(child)); } *out = union_(fields, type_codes); break; diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index d3edd0995bd..9363c96aa26 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -25,6 +25,7 @@ #include #include +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/buffer_builder.h" #include "arrow/compare.h" @@ -702,7 +703,7 @@ void UnionArray::SetData(const std::shared_ptr& data) { auto type_ids = data_->buffers[1]; auto value_offsets = data_->buffers[2]; raw_type_ids_ = - type_ids == nullptr ? nullptr : reinterpret_cast(type_ids->data()); + type_ids == nullptr ? nullptr : reinterpret_cast(type_ids->data()); raw_value_offsets_ = value_offsets == nullptr ? nullptr : reinterpret_cast(value_offsets->data()); @@ -728,7 +729,7 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, const std::vector& field_names, - const std::vector& type_codes, + const std::vector& type_codes, std::shared_ptr* out) { if (value_offsets.length() == 0) { return Status::Invalid("UnionArray offsets must have non-zero length"); @@ -772,7 +773,7 @@ Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, Status UnionArray::MakeSparse(const Array& type_ids, const std::vector>& children, const std::vector& field_names, - const std::vector& type_codes, + const std::vector& type_codes, std::shared_ptr* out) { if (type_ids.type_id() != Type::INT8) { return Status::TypeError("UnionArray type_ids must be signed int8"); @@ -1146,265 +1147,13 @@ Status Array::Accept(ArrayVisitor* visitor) const { return VisitArrayInline(*this, visitor); } -// ---------------------------------------------------------------------- -// Implement Array::Validate as inline visitor - -namespace internal { - -struct ValidateVisitor { - Status Visit(const NullArray& array) { - ARROW_RETURN_IF(array.null_count() != array.length(), - Status::Invalid("null_count was invalid")); - return Status::OK(); - } - - Status Visit(const PrimitiveArray& array) { - ARROW_RETURN_IF(array.data()->buffers.size() != 2, - Status::Invalid("number of buffers was != 2")); - - if (array.length() > 0 && array.values() == nullptr) { - return Status::Invalid("values was null"); - } - return Status::OK(); - } - - Status Visit(const Decimal128Array& array) { - if (array.data()->buffers.size() != 2) { - return Status::Invalid("number of buffers was != 2"); - } - if (array.length() > 0 && array.values() == nullptr) { - return Status::Invalid("values was null"); - } - return Status::OK(); - } - - Status Visit(const BinaryArray& array) { - if (array.data()->buffers.size() != 3) { - return Status::Invalid("number of buffers was != 3"); - } - return ValidateOffsets(array); - } - - Status Visit(const LargeBinaryArray& array) { - if (array.data()->buffers.size() != 3) { - return Status::Invalid("number of buffers was != 3"); - } - return ValidateOffsets(array); - } - - Status Visit(const ListArray& array) { - RETURN_NOT_OK(ValidateListArray(array)); - return ValidateOffsets(array); - } - - Status Visit(const LargeListArray& array) { - RETURN_NOT_OK(ValidateListArray(array)); - return ValidateOffsets(array); - } - - Status Visit(const MapArray& array) { - if (!array.keys()) { - return Status::Invalid("keys was null"); - } - const Status key_valid = array.keys()->Validate(); - if (!key_valid.ok()) { - return Status::Invalid("key array invalid: ", key_valid.ToString()); - } - - if (array.length() > 0 && !array.values()) { - return Status::Invalid("values was null"); - } - const Status values_valid = array.values()->Validate(); - if (!values_valid.ok()) { - return Status::Invalid("values array invalid: ", values_valid.ToString()); - } +Status Array::Validate() const { return internal::ValidateArray(*this); } - const int32_t last_offset = array.value_offset(array.length()); - if (array.values()->length() != last_offset) { - return Status::Invalid("Final offset invariant not equal to values length: ", - last_offset, "!=", array.values()->length()); - } - if (array.keys()->length() != last_offset) { - return Status::Invalid("Final offset invariant not equal to keys length: ", - last_offset, "!=", array.keys()->length()); - } - - return ValidateOffsets(array); - } - - Status Visit(const FixedSizeListArray& array) { - if (array.length() > 0 && !array.values()) { - return Status::Invalid("values was null"); - } - if (array.values()->length() != array.length() * array.value_length()) { - return Status::Invalid( - "Values Length (", array.values()->length(), ") was not equal to the length (", - array.length(), ") multiplied by the list size (", array.value_length(), ")"); - } - - return Status::OK(); - } - - Status Visit(const StructArray& array) { - const auto& struct_type = checked_cast(*array.type()); - if (array.num_fields() > 0) { - // Validate fields - int64_t array_length = array.field(0)->length(); - size_t idx = 0; - for (int i = 0; i < array.num_fields(); ++i) { - auto it = array.field(i); - if (it->length() != array_length) { - return Status::Invalid("Length is not equal from field ", - it->type()->ToString(), " at position [", idx, "]"); - } - - auto it_type = struct_type.child(i)->type(); - if (!it->type()->Equals(it_type)) { - return Status::Invalid("Child array at position [", idx, - "] does not match type field: ", it->type()->ToString(), - " vs ", it_type->ToString()); - } - - const Status child_valid = it->Validate(); - if (!child_valid.ok()) { - return Status::Invalid("Child array invalid: ", child_valid.ToString(), - " at position [", idx, "]"); - } - ++idx; - } - - if (array_length > 0 && array_length != array.length()) { - return Status::Invalid("Struct's length is not equal to its child arrays"); - } - } - return Status::OK(); - } - - Status Visit(const UnionArray& array) { return Status::OK(); } - - Status Visit(const DictionaryArray& array) { - Type::type index_type_id = array.indices()->type()->id(); - if (!is_integer(index_type_id)) { - return Status::Invalid("Dictionary indices must be integer type"); - } - if (!array.data()->dictionary) { - return Status::Invalid("Dictionary values must be non-null"); - } - return Status::OK(); - } - - Status Visit(const ExtensionArray& array) { - const auto& ext_type = checked_cast(*array.type()); - - if (!array.storage()->type()->Equals(*ext_type.storage_type())) { - return Status::Invalid("Extension array of type '", array.type()->ToString(), - "' has storage array of incompatible type '", - array.storage()->type()->ToString(), "'"); - } - return array.storage()->Validate(); - } - - protected: - template - Status ValidateListArray(const ListArrayType& array) { - const auto last_offset = array.value_offset(array.length()); - const auto data_extent = last_offset - array.value_offset(0); - if (data_extent > 0 && !array.values()) { - return Status::Invalid("values was null"); - } - if (array.values()->length() < last_offset) { - return Status::Invalid("Final offset larger than values array: ", last_offset, ">", - array.values()->length()); - } - - const Status child_valid = array.values()->Validate(); - if (!child_valid.ok()) { - return Status::Invalid("Child array invalid: ", child_valid.ToString()); - } - - return ValidateOffsets(array); - } - - template - Status ValidateOffsets(ArrayType& array) { - using offset_type = typename ArrayType::offset_type; - - auto value_offsets = array.value_offsets(); - if (array.length() && !value_offsets) { - return Status::Invalid("value_offsets_ was null"); - } - if (value_offsets->size() / static_cast(sizeof(offset_type)) < array.length()) { - return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), - " isn't large enough for length: ", array.length()); - } - - auto prev_offset = array.value_offset(0); - if (array.offset() == 0 && prev_offset != 0) { - return Status::Invalid("The first offset wasn't zero"); - } - for (int64_t i = 1; i <= array.length(); ++i) { - auto current_offset = array.value_offset(i); - if (array.IsNull(i - 1) && current_offset != prev_offset) { - return Status::Invalid("Offset invariant failure at: ", i, - " inconsistent value_offsets for null slot", - current_offset, "!=", prev_offset); - } - if (current_offset < prev_offset) { - return Status::Invalid("Offset invariant failure: ", i, - " inconsistent offset for non-null slot: ", current_offset, - "<", prev_offset); - } - prev_offset = current_offset; - } - return Status::OK(); - } -}; - -} // namespace internal - -Status Array::Validate() const { - // First check the array layout conforms to the spec - const DataType& type = *this->type(); - const auto layout = type.layout(); - const ArrayData& data = *this->data(); - - if (length() < 0) { - return Status::Invalid("Array length is negative"); - } - - if (null_count() > length()) { - return Status::Invalid("Null count exceeds array length"); - } - - if (data.buffers.size() != layout.bit_widths.size()) { - return Status::Invalid("Expected ", layout.bit_widths.size(), - " buffers in array " - "of type ", - type.ToString(), ", got ", data.buffers.size()); - } - if (type.id() != Type::EXTENSION) { - if (data.child_data.size() != static_cast(type.num_children())) { - return Status::Invalid("Expected ", type.num_children(), - " child arrays in array " - "of type ", - type.ToString(), ", got ", data.child_data.size()); - } - } - if (layout.has_dictionary && !data.dictionary) { - return Status::Invalid("Array of type ", type.ToString(), - " must have dictionary values"); - } - if (!layout.has_dictionary && data.dictionary) { - return Status::Invalid("Unexpected dictionary values in array of type ", - type.ToString()); - } - - internal::ValidateVisitor validate_visitor; - return VisitArrayInline(*this, &validate_visitor); +Status Array::ValidateFull() const { + RETURN_NOT_OK(internal::ValidateArray(*this)); + return internal::ValidateArrayData(*this); } -Status ValidateArray(const Array& array) { return array.Validate(); } - // ---------------------------------------------------------------------- // Loading from ArrayData diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 8038e1e53f3..5eb75a0736d 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -379,14 +379,23 @@ class ARROW_EXPORT Array { /// \return PrettyPrint representation of array suitable for debugging std::string ToString() const; - /// \brief Perform any validation checks to determine obvious inconsistencies + /// \brief Perform cheap validation checks to determine obvious inconsistencies /// within the array's internal data. /// - /// This can be an expensive check, potentially O(length). + /// This is O(k) where k is the number of descendents. /// /// \return Status Status Validate() const; + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the array's internal data. + /// + /// This is potentially O(k*n) where k is the number of descendents and n + /// is the array length. + /// + /// \return Status + Status ValidateFull() const; + protected: Array() : null_bitmap_data_(NULLPTR) {} @@ -1003,7 +1012,7 @@ class ARROW_EXPORT StructArray : public Array { class ARROW_EXPORT UnionArray : public Array { public: using TypeClass = UnionType; - using type_id_t = uint8_t; + using type_id_t = int8_t; explicit UnionArray(const std::shared_ptr& data); @@ -1019,8 +1028,7 @@ class ARROW_EXPORT UnionArray : public Array { /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. @@ -1031,7 +1039,7 @@ class ARROW_EXPORT UnionArray : public Array { static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, const std::vector& field_names, - const std::vector& type_codes, + const std::vector& type_codes, std::shared_ptr* out); /// \brief Construct Dense UnionArray from types_ids, value_offsets and children @@ -1039,8 +1047,7 @@ class ARROW_EXPORT UnionArray : public Array { /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. @@ -1059,8 +1066,7 @@ class ARROW_EXPORT UnionArray : public Array { /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. @@ -1069,7 +1075,7 @@ class ARROW_EXPORT UnionArray : public Array { /// \param[out] out Will have length equal to value_offsets.length() static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, - const std::vector& type_codes, + const std::vector& type_codes, std::shared_ptr* out) { return MakeDense(type_ids, value_offsets, children, {}, type_codes, out); } @@ -1081,8 +1087,7 @@ class ARROW_EXPORT UnionArray : public Array { /// /// The name of each field is filled by the index of the field. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. @@ -1099,8 +1104,7 @@ class ARROW_EXPORT UnionArray : public Array { /// This function does the bare minimum of validation of the offsets and /// input types. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. @@ -1108,7 +1112,7 @@ class ARROW_EXPORT UnionArray : public Array { static Status MakeSparse(const Array& type_ids, const std::vector>& children, const std::vector& field_names, - const std::vector& type_codes, + const std::vector& type_codes, std::shared_ptr* out); /// \brief Construct Sparse UnionArray from type_ids and children @@ -1116,8 +1120,7 @@ class ARROW_EXPORT UnionArray : public Array { /// This function does the bare minimum of validation of the offsets and /// input types. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[out] out Will have length equal to type_ids.length() @@ -1133,14 +1136,13 @@ class ARROW_EXPORT UnionArray : public Array { /// This function does the bare minimum of validation of the offsets and /// input types. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] type_codes Vector of type codes. /// \param[out] out Will have length equal to type_ids.length() static Status MakeSparse(const Array& type_ids, const std::vector>& children, - const std::vector& type_codes, + const std::vector& type_codes, std::shared_ptr* out) { return MakeSparse(type_ids, children, {}, type_codes, out); } @@ -1152,8 +1154,7 @@ class ARROW_EXPORT UnionArray : public Array { /// /// The name of each field is filled by the index of the field. /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. + /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[out] out Will have length equal to type_ids.length() static Status MakeSparse(const Array& type_ids, @@ -1165,12 +1166,21 @@ class ARROW_EXPORT UnionArray : public Array { /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } + + /// The physical child id containing value at index. + int child_id(int64_t i) const { + return union_type_->child_ids()[raw_type_ids_[i + data_->offset]]; + } + + /// For dense arrays only. /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } + /// For dense arrays only. int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } + /// For dense arrays only. const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } const UnionType* union_type() const { return union_type_; } @@ -1275,12 +1285,4 @@ class ARROW_EXPORT DictionaryArray : public Array { std::shared_ptr indices_; }; -/// \brief Alias of Array::Validate(). -/// -/// \param array an Array instance -/// \return Status -ARROW_DEPRECATED("Use Array::Validate instead") -ARROW_EXPORT -Status ValidateArray(const Array& array); - } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 238fb404d06..cc9eb292c18 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -60,9 +60,9 @@ BasicUnionBuilder::BasicUnionBuilder( children_ = children; type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr); - DCHECK_LT(type_id_to_children_.size(), - static_cast( - std::numeric_limits::max())); + DCHECK_LT( + type_id_to_children_.size(), + static_cast(UnionType::kMaxTypeId)); for (size_t i = 0; i < children.size(); ++i) { child_fields_[i] = union_type.child(static_cast(i)); @@ -73,7 +73,7 @@ BasicUnionBuilder::BasicUnionBuilder( } BasicUnionBuilder::BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) - : BasicUnionBuilder(pool, mode, {}, union_({}, mode)) {} + : BasicUnionBuilder(pool, mode, {}, union_(mode)) {} int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_child, const std::string& field_name) { @@ -85,7 +85,7 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c child_fields_.push_back(field(field_name, nullptr)); - type_codes_.push_back(static_cast(new_type_id)); + type_codes_.push_back(static_cast(new_type_id)); return new_type_id; } @@ -109,9 +109,9 @@ int8_t BasicUnionBuilder::NextTypeId() { } } - DCHECK_LT(type_id_to_children_.size(), - static_cast( - std::numeric_limits::max())); + DCHECK_LT( + type_id_to_children_.size(), + static_cast(UnionType::kMaxTypeId)); // type_id_to_children_ is already densely packed, so just append the new child type_id_to_children_.resize(type_id_to_children_.size() + 1); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 7bbb2656125..49ff18f7bec 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -66,7 +66,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { int8_t NextTypeId(); std::vector> child_fields_; - std::vector type_codes_; + std::vector type_codes_; UnionMode::type mode_; std::vector type_id_to_children_; diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index f41d7822018..1ec7e5fd11f 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -149,7 +149,7 @@ TYPED_TEST(PrimitiveConcatenateTest, Primitives) { TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); - ASSERT_OK((**out).Validate()); + ASSERT_OK((**out).ValidateFull()); }); } @@ -157,7 +157,7 @@ TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); - ASSERT_OK((**out).Validate()); + ASSERT_OK((**out).ValidateFull()); }); } @@ -172,7 +172,7 @@ TEST_F(ConcatenateTest, ListType) { std::shared_ptr offsets; ArrayFromVector(offsets_vector, &offsets); ASSERT_OK(ListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); - ASSERT_OK((**out).Validate()); + ASSERT_OK((**out).ValidateFull()); }); } @@ -187,7 +187,7 @@ TEST_F(ConcatenateTest, LargeListType) { std::shared_ptr offsets; ArrayFromVector(offsets_vector, &offsets); ASSERT_OK(LargeListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); - ASSERT_OK((**out).Validate()); + ASSERT_OK((**out).ValidateFull()); }); } diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 32b916f63ab..1a538593478 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -608,13 +608,12 @@ class MakeFormatterImpl { Status Visit(const UnionType& t) { struct UnionImpl { - UnionImpl(std::vector f, std::vector c) - : field_formatters_(std::move(f)), type_id_to_child_index_(std::move(c)) {} + explicit UnionImpl(std::vector f) : field_formatters_(std::move(f)) {} void DoFormat(const UnionArray& array, int64_t index, int64_t child_index, std::ostream* os) { auto type_id = array.raw_type_ids()[index]; - const auto& child = *array.child(type_id_to_child_index_[type_id]); + const auto& child = *array.child(array.child_id(index)); *os << "{" << static_cast(type_id) << ": "; if (child.IsNull(child_index)) { @@ -626,7 +625,6 @@ class MakeFormatterImpl { } std::vector field_formatters_; - std::vector type_id_to_child_index_; }; struct SparseImpl : UnionImpl { @@ -648,18 +646,16 @@ class MakeFormatterImpl { }; std::vector field_formatters(t.max_type_code() + 1); - std::vector type_id_to_child_index(field_formatters.size()); for (int i = 0; i < t.num_children(); ++i) { auto type_id = t.type_codes()[i]; - type_id_to_child_index[type_id] = i; ARROW_ASSIGN_OR_RAISE(field_formatters[type_id], MakeFormatter(*t.child(i)->type())); } if (t.mode() == UnionMode::SPARSE) { - impl_ = SparseImpl(std::move(field_formatters), std::move(type_id_to_child_index)); + impl_ = SparseImpl(std::move(field_formatters)); } else { - impl_ = DenseImpl(std::move(field_formatters), std::move(type_id_to_child_index)); + impl_ = DenseImpl(std::move(field_formatters)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 8a39512cb28..0ad321158ee 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -81,7 +81,7 @@ class DiffTest : public ::testing::Test { auto edits = Diff(*base_, *target_, default_memory_pool()); ASSERT_OK(edits.status()); edits_ = edits.ValueOrDie(); - ASSERT_OK(edits_->Validate()); + ASSERT_OK(edits_->ValidateFull()); ASSERT_TRUE(edits_->type()->Equals(edits_type)); insert_ = checked_pointer_cast(edits_->field(0)); run_lengths_ = checked_pointer_cast(edits_->field(1)); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc new file mode 100644 index 00000000000..e8c107bc08b --- /dev/null +++ b/cpp/src/arrow/array/validate.cc @@ -0,0 +1,440 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/validate.h" + +#include "arrow/array.h" +#include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +/////////////////////////////////////////////////////////////////////////// +// ValidateArray: cheap validation checks + +namespace { + +struct ValidateArrayVisitor { + Status Visit(const NullArray& array) { + ARROW_RETURN_IF(array.null_count() != array.length(), + Status::Invalid("null_count is invalid")); + return Status::OK(); + } + + Status Visit(const PrimitiveArray& array) { + ARROW_RETURN_IF(array.data()->buffers.size() != 2, + Status::Invalid("number of buffers is != 2")); + + if (array.length() > 0 && array.data()->buffers[1] == nullptr) { + return Status::Invalid("values buffer is null"); + } + if (array.length() > 0 && array.values() == nullptr) { + return Status::Invalid("values is null"); + } + return Status::OK(); + } + + Status Visit(const Decimal128Array& array) { + if (array.data()->buffers.size() != 2) { + return Status::Invalid("number of buffers is != 2"); + } + if (array.length() > 0 && array.values() == nullptr) { + return Status::Invalid("values is null"); + } + return Status::OK(); + } + + Status Visit(const BinaryArray& array) { + if (array.data()->buffers.size() != 3) { + return Status::Invalid("number of buffers is != 3"); + } + return ValidateOffsets(array); + } + + Status Visit(const LargeBinaryArray& array) { + if (array.data()->buffers.size() != 3) { + return Status::Invalid("number of buffers is != 3"); + } + return ValidateOffsets(array); + } + + Status Visit(const ListArray& array) { return ValidateListArray(array); } + + Status Visit(const LargeListArray& array) { return ValidateListArray(array); } + + Status Visit(const MapArray& array) { + if (!array.keys()) { + return Status::Invalid("keys is null"); + } + const Status key_valid = ValidateArray(*array.keys()); + if (!key_valid.ok()) { + return Status::Invalid("key array invalid: ", key_valid.ToString()); + } + + if (array.length() > 0 && !array.values()) { + return Status::Invalid("values is null"); + } + const Status values_valid = ValidateArray(*array.values()); + if (!values_valid.ok()) { + return Status::Invalid("values array invalid: ", values_valid.ToString()); + } + + const int32_t last_offset = array.value_offset(array.length()); + if (array.values()->length() != last_offset) { + return Status::Invalid("Final offset invariant not equal to values length: ", + last_offset, "!=", array.values()->length()); + } + if (array.keys()->length() != last_offset) { + return Status::Invalid("Final offset invariant not equal to keys length: ", + last_offset, "!=", array.keys()->length()); + } + + return ValidateOffsets(array); + } + + Status Visit(const FixedSizeListArray& array) { + if (array.length() > 0 && !array.values()) { + return Status::Invalid("values is null"); + } + if (array.values()->length() != array.length() * array.value_length()) { + return Status::Invalid( + "Values Length (", array.values()->length(), ") is not equal to the length (", + array.length(), ") multiplied by the list size (", array.value_length(), ")"); + } + + return Status::OK(); + } + + Status Visit(const StructArray& array) { + const auto& struct_type = checked_cast(*array.type()); + // Validate fields + for (int i = 0; i < array.num_fields(); ++i) { + auto it = array.field(i); + if (it->length() != array.length()) { + return Status::Invalid("Struct child array #", i, + " has length different from struct array (", it->length(), + " != ", array.length(), ")"); + } + + auto it_type = struct_type.child(i)->type(); + if (!it->type()->Equals(it_type)) { + return Status::Invalid("Struct child array #", i, + " does not match type field: ", it->type()->ToString(), + " vs ", it_type->ToString()); + } + + const Status child_valid = ValidateArray(*it); + if (!child_valid.ok()) { + return Status::Invalid("Struct child array #", i, + " invalid: ", child_valid.ToString()); + } + } + return Status::OK(); + } + + Status Visit(const UnionArray& array) { + const auto& union_type = *array.union_type(); + // Validate fields + for (int i = 0; i < array.num_fields(); ++i) { + auto it = array.child(i); + if (union_type.mode() == UnionMode::SPARSE && it->length() != array.length()) { + return Status::Invalid("Sparse union child array #", i, + " has length different from union array (", it->length(), + " != ", array.length(), ")"); + } + + auto it_type = union_type.child(i)->type(); + if (!it->type()->Equals(it_type)) { + return Status::Invalid("Union child array #", i, + " does not match type field: ", it->type()->ToString(), + " vs ", it_type->ToString()); + } + + const Status child_valid = ValidateArray(*it); + if (!child_valid.ok()) { + return Status::Invalid("Union child array #", i, + " invalid: ", child_valid.ToString()); + } + } + return Status::OK(); + } + + Status Visit(const DictionaryArray& array) { + Type::type index_type_id = array.indices()->type()->id(); + if (!is_integer(index_type_id)) { + return Status::Invalid("Dictionary indices must be integer type"); + } + if (!array.data()->dictionary) { + return Status::Invalid("Dictionary values must be non-null"); + } + const Status dict_valid = ValidateArray(*array.data()->dictionary); + if (!dict_valid.ok()) { + return Status::Invalid("Dictionary array invalid: ", dict_valid.ToString()); + } + return Status::OK(); + } + + Status Visit(const ExtensionArray& array) { + const auto& ext_type = checked_cast(*array.type()); + + if (!array.storage()->type()->Equals(*ext_type.storage_type())) { + return Status::Invalid("Extension array of type '", array.type()->ToString(), + "' has storage array of incompatible type '", + array.storage()->type()->ToString(), "'"); + } + return ValidateArray(*array.storage()); + } + + protected: + template + Status ValidateListArray(const ListArrayType& array) { + const auto first_offset = array.value_offset(0); + const auto last_offset = array.value_offset(array.length()); + const auto data_extent = last_offset - first_offset; + if (data_extent > 0 && !array.values()) { + return Status::Invalid("values is null"); + } + const auto values_length = array.values()->length(); + if (values_length < data_extent) { + return Status::Invalid("Length spanned by list offsets (", data_extent, + ") larger than values array (length ", values_length, ")"); + } + + const Status child_valid = ValidateArray(*array.values()); + if (!child_valid.ok()) { + return Status::Invalid("List child array invalid: ", child_valid.ToString()); + } + + return ValidateOffsets(array); + } + + template + Status ValidateOffsets(const ArrayType& array) { + using offset_type = typename ArrayType::offset_type; + + auto value_offsets = array.value_offsets(); + if (value_offsets == nullptr) { + if (array.length() != 0) { + return Status::Invalid("non-empty array but value_offsets_ is null"); + } + return Status::OK(); + } + if (value_offsets->size() / static_cast(sizeof(offset_type)) < array.length()) { + return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), + " isn't large enough for length: ", array.length()); + } + + auto first_offset = array.value_offset(0); + if (array.offset() == 0 && first_offset != 0) { + return Status::Invalid("The first offset isn't zero"); + } + return Status::OK(); + } +}; + +} // namespace + +ARROW_EXPORT +Status ValidateArray(const Array& array) { + // First check the array layout conforms to the spec + const DataType& type = *array.type(); + const auto layout = type.layout(); + const ArrayData& data = *array.data(); + + if (array.length() < 0) { + return Status::Invalid("Array length is negative"); + } + + if (array.null_count() > array.length()) { + return Status::Invalid("Null count exceeds array length"); + } + + if (data.buffers.size() != layout.bit_widths.size()) { + return Status::Invalid("Expected ", layout.bit_widths.size(), + " buffers in array " + "of type ", + type.ToString(), ", got ", data.buffers.size()); + } + if (type.id() != Type::EXTENSION) { + if (data.child_data.size() != static_cast(type.num_children())) { + return Status::Invalid("Expected ", type.num_children(), + " child arrays in array " + "of type ", + type.ToString(), ", got ", data.child_data.size()); + } + } + if (layout.has_dictionary && !data.dictionary) { + return Status::Invalid("Array of type ", type.ToString(), + " must have dictionary values"); + } + if (!layout.has_dictionary && data.dictionary) { + return Status::Invalid("Unexpected dictionary values in array of type ", + type.ToString()); + } + + ValidateArrayVisitor visitor; + return VisitArrayInline(array, &visitor); +} + +/////////////////////////////////////////////////////////////////////////// +// ValidateArrayData: expensive validation checks + +namespace { + +struct BoundsCheckVisitor { + int64_t min_value_; + int64_t max_value_; + + Status Visit(const Array& array) { + // Default, should be unreachable + return Status::NotImplemented(""); + } + + template + Status Visit(const NumericArray& array) { + for (int64_t i = 0; i < array.length(); ++i) { + if (!array.IsNull(i)) { + const auto v = static_cast(array.Value(i)); + if (v < min_value_ || v > max_value_) { + return Status::Invalid("Value at position ", i, " out of bounds: ", v, + " (should be in [", min_value_, ", ", max_value_, "])"); + } + } + } + return Status::OK(); + } +}; + +struct ValidateArrayDataVisitor { + // Fallback + Status Visit(const Array& array) { return Status::OK(); } + + Status Visit(const ListArray& array) { return ValidateListArray(array); } + + Status Visit(const LargeListArray& array) { return ValidateListArray(array); } + + Status Visit(const MapArray& array) { + // TODO check keys and items individually? + return ValidateListArray(array); + } + + Status Visit(const UnionArray& array) { + const auto& type_codes = array.union_type()->type_codes(); + const auto& child_ids = array.union_type()->child_ids(); + + const int8_t* type_ids = array.raw_type_ids(); + for (int64_t i = 0; i < array.length(); ++i) { + if (array.IsNull(i)) { + continue; + } + const int32_t id = type_ids[i]; + if (id < 0 || child_ids[id] == UnionType::kInvalidChildId) { + return Status::Invalid("Union value at position ", i, " has invalid type id ", + id); + } + } + + if (array.mode() == UnionMode::DENSE) { + // Map logical type id to child length + std::vector child_lengths(256); + for (int child_id = 0; child_id < static_cast(type_codes.size()); ++child_id) { + child_lengths[type_codes[child_id]] = array.child(child_id)->length(); + } + + // Check offsets + const int32_t* offsets = array.raw_value_offsets(); + for (int64_t i = 0; i < array.length(); ++i) { + if (array.IsNull(i)) { + continue; + } + const int32_t id = type_ids[i]; + const int32_t offset = offsets[i]; + if (offset < 0) { + return Status::Invalid("Union value at position ", i, " has negative offset ", + offset); + } + if (offset >= child_lengths[id]) { + return Status::Invalid("Union value at position ", i, + " has offset larger " + "than child length (", + offset, " >= ", child_lengths[id], ")"); + } + } + } + return Status::OK(); + } + + Status Visit(const DictionaryArray& array) { + const Status indices_status = + CheckBounds(*array.indices(), 0, array.dictionary()->length() - 1); + if (!indices_status.ok()) { + return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString()); + } + return Status::OK(); + } + + Status Visit(const ExtensionArray& array) { + return ValidateArrayData(*array.storage()); + } + + protected: + template + Status ValidateListArray(const ListArrayType& array) { + const Status child_valid = ValidateArrayData(*array.values()); + if (!child_valid.ok()) { + return Status::Invalid("List child array invalid: ", child_valid.ToString()); + } + return ValidateOffsets(array); + } + + template + Status ValidateOffsets(const ArrayType& array) { + auto prev_offset = array.value_offset(0); + for (int64_t i = 1; i <= array.length(); ++i) { + auto current_offset = array.value_offset(i); + if (array.IsNull(i - 1) && current_offset != prev_offset) { + return Status::Invalid("Offset invariant failure at: ", i, + " inconsistent value_offsets for null slot", + current_offset, "!=", prev_offset); + } + if (current_offset < prev_offset) { + return Status::Invalid("Offset invariant failure: ", i, + " inconsistent offset for non-null slot: ", current_offset, + "<", prev_offset); + } + prev_offset = current_offset; + } + return Status::OK(); + } + + Status CheckBounds(const Array& array, int64_t min_value, int64_t max_value) { + BoundsCheckVisitor visitor{min_value, max_value}; + return VisitArrayInline(array, &visitor); + } +}; + +} // namespace + +ARROW_EXPORT +Status ValidateArrayData(const Array& array) { + ValidateArrayDataVisitor visitor; + return VisitArrayInline(array, &visitor); +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/array/validate.h b/cpp/src/arrow/array/validate.h new file mode 100644 index 00000000000..6d441b89d00 --- /dev/null +++ b/cpp/src/arrow/array/validate.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// Internal functions implementing Array::Validate() and friends. + +ARROW_EXPORT +Status ValidateArray(const Array& array); + +ARROW_EXPORT +Status ValidateArrayData(const Array& array); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/array_binary_test.cc b/cpp/src/arrow/array_binary_test.cc index 40c639362ae..a1c8f7ee307 100644 --- a/cpp/src/arrow/array_binary_test.cc +++ b/cpp/src/arrow/array_binary_test.cc @@ -101,7 +101,7 @@ class TestStringArray : public ::testing::Test { void TestArrayBasics() { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); - ASSERT_OK(strings_->Validate()); + ASSERT_OK(strings_->ValidateFull()); TestInitialized(*strings_); AssertZeroPadded(*strings_); } @@ -276,7 +276,7 @@ class TestStringBuilder : public TestBuilder { FinishAndCheckPadding(builder_.get(), &out); result_ = std::dynamic_pointer_cast(out); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); } void TestScalarAppend() { @@ -330,7 +330,7 @@ class TestStringBuilder : public TestBuilder { ASSERT_EQ(builder_->value_data_length(), total_length * reps); Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); ASSERT_EQ(reps * total_length, result_->value_data()->size()); diff --git a/cpp/src/arrow/array_dict_test.cc b/cpp/src/arrow/array_dict_test.cc index 4d5885242c4..e67c647c703 100644 --- a/cpp/src/arrow/array_dict_test.cc +++ b/cpp/src/arrow/array_dict_test.cc @@ -978,7 +978,7 @@ TEST(TestDictionary, Validate) { std::make_shared(dict_type, indices, dict); // Only checking index type for now - ASSERT_OK(arr->Validate()); + ASSERT_OK(arr->ValidateFull()); ASSERT_DEATH( { @@ -1019,7 +1019,7 @@ static void CheckTranspose(const std::shared_ptr& input, std::shared_ptr out, expected; ASSERT_OK(internal::checked_cast(*input).Transpose( default_memory_pool(), out_dict_type, out_dict, transpose_map, &out)); - ASSERT_OK(out->Validate()); + ASSERT_OK(out->ValidateFull()); ASSERT_OK( DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict, &expected)); @@ -1167,7 +1167,7 @@ TEST(TestDictionary, ListOfDictionary) { std::shared_ptr array; ASSERT_OK(root_builder->Finish(&array)); - ASSERT_OK(array->Validate()); + ASSERT_OK(array->ValidateFull()); auto expected_type = list(dictionary(int16(), utf8())); ASSERT_EQ(array->type()->ToString(), expected_type->ToString()); @@ -1231,7 +1231,7 @@ TEST(TestDictionary, IndicesArray) { ASSERT_EQ(arr->indices()->data()->dictionary, nullptr); // Validate the indices array - ASSERT_OK(arr->indices()->Validate()); + ASSERT_OK(arr->indices()->ValidateFull()); } } // namespace arrow diff --git a/cpp/src/arrow/array_list_test.cc b/cpp/src/arrow/array_list_test.cc index 13f6423ea06..d9cf51595cb 100644 --- a/cpp/src/arrow/array_list_test.cc +++ b/cpp/src/arrow/array_list_test.cc @@ -72,7 +72,7 @@ class TestListArray : public TestBuilder { void ValidateBasicListArray(const ArrayType* result, const std::vector& values, const std::vector& is_valid) { - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -213,9 +213,9 @@ class TestListArray : public TestBuilder { ASSERT_OK(ArrayType::FromArrays(*offsets1, *values, pool_, &list1)); ASSERT_OK(ArrayType::FromArrays(*offsets3, *values, pool_, &list3)); ASSERT_OK(ArrayType::FromArrays(*offsets4, *values, pool_, &list4)); - ASSERT_OK(list1->Validate()); - ASSERT_OK(list3->Validate()); - ASSERT_OK(list4->Validate()); + ASSERT_OK(list1->ValidateFull()); + ASSERT_OK(list3->ValidateFull()); + ASSERT_OK(list4->ValidateFull()); ArrayType expected1(list_type, length, offsets1->data()->buffers[1], values, offsets1->data()->buffers[0], 0); @@ -251,7 +251,7 @@ class TestListArray : public TestBuilder { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); @@ -270,7 +270,7 @@ class TestListArray : public TestBuilder { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); ASSERT_EQ(result_->length(), 3); ASSERT_EQ(result_->null_count(), 3); ASSERT_TRUE(result_->IsNull(0)); @@ -321,13 +321,13 @@ class TestListArray : public TestBuilder { } Done(); - ASSERT_RAISES(Invalid, result_->Validate()); + ASSERT_RAISES(Invalid, result_->ValidateFull()); } void TestZeroLength() { // All buffers are null Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); } void TestBuilderPreserveFieldName() { @@ -485,7 +485,7 @@ TEST_F(TestMapArray, BuildingIntToInt) { ASSERT_OK(item_builder->AppendValues({-1, -1, 0, 1, -1, 2}, {0, 0, 1, 1, 0, 1})); ASSERT_OK(map_builder.Append()); ASSERT_OK(map_builder.Finish(&actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_ARRAYS_EQUAL(*actual, expected); } @@ -518,7 +518,7 @@ TEST_F(TestMapArray, BuildingStringToInt) { ASSERT_OK(item_builder->Append(8)); ASSERT_OK(map_builder.Append()); ASSERT_OK(map_builder.Finish(&actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_ARRAYS_EQUAL(*actual, expected); } @@ -594,7 +594,7 @@ TEST_F(TestFixedSizeListArray, TestAppendNull) { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); @@ -610,7 +610,7 @@ TEST_F(TestFixedSizeListArray, TestAppendNulls) { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); ASSERT_EQ(result_->length(), 3); ASSERT_EQ(result_->null_count(), 3); ASSERT_TRUE(result_->IsNull(0)); @@ -628,7 +628,7 @@ TEST_F(TestFixedSizeListArray, TestAppendNulls) { void ValidateBasicFixedSizeListArray(const FixedSizeListArray* result, const std::vector& values, const std::vector& is_valid) { - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); ASSERT_EQ(1, result->null_count()); ASSERT_LE(result->values()->null_count(), 2); @@ -704,13 +704,13 @@ TEST_F(TestFixedSizeListArray, BulkAppendInvalid) { } Done(); - ASSERT_RAISES(Invalid, result_->Validate()); + ASSERT_RAISES(Invalid, result_->ValidateFull()); } TEST_F(TestFixedSizeListArray, TestZeroLength) { // All buffers are null Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); } TEST_F(TestFixedSizeListArray, TestBuilderPreserveFieldName) { diff --git a/cpp/src/arrow/array_struct_test.cc b/cpp/src/arrow/array_struct_test.cc index cfeb416d9e1..133ccc9542a 100644 --- a/cpp/src/arrow/array_struct_test.cc +++ b/cpp/src/arrow/array_struct_test.cc @@ -45,7 +45,7 @@ void ValidateBasicStructArray(const StructArray* result, const std::vector& list_offsets, const std::vector& int_values) { ASSERT_EQ(4, result->length()); - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); auto list_char_arr = std::dynamic_pointer_cast(result->field(0)); auto char_arr = std::dynamic_pointer_cast(list_char_arr->values()); @@ -225,7 +225,7 @@ TEST_F(TestStructBuilder, TestAppendNull) { Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); ASSERT_EQ(2, static_cast(result_->num_fields())); ASSERT_EQ(2, result_->length()); @@ -339,7 +339,7 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) { Done(); // Even null bitmap of the parent Struct is not valid, validate will ignore it. - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); } TEST_F(TestStructBuilder, TestEquality) { @@ -475,7 +475,7 @@ TEST_F(TestStructBuilder, TestEquality) { TEST_F(TestStructBuilder, TestZeroLength) { // All buffers are null Done(); - ASSERT_OK(result_->Validate()); + ASSERT_OK(result_->ValidateFull()); } TEST_F(TestStructBuilder, TestSlice) { diff --git a/cpp/src/arrow/array_test.cc b/cpp/src/arrow/array_test.cc index ea41e242041..4dbcde5f5b6 100644 --- a/cpp/src/arrow/array_test.cc +++ b/cpp/src/arrow/array_test.cc @@ -275,7 +275,7 @@ TEST_F(TestArray, TestMakeArrayOfNull) { for (auto type : types) { std::shared_ptr array; ASSERT_OK(MakeArrayOfNull(type, length, &array)); - ASSERT_OK(array->Validate()); + ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), length); } @@ -303,7 +303,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { for (auto scalar : scalars) { std::shared_ptr array; ASSERT_OK(MakeArrayFromScalar(*scalar, length, &array)); - ASSERT_OK(array->Validate()); + ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); } diff --git a/cpp/src/arrow/array_union_test.cc b/cpp/src/arrow/array_union_test.cc index e70fcc25a22..b2f256e726e 100644 --- a/cpp/src/arrow/array_union_test.cc +++ b/cpp/src/arrow/array_union_test.cc @@ -73,15 +73,23 @@ class TestUnionArrayFactories : public ::testing::Test { public: void SetUp() { pool_ = default_memory_pool(); + type_codes_ = {1, 2, 4, 8}; ArrayFromVector({0, 1, 2, 0, 1, 3, 2, 0, 2, 1}, &type_ids_); + ArrayFromVector({1, 2, 4, 1, 2, 8, 4, 1, 4, 2}, &logical_type_ids_); + ArrayFromVector({1, 2, 4, 1, -2, 8, 4, 1, 4, 2}, &invalid_type_ids1_); + ArrayFromVector({1, 2, 4, 1, 3, 8, 4, 1, 4, 2}, &invalid_type_ids2_); } void CheckUnionArray(const UnionArray& array, UnionMode::type mode, const std::vector& field_names, - const std::vector& type_codes) { + const std::vector& type_codes) { ASSERT_EQ(mode, array.mode()); CheckFieldNames(array, field_names); CheckTypeCodes(array, type_codes); + const auto& type_ids = checked_cast(*type_ids_); + for (int64_t i = 0; i < type_ids.length(); ++i) { + ASSERT_EQ(array.child_id(i), type_ids.Value(i)); + } } void CheckFieldNames(const UnionArray& array, const std::vector& names) { @@ -92,19 +100,23 @@ class TestUnionArrayFactories : public ::testing::Test { } } - void CheckTypeCodes(const UnionArray& array, const std::vector& codes) { + void CheckTypeCodes(const UnionArray& array, const std::vector& codes) { const auto& type = checked_cast(*array.type()); ASSERT_EQ(codes, type.type_codes()); } protected: MemoryPool* pool_; + std::vector type_codes_; std::shared_ptr type_ids_; + std::shared_ptr logical_type_ids_; + std::shared_ptr invalid_type_ids1_; + std::shared_ptr invalid_type_ids2_; }; TEST_F(TestUnionArrayFactories, TestMakeDense) { std::shared_ptr value_offsets; - ArrayFromVector({0, 0, 0, 1, 1, 0, 1, 2, 1, 2}, &value_offsets); + ArrayFromVector({1, 0, 0, 0, 1, 0, 1, 2, 1, 2}, &value_offsets); auto children = std::vector>(4); ArrayFromVector({"abc", "def", "xyz"}, &children[0]); @@ -113,38 +125,60 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) { ArrayFromVector({-12}, &children[3]); std::vector field_names = {"str", "int1", "real", "int2"}; - std::vector type_codes = {1, 2, 4, 8}; std::shared_ptr result; + const UnionArray* union_array; // without field names and type codes ASSERT_OK(UnionArray::MakeDense(*type_ids_, *value_offsets, children, &result)); - CheckUnionArray(checked_cast(*result), UnionMode::DENSE, - {"0", "1", "2", "3"}, {0, 1, 2, 3}); + ASSERT_OK(result->ValidateFull()); + union_array = checked_cast(result.get()); + CheckUnionArray(*union_array, UnionMode::DENSE, {"0", "1", "2", "3"}, {0, 1, 2, 3}); // with field name ASSERT_RAISES(Invalid, UnionArray::MakeDense(*type_ids_, *value_offsets, children, {"one"}, &result)); ASSERT_OK( UnionArray::MakeDense(*type_ids_, *value_offsets, children, field_names, &result)); - CheckUnionArray(checked_cast(*result), UnionMode::DENSE, field_names, - {0, 1, 2, 3}); + ASSERT_OK(result->ValidateFull()); + union_array = checked_cast(result.get()); + CheckUnionArray(*union_array, UnionMode::DENSE, field_names, {0, 1, 2, 3}); // with type codes - ASSERT_RAISES(Invalid, UnionArray::MakeDense(*type_ids_, *value_offsets, children, - std::vector{0}, &result)); - ASSERT_OK( - UnionArray::MakeDense(*type_ids_, *value_offsets, children, type_codes, &result)); - CheckUnionArray(checked_cast(*result), UnionMode::DENSE, - {"0", "1", "2", "3"}, type_codes); + ASSERT_RAISES(Invalid, + UnionArray::MakeDense(*logical_type_ids_, *value_offsets, children, + std::vector{0}, &result)); + ASSERT_OK(UnionArray::MakeDense(*logical_type_ids_, *value_offsets, children, + type_codes_, &result)); + ASSERT_OK(result->ValidateFull()); + union_array = checked_cast(result.get()); + CheckUnionArray(*union_array, UnionMode::DENSE, {"0", "1", "2", "3"}, type_codes_); // with field names and type codes - ASSERT_RAISES(Invalid, UnionArray::MakeDense(*type_ids_, *value_offsets, children, - {"one"}, type_codes, &result)); - ASSERT_OK(UnionArray::MakeDense(*type_ids_, *value_offsets, children, field_names, - type_codes, &result)); - CheckUnionArray(checked_cast(*result), UnionMode::DENSE, field_names, - type_codes); + ASSERT_RAISES(Invalid, UnionArray::MakeDense(*logical_type_ids_, *value_offsets, + children, {"one"}, type_codes_, &result)); + ASSERT_OK(UnionArray::MakeDense(*logical_type_ids_, *value_offsets, children, + field_names, type_codes_, &result)); + ASSERT_OK(result->ValidateFull()); + union_array = checked_cast(result.get()); + CheckUnionArray(*union_array, UnionMode::DENSE, field_names, type_codes_); + + // Invalid type codes + ASSERT_OK(UnionArray::MakeDense(*invalid_type_ids1_, *value_offsets, children, + type_codes_, &result)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + ASSERT_OK(UnionArray::MakeDense(*invalid_type_ids2_, *value_offsets, children, + type_codes_, &result)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + + // Invalid offsets + std::shared_ptr invalid_offsets; + ArrayFromVector({1, 0, 0, 0, 1, 1, 1, 2, 1, 2}, &invalid_offsets); + ASSERT_OK(UnionArray::MakeDense(*type_ids_, *invalid_offsets, children, &result)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + ArrayFromVector({1, 0, 0, 0, 1, -1, 1, 2, 1, 2}, &invalid_offsets); + ASSERT_OK(UnionArray::MakeDense(*type_ids_, *invalid_offsets, children, &result)); + ASSERT_RAISES(Invalid, result->ValidateFull()); } TEST_F(TestUnionArrayFactories, TestMakeSparse) { @@ -157,41 +191,54 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { ArrayFromVector({0, 0, 0, 0, 0, -12, 0, 0, 0, 0}, &children[3]); std::vector field_names = {"str", "int1", "real", "int2"}; - std::vector type_codes = {1, 2, 4, 8}; std::shared_ptr result; // without field names and type codes ASSERT_OK(UnionArray::MakeSparse(*type_ids_, children, &result)); + ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, {"0", "1", "2", "3"}, {0, 1, 2, 3}); // with field names ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, {"one"}, &result)); ASSERT_OK(UnionArray::MakeSparse(*type_ids_, children, field_names, &result)); + ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, field_names, {0, 1, 2, 3}); // with type codes - ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, - std::vector{0}, &result)); - ASSERT_OK(UnionArray::MakeSparse(*type_ids_, children, type_codes, &result)); + ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*logical_type_ids_, children, + std::vector{0}, &result)); + ASSERT_OK(UnionArray::MakeSparse(*logical_type_ids_, children, type_codes_, &result)); + ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, - {"0", "1", "2", "3"}, type_codes); + {"0", "1", "2", "3"}, type_codes_); // with field names and type codes - ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, {"one"}, type_codes, - &result)); - ASSERT_OK( - UnionArray::MakeSparse(*type_ids_, children, field_names, type_codes, &result)); + ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*logical_type_ids_, children, {"one"}, + type_codes_, &result)); + ASSERT_OK(UnionArray::MakeSparse(*logical_type_ids_, children, field_names, type_codes_, + &result)); + ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, field_names, - type_codes); + type_codes_); + + // Invalid type codes + ASSERT_OK(UnionArray::MakeSparse(*invalid_type_ids1_, children, type_codes_, &result)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + ASSERT_OK(UnionArray::MakeSparse(*invalid_type_ids2_, children, type_codes_, &result)); + ASSERT_RAISES(Invalid, result->ValidateFull()); + + // Invalid child length + ArrayFromVector({0, 0, 0, 0, 0, -12, 0, 0, 0}, &children[3]); + ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, &result)); } template class UnionBuilderTest : public ::testing::Test { public: - uint8_t I8 = 8, STR = 13, DBL = 7; + int8_t I8 = 8, STR = 13, DBL = 7; virtual void AppendInt(int8_t i) { expected_types_vector.push_back(I8); @@ -402,8 +449,8 @@ TEST_F(SparseUnionBuilderTest, InferredType) { TEST_F(SparseUnionBuilderTest, StructWithUnion) { auto union_builder = std::make_shared(default_memory_pool()); - StructBuilder builder(struct_({field("u", union_({}))}), default_memory_pool(), - {union_builder}); + StructBuilder builder(struct_({field("u", union_builder->type())}), + default_memory_pool(), {union_builder}); ASSERT_EQ(union_builder->AppendChild(std::make_shared(), "i"), 0); ASSERT_TRUE( builder.type()->Equals(struct_({field("u", union_({field("i", int32())}, {0}))}))); diff --git a/cpp/src/arrow/array_view_test.cc b/cpp/src/arrow/array_view_test.cc index 1c2ed2bcc54..56570771b11 100644 --- a/cpp/src/arrow/array_view_test.cc +++ b/cpp/src/arrow/array_view_test.cc @@ -33,7 +33,7 @@ void CheckView(const std::shared_ptr& input, const std::shared_ptr& expected) { std::shared_ptr result; ASSERT_OK(input->View(view_type, &result)); - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*expected, *result); } @@ -323,7 +323,7 @@ TEST(TestArrayView, SparseUnionAsStruct) { auto indices = ArrayFromJSON(int8(), "[0, 0, 1]"); std::shared_ptr arr; ASSERT_OK(UnionArray::MakeSparse(*indices, {child1, child2}, &arr)); - ASSERT_OK(arr->Validate()); + ASSERT_OK(arr->ValidateFull()); auto ty1 = struct_({field("a", int8()), field("b", uint16()), field("c", float32())}); auto expected = ArrayFromJSON(ty1, "[[0, 0, 0], [0, 65535, 1.5], [1, 42, -2.5]]"); @@ -333,7 +333,7 @@ TEST(TestArrayView, SparseUnionAsStruct) { // With nulls indices = ArrayFromJSON(int8(), "[null, 0, 1]"); ASSERT_OK(UnionArray::MakeSparse(*indices, {child1, child2}, &arr)); - ASSERT_OK(arr->Validate()); + ASSERT_OK(arr->ValidateFull()); expected = ArrayFromJSON(ty1, "[null, [0, 65535, 1.5], [1, 42, -2.5]]"); CheckView(arr, expected); // CheckView(expected, arr); // XXX currently fails @@ -342,7 +342,7 @@ TEST(TestArrayView, SparseUnionAsStruct) { child1 = ArrayFromJSON(int16(), "[0, -1, null]"); child2 = ArrayFromJSON(int32(), "[0, null, -1071644672]"); ASSERT_OK(UnionArray::MakeSparse(*indices, {child1, child2}, &arr)); - ASSERT_OK(arr->Validate()); + ASSERT_OK(arr->ValidateFull()); expected = ArrayFromJSON(ty1, "[null, [0, 65535, null], [1, null, -2.5]]"); CheckView(arr, expected); // CheckView(expected, arr); // XXX currently fails @@ -355,9 +355,9 @@ TEST(TestArrayView, DecimalRoundTrip) { auto ty2 = fixed_size_binary(16); std::shared_ptr v, w; ASSERT_OK(arr->View(ty2, &v)); - ASSERT_OK(v->Validate()); + ASSERT_OK(v->ValidateFull()); ASSERT_OK(v->View(ty1, &w)); - ASSERT_OK(w->Validate()); + ASSERT_OK(w->ValidateFull()); AssertArraysEqual(*arr, *w); } @@ -424,7 +424,7 @@ TEST(TestArrayView, NonZeroNestedOffset) { default_memory_pool(), &arr)); ASSERT_OK(ListArray::FromArrays(*list_offsets, *view_values->Slice(2), default_memory_pool(), &expected)); - ASSERT_OK(arr->Validate()); + ASSERT_OK(arr->ValidateFull()); CheckView(arr->Slice(1), expected->Slice(1)); // Be extra paranoid about checking offsets diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index eae54e7092f..29dc4bc464d 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -237,15 +237,10 @@ class RangeEqualsVisitor { const auto& left_type = checked_cast(*left.type()); - // Define a mapping from the type id to child number - const std::vector& type_codes = left_type.type_codes(); - std::vector type_id_to_child_num(left.union_type()->max_type_code() + 1, 0); - for (uint8_t i = 0; i < type_codes.size(); ++i) { - type_id_to_child_num[type_codes[i]] = i; - } + const std::vector& child_ids = left_type.child_ids(); - const uint8_t* left_ids = left.raw_type_ids(); - const uint8_t* right_ids = right.raw_type_ids(); + const int8_t* left_ids = left.raw_type_ids(); + const int8_t* right_ids = right.raw_type_ids(); for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { @@ -257,7 +252,7 @@ class RangeEqualsVisitor { return false; } - auto child_num = type_id_to_child_num[left_ids[i]]; + auto child_num = child_ids[left_ids[i]]; // TODO(wesm): really we should be comparing stretches of non-null data // rather than looking at one value at a time. diff --git a/cpp/src/arrow/compute/kernels/add-test.cc b/cpp/src/arrow/compute/kernels/add-test.cc index 89d2c90882d..d60cd566016 100644 --- a/cpp/src/arrow/compute/kernels/add-test.cc +++ b/cpp/src/arrow/compute/kernels/add-test.cc @@ -45,7 +45,7 @@ class TestArithmeticKernel : public ComputeFixture, public TestBase { const std::shared_ptr expected) { std::shared_ptr actual; ASSERT_OK(arrow::compute::Add(&this->ctx_, *lhs, *rhs, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } diff --git a/cpp/src/arrow/compute/kernels/boolean_test.cc b/cpp/src/arrow/compute/kernels/boolean_test.cc index 04d7e500843..77b7e6473d7 100644 --- a/cpp/src/arrow/compute/kernels/boolean_test.cc +++ b/cpp/src/arrow/compute/kernels/boolean_test.cc @@ -48,11 +48,13 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); + ASSERT_OK(result_array->ValidateFull()); ASSERT_ARRAYS_EQUAL(*expected, *result_array); ASSERT_OK(kernel(&this->ctx_, right, left, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); result_array = result.make_array(); + ASSERT_OK(result_array->ValidateFull()); ASSERT_ARRAYS_EQUAL(*expected, *result_array); } diff --git a/cpp/src/arrow/compute/kernels/cast_test.cc b/cpp/src/arrow/compute/kernels/cast_test.cc index 8549e22dd4c..18bf7589dcd 100644 --- a/cpp/src/arrow/compute/kernels/cast_test.cc +++ b/cpp/src/arrow/compute/kernels/cast_test.cc @@ -67,7 +67,7 @@ class TestCast : public ComputeFixture, public TestBase { const std::shared_ptr& out_type, const CastOptions& options) { std::shared_ptr result; ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); ASSERT_ARRAYS_EQUAL(expected, *result); } @@ -87,7 +87,7 @@ class TestCast : public ComputeFixture, public TestBase { void CheckZeroCopy(const Array& input, const std::shared_ptr& out_type) { std::shared_ptr result; ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size()); for (size_t i = 0; i < input.data()->buffers.size(); ++i) { AssertBufferSame(input, *result, static_cast(i)); @@ -1394,7 +1394,7 @@ TYPED_TEST(TestNullCast, FromNull) { std::shared_ptr result; ASSERT_OK(Cast(&this->ctx_, arr, out_type, {}, &result)); - ASSERT_OK(result->Validate()); + ASSERT_OK(result->ValidateFull()); ASSERT_TRUE(result->type()->Equals(*out_type)); ASSERT_EQ(length, result->length()); @@ -1448,7 +1448,7 @@ TYPED_TEST(TestDictionaryCast, NoNulls) { data->buffers[0] = nullptr; data->null_count = 0; std::shared_ptr dict_array = std::make_shared(data); - ASSERT_OK(dict_array->Validate()); + ASSERT_OK(dict_array->ValidateFull()); this->CheckPass(*dict_array, *plain_array, plain_array->type(), options); } diff --git a/cpp/src/arrow/compute/kernels/filter_test.cc b/cpp/src/arrow/compute/kernels/filter_test.cc index 0c00ce4ca1e..1895a66f82f 100644 --- a/cpp/src/arrow/compute/kernels/filter_test.cc +++ b/cpp/src/arrow/compute/kernels/filter_test.cc @@ -46,7 +46,7 @@ class TestFilterKernel : public ComputeFixture, public TestBase { const std::shared_ptr& expected) { std::shared_ptr actual; ASSERT_OK(arrow::compute::Filter(&this->ctx_, *values, *filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } @@ -54,7 +54,7 @@ class TestFilterKernel : public ComputeFixture, public TestBase { const std::string& filter, const std::string& expected) { std::shared_ptr actual; ASSERT_OK(this->Filter(type, values, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); } @@ -68,7 +68,7 @@ class TestFilterKernel : public ComputeFixture, public TestBase { const std::shared_ptr& filter_boxed) { std::shared_ptr filtered; ASSERT_OK(arrow::compute::Filter(&this->ctx_, *values, *filter_boxed, &filtered)); - ASSERT_OK(filtered->Validate()); + ASSERT_OK(filtered->ValidateFull()); auto filter = checked_pointer_cast(filter_boxed); int64_t values_i = 0, filtered_i = 0; @@ -228,7 +228,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) { &selection)); ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered)); auto filtered_array = filtered.make_array(); - ASSERT_OK(filtered_array->Validate()); + ASSERT_OK(filtered_array->ValidateFull()); auto expected = CompareAndFilter(array->raw_values(), array->length(), c_fifty, op); ASSERT_ARRAYS_EQUAL(*filtered_array, *expected); @@ -253,7 +253,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) { &selection)); ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(lhs), selection, &filtered)); auto filtered_array = filtered.make_array(); - ASSERT_OK(filtered_array->Validate()); + ASSERT_OK(filtered_array->ValidateFull()); auto expected = CompareAndFilter(lhs->raw_values(), lhs->length(), rhs->raw_values(), op); ASSERT_ARRAYS_EQUAL(*filtered_array, *expected); @@ -283,7 +283,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) { &selection)); ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered)); auto filtered_array = filtered.make_array(); - ASSERT_OK(filtered_array->Validate()); + ASSERT_OK(filtered_array->ValidateFull()); auto expected = CompareAndFilter( array->raw_values(), array->length(), [&](CType e) { return (e > c_fifty) && (e < c_hundred); }); @@ -475,7 +475,7 @@ class TestFilterKernelWithRecordBatch : public TestFilterKernel { std::shared_ptr actual; ASSERT_OK(this->Filter(schm, batch_json, selection, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), *actual); } @@ -517,7 +517,7 @@ class TestFilterKernelWithChunkedArray : public TestFilterKernel { const std::vector& expected) { std::shared_ptr actual; ASSERT_OK(this->FilterWithArray(type, values, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual); } @@ -527,7 +527,7 @@ class TestFilterKernelWithChunkedArray : public TestFilterKernel { const std::vector& expected) { std::shared_ptr actual; ASSERT_OK(this->FilterWithChunkedArray(type, values, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual); } @@ -570,7 +570,7 @@ class TestFilterKernelWithTable : public TestFilterKernel { std::shared_ptr
actual; ASSERT_OK(this->FilterWithArray(schm, table_json, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual); } @@ -581,7 +581,7 @@ class TestFilterKernelWithTable : public TestFilterKernel
{ std::shared_ptr
actual; ASSERT_OK(this->FilterWithChunkedArray(schm, table_json, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual); } diff --git a/cpp/src/arrow/compute/kernels/hash_test.cc b/cpp/src/arrow/compute/kernels/hash_test.cc index 58972fdb4fe..5a92e8d8869 100644 --- a/cpp/src/arrow/compute/kernels/hash_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_test.cc @@ -63,6 +63,7 @@ void CheckUnique(FunctionContext* ctx, const std::shared_ptr& type, std::shared_ptr result; ASSERT_OK(Unique(ctx, input, &result)); + ASSERT_OK(result->ValidateFull()); // TODO: We probably shouldn't rely on array ordering. ASSERT_ARRAYS_EQUAL(*expected, *result); } @@ -79,6 +80,7 @@ void CheckValueCountsNull(FunctionContext* ctx, const std::shared_ptr& std::shared_ptr result; ASSERT_OK(ValueCounts(ctx, input, &result)); + ASSERT_OK(result->ValidateFull()); auto result_struct = std::dynamic_pointer_cast(result); ASSERT_NE(result_struct->GetFieldByName(kValuesFieldName), nullptr); // TODO: We probably shouldn't rely on value ordering. @@ -101,6 +103,7 @@ void CheckValueCounts(FunctionContext* ctx, const std::shared_ptr& typ std::shared_ptr result; ASSERT_OK(ValueCounts(ctx, input, &result)); + ASSERT_OK(result->ValidateFull()); auto result_struct = std::dynamic_pointer_cast(result); // TODO: We probably shouldn't rely on value ordering. ASSERT_ARRAYS_EQUAL(*ex_values, *result_struct->field(kValuesFieldIndex)); @@ -124,6 +127,7 @@ void CheckDictEncode(FunctionContext* ctx, const std::shared_ptr& type Datum datum_out; ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); std::shared_ptr result = MakeArray(datum_out.array()); + ASSERT_OK(result->ValidateFull()); ASSERT_ARRAYS_EQUAL(expected, *result); } diff --git a/cpp/src/arrow/compute/kernels/isin_test.cc b/cpp/src/arrow/compute/kernels/isin_test.cc index 64652767b59..b9ad3591f84 100644 --- a/cpp/src/arrow/compute/kernels/isin_test.cc +++ b/cpp/src/arrow/compute/kernels/isin_test.cc @@ -65,6 +65,7 @@ void CheckIsIn(FunctionContext* ctx, const std::shared_ptr& type, Datum datum_out; ASSERT_OK(IsIn(ctx, input, member_set, &datum_out)); std::shared_ptr result = datum_out.make_array(); + ASSERT_OK(result->ValidateFull()); ASSERT_ARRAYS_EQUAL(*expected, *result); } diff --git a/cpp/src/arrow/compute/kernels/sort_to_indices_test.cc b/cpp/src/arrow/compute/kernels/sort_to_indices_test.cc index 41c7f5db078..27fae72a52a 100644 --- a/cpp/src/arrow/compute/kernels/sort_to_indices_test.cc +++ b/cpp/src/arrow/compute/kernels/sort_to_indices_test.cc @@ -39,7 +39,7 @@ class TestSortToIndicesKernel : public ComputeFixture, public TestBase { const std::shared_ptr expected) { std::shared_ptr actual; ASSERT_OK(arrow::compute::SortToIndices(&this->ctx_, *values, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } diff --git a/cpp/src/arrow/compute/kernels/take_test.cc b/cpp/src/arrow/compute/kernels/take_test.cc index c886a00ead9..afff2fc6ab9 100644 --- a/cpp/src/arrow/compute/kernels/take_test.cc +++ b/cpp/src/arrow/compute/kernels/take_test.cc @@ -47,7 +47,7 @@ class TestTakeKernel : public ComputeFixture, public TestBase { std::shared_ptr actual; TakeOptions options; ASSERT_OK(arrow::compute::Take(&this->ctx_, *values, *indices, options, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } @@ -57,7 +57,7 @@ class TestTakeKernel : public ComputeFixture, public TestBase { for (auto index_type : {int8(), uint32()}) { ASSERT_OK(this->Take(type, values, index_type, indices, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*ArrayFromJSON(type, expected), *actual); } } @@ -134,7 +134,7 @@ class TestTakeKernelWithNumeric : public TestTakeKernel { TakeOptions options; ASSERT_OK( arrow::compute::Take(&this->ctx_, *values, *indices_boxed, options, &taken)); - ASSERT_OK(taken->Validate()); + ASSERT_OK(taken->ValidateFull()); ASSERT_EQ(indices_boxed->length(), taken->length()); ASSERT_EQ(indices_boxed->type_id(), Type::INT32); @@ -418,7 +418,7 @@ class TestPermutationsWithTake : public ComputeFixture, public TestBase { TakeOptions options; std::shared_ptr boxed_out; ASSERT_OK(arrow::compute::Take(&this->ctx_, values, indices, options, &boxed_out)); - ASSERT_OK(boxed_out->Validate()); + ASSERT_OK(boxed_out->ValidateFull()); *out = checked_pointer_cast(std::move(boxed_out)); } @@ -535,7 +535,7 @@ class TestTakeKernelWithRecordBatch : public TestTakeKernel { for (auto index_type : {int8(), uint32()}) { ASSERT_OK(this->Take(schm, batch_json, index_type, indices, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), *actual); } } @@ -592,7 +592,7 @@ class TestTakeKernelWithChunkedArray : public TestTakeKernel { const std::vector& expected) { std::shared_ptr actual; ASSERT_OK(this->TakeWithArray(type, values, indices, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual); } @@ -602,7 +602,7 @@ class TestTakeKernelWithChunkedArray : public TestTakeKernel { const std::vector& expected) { std::shared_ptr actual; ASSERT_OK(this->TakeWithChunkedArray(type, values, indices, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual); } @@ -648,7 +648,7 @@ class TestTakeKernelWithTable : public TestTakeKernel
{ std::shared_ptr
actual; ASSERT_OK(this->TakeWithArray(schm, table_json, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual); } @@ -659,7 +659,7 @@ class TestTakeKernelWithTable : public TestTakeKernel
{ std::shared_ptr
actual; ASSERT_OK(this->TakeWithChunkedArray(schm, table_json, filter, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual); } diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc index 1d895893d68..94ff85f1727 100644 --- a/cpp/src/arrow/csv/column_builder_test.cc +++ b/cpp/src/arrow/csv/column_builder_test.cc @@ -52,7 +52,7 @@ void AssertBuilding(const std::shared_ptr& builder, } ASSERT_OK(builder->task_group()->Finish()); ASSERT_OK(builder->Finish(out)); - ASSERT_OK((*out)->Validate()); + ASSERT_OK((*out)->ValidateFull()); } void CheckInferred(const std::shared_ptr& tg, const ChunkData& csv_data, @@ -126,7 +126,7 @@ TEST(NullColumnBuilder, InsertNull) { builder->Insert(0, parser); ASSERT_OK(builder->task_group()->Finish()); ASSERT_OK(builder->Finish(&actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); auto chunks = ArrayVector{std::make_shared(1), std::make_shared(2)}; @@ -151,7 +151,7 @@ TEST(NullColumnBuilder, InsertTyped) { builder->Insert(0, parser); ASSERT_OK(builder->task_group()->Finish()); ASSERT_OK(builder->Finish(&actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); auto chunks = ArrayVector{ArrayFromJSON(type, "[null]"), ArrayFromJSON(type, "[null, null, null]")}; @@ -177,7 +177,7 @@ TEST(NullColumnBuilder, EmptyChunks) { builder->Insert(2, parser); ASSERT_OK(builder->task_group()->Finish()); ASSERT_OK(builder->Finish(&actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); auto chunks = ArrayVector{ArrayFromJSON(type, "[]"), ArrayFromJSON(type, "[null, null]"), @@ -227,7 +227,7 @@ TEST(ColumnBuilder, Insert) { builder->Insert(0, parser); ASSERT_OK(builder->task_group()->Finish()); ASSERT_OK(builder->Finish(&actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ChunkedArrayFromVector({{123}, {456}}, &expected); AssertChunkedEqual(*actual, *expected); diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index a597bd545bc..44bb7adab5f 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -62,6 +62,7 @@ void AssertConversion(const std::shared_ptr& type, for (int32_t col_index = 0; col_index < static_cast(expected.size()); ++col_index) { ASSERT_OK(converter->Convert(*parser, col_index, &array)); + ASSERT_OK(array->ValidateFull()); ArrayFromVector(type, expected[col_index], &expected_array); AssertArraysEqual(*expected_array, *array); } @@ -83,6 +84,7 @@ void AssertConversion(const std::shared_ptr& type, for (int32_t col_index = 0; col_index < static_cast(expected.size()); ++col_index) { ASSERT_OK(converter->Convert(*parser, col_index, &array)); + ASSERT_OK(array->ValidateFull()); ArrayFromVector(type, is_valid[col_index], expected[col_index], &expected_array); AssertArraysEqual(*expected_array, *array); @@ -122,6 +124,7 @@ void AssertDictConversion(const std::string& csv_string, ASSERT_OK_AND_ASSIGN( array, DictConversion(expected_dict->type(), csv_string, max_cardinality, options)); + ASSERT_OK(array->ValidateFull()); expected_type = dictionary(expected_indices->type(), expected_dict->type()); ASSERT_TRUE(array->type()->Equals(*expected_type)); const auto& dict_array = internal::checked_cast(*array); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 6365a093ece..eb2047a9781 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -394,9 +394,9 @@ TEST_F(TestExtensionType, ValidateExtensionArray) { auto ext_arr2 = ExampleParametric(p1_type, "[null, 1, 2, 3]"); auto ext_arr3 = ExampleStruct(); - ASSERT_OK(ext_arr1->Validate()); - ASSERT_OK(ext_arr2->Validate()); - ASSERT_OK(ext_arr3->Validate()); + ASSERT_OK(ext_arr1->ValidateFull()); + ASSERT_OK(ext_arr2->ValidateFull()); + ASSERT_OK(ext_arr3->ValidateFull()); } } // namespace arrow diff --git a/cpp/src/arrow/ipc/json_integration_test.cc b/cpp/src/arrow/ipc/json_integration_test.cc index 559880508f0..29d5e9059ac 100644 --- a/cpp/src/arrow/ipc/json_integration_test.cc +++ b/cpp/src/arrow/ipc/json_integration_test.cc @@ -173,6 +173,16 @@ static Status ValidateArrowVsJson(const std::string& arrow_path, for (int i = 0; i < json_nbatches; ++i) { RETURN_NOT_OK(json_reader->ReadRecordBatch(i, &json_batch)); RETURN_NOT_OK(arrow_reader->ReadRecordBatch(i, &arrow_batch)); + Status valid_st = json_batch->ValidateFull(); + if (!valid_st.ok()) { + return Status::Invalid("JSON record batch ", i, " did not validate:\n", + valid_st.ToString()); + } + valid_st = arrow_batch->ValidateFull(); + if (!valid_st.ok()) { + return Status::Invalid("Arrow record batch ", i, " did not validate:\n", + valid_st.ToString()); + } if (!json_batch->ApproxEquals(*arrow_batch)) { std::stringstream ss; diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 1f2c44e7224..ec75e6ad09f 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -264,7 +264,7 @@ class SchemaWriter { writer_->Key("typeIds"); writer_->StartArray(); for (size_t i = 0; i < type.type_codes().size(); ++i) { - writer_->Uint(type.type_codes()[i]); + writer_->Int(type.type_codes()[i]); } writer_->EndArray(); } @@ -901,11 +901,11 @@ static Status GetUnion(const RjObject& json_type, const auto& it_type_codes = json_type.FindMember("typeIds"); RETURN_NOT_ARRAY("typeIds", it_type_codes, json_type); - std::vector type_codes; + std::vector type_codes; const auto& id_array = it_type_codes->value.GetArray(); for (const rj::Value& val : id_array) { - DCHECK(val.IsUint()); - type_codes.push_back(static_cast(val.GetUint())); + DCHECK(val.IsInt()); + type_codes.push_back(static_cast(val.GetInt())); } *type = union_(children, type_codes, mode); diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 20ff75e6ffa..45b0b4c0449 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -94,7 +94,7 @@ void AssertJSONArray(const std::shared_ptr& type, const std::string& j std::shared_ptr actual, expected; ASSERT_OK(ArrayFromJSON(type, json, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(type, values, &expected); AssertArraysEqual(*expected, *actual); } @@ -106,7 +106,7 @@ void AssertJSONArray(const std::shared_ptr& type, const std::string& j std::shared_ptr actual, expected; ASSERT_OK(ArrayFromJSON(type, json, &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(type, is_valid, values, &expected); AssertArraysEqual(*expected, *actual); } @@ -260,7 +260,7 @@ TEST(TestFloat, Basics) { // Check NaN separately as AssertArraysEqual simply memcmp's array contents // and NaNs can have many bit representations. ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); float value = checked_cast(*actual).Value(0); ASSERT_TRUE(std::isnan(value)); } @@ -282,7 +282,7 @@ TEST(TestDouble, Basics) { {-0.0, INFINITY, -INFINITY, 0.0}); ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); double value = checked_cast(*actual).Value(0); ASSERT_TRUE(std::isnan(value)); } @@ -389,7 +389,7 @@ TEST(TestDecimal, Basics) { std::shared_ptr expected, actual; ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { Decimal128Builder builder(type); ASSERT_OK(builder.Finish(&expected)); @@ -397,7 +397,7 @@ TEST(TestDecimal, Basics) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", \"-78.9000\"]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { Decimal128Builder builder(type); ASSERT_OK(builder.Append(Decimal128(1234567))); @@ -407,7 +407,7 @@ TEST(TestDecimal, Basics) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", null]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { Decimal128Builder builder(type); ASSERT_OK(builder.Append(Decimal128(1234567))); @@ -434,21 +434,21 @@ TEST(TestList, IntegerList) { std::shared_ptr offsets, values, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0}, &offsets); ArrayFromVector({}, &values); ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[4, 5], [], [6]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 2, 2, 3}, &offsets); ArrayFromVector({4, 5, 6}, &values); ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 0, 1, 3}, &offsets); auto is_valid = std::vector{false, true, false}; ArrayFromVector(is_valid, {0, 6, 0}, &values); @@ -456,7 +456,7 @@ TEST(TestList, IntegerList) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool, type, &builder)); @@ -484,21 +484,21 @@ TEST(TestList, NullList) { std::shared_ptr offsets, values, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0}, &offsets); values = std::make_shared(0); ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[], [null], [null, null]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 0, 1, 3}, &offsets); values = std::make_shared(3); ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool, type, &builder)); @@ -517,7 +517,7 @@ TEST(TestList, IntegerListList) { std::shared_ptr offsets, values, nested, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 1, 3, 6}, &offsets); ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); @@ -527,7 +527,7 @@ TEST(TestList, IntegerListList) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); @@ -537,7 +537,7 @@ TEST(TestList, IntegerListList) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[null, [null], [[null]]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool, type, &builder)); @@ -559,7 +559,7 @@ TEST(TestLargeList, Basics) { std::shared_ptr offsets, values, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 0, 1, 3}, &offsets); auto is_valid = std::vector{false, true, false}; ArrayFromVector(is_valid, {0, 6, 0}, &values); @@ -708,26 +708,26 @@ TEST(TestFixedSizeList, IntegerList) { std::shared_ptr values, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({}, &values); expected = std::make_shared(type, 0, values); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[4, 5], [0, 0], [6, 7]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({4, 5, 0, 0, 6, 7}, &values); expected = std::make_shared(type, 3, values); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[null, null], [0, null], [6, null]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); auto is_valid = std::vector{false, false, true, false, true, false}; ArrayFromVector(is_valid, {0, 0, 0, 0, 6, 0}, &values); expected = std::make_shared(type, 3, values); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[null, [null, null], null]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool, type, &builder)); @@ -759,19 +759,19 @@ TEST(TestFixedSizeList, NullList) { std::shared_ptr values, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); values = std::make_shared(0); expected = std::make_shared(type, 0, values); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[null, null], [null, null], [null, null]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); values = std::make_shared(6); expected = std::make_shared(type, 3, values); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[null, [null, null], null]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool, type, &builder)); @@ -794,14 +794,14 @@ TEST(TestFixedSizeList, IntegerListList) { std::shared_ptr values, nested, expected, actual; ASSERT_OK(ArrayFromJSON(type, "[[[1, 4]], [[2, 5]], [[3, 6]]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({1, 4, 2, 5, 3, 6}, &values); nested = std::make_shared(nested_type, 3, values); expected = std::make_shared(type, 3, nested); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[[1, null]], [null], null]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool, type, &builder)); @@ -836,7 +836,7 @@ TEST(TestStruct, SimpleStruct) { // Trivial ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({}, &a); ArrayFromVector({}, &b); children.assign({a, b}); @@ -850,11 +850,11 @@ TEST(TestStruct, SimpleStruct) { expected = std::make_shared(type, 2, children); ASSERT_OK(ArrayFromJSON(type, "[[5, true], [6, false]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[{\"a\": 5, \"b\": true}, {\"b\": false, \"a\": 6}]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); // With nulls @@ -868,12 +868,12 @@ TEST(TestStruct, SimpleStruct) { ASSERT_OK( ArrayFromJSON(type, "[null, [5, null], [null, false], [null, null]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); // When using object notation, null members can be omitted ASSERT_OK(ArrayFromJSON(type, "[null, {\"a\": 5, \"b\": null}, {\"b\": false}, {}]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } @@ -890,7 +890,7 @@ TEST(TestStruct, NestedStruct) { std::vector> children(2); ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({}, &children[0]); ArrayFromVector({}, &children[1]); children[0] = std::make_shared(nested_type, 0, children); @@ -899,7 +899,7 @@ TEST(TestStruct, NestedStruct) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[[[5, true], 1.5], [[6, false], -3e2]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector({5, 6}, &children[0]); ArrayFromVector({true, false}, &children[1]); children[0] = std::make_shared(nested_type, 2, children); @@ -908,7 +908,7 @@ TEST(TestStruct, NestedStruct) { AssertArraysEqual(*expected, *actual); ASSERT_OK(ArrayFromJSON(type, "[null, [[5, null], null], [null, -3e2]]", &actual)); - ASSERT_OK(actual->Validate()); + ASSERT_OK(actual->ValidateFull()); is_valid = {false, true, false}; ArrayFromVector(is_valid, {0, 5, 0}, &children[0]); is_valid = {false, false, false}; diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 75c86f53230..7af8f12295a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -157,17 +157,18 @@ Status UnionFromFlatbuffer(const flatbuf::Union* union_data, (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE : UnionMode::DENSE); - std::vector type_codes; + std::vector type_codes; const flatbuffers::Vector* fb_type_ids = union_data->typeIds(); if (fb_type_ids == nullptr) { - for (uint8_t i = 0; i < children.size(); ++i) { + // TODO validate that children.size() <= 127? + for (int8_t i = 0; i < static_cast(children.size()); ++i) { type_codes.push_back(i); } } else { for (int32_t id : (*fb_type_ids)) { - // TODO(wesm): can these values exceed 255? - type_codes.push_back(static_cast(id)); + // TODO(wesm): can these values exceed 127? + type_codes.push_back(static_cast(id)); } } diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index de464c42ddf..370e997bcac 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -319,7 +319,7 @@ class IpcTestFixture : public io::MemoryMapFixture { } void CheckReadResult(const RecordBatch& result, const RecordBatch& expected) { - ASSERT_OK(result.Validate()); + ASSERT_OK(result.ValidateFull()); EXPECT_EQ(expected.num_rows(), result.num_rows()); ASSERT_TRUE(expected.schema()->Equals(*result.schema())); @@ -870,7 +870,7 @@ class ReaderWriterMixin { RETURN_NOT_OK(writer_helper.Finish()); RETURN_NOT_OK(writer_helper.ReadBatches(out_batches)); for (const auto& batch : *out_batches) { - RETURN_NOT_OK(batch->Validate()); + RETURN_NOT_OK(batch->ValidateFull()); } return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 781863f4aee..e5635fbbd2f 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -434,7 +434,7 @@ Status MakeUnion(std::shared_ptr* out) { std::vector> union_types( {field("u0", int32()), field("u1", uint8())}); - std::vector type_codes = {5, 10}; + std::vector type_codes = {5, 10}; auto sparse_type = std::make_shared(union_types, type_codes, UnionMode::SPARSE); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 1753c4ef870..47dfbb1b958 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -404,8 +404,8 @@ class RecordBatchSerializer : public ArrayVisitor { pool_, &value_offsets)); // The Union type codes are not necessary 0-indexed - uint8_t max_code = 0; - for (uint8_t code : type.type_codes()) { + int8_t max_code = 0; + for (int8_t code : type.type_codes()) { if (code > max_code) { max_code = code; } @@ -422,7 +422,7 @@ class RecordBatchSerializer : public ArrayVisitor { // the value_offsets for each array const int32_t* unshifted_offsets = array.raw_value_offsets(); - const uint8_t* type_ids = array.raw_type_ids(); + const int8_t* type_ids = array.raw_type_ids(); // Allocate the shifted offsets std::shared_ptr shifted_offsets_buffer; @@ -444,7 +444,7 @@ class RecordBatchSerializer : public ArrayVisitor { // Now compute shifted offsets by subtracting child offset for (int64_t i = 0; i < length; ++i) { - const uint8_t code = type_ids[i]; + const int8_t code = type_ids[i]; shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code]; // Update the child length to account for observed value child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1); @@ -462,7 +462,7 @@ class RecordBatchSerializer : public ArrayVisitor { // truncate the children. For now, we are truncating the children to be // no longer than the parent union. if (offset != 0) { - const uint8_t code = type.type_codes()[i]; + const int8_t code = type.type_codes()[i]; const int64_t child_offset = child_offsets[code]; const int64_t child_length = child_lengths[code]; diff --git a/cpp/src/arrow/json/chunked_builder_test.cc b/cpp/src/arrow/json/chunked_builder_test.cc index aa412a52667..5c57b4963bc 100644 --- a/cpp/src/arrow/json/chunked_builder_test.cc +++ b/cpp/src/arrow/json/chunked_builder_test.cc @@ -58,6 +58,7 @@ void AssertBuilding(const std::unique_ptr& builder, ++i; } ASSERT_OK(builder->Finish(out)); + ASSERT_OK((*out)->ValidateFull()); } std::shared_ptr ExtractField(const std::string& name, diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 78f5774b481..25bb314acd1 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -86,34 +86,13 @@ class NullConverter : public PrimitiveConverter { } }; -Status PrimitiveFromNull(MemoryPool* pool, const std::shared_ptr& type, - const Array& null, std::shared_ptr* out) { - auto data = ArrayData::Make(type, null.length(), {nullptr, nullptr}, null.length()); - RETURN_NOT_OK(AllocateBitmap(pool, null.length(), &data->buffers[0])); - std::memset(data->buffers[0]->mutable_data(), 0, data->buffers[0]->size()); - *out = MakeArray(data); - return Status::OK(); -} - -Status BinaryFromNull(MemoryPool* pool, const std::shared_ptr& type, - const Array& null, std::shared_ptr* out) { - auto data = - ArrayData::Make(type, null.length(), {nullptr, nullptr, nullptr}, null.length()); - RETURN_NOT_OK(AllocateBitmap(pool, null.length(), &data->buffers[0])); - std::memset(data->buffers[0]->mutable_data(), 0, data->buffers[0]->size()); - RETURN_NOT_OK(AllocateBuffer(pool, sizeof(int32_t), &data->buffers[1])); - data->GetMutableValues(1)[0] = 0; - *out = MakeArray(data); - return Status::OK(); -} - class BooleanConverter : public PrimitiveConverter { public: using PrimitiveConverter::PrimitiveConverter; Status Convert(const std::shared_ptr& in, std::shared_ptr* out) override { if (in->type_id() == Type::NA) { - return PrimitiveFromNull(pool_, boolean(), *in, out); + return MakeArrayOfNull(pool_, boolean(), in->length(), out); } if (in->type_id() != Type::BOOL) { return GenericConversionError(*out_type_, " from ", *in->type()); @@ -133,7 +112,7 @@ class NumericConverter : public PrimitiveConverter { Status Convert(const std::shared_ptr& in, std::shared_ptr* out) override { if (in->type_id() == Type::NA) { - return PrimitiveFromNull(pool_, out_type_, *in, out); + return MakeArrayOfNull(pool_, out_type_, in->length(), out); } const auto& dict_array = GetDictionaryArray(in); @@ -171,7 +150,7 @@ class DateTimeConverter : public PrimitiveConverter { Status Convert(const std::shared_ptr& in, std::shared_ptr* out) override { if (in->type_id() == Type::NA) { - return PrimitiveFromNull(pool_, out_type_, *in, out); + return MakeArrayOfNull(pool_, out_type_, in->length(), out); } std::shared_ptr repr; @@ -199,7 +178,7 @@ class BinaryConverter : public PrimitiveConverter { Status Convert(const std::shared_ptr& in, std::shared_ptr* out) override { if (in->type_id() == Type::NA) { - return BinaryFromNull(pool_, out_type_, *in, out); + return MakeArrayOfNull(pool_, out_type_, in->length(), out); } const auto& dict_array = GetDictionaryArray(in); diff --git a/cpp/src/arrow/json/converter_test.cc b/cpp/src/arrow/json/converter_test.cc index 223cd56275f..6d787db0bbd 100644 --- a/cpp/src/arrow/json/converter_test.cc +++ b/cpp/src/arrow/json/converter_test.cc @@ -54,6 +54,7 @@ void AssertConvert(const std::shared_ptr& expected_type, std::shared_ptr converter; ASSERT_OK(MakeConverter(expected_type, default_memory_pool(), &converter)); ASSERT_OK(converter->Convert(unconverted, &converted)); + ASSERT_OK(converted->ValidateFull()); // assert equality auto expected = ArrayFromJSON(expected_type, expected_json); diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc index f748d2f5806..5e10aed988e 100644 --- a/cpp/src/arrow/python/deserialize.cc +++ b/cpp/src/arrow/python/deserialize.cc @@ -239,7 +239,7 @@ Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_ const auto& data = checked_cast(array); OwnedRef result(create_sequence(stop_idx - start_idx)); RETURN_IF_PYERROR(); - const uint8_t* type_ids = data.raw_type_ids(); + const int8_t* type_ids = data.raw_type_ids(); const int32_t* value_offsets = data.raw_value_offsets(); std::vector python_types; RETURN_NOT_OK(GetPythonTypes(data, &python_types)); diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 5aa02c3c85e..4883c458823 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -285,10 +285,9 @@ class SequenceBuilder { class DictBuilder { public: explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) { - builder_.reset( - new StructBuilder(struct_({field("keys", union_({}, UnionMode::DENSE)), - field("vals", union_({}, UnionMode::DENSE))}), - pool, {keys_.builder(), vals_.builder()})); + builder_.reset(new StructBuilder(struct_({field("keys", union_(UnionMode::DENSE)), + field("vals", union_(UnionMode::DENSE))}), + pool, {keys_.builder(), vals_.builder()})); } // Builder for the keys of the dictionary diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index fadb2719ddd..80e775e2234 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -25,6 +25,7 @@ #include #include "arrow/array.h" +#include "arrow/array/validate.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" @@ -259,7 +260,16 @@ Status RecordBatch::Validate() const { " type not match schema: ", array.type()->ToString(), " vs ", schema_type.ToString()); } - RETURN_NOT_OK(array.Validate()); + RETURN_NOT_OK(internal::ValidateArray(array)); + } + return Status::OK(); +} + +Status RecordBatch::ValidateFull() const { + RETURN_NOT_OK(Validate()); + for (int i = 0; i < num_columns(); ++i) { + const auto& array = *this->column(i); + RETURN_NOT_OK(internal::ValidateArrayData(array)); } return Status::OK(); } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 90350bed6fc..92943640e17 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -152,10 +152,22 @@ class ARROW_EXPORT RecordBatch { /// \return new record batch virtual std::shared_ptr Slice(int64_t offset, int64_t length) const = 0; - /// \brief Check for schema or length inconsistencies + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the record batch's schema and internal data. + /// + /// This is O(k) where k is the total number of fields and array descendents. + /// /// \return Status virtual Status Validate() const; + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the record batch's schema and internal data. + /// + /// This is potentially O(k*n) where n is the number of rows. + /// + /// \return Status + virtual Status ValidateFull() const; + protected: RecordBatch(const std::shared_ptr& schema, int64_t num_rows); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 739391deb32..2f93b327d43 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/array/concatenate.h" +#include "arrow/array/validate.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/type.h" @@ -189,11 +190,6 @@ Status ChunkedArray::Validate() const { return Status::OK(); } - for (auto chunk : chunks_) { - // Validate the chunks themselves - RETURN_NOT_OK(chunk->Validate()); - } - const auto& type = *chunks_[0]->type(); // Make sure chunks all have the same type for (size_t i = 1; i < chunks_.size(); ++i) { @@ -203,6 +199,26 @@ Status ChunkedArray::Validate() const { " but saw ", chunk.type()->ToString()); } } + // Validate the chunks themselves + for (size_t i = 0; i < chunks_.size(); ++i) { + const Array& chunk = *chunks_[i]; + const Status st = internal::ValidateArray(chunk); + if (!st.ok()) { + return Status::Invalid("In chunk ", i, ": ", st.ToString()); + } + } + return Status::OK(); +} + +Status ChunkedArray::ValidateFull() const { + RETURN_NOT_OK(Validate()); + for (size_t i = 0; i < chunks_.size(); ++i) { + const Array& chunk = *chunks_[i]; + const Status st = internal::ValidateArrayData(chunk); + if (!st.ok()) { + return Status::Invalid("In chunk ", i, ": ", st.ToString()); + } + } return Status::OK(); } @@ -340,6 +356,35 @@ class SimpleTable : public Table { } Status Validate() const override { + RETURN_NOT_OK(ValidateMeta()); + for (int i = 0; i < num_columns(); ++i) { + const ChunkedArray* col = columns_[i].get(); + Status st = col->Validate(); + if (!st.ok()) { + std::stringstream ss; + ss << "Column " << i << ": " << st.message(); + return st.WithMessage(ss.str()); + } + } + return Status::OK(); + } + + Status ValidateFull() const override { + RETURN_NOT_OK(ValidateMeta()); + for (int i = 0; i < num_columns(); ++i) { + const ChunkedArray* col = columns_[i].get(); + Status st = col->ValidateFull(); + if (!st.ok()) { + std::stringstream ss; + ss << "Column " << i << ": " << st.message(); + return st.WithMessage(ss.str()); + } + } + return Status::OK(); + } + + protected: + Status ValidateMeta() const { // Make sure columns and schema are consistent if (static_cast(columns_.size()) != schema_->num_fields()) { return Status::Invalid("Number of columns did not match schema"); diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 8afd707ac4e..92a12edb802 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -106,9 +106,24 @@ class ARROW_EXPORT ChunkedArray { /// \brief Determine if two chunked arrays are equal. bool Equals(const std::shared_ptr& other) const; - /// \brief Check that all chunks have the same data type + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the chunk array's internal data. + /// + /// This is O(k*m) where k is the number of array descendents, + /// and m is the number of chunks. + /// + /// \return Status Status Validate() const; + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the chunk array's internal data. + /// + /// This is O(k*n) where k is the number of array descendents, + /// and n is the length in elements. + /// + /// \return Status + Status ValidateFull() const; + protected: ArrayVector chunks_; int64_t length_; @@ -239,9 +254,24 @@ class ARROW_EXPORT Table { /// \param[out] out The returned table virtual Status Flatten(MemoryPool* pool, std::shared_ptr
* out) const = 0; - /// \brief Perform any checks to validate the input arguments + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the table's schema and internal data. + /// + /// This is O(k*m) where k is the total number of field descendents, + /// and m is the number of chunks. + /// + /// \return Status virtual Status Validate() const = 0; + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the table's schema and internal data. + /// + /// This is O(k*n) where k is the total number of field descendents, + /// and n is the number of rows. + /// + /// \return Status + virtual Status ValidateFull() const = 0; + /// \brief Return the number of columns in the table int num_columns() const { return schema_->num_fields(); } diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 86163bcc84e..ac1e41218c3 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -142,20 +142,20 @@ TEST_F(TestChunkedArray, Validate) { // Valid if empty ArrayVector empty = {}; auto no_chunks = std::make_shared(empty, utf8()); - ASSERT_OK(no_chunks->Validate()); + ASSERT_OK(no_chunks->ValidateFull()); random::RandomArrayGenerator gen(0); arrays_one_.push_back(gen.Int32(50, 0, 100, 0.1)); Construct(); - ASSERT_OK(one_->Validate()); + ASSERT_OK(one_->ValidateFull()); arrays_one_.push_back(gen.Int32(50, 0, 100, 0.1)); Construct(); - ASSERT_OK(one_->Validate()); + ASSERT_OK(one_->ValidateFull()); arrays_one_.push_back(gen.String(50, 0, 10, 0.1)); Construct(); - ASSERT_RAISES(Invalid, one_->Validate()); + ASSERT_RAISES(Invalid, one_->ValidateFull()); } TEST_F(TestChunkedArray, View) { @@ -212,7 +212,7 @@ class TestTable : public TestBase { TEST_F(TestTable, EmptySchema) { auto empty_schema = ::arrow::schema({}); table_ = Table::Make(empty_schema, columns_); - ASSERT_OK(table_->Validate()); + ASSERT_OK(table_->ValidateFull()); ASSERT_EQ(0, table_->num_rows()); ASSERT_EQ(0, table_->num_columns()); } @@ -222,7 +222,7 @@ TEST_F(TestTable, Ctors) { MakeExample1(length); table_ = Table::Make(schema_, columns_); - ASSERT_OK(table_->Validate()); + ASSERT_OK(table_->ValidateFull()); ASSERT_EQ(length, table_->num_rows()); ASSERT_EQ(3, table_->num_columns()); @@ -230,11 +230,11 @@ TEST_F(TestTable, Ctors) { ASSERT_TRUE(table_->Equals(*array_ctor)); table_ = Table::Make(schema_, columns_, length); - ASSERT_OK(table_->Validate()); + ASSERT_OK(table_->ValidateFull()); ASSERT_EQ(length, table_->num_rows()); table_ = Table::Make(schema_, arrays_); - ASSERT_OK(table_->Validate()); + ASSERT_OK(table_->ValidateFull()); ASSERT_EQ(length, table_->num_rows()); ASSERT_EQ(3, table_->num_columns()); } @@ -257,20 +257,20 @@ TEST_F(TestTable, InvalidColumns) { MakeExample1(length); table_ = Table::Make(schema_, columns_, length - 1); - ASSERT_RAISES(Invalid, table_->Validate()); + ASSERT_RAISES(Invalid, table_->ValidateFull()); columns_.clear(); // Wrong number of columns table_ = Table::Make(schema_, columns_, length); - ASSERT_RAISES(Invalid, table_->Validate()); + ASSERT_RAISES(Invalid, table_->ValidateFull()); columns_ = {std::make_shared(MakeRandomArray(length)), std::make_shared(MakeRandomArray(length)), std::make_shared(MakeRandomArray(length - 1))}; table_ = Table::Make(schema_, columns_, length); - ASSERT_RAISES(Invalid, table_->Validate()); + ASSERT_RAISES(Invalid, table_->ValidateFull()); } TEST_F(TestTable, Equals) { @@ -655,7 +655,7 @@ TEST_F(TestTable, RenameColumns) { std::shared_ptr
renamed; ASSERT_OK(table->RenameColumns({"zero", "one", "two"}, &renamed)); EXPECT_THAT(renamed->ColumnNames(), testing::ElementsAre("zero", "one", "two")); - ASSERT_OK(renamed->Validate()); + ASSERT_OK(renamed->ValidateFull()); ASSERT_RAISES(Invalid, table->RenameColumns({"hello", "world"}, &renamed)); } @@ -778,15 +778,15 @@ TEST_F(TestRecordBatch, Validate) { auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - ASSERT_OK(b1->Validate()); + ASSERT_OK(b1->ValidateFull()); // Length mismatch auto b2 = RecordBatch::Make(schema, length, {a0, a1, a3}); - ASSERT_RAISES(Invalid, b2->Validate()); + ASSERT_RAISES(Invalid, b2->ValidateFull()); // Type mismatch auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); - ASSERT_RAISES(Invalid, b3->Validate()); + ASSERT_RAISES(Invalid, b3->ValidateFull()); } TEST_F(TestRecordBatch, Slice) { @@ -997,19 +997,19 @@ TEST_F(TestTableBatchReader, Chunksize) { std::shared_ptr batch; ASSERT_OK(i1.ReadNext(&batch)); - ASSERT_OK(batch->Validate()); + ASSERT_OK(batch->ValidateFull()); ASSERT_EQ(10, batch->num_rows()); ASSERT_OK(i1.ReadNext(&batch)); - ASSERT_OK(batch->Validate()); + ASSERT_OK(batch->ValidateFull()); ASSERT_EQ(15, batch->num_rows()); ASSERT_OK(i1.ReadNext(&batch)); - ASSERT_OK(batch->Validate()); + ASSERT_OK(batch->ValidateFull()); ASSERT_EQ(5, batch->num_rows()); ASSERT_OK(i1.ReadNext(&batch)); - ASSERT_OK(batch->Validate()); + ASSERT_OK(batch->ValidateFull()); ASSERT_EQ(10, batch->num_rows()); ASSERT_OK(i1.ReadNext(&batch)); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 9bf4176bde5..e2d0e972812 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -288,13 +288,25 @@ std::string DurationType::ToString() const { // ---------------------------------------------------------------------- // Union type +constexpr int8_t UnionType::kMaxTypeId; +constexpr int UnionType::kInvalidChildId; + UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode::type mode) - : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { + const std::vector& type_codes, UnionMode::type mode) + : NestedType(Type::UNION), + mode_(mode), + type_codes_(type_codes), + child_ids_(kMaxTypeId + 1, kInvalidChildId) { DCHECK_LE(fields.size(), type_codes.size()) << "union field with unknown type id"; DCHECK_GE(fields.size(), type_codes.size()) << "type id provided without corresponding union field"; children_ = fields; + for (int child_id = 0; child_id < static_cast(type_codes_.size()); ++child_id) { + const auto type_code = type_codes_[child_id]; + DCHECK_GE(type_code, 0); + DCHECK_LE(type_code, kMaxTypeId); + child_ids_[type_code] = child_id; + } } DataTypeLayout UnionType::layout() const { @@ -1039,7 +1051,7 @@ std::string UnionType::ComputeFingerprint() const { } for (const auto code : type_codes_) { // Represent code as integer, not raw character - ss << ':' << static_cast(code); + ss << ':' << static_cast(code); } ss << "]{"; for (const auto& child : children_) { @@ -1180,18 +1192,32 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, + const std::vector& type_codes, + UnionMode::type mode) { + return std::make_shared(child_fields, type_codes, mode); +} + +std::shared_ptr union_(const std::vector>& child_fields, UnionMode::type mode) { + std::vector type_codes(child_fields.size()); + for (int i = 0; i < static_cast(child_fields.size()); ++i) { + type_codes[i] = static_cast(i); + } return std::make_shared(child_fields, type_codes, mode); } +std::shared_ptr union_(UnionMode::type mode) { + std::vector> child_fields; + return union_(child_fields, mode); +} + std::shared_ptr union_(const std::vector>& children, const std::vector& field_names, - const std::vector& given_type_codes, + const std::vector& given_type_codes, UnionMode::type mode) { std::vector> fields; - std::vector type_codes(given_type_codes); - uint8_t counter = 0; + std::vector type_codes(given_type_codes); + int8_t counter = 0; for (const auto& child : children) { if (field_names.size() == 0) { fields.push_back(field(std::to_string(counter), child->type())); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 41ac309c419..bd29154d9fa 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -928,11 +928,13 @@ struct UnionMode { class ARROW_EXPORT UnionType : public NestedType { public: static constexpr Type::type type_id = Type::UNION; + static constexpr int8_t kMaxTypeId = 127; + static constexpr int kInvalidChildId = -1; static constexpr const char* type_name() { return "union"; } UnionType(const std::vector>& fields, - const std::vector& type_codes, + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); DataTypeLayout layout() const override; @@ -940,7 +942,14 @@ class ARROW_EXPORT UnionType : public NestedType { std::string ToString() const override; std::string name() const override { return "union"; } - const std::vector& type_codes() const { return type_codes_; } + /// The array of logical type ids. + /// + /// For example, the first type in the union might be denoted by the id 5 + /// (instead of 0). + const std::vector& type_codes() const { return type_codes_; } + + /// An array mapping logical type ids to physical child ids. + const std::vector& child_ids() const { return child_ids_; } uint8_t max_type_code() const; @@ -951,10 +960,8 @@ class ARROW_EXPORT UnionType : public NestedType { UnionMode::type mode_; - // The type id used in the data to indicate each data type in the union. For - // example, the first type in the union might be denoted by the id 5 (instead - // of 0). - std::vector type_codes_; + std::vector type_codes_; + std::vector child_ids_; }; // ---------------------------------------------------------------------- @@ -1466,13 +1473,21 @@ struct_(const std::vector>& fields); /// \brief Create a UnionType instance std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create a UnionType instance +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& child_fields, + UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create a UnionType instance +std::shared_ptr ARROW_EXPORT union_(UnionMode::type mode = UnionMode::SPARSE); /// \brief Create a UnionType instance std::shared_ptr ARROW_EXPORT union_(const std::vector>& children, - const std::vector& field_names, - const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + const std::vector& field_names, const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); /// \brief Create a UnionType instance inline std::shared_ptr ARROW_EXPORT diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 94107ab7e60..4f70e43b9c2 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -35,6 +35,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; template void AssertFingerprintablesEqual(const T& left, const T& right, bool check_metadata, @@ -883,7 +884,7 @@ TEST(TestNestedType, Equals) { std::string union_name) -> std::shared_ptr { auto f_type = field(inner_name, int32()); std::vector> fields = {f_type}; - std::vector codes = {Type::INT32}; + std::vector codes = {42}; auto u_type = std::make_shared(fields, codes, UnionMode::SPARSE); return field(union_name, u_type); }; @@ -995,6 +996,52 @@ TEST(TestStructType, GetFieldDuplicates) { ASSERT_EQ(results.size(), 0); } +TEST(TestUnionType, Basics) { + auto f0_type = int32(); + auto f0 = field("f0", f0_type); + auto f1_type = utf8(); + auto f1 = field("f1", f1_type); + auto f2_type = uint8(); + auto f2 = field("f2", f2_type); + + std::vector> fields = {f0, f1, f2}; + std::vector type_codes1 = {0, 1, 2}; + std::vector type_codes2 = {10, 11, 12}; + std::vector child_ids1(128, -1); + std::vector child_ids2(128, -1); + child_ids1[0] = 0; + child_ids1[1] = 1; + child_ids1[2] = 2; + child_ids2[10] = 0; + child_ids2[11] = 1; + child_ids2[12] = 2; + + auto ty1 = checked_pointer_cast(union_(fields, UnionMode::DENSE)); + auto ty2 = + checked_pointer_cast(union_(fields, type_codes1, UnionMode::DENSE)); + auto ty3 = + checked_pointer_cast(union_(fields, type_codes2, UnionMode::DENSE)); + auto ty4 = checked_pointer_cast(union_(fields, UnionMode::SPARSE)); + auto ty5 = + checked_pointer_cast(union_(fields, type_codes1, UnionMode::SPARSE)); + auto ty6 = + checked_pointer_cast(union_(fields, type_codes2, UnionMode::SPARSE)); + + ASSERT_EQ(ty1->type_codes(), type_codes1); + ASSERT_EQ(ty2->type_codes(), type_codes1); + ASSERT_EQ(ty3->type_codes(), type_codes2); + ASSERT_EQ(ty4->type_codes(), type_codes1); + ASSERT_EQ(ty5->type_codes(), type_codes1); + ASSERT_EQ(ty6->type_codes(), type_codes2); + + ASSERT_EQ(ty1->child_ids(), child_ids1); + ASSERT_EQ(ty2->child_ids(), child_ids1); + ASSERT_EQ(ty3->child_ids(), child_ids2); + ASSERT_EQ(ty4->child_ids(), child_ids1); + ASSERT_EQ(ty5->child_ids(), child_ids1); + ASSERT_EQ(ty6->child_ids(), child_ids2); +} + TEST(TestDictionaryType, Basics) { auto value_type = int32(); diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 81b3da183c4..86768494985 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2295,7 +2295,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam { } } ASSERT_EQ(local_null_count, expected_nulls); - ASSERT_OK(array.Validate()); + ASSERT_OK(array.ValidateFull()); } void ValidateColumnArray(const ::arrow::Int32Array& array, size_t expected_nulls) { @@ -2696,7 +2696,7 @@ TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) { std::unique_ptr arrow_reader; ASSERT_OK(FileReader::Make(default_memory_pool(), std::move(reader), &arrow_reader)); ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table)); - ASSERT_OK(table->Validate()); + ASSERT_OK(table->ValidateFull()); } TEST(TestArrowReaderAdHoc, HandleDictPageOffsetZero) { diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 99edc0f8129..f483d7b10ed 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1069,18 +1069,28 @@ cdef class Array(_PandasConvertible): """ return [x.as_py() for x in self] - def validate(self): + def validate(self, *, full=False): """ - Perform any validation checks implemented by - arrow::Array::Validate(). Raises exception with error message if - array does not validate. + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full: bool, default False + If True, run expensive checks, otherwise cheap checks only. Raises ------ ArrowInvalid """ - with nogil: - check_status(self.ap.Validate()) + if full: + with nogil: + check_status(self.ap.ValidateFull()) + else: + with nogil: + check_status(self.ap.Validate()) @property def offset(self): @@ -1407,7 +1417,7 @@ cdef class UnionArray(Array): cdef vector[shared_ptr[CArray]] c cdef Array child cdef vector[c_string] c_field_names - cdef vector[uint8_t] c_type_codes + cdef vector[int8_t] c_type_codes for child in children: c.push_back(child.sp_array) if field_names is not None: @@ -1446,7 +1456,7 @@ cdef class UnionArray(Array): cdef vector[shared_ptr[CArray]] c cdef Array child cdef vector[c_string] c_field_names - cdef vector[uint8_t] c_type_codes + cdef vector[int8_t] c_type_codes for child in children: c.push_back(child.sp_array) if field_names is not None: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 009ab60bb4e..0064e37b2ae 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -166,6 +166,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] Slice(int64_t offset, int64_t length) CStatus Validate() const + CStatus ValidateFull() const CStatus View(const shared_ptr[CDataType]& type, shared_ptr[CArray]* out) @@ -333,9 +334,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CUnionType" arrow::UnionType"(CDataType): CUnionType(const vector[shared_ptr[CField]]& fields, - const vector[uint8_t]& type_codes, UnionMode mode) + const vector[int8_t]& type_codes, UnionMode mode) UnionMode mode() - const vector[uint8_t]& type_codes() + const vector[int8_t]& type_codes() + const vector[int]& child_ids() cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) @@ -475,17 +477,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStatus MakeSparse(const CArray& type_ids, const vector[shared_ptr[CArray]]& children, const vector[c_string]& field_names, - const vector[uint8_t]& type_codes, + const vector[int8_t]& type_codes, shared_ptr[CArray]* out) @staticmethod CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets, const vector[shared_ptr[CArray]]& children, const vector[c_string]& field_names, - const vector[uint8_t]& type_codes, + const vector[int8_t]& type_codes, shared_ptr[CArray]* out) - uint8_t* raw_type_ids() + int8_t* raw_type_ids() int32_t value_offset(int i) + int child_id(int64_t index) shared_ptr[CArray] child(int pos) const CArray* UnsafeChild(int pos) UnionMode mode() @@ -569,6 +572,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[shared_ptr[CChunkedArray]]* out) CStatus Validate() const + CStatus ValidateFull() const cdef cppclass CRecordBatch" arrow::RecordBatch": @staticmethod @@ -591,7 +595,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int num_columns() int64_t num_rows() - CStatus Validate() + CStatus Validate() const + CStatus ValidateFull() const shared_ptr[CRecordBatch] ReplaceSchemaMetadata( const shared_ptr[CKeyValueMetadata]& metadata) @@ -643,7 +648,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStatus CombineChunks(CMemoryPool* pool, shared_ptr[CTable]* out) - CStatus Validate() + CStatus Validate() const + CStatus ValidateFull() const shared_ptr[CTable] ReplaceSchemaMetadata( const shared_ptr[CKeyValueMetadata]& metadata) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index f79339d5b55..f8c42ac667f 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -741,12 +741,12 @@ cdef class UnionValue(ArrayValue): self.ap = sp_array.get() cdef getitem(self, int64_t i): - cdef int8_t type_id = self.ap.raw_type_ids()[i] - cdef shared_ptr[CArray] child = self.ap.child(type_id) + cdef int child_id = self.ap.child_id(i) + cdef shared_ptr[CArray] child = self.ap.child(child_id) if self.ap.mode() == _UnionMode_SPARSE: - return box_scalar(self.type[type_id].type, child, i) + return box_scalar(self.type[child_id].type, child, i) else: - return box_scalar(self.type[type_id].type, child, + return box_scalar(self.type[child_id].type, child, self.ap.value_offset(i)) def as_py(self): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index cb2a17c7fb1..84a8dcfa858 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -81,12 +81,28 @@ cdef class ChunkedArray(_PandasConvertible): def __str__(self): return self.format() - def validate(self): + def validate(self, *, full=False): """ - Validate chunked array consistency. + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full: bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid """ - with nogil: - check_status(self.sp_chunked_array.get().Validate()) + if full: + with nogil: + check_status(self.sp_chunked_array.get().ValidateFull()) + else: + with nogil: + check_status(self.sp_chunked_array.get().Validate()) @property def null_count(self): @@ -517,12 +533,28 @@ cdef class RecordBatch(_PandasConvertible): def __len__(self): return self.batch.num_rows() - def validate(self): + def validate(self, *, full=False): """ - Validate RecordBatch consistency. + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full: bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid """ - with nogil: - check_status(self.batch.Validate()) + if full: + with nogil: + check_status(self.batch.ValidateFull()) + else: + with nogil: + check_status(self.batch.Validate()) def replace_schema_metadata(self, metadata=None): """ @@ -881,12 +913,28 @@ cdef class Table(_PandasConvertible): self.sp_table = table self.table = table.get() - def validate(self): + def validate(self, *, full=False): """ - Validate table consistency. + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full: bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid """ - with nogil: - check_status(self.table.Validate()) + if full: + with nogil: + check_status(self.table.ValidateFull()) + else: + with nogil: + check_status(self.table.Validate()) def __reduce__(self): # Reduce the columns as ChunkedArrays to avoid serializing schema diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d79fe2cb5da..7cb81847e16 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -578,7 +578,7 @@ def test_dictionary_indices(): indices = pa.array([0, 1, 2, 0, 1, 2]) dictionary = pa.array(['foo', 'bar', 'baz']) arr = pa.DictionaryArray.from_arrays(indices, dictionary) - arr.indices.validate() + arr.indices.validate(full=True) @pytest.mark.parametrize(('list_array_type', 'list_type_factory'), @@ -619,19 +619,29 @@ def test_list_from_arrays(list_array_type, list_type_factory): with pytest.raises(ValueError): list_array_type.from_arrays(offsets, values) + # Non-monotonic offsets + offsets = [0, 3, 2, 6] + values = list(range(6)) + result = list_array_type.from_arrays(offsets, values) + with pytest.raises(ValueError): + result.validate(full=True) + def test_union_from_dense(): binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') int64 = pa.array([1, 2, 3], type='int64') types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') - value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') + logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8') + value_offsets = pa.array([1, 0, 0, 2, 1, 2, 3], type='int32') + py_value = [b'b', 1, b'a', b'c', 2, 3, b'd'] def check_result(result, expected_field_names, expected_type_codes): - assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd'] + result.validate(full=True) actual_field_names = [result.type[i].name for i in range(result.type.num_children)] assert actual_field_names == expected_field_names assert result.type.type_codes == expected_type_codes + assert result.to_pylist() == py_value # without field names and type codes check_result(pa.UnionArray.from_dense(types, value_offsets, @@ -647,28 +657,47 @@ def check_result(result, expected_field_names, expected_type_codes): expected_type_codes=[0, 1]) # with type codes - check_result(pa.UnionArray.from_dense(types, value_offsets, + check_result(pa.UnionArray.from_dense(logical_types, value_offsets, [binary, int64], type_codes=[11, 13]), expected_field_names=['0', '1'], expected_type_codes=[11, 13]) # with field names and type codes - check_result(pa.UnionArray.from_dense(types, value_offsets, + check_result(pa.UnionArray.from_dense(logical_types, value_offsets, [binary, int64], ['bin', 'int'], [11, 13]), expected_field_names=['bin', 'int'], expected_type_codes=[11, 13]) + # Bad type ids + arr = pa.UnionArray.from_dense(logical_types, value_offsets, + [binary, int64]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64], + type_codes=[11, 13]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + + # Offset larger than child size + bad_offsets = pa.array([0, 0, 1, 2, 1, 2, 4], type='int32') + arr = pa.UnionArray.from_dense(types, bad_offsets, [binary, int64]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + def test_union_from_sparse(): binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], type='binary') int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8') + py_value = [b'a', 1, b'b', b'c', 2, 3, b'd'] def check_result(result, expected_field_names, expected_type_codes): - assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd'] + result.validate(full=True) + assert result.to_pylist() == py_value actual_field_names = [result.type[i].name for i in range(result.type.num_children)] assert actual_field_names == expected_field_names @@ -686,18 +715,31 @@ def check_result(result, expected_field_names, expected_type_codes): expected_type_codes=[0, 1]) # with type codes - check_result(pa.UnionArray.from_sparse(types, [binary, int64], + check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64], type_codes=[11, 13]), expected_field_names=['0', '1'], expected_type_codes=[11, 13]) # with field names and type codes - check_result(pa.UnionArray.from_sparse(types, [binary, int64], + check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64], ['bin', 'int'], [11, 13]), expected_field_names=['bin', 'int'], expected_type_codes=[11, 13]) + # Bad type ids + arr = pa.UnionArray.from_sparse(logical_types, [binary, int64]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + arr = pa.UnionArray.from_sparse(types, [binary, int64], + type_codes=[11, 13]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + + # Invalid child length + with pytest.raises(pa.ArrowInvalid): + arr = pa.UnionArray.from_sparse(logical_types, [binary, int64[1:]]) + def test_union_array_slice(): # ARROW-2314 @@ -733,7 +775,7 @@ def _check_cast_case(case, safe=True): else: in_arr = pa.array(in_data, type=in_type) casted = in_arr.cast(out_type, safe=safe) - casted.validate() + casted.validate(full=True) assert casted.equals(expected) # constructing an array with out type which optionally involves casting @@ -909,10 +951,10 @@ def test_cast_string_to_number_roundtrip(): ] for in_arr, expected in cases: casted = in_arr.cast(expected.type, safe=True) - casted.validate() + casted.validate(full=True) assert casted.equals(expected) casted_back = casted.cast(in_arr.type, safe=True) - casted_back.validate() + casted_back.validate(full=True) assert casted_back.equals(in_arr) diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 0fda62a9d21..6e792199e1b 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -740,7 +740,7 @@ def read_csv(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = False table = read_csv(*args, **kwargs) - table.validate() + table.validate(full=True) return table @@ -750,7 +750,7 @@ def read_csv(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_csv(*args, **kwargs) - table.validate() + table.validate(full=True) return table @@ -773,7 +773,7 @@ def test_random_csv(self): csv_path = os.path.join(self.tmpdir, self.csv_filename) self.write_file(csv_path, csv) table = self.read_csv(csv_path) - table.validate() + table.validate(full=True) assert table.schema == expected.schema assert table.equals(expected) assert table.to_pydict() == expected.to_pydict() diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index 796b7bf5d6c..fe9131ccc54 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -217,7 +217,7 @@ def read_json(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = False table = read_json(*args, **kwargs) - table.validate() + table.validate(full=True) return table @@ -227,5 +227,5 @@ def read_json(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_json(*args, **kwargs) - table.validate() + table.validate(full=True) return table diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e13e594482e..3b59d64e63e 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -62,6 +62,9 @@ def get_many_types(): pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), + pa.union([pa.field('a', pa.binary(10)), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE, + type_codes=[4, 8]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), @@ -346,16 +349,34 @@ def check_fields(ty, fields): fields = [pa.field('x', pa.list_(pa.int32())), pa.field('y', pa.binary())] + type_codes = [5, 9] + for mode in ('sparse', pa.lib.UnionMode_SPARSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'sparse' check_fields(ty, fields) assert ty.type_codes == [0, 1] + ty = pa.union(fields, mode=mode, type_codes=type_codes) + assert ty.mode == 'sparse' + check_fields(ty, fields) + assert ty.type_codes == type_codes + # Invalid number of type codes + with pytest.raises(ValueError): + pa.union(fields, mode=mode, type_codes=type_codes[1:]) + for mode in ('dense', pa.lib.UnionMode_DENSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'dense' check_fields(ty, fields) assert ty.type_codes == [0, 1] + ty = pa.union(fields, mode=mode, type_codes=type_codes) + assert ty.mode == 'dense' + check_fields(ty, fields) + assert ty.type_codes == type_codes + # Invalid number of type codes + with pytest.raises(ValueError): + pa.union(fields, mode=mode, type_codes=type_codes[1:]) + for mode in ('unknown', 2): with pytest.raises(ValueError, match='Invalid union mode'): pa.union(fields, mode=mode) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 04d731fce61..a4741293272 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -389,7 +389,7 @@ cdef class UnionType(DataType): return self.child(i) def __reduce__(self): - return union, (list(self), self.mode) + return union, (list(self), self.mode, self.type_codes) cdef class TimestampType(DataType): @@ -1926,7 +1926,7 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) -def union(children_fields, mode): +def union(children_fields, mode, type_codes=None): """ Create UnionType from children fields. @@ -1935,6 +1935,7 @@ def union(children_fields, mode): fields : sequence of Field values mode : str 'dense' or 'sparse' + type_codes : list of integers, default None Returns ------- @@ -1943,7 +1944,7 @@ def union(children_fields, mode): cdef: Field child_field vector[shared_ptr[CField]] c_fields - vector[uint8_t] type_codes + vector[int8_t] c_type_codes shared_ptr[CDataType] union_type int i @@ -1958,15 +1959,23 @@ def union(children_fields, mode): else: raise ValueError("Invalid union mode {0!r}".format(mode)) - for i, child_field in enumerate(children_fields): - type_codes.push_back(i) + for child_field in children_fields: c_fields.push_back(child_field.sp_field) + if type_codes is not None: + if len(type_codes) != c_fields.size(): + raise ValueError("type_codes should have the same length " + "as fields") + for code in type_codes: + c_type_codes.push_back(code) + else: + c_type_codes = range(c_fields.size()) + if mode == UnionMode_SPARSE: - union_type.reset(new CUnionType(c_fields, type_codes, + union_type.reset(new CUnionType(c_fields, c_type_codes, _UnionMode_SPARSE)) else: - union_type.reset(new CUnionType(c_fields, type_codes, + union_type.reset(new CUnionType(c_fields, c_type_codes, _UnionMode_DENSE)) return pyarrow_wrap_data_type(union_type)