From bfa5aeceb6e306a6a8a25c428f723e2efe5614b1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 31 Mar 2021 17:21:44 +0200 Subject: [PATCH] PARQUET-1990: [C++] Refuse to write ConvertedType::NA ConvertedType::NA corresponds to an invalid converted type that was once added to the Parquet spec: https://github.com/apache/parquet-format/pull/45 but then quickly removed in favour of the Null logical type: https://github.com/apache/parquet-format/pull/51 Unfortunately, Parquet C++ could still in some cases emit the unofficial converted type. Also remove the confusingly-named LogicalType::Unknown, while "UNKNOWN" in the Thrift specification points to LogicalType::Null. --- cpp/src/parquet/printer.cc | 12 ++++++---- cpp/src/parquet/reader_test.cc | 2 +- cpp/src/parquet/schema.cc | 13 +++++++++-- cpp/src/parquet/schema_test.cc | 14 +++++------- cpp/src/parquet/thrift_internal.h | 3 +++ cpp/src/parquet/types.cc | 37 +++++++++++++++---------------- cpp/src/parquet/types.h | 29 +++++++++++++++--------- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/_parquet.pyx | 2 +- 9 files changed, 67 insertions(+), 47 deletions(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 1b921d566a36..dfd4bd802ee1 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -87,11 +87,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte const ColumnDescriptor* descr = file_metadata->schema()->Column(i); stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" << TypeToString(descr->physical_type()); - if (descr->converted_type() != ConvertedType::NONE) { - stream << "/" << ConvertedTypeToString(descr->converted_type()); + const auto& logical_type = descr->logical_type(); + if (!logical_type->is_none()) { + stream << " / " << logical_type->ToString(); } - if (descr->converted_type() == ConvertedType::DECIMAL) { - stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")"; + if (descr->converted_type() != ConvertedType::NONE) { + stream << " / " << ConvertedTypeToString(descr->converted_type()); + if (descr->converted_type() == ConvertedType::DECIMAL) { + stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")"; + } } stream << ")" << std::endl; } diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 01710349b304..7c2f9d7aa58c 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -317,7 +317,7 @@ Number of RowGroups: 1 Number of Real Columns: 2 Number of Columns: 2 Number of Selected Columns: 2 -Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8) +Column 0: a.list.element.list.element.list.element (BYTE_ARRAY / String / UTF8) Column 1: b (INT32) --- Row Group: 0 --- --- Total Bytes: 155 --- diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index f2a99c8780db..bfb295f0be30 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -448,7 +448,7 @@ std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element, LogicalType::FromThrift(element->logicalType), LoadEnumSafe(&element->type), element->type_length, field_id)); } else if (element->__isset.converted_type) { - // legacy writer with logical type present + // legacy writer with converted type present primitive_node = std::unique_ptr(new PrimitiveNode( element->name, LoadEnumSafe(&element->repetition_type), LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type), @@ -500,7 +500,16 @@ void PrimitiveNode::ToParquet(void* opaque_element) const { element->__set_name(name_); element->__set_repetition_type(ToThrift(repetition_)); if (converted_type_ != ConvertedType::NONE) { - element->__set_converted_type(ToThrift(converted_type_)); + if (converted_type_ != ConvertedType::NA) { + element->__set_converted_type(ToThrift(converted_type_)); + } else { + // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null. + // Never emit it (see PARQUET-1990 for discussion). + if (!logical_type_ || !logical_type_->is_null()) { + throw ParquetException( + "ConvertedType::NA is obsolete, please use LogicalType::Null instead"); + } + } } if (field_id_ >= 0) { element->__set_field_id(field_id_); diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index c095e7adbf41..43760d34ab41 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -1101,12 +1101,12 @@ TEST(TestLogicalTypeConstruction, ConvertedTypeCompatibility) { ASSERT_TRUE(reconstructed->is_valid()); ASSERT_TRUE(reconstructed->Equals(*original)); - // Unknown - original = LogicalType::Unknown(); + // Undefined + original = UndefinedLogicalType::Make(); ASSERT_TRUE(original->is_invalid()); ASSERT_FALSE(original->is_valid()); converted_type = original->ToConvertedType(&converted_decimal_metadata); - ASSERT_EQ(converted_type, ConvertedType::NA); + ASSERT_EQ(converted_type, ConvertedType::UNDEFINED); ASSERT_FALSE(converted_decimal_metadata.isset); ASSERT_TRUE(original->is_compatible(converted_type, converted_decimal_metadata)); ASSERT_TRUE(original->is_compatible(converted_type)); @@ -1243,7 +1243,6 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) { {BSONLogicalType::Make(), false, true, true}, {UUIDLogicalType::Make(), false, true, true}, {NoLogicalType::Make(), false, false, true}, - {UnknownLogicalType::Make(), false, false, false}, }; for (const ExpectedProperties& c : cases) { @@ -1339,7 +1338,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeApplicability) { } std::vector> any_type_cases = { - LogicalType::Null(), LogicalType::None(), LogicalType::Unknown()}; + LogicalType::Null(), LogicalType::None(), UndefinedLogicalType::Make()}; for (auto c : any_type_cases) { ConfirmAnyPrimitiveTypeApplicability(c); @@ -1453,7 +1452,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) { }; std::vector cases = { - {LogicalType::Unknown(), "Unknown", R"({"Type": "Unknown"})"}, + {UndefinedLogicalType::Make(), "Undefined", R"({"Type": "Undefined"})"}, {LogicalType::String(), "String", R"({"Type": "String"})"}, {LogicalType::Map(), "Map", R"({"Type": "Map"})"}, {LogicalType::List(), "List", R"({"Type": "List"})"}, @@ -1550,7 +1549,6 @@ TEST(TestLogicalTypeOperation, LogicalTypeSortOrder) { }; std::vector cases = { - {LogicalType::Unknown(), SortOrder::UNKNOWN}, {LogicalType::String(), SortOrder::UNSIGNED}, {LogicalType::Map(), SortOrder::UNKNOWN}, {LogicalType::List(), SortOrder::UNKNOWN}, @@ -1888,8 +1886,6 @@ TEST_F(TestSchemaElementConstruction, SimpleCases) { {"uuid", LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16, false, ConvertedType::NA, true, [this]() { return element_->logicalType.__isset.UUID; }}, {"none", LogicalType::None(), Type::INT64, -1, false, ConvertedType::NA, false, - check_nothing}, - {"unknown", LogicalType::Unknown(), Type::INT64, -1, true, ConvertedType::NA, false, check_nothing}}; for (const SchemaElementConstructionArguments& c : cases) { diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index ba3fefd0d4f4..c9e02696f5dd 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -256,6 +256,9 @@ static inline format::Type::type ToThrift(Type::type type) { static inline format::ConvertedType::type ToThrift(ConvertedType::type type) { // item 0 is NONE DCHECK_NE(type, ConvertedType::NONE); + // it is forbidden to emit "NA" (PARQUET-1990) + DCHECK_NE(type, ConvertedType::NA); + DCHECK_NE(type, ConvertedType::UNDEFINED); return static_cast(static_cast(type) - 1); } diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 3d3e7c7a64f6..4e5bcee4ce8f 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -366,13 +366,14 @@ std::shared_ptr LogicalType::FromConvertedType( return JSONLogicalType::Make(); case ConvertedType::BSON: return BSONLogicalType::Make(); + case ConvertedType::NA: + return NullLogicalType::Make(); case ConvertedType::NONE: return NoLogicalType::Make(); - case ConvertedType::NA: case ConvertedType::UNDEFINED: - return UnknownLogicalType::Make(); + return UndefinedLogicalType::Make(); } - return UnknownLogicalType::Make(); + return UndefinedLogicalType::Make(); } std::shared_ptr LogicalType::FromThrift( @@ -483,10 +484,6 @@ std::shared_ptr LogicalType::UUID() { return UUIDLogicalType: std::shared_ptr LogicalType::None() { return NoLogicalType::Make(); } -std::shared_ptr LogicalType::Unknown() { - return UnknownLogicalType::Make(); -} - /* * The logical type implementation classes are built in four layers: (1) the base * layer, which establishes the interface and provides generally reusable implementations @@ -516,7 +513,7 @@ class LogicalType::Impl { virtual std::string ToString() const = 0; virtual bool is_serialized() const { - return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNKNOWN); + return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED); } virtual std::string ToJSON() const { @@ -567,14 +564,14 @@ class LogicalType::Impl { class BSON; class UUID; class No; - class Unknown; + class Undefined; protected: Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {} Impl() = default; private: - LogicalType::Type::type type_ = LogicalType::Type::UNKNOWN; + LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED; SortOrder::type order_ = SortOrder::UNKNOWN; }; @@ -636,7 +633,9 @@ bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::J bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; } bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; } bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; } -bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNKNOWN; } +bool LogicalType::is_valid() const { + return impl_->type() != LogicalType::Type::UNDEFINED; +} bool LogicalType::is_invalid() const { return !is_valid(); } bool LogicalType::is_nested() const { return (impl_->type() == LogicalType::Type::LIST) || @@ -1555,19 +1554,19 @@ class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible, GENERATE_MAKE(No) -class LogicalType::Impl::Unknown final : public LogicalType::Impl::SimpleCompatible, - public LogicalType::Impl::UniversalApplicable { +class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible, + public LogicalType::Impl::UniversalApplicable { public: - friend class UnknownLogicalType; + friend class UndefinedLogicalType; - OVERRIDE_TOSTRING(Unknown) + OVERRIDE_TOSTRING(Undefined) private: - Unknown() - : LogicalType::Impl(LogicalType::Type::UNKNOWN, SortOrder::UNKNOWN), - LogicalType::Impl::SimpleCompatible(ConvertedType::NA) {} + Undefined() + : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN), + LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {} }; -GENERATE_MAKE(Unknown) +GENERATE_MAKE(Undefined) } // namespace parquet diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 953da832fbb2..f3d3abfc918c 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -71,7 +71,7 @@ struct Type { // Mirrors parquet::ConvertedType struct ConvertedType { enum type { - NONE, + NONE, // Not a real converted type, but means no converted type is specified UTF8, MAP, MAP_KEY_VALUE, @@ -94,9 +94,12 @@ struct ConvertedType { JSON, BSON, INTERVAL, + // DEPRECATED INVALID ConvertedType for all-null data. + // Only useful for reading legacy files written out by interim Parquet C++ releases. + // For writing, always emit LogicalType::Null instead. + // See PARQUET-1990. NA = 25, - // Should always be last element. - UNDEFINED = 26 + UNDEFINED = 26 // Not a real converted type; should always be last element }; }; @@ -140,7 +143,7 @@ class PARQUET_EXPORT LogicalType { public: struct Type { enum type { - UNKNOWN = 0, + UNDEFINED = 0, // Not a real logical type STRING = 1, MAP, LIST, @@ -151,11 +154,11 @@ class PARQUET_EXPORT LogicalType { TIMESTAMP, INTERVAL, INT, - NIL, // Thrift NullType + NIL, // Thrift NullType: annotates data that is always null JSON, BSON, UUID, - NONE + NONE // Not a real logical type; should always be last element }; }; @@ -199,12 +202,18 @@ class PARQUET_EXPORT LogicalType { static std::shared_ptr Interval(); static std::shared_ptr Int(int bit_width, bool is_signed); + + /// \brief Create a logical type for data that's always null + /// + /// Any physical type can be annotated with this logical type. static std::shared_ptr Null(); + static std::shared_ptr JSON(); static std::shared_ptr BSON(); static std::shared_ptr UUID(); + + /// \brief Create a placeholder for when no logical type is specified static std::shared_ptr None(); - static std::shared_ptr Unknown(); /// \brief Return true if this logical type is consistent with the given underlying /// physical type. @@ -434,13 +443,13 @@ class PARQUET_EXPORT NoLogicalType : public LogicalType { NoLogicalType() = default; }; -/// \brief Allowed for any type. -class PARQUET_EXPORT UnknownLogicalType : public LogicalType { +// Internal API, for unrecognized logical types +class PARQUET_EXPORT UndefinedLogicalType : public LogicalType { public: static std::shared_ptr Make(); private: - UnknownLogicalType() = default; + UndefinedLogicalType() = default; }; // Data encodings. Mirrors parquet::Encoding diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index e97f32411e62..8fa1c855b3e4 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -54,7 +54,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY" enum ParquetLogicalTypeId" parquet::LogicalType::Type::type": - ParquetLogicalType_UNKNOWN" parquet::LogicalType::Type::UNKNOWN" + ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED" ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING" ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP" ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST" diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index ce16acc68908..67c1c5a4fc80 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -809,7 +809,7 @@ cdef physical_type_name_from_enum(ParquetType type_): cdef logical_type_name_from_enum(ParquetLogicalTypeId type_): return { - ParquetLogicalType_UNKNOWN: 'UNKNOWN', + ParquetLogicalType_UNDEFINED: 'UNDEFINED', ParquetLogicalType_STRING: 'STRING', ParquetLogicalType_MAP: 'MAP', ParquetLogicalType_LIST: 'LIST',