From 08f5d608085b42550377a7a80e51bcc61ae9b47f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 16 Jan 2023 20:33:48 +0800 Subject: [PATCH 1/4] MINOR: [C++][Parquet] Rephrase decimal annotation --- .../parquet/arrow/arrow_reader_writer_test.cc | 8 +-- cpp/src/parquet/arrow/schema.cc | 3 +- cpp/src/parquet/properties.h | 54 +++++++++++++------ 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index bdfd0fe07dc..498ebd90744 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3443,8 +3443,8 @@ TEST(ArrowReadWrite, Decimal256AsInt) { auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array}); parquet::WriterProperties::Builder builder; - // Enforce integer type to annotate decimal type - auto writer_properties = builder.enable_integer_annotate_decimal()->build(); + // Enforce integer type to store short decimal type + auto writer_properties = builder.enable_short_decimal_stored_as_integer()->build(); auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); CheckConfiguredRoundtrip(table, table, writer_properties, props_store_schema); @@ -4821,8 +4821,8 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { auto arrow_schema = ::arrow::schema({::arrow::field("a", values->type())}); parquet::WriterProperties::Builder builder; - // Enforce integer type to annotate decimal type - auto writer_properties = builder.enable_integer_annotate_decimal()->build(); + // Enforce integer type to store short decimal type + auto writer_properties = builder.enable_short_decimal_stored_as_integer()->build(); std::shared_ptr parquet_schema; ASSERT_OK_NO_THROW(ToParquetSchema(arrow_schema.get(), *writer_properties, *default_arrow_writer_properties(), diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 267b892e4b4..a1cd0ee608b 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -357,7 +357,8 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, const auto& decimal_type = static_cast(*field->type()); precision = decimal_type.precision(); scale = decimal_type.scale(); - if (properties.integer_annotate_decimal() && 1 <= precision && precision <= 18) { + if (properties.store_short_decimal_as_integer() && 1 <= precision && + precision <= 18) { type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64; } else { type = ParquetType::FIXED_LEN_BYTE_ARRAY; diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d45dcfe69fc..797defb64a9 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -201,7 +201,7 @@ class PARQUET_EXPORT WriterProperties { version_(ParquetVersion::PARQUET_2_4), data_page_version_(ParquetDataPageVersion::V1), created_by_(DEFAULT_CREATED_BY), - integer_annotate_decimal_(false) {} + store_short_decimal_as_integer_(false) {} virtual ~Builder() {} /// Specify the memory pool for the writer. Default default_memory_pool. @@ -452,19 +452,39 @@ class PARQUET_EXPORT WriterProperties { return this->disable_statistics(path->ToDotString()); } - /// Enable integer type to annotate decimal type as below: - /// int32: 1 <= precision <= 9 - /// int64: 10 <= precision <= 18 - /// Default disabled. - Builder* enable_integer_annotate_decimal() { - integer_annotate_decimal_ = true; + /// Enable decimal logical type with 1 <= precision <= 18 to be stored as + /// integer physical type. + /// + /// According to the specs, DECIMAL can be used to annotate the following types: + /// - int32: for 1 <= precision <= 9. + /// - int64: for 1 <= precision <= 18; precision < 10 will produce a warning. + /// - fixed_len_byte_array: precision is limited by the array size. + /// Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + /// - binary: precision is not limited, but is required. precision is not limited, + /// but is required. The minimum number of bytes to store the unscaled value + /// should be used. + /// + /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + /// + /// When enabled, the C++ writer will use following physical types to store decimals: + /// - int32: for 1 <= precision <= 9. + /// - int64: for 10 <= precision <= 18. + /// - fixed_len_byte_array: for precision > 18. + /// + /// As a consequence, decimal columns stored in integer types are more compact + /// but in a risk that the parquet file may not be readable by previous Arrow C++ + /// versions or other implementations. + Builder* enable_short_decimal_stored_as_integer() { + store_short_decimal_as_integer_ = true; return this; } - /// Disable integer type to annotate decimal type. + /// Disable decimal logical type with 1 <= precision <= 18 to be stored as + /// integer physical type. + /// /// Default disabled. - Builder* disable_integer_annotate_decimal() { - integer_annotate_decimal_ = false; + Builder* disable_short_decimal_stored_as_integer() { + store_short_decimal_as_integer_ = false; return this; } @@ -493,7 +513,7 @@ class PARQUET_EXPORT WriterProperties { pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, pagesize_, version_, created_by_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, - integer_annotate_decimal_)); + store_short_decimal_as_integer_)); } private: @@ -505,7 +525,7 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type version_; ParquetDataPageVersion data_page_version_; std::string created_by_; - bool integer_annotate_decimal_; + bool store_short_decimal_as_integer_; std::shared_ptr file_encryption_properties_; @@ -536,7 +556,9 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } - inline bool integer_annotate_decimal() const { return integer_annotate_decimal_; } + inline bool store_short_decimal_as_integer() const { + return store_short_decimal_as_integer_; + } inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { @@ -606,7 +628,7 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr file_encryption_properties, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, - ParquetDataPageVersion data_page_version, bool integer_annotate_decimal) + ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -615,7 +637,7 @@ class PARQUET_EXPORT WriterProperties { parquet_data_page_version_(data_page_version), parquet_version_(version), parquet_created_by_(created_by), - integer_annotate_decimal_(integer_annotate_decimal), + store_short_decimal_as_integer_(store_short_decimal_as_integer), file_encryption_properties_(file_encryption_properties), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -628,7 +650,7 @@ class PARQUET_EXPORT WriterProperties { ParquetDataPageVersion parquet_data_page_version_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; - bool integer_annotate_decimal_; + bool store_short_decimal_as_integer_; std::shared_ptr file_encryption_properties_; From ff2869250377986ba8f03f63fa690cb0c1ce14ef Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Jan 2023 17:53:09 +0800 Subject: [PATCH 2/4] use shorter name --- .../parquet/arrow/arrow_reader_writer_test.cc | 8 ++--- cpp/src/parquet/arrow/schema.cc | 3 +- cpp/src/parquet/properties.h | 35 ++++++++----------- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 498ebd90744..6585d286d1a 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3443,8 +3443,8 @@ TEST(ArrowReadWrite, Decimal256AsInt) { auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array}); parquet::WriterProperties::Builder builder; - // Enforce integer type to store short decimal type - auto writer_properties = builder.enable_short_decimal_stored_as_integer()->build(); + // Allow small decimals to be stored as int32 or int64. + auto writer_properties = builder.enable_store_decimal_as_integer()->build(); auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); CheckConfiguredRoundtrip(table, table, writer_properties, props_store_schema); @@ -4821,8 +4821,8 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { auto arrow_schema = ::arrow::schema({::arrow::field("a", values->type())}); parquet::WriterProperties::Builder builder; - // Enforce integer type to store short decimal type - auto writer_properties = builder.enable_short_decimal_stored_as_integer()->build(); + // Allow small decimals to be stored as int32 or int64. + auto writer_properties = builder.enable_store_decimal_as_integer()->build(); std::shared_ptr parquet_schema; ASSERT_OK_NO_THROW(ToParquetSchema(arrow_schema.get(), *writer_properties, *default_arrow_writer_properties(), diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index a1cd0ee608b..07d481ba8e2 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -357,8 +357,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, const auto& decimal_type = static_cast(*field->type()); precision = decimal_type.precision(); scale = decimal_type.scale(); - if (properties.store_short_decimal_as_integer() && 1 <= precision && - precision <= 18) { + if (properties.store_decimal_as_integer() && 1 <= precision && precision <= 18) { type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64; } else { type = ParquetType::FIXED_LEN_BYTE_ARRAY; diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 797defb64a9..01872d581e3 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -201,7 +201,7 @@ class PARQUET_EXPORT WriterProperties { version_(ParquetVersion::PARQUET_2_4), data_page_version_(ParquetDataPageVersion::V1), created_by_(DEFAULT_CREATED_BY), - store_short_decimal_as_integer_(false) {} + store_decimal_as_integer_(false) {} virtual ~Builder() {} /// Specify the memory pool for the writer. Default default_memory_pool. @@ -452,17 +452,15 @@ class PARQUET_EXPORT WriterProperties { return this->disable_statistics(path->ToDotString()); } - /// Enable decimal logical type with 1 <= precision <= 18 to be stored as - /// integer physical type. + /// Allow decimals with 1 <= precision <= 18 to be stored as integers. /// - /// According to the specs, DECIMAL can be used to annotate the following types: + /// In Parquet, DECIMAL can be stored in any of the following physical types: /// - int32: for 1 <= precision <= 9. /// - int64: for 1 <= precision <= 18; precision < 10 will produce a warning. /// - fixed_len_byte_array: precision is limited by the array size. /// Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - /// - binary: precision is not limited, but is required. precision is not limited, - /// but is required. The minimum number of bytes to store the unscaled value - /// should be used. + /// - binary: precision is not limited, but is required. The minimum number + /// of bytes to store the unscaled value should be used. /// /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. /// @@ -472,10 +470,9 @@ class PARQUET_EXPORT WriterProperties { /// - fixed_len_byte_array: for precision > 18. /// /// As a consequence, decimal columns stored in integer types are more compact - /// but in a risk that the parquet file may not be readable by previous Arrow C++ - /// versions or other implementations. - Builder* enable_short_decimal_stored_as_integer() { - store_short_decimal_as_integer_ = true; + /// but in a risk that the parquet file may not be readable by other implementations. + Builder* enable_store_decimal_as_integer() { + store_decimal_as_integer_ = true; return this; } @@ -483,8 +480,8 @@ class PARQUET_EXPORT WriterProperties { /// integer physical type. /// /// Default disabled. - Builder* disable_short_decimal_stored_as_integer() { - store_short_decimal_as_integer_ = false; + Builder* disable_store_decimal_as_integer() { + store_decimal_as_integer_ = false; return this; } @@ -513,7 +510,7 @@ class PARQUET_EXPORT WriterProperties { pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, pagesize_, version_, created_by_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, - store_short_decimal_as_integer_)); + store_decimal_as_integer_)); } private: @@ -525,7 +522,7 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type version_; ParquetDataPageVersion data_page_version_; std::string created_by_; - bool store_short_decimal_as_integer_; + bool store_decimal_as_integer_; std::shared_ptr file_encryption_properties_; @@ -556,9 +553,7 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } - inline bool store_short_decimal_as_integer() const { - return store_short_decimal_as_integer_; - } + inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; } inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { @@ -637,7 +632,7 @@ class PARQUET_EXPORT WriterProperties { parquet_data_page_version_(data_page_version), parquet_version_(version), parquet_created_by_(created_by), - store_short_decimal_as_integer_(store_short_decimal_as_integer), + store_decimal_as_integer_(store_short_decimal_as_integer), file_encryption_properties_(file_encryption_properties), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -650,7 +645,7 @@ class PARQUET_EXPORT WriterProperties { ParquetDataPageVersion parquet_data_page_version_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; - bool store_short_decimal_as_integer_; + bool store_decimal_as_integer_; std::shared_ptr file_encryption_properties_; From 68e8866a067a75d9de94ce716ec6f6f32ddef6a3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 21 Jan 2023 14:44:32 +0800 Subject: [PATCH 3/4] refine comments --- cpp/src/parquet/properties.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 01872d581e3..72df57e993b 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -459,8 +459,8 @@ class PARQUET_EXPORT WriterProperties { /// - int64: for 1 <= precision <= 18; precision < 10 will produce a warning. /// - fixed_len_byte_array: precision is limited by the array size. /// Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - /// - binary: precision is not limited, but is required. The minimum number - /// of bytes to store the unscaled value should be used. + /// - binary: precision is unlimited. The minimum number of bytes to store + /// the unscaled value is used. /// /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. /// @@ -469,8 +469,7 @@ class PARQUET_EXPORT WriterProperties { /// - int64: for 10 <= precision <= 18. /// - fixed_len_byte_array: for precision > 18. /// - /// As a consequence, decimal columns stored in integer types are more compact - /// but in a risk that the parquet file may not be readable by other implementations. + /// As a consequence, decimal columns stored in integer types are more compact. Builder* enable_store_decimal_as_integer() { store_decimal_as_integer_ = true; return this; From b77931cffddebd7698d214d07633a7a7f155a087 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 24 Jan 2023 10:47:16 +0800 Subject: [PATCH 4/4] refine comment about decimal --- cpp/src/parquet/properties.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 72df57e993b..ef4ae124a29 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -456,7 +456,7 @@ class PARQUET_EXPORT WriterProperties { /// /// In Parquet, DECIMAL can be stored in any of the following physical types: /// - int32: for 1 <= precision <= 9. - /// - int64: for 1 <= precision <= 18; precision < 10 will produce a warning. + /// - int64: for 10 <= precision <= 18. /// - fixed_len_byte_array: precision is limited by the array size. /// Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. /// - binary: precision is unlimited. The minimum number of bytes to store