From 25546eea8437ac49c166651b3cda21d5ee359c31 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 31 Jul 2023 23:23:00 +0800 Subject: [PATCH 1/3] GH-36882: [C++][Parquet] Default RLE for bool values in v2 pages --- cpp/src/parquet/column_writer.cc | 8 ++++- cpp/src/parquet/column_writer_test.cc | 48 +++++++++++++++++++-------- cpp/src/parquet/properties.h | 11 +++--- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e34420b9f6e..613dbfc0cec 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2328,7 +2328,13 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met const ColumnDescriptor* descr = metadata->descr(); const bool use_dictionary = properties->dictionary_enabled(descr->path()) && descr->physical_type() != Type::BOOLEAN; - Encoding::type encoding = properties->encoding(descr->path()); + Encoding::type default_encoding = + (descr->physical_type() == Type::BOOLEAN && + properties->data_page_version() == ParquetDataPageVersion::V2) + ? Encoding::RLE + : Encoding::PLAIN; + Encoding::type encoding = + properties->encoding(descr->path()).value_or(default_encoding); if (use_dictionary) { encoding = properties->dictionary_index_encoding(); } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 58199c402bd..890ec95d6ca 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -98,7 +98,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { int64_t output_size = SMALL_SIZE, const ColumnProperties& column_properties = ColumnProperties(), const ParquetVersion::type version = ParquetVersion::PARQUET_1_0, - bool enable_checksum = false) { + bool enable_checksum = false, + ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1) { sink_ = CreateOutputStream(); WriterProperties::Builder wp_builder; wp_builder.version(version); @@ -108,12 +109,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { wp_builder.dictionary_pagesize_limit(DICTIONARY_PAGE_SIZE); } else { wp_builder.disable_dictionary(); - wp_builder.encoding(column_properties.encoding()); + if (column_properties.encoding().has_value()) { + wp_builder.encoding(column_properties.encoding().value()); + } } if (enable_checksum) { wp_builder.enable_page_checksum(); } wp_builder.max_statistics_size(column_properties.max_statistics_size()); + wp_builder.data_page_version(data_page_version); writer_properties_ = wp_builder.build(); metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); @@ -757,21 +761,37 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) { ASSERT_EQ(0, this->values_read_); } +class TestBooleanValuesWriter : public TestPrimitiveWriter { + public: + void TestWithEncoding(ParquetDataPageVersion version, Encoding::type encoding) { + this->SetUpSchema(Repetition::REQUIRED); + auto writer = + this->BuildWriter(SMALL_SIZE, ColumnProperties(), ParquetVersion::PARQUET_1_0, + /*enable_checksum*/ false, version); + for (int i = 0; i < SMALL_SIZE; i++) { + bool value = (i % 2 == 0) ? true : false; + writer->WriteBatch(1, nullptr, nullptr, &value); + } + writer->Close(); + this->ReadColumn(); + for (int i = 0; i < SMALL_SIZE; i++) { + ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i; + } + const auto& encodings = this->metadata_encodings(); + auto iter = std::find(encodings.begin(), encodings.end(), encoding); + ASSERT_TRUE(iter != encodings.end()); + } +}; + // PARQUET-764 // Correct bitpacking for boolean write at non-byte boundaries -using TestBooleanValuesWriter = TestPrimitiveWriter; TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) { - this->SetUpSchema(Repetition::REQUIRED); - auto writer = this->BuildWriter(); - for (int i = 0; i < SMALL_SIZE; i++) { - bool value = (i % 2 == 0) ? true : false; - writer->WriteBatch(1, nullptr, nullptr, &value); - } - writer->Close(); - this->ReadColumn(); - for (int i = 0; i < SMALL_SIZE; i++) { - ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i; - } + TestWithEncoding(ParquetDataPageVersion::V1, Encoding::PLAIN); +} + +// Default encoding for boolean is RLE when using V2 pages +TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) { + TestWithEncoding(ParquetDataPageVersion::V2, Encoding::RLE); } // PARQUET-979 diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index bd7eb9dc7ab..94fcf889fab 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include #include @@ -135,14 +136,13 @@ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024; static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; -static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; class PARQUET_EXPORT ColumnProperties { public: - ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, + ColumnProperties(std::optional encoding = std::nullopt, Compression::type codec = DEFAULT_COMPRESSION_TYPE, bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, @@ -186,7 +186,7 @@ class PARQUET_EXPORT ColumnProperties { page_index_enabled_ = page_index_enabled; } - Encoding::type encoding() const { return encoding_; } + std::optional encoding() const { return encoding_; } Compression::type compression() const { return codec_; } @@ -203,7 +203,7 @@ class PARQUET_EXPORT ColumnProperties { bool page_index_enabled() const { return page_index_enabled_; } private: - Encoding::type encoding_; + std::optional encoding_; Compression::type codec_; bool dictionary_enabled_; bool statistics_enabled_; @@ -710,7 +710,8 @@ class PARQUET_EXPORT WriterProperties { return default_column_properties_; } - Encoding::type encoding(const std::shared_ptr& path) const { + std::optional encoding( + const std::shared_ptr& path) const { return column_properties(path).encoding(); } From a896c97cb2222e45c0219537121b840093771be0 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 11 Aug 2023 10:03:29 +0800 Subject: [PATCH 2/3] Use Encoding::UNKNOWN as the default --- cpp/src/parquet/column_writer.cc | 14 +++++++------- cpp/src/parquet/column_writer_test.cc | 4 +--- cpp/src/parquet/properties.h | 11 +++++------ 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 613dbfc0cec..fd6595793f4 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2328,13 +2328,13 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met const ColumnDescriptor* descr = metadata->descr(); const bool use_dictionary = properties->dictionary_enabled(descr->path()) && descr->physical_type() != Type::BOOLEAN; - Encoding::type default_encoding = - (descr->physical_type() == Type::BOOLEAN && - properties->data_page_version() == ParquetDataPageVersion::V2) - ? Encoding::RLE - : Encoding::PLAIN; - Encoding::type encoding = - properties->encoding(descr->path()).value_or(default_encoding); + Encoding::type encoding = properties->encoding(descr->path()); + if (encoding == Encoding::UNKNOWN) { + encoding = (descr->physical_type() == Type::BOOLEAN && + properties->data_page_version() == ParquetDataPageVersion::V2) + ? Encoding::RLE + : Encoding::PLAIN; + } if (use_dictionary) { encoding = properties->dictionary_index_encoding(); } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 890ec95d6ca..2c50bef169d 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -109,9 +109,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { wp_builder.dictionary_pagesize_limit(DICTIONARY_PAGE_SIZE); } else { wp_builder.disable_dictionary(); - if (column_properties.encoding().has_value()) { - wp_builder.encoding(column_properties.encoding().value()); - } + wp_builder.encoding(column_properties.encoding()); } if (enable_checksum) { wp_builder.enable_page_checksum(); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 94fcf889fab..bdc5b15332d 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -18,7 +18,6 @@ #pragma once #include -#include #include #include #include @@ -136,13 +135,14 @@ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024; static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; +static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; class PARQUET_EXPORT ColumnProperties { public: - ColumnProperties(std::optional encoding = std::nullopt, + ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, Compression::type codec = DEFAULT_COMPRESSION_TYPE, bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, @@ -186,7 +186,7 @@ class PARQUET_EXPORT ColumnProperties { page_index_enabled_ = page_index_enabled; } - std::optional encoding() const { return encoding_; } + Encoding::type encoding() const { return encoding_; } Compression::type compression() const { return codec_; } @@ -203,7 +203,7 @@ class PARQUET_EXPORT ColumnProperties { bool page_index_enabled() const { return page_index_enabled_; } private: - std::optional encoding_; + Encoding::type encoding_; Compression::type codec_; bool dictionary_enabled_; bool statistics_enabled_; @@ -710,8 +710,7 @@ class PARQUET_EXPORT WriterProperties { return default_column_properties_; } - std::optional encoding( - const std::shared_ptr& path) const { + Encoding::type encoding(const std::shared_ptr& path) const { return column_properties(path).encoding(); } From 2b680af5a17b6d416878771705e23738c6370948 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 15 Aug 2023 10:39:31 +0800 Subject: [PATCH 3/3] Use ParquetVersion instead of DataPageVersion --- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/column_writer_test.cc | 29 +++++++++++-------- python/pyarrow/tests/parquet/test_metadata.py | 2 +- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fd6595793f4..56219e63de7 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2331,7 +2331,7 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met Encoding::type encoding = properties->encoding(descr->path()); if (encoding == Encoding::UNKNOWN) { encoding = (descr->physical_type() == Type::BOOLEAN && - properties->data_page_version() == ParquetDataPageVersion::V2) + properties->version() != ParquetVersion::PARQUET_1_0) ? Encoding::RLE : Encoding::PLAIN; } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 2c50bef169d..b2fd9cf9308 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -98,8 +98,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { int64_t output_size = SMALL_SIZE, const ColumnProperties& column_properties = ColumnProperties(), const ParquetVersion::type version = ParquetVersion::PARQUET_1_0, - bool enable_checksum = false, - ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1) { + bool enable_checksum = false) { sink_ = CreateOutputStream(); WriterProperties::Builder wp_builder; wp_builder.version(version); @@ -115,7 +114,6 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { wp_builder.enable_page_checksum(); } wp_builder.max_statistics_size(column_properties.max_statistics_size()); - wp_builder.data_page_version(data_page_version); writer_properties_ = wp_builder.build(); metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); @@ -205,8 +203,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { if (this->type_num() == Type::BOOLEAN) { // Dictionary encoding is not allowed for boolean type - // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case - std::set expected({Encoding::PLAIN, Encoding::RLE}); + std::set expected; + if (version == ParquetVersion::PARQUET_1_0) { + // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for + // version 1.0. Note that RLE is used for DL/RL. + expected = {Encoding::PLAIN, Encoding::RLE}; + } else { + // There is only 1 encoding (RLE) in a fallback case for version 2.0 + expected = {Encoding::RLE}; + } ASSERT_EQ(encodings, expected); } else if (version == ParquetVersion::PARQUET_1_0) { // There are 3 encodings (PLAIN_DICTIONARY, PLAIN, RLE) in a fallback case @@ -225,7 +230,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { std::vector encoding_stats = this->metadata_encoding_stats(); if (this->type_num() == Type::BOOLEAN) { - ASSERT_EQ(encoding_stats[0].encoding, Encoding::PLAIN); + ASSERT_EQ(encoding_stats[0].encoding, + version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : Encoding::RLE); ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE); } else if (version == ParquetVersion::PARQUET_1_0) { std::vector expected( @@ -761,11 +767,10 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) { class TestBooleanValuesWriter : public TestPrimitiveWriter { public: - void TestWithEncoding(ParquetDataPageVersion version, Encoding::type encoding) { + void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) { this->SetUpSchema(Repetition::REQUIRED); - auto writer = - this->BuildWriter(SMALL_SIZE, ColumnProperties(), ParquetVersion::PARQUET_1_0, - /*enable_checksum*/ false, version); + auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version, + /*enable_checksum*/ false); for (int i = 0; i < SMALL_SIZE; i++) { bool value = (i % 2 == 0) ? true : false; writer->WriteBatch(1, nullptr, nullptr, &value); @@ -784,12 +789,12 @@ class TestBooleanValuesWriter : public TestPrimitiveWriter { // PARQUET-764 // Correct bitpacking for boolean write at non-byte boundaries TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) { - TestWithEncoding(ParquetDataPageVersion::V1, Encoding::PLAIN); + TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN); } // Default encoding for boolean is RLE when using V2 pages TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) { - TestWithEncoding(ParquetDataPageVersion::V2, Encoding::RLE); + TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE); } // PARQUET-979 diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 3efaf1dbf55..82182b0449a 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -128,7 +128,7 @@ def test_parquet_metadata_api(): assert col_meta.is_stats_set is True assert isinstance(col_meta.statistics, pq.Statistics) assert col_meta.compression == 'SNAPPY' - assert set(col_meta.encodings) == {'PLAIN', 'RLE'} + assert set(col_meta.encodings) == {'RLE'} assert col_meta.has_dictionary_page is False assert col_meta.dictionary_page_offset is None assert col_meta.data_page_offset > 0