diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e34420b9f6e..56219e63de7 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2329,6 +2329,12 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met const bool use_dictionary = properties->dictionary_enabled(descr->path()) && descr->physical_type() != Type::BOOLEAN; Encoding::type encoding = properties->encoding(descr->path()); + if (encoding == Encoding::UNKNOWN) { + encoding = (descr->physical_type() == Type::BOOLEAN && + properties->version() != ParquetVersion::PARQUET_1_0) + ? Encoding::RLE + : Encoding::PLAIN; + } if (use_dictionary) { encoding = properties->dictionary_index_encoding(); } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 58199c402bd..b2fd9cf9308 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -203,8 +203,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { if (this->type_num() == Type::BOOLEAN) { // Dictionary encoding is not allowed for boolean type - // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case - std::set expected({Encoding::PLAIN, Encoding::RLE}); + std::set expected; + if (version == ParquetVersion::PARQUET_1_0) { + // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for + // version 1.0. Note that RLE is used for DL/RL. + expected = {Encoding::PLAIN, Encoding::RLE}; + } else { + // There is only 1 encoding (RLE) in a fallback case for version 2.0 + expected = {Encoding::RLE}; + } ASSERT_EQ(encodings, expected); } else if (version == ParquetVersion::PARQUET_1_0) { // There are 3 encodings (PLAIN_DICTIONARY, PLAIN, RLE) in a fallback case @@ -223,7 +230,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { std::vector encoding_stats = this->metadata_encoding_stats(); if (this->type_num() == Type::BOOLEAN) { - ASSERT_EQ(encoding_stats[0].encoding, Encoding::PLAIN); + ASSERT_EQ(encoding_stats[0].encoding, + version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : Encoding::RLE); ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE); } else if (version == ParquetVersion::PARQUET_1_0) { std::vector expected( @@ -757,21 +765,36 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) { ASSERT_EQ(0, this->values_read_); } +class TestBooleanValuesWriter : public TestPrimitiveWriter { + public: + void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) { + this->SetUpSchema(Repetition::REQUIRED); + auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version, + /*enable_checksum*/ false); + for (int i = 0; i < SMALL_SIZE; i++) { + bool value = (i % 2 == 0) ? true : false; + writer->WriteBatch(1, nullptr, nullptr, &value); + } + writer->Close(); + this->ReadColumn(); + for (int i = 0; i < SMALL_SIZE; i++) { + ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i; + } + const auto& encodings = this->metadata_encodings(); + auto iter = std::find(encodings.begin(), encodings.end(), encoding); + ASSERT_TRUE(iter != encodings.end()); + } +}; + // PARQUET-764 // Correct bitpacking for boolean write at non-byte boundaries -using TestBooleanValuesWriter = TestPrimitiveWriter; TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) { - this->SetUpSchema(Repetition::REQUIRED); - auto writer = this->BuildWriter(); - for (int i = 0; i < SMALL_SIZE; i++) { - bool value = (i % 2 == 0) ? true : false; - writer->WriteBatch(1, nullptr, nullptr, &value); - } - writer->Close(); - this->ReadColumn(); - for (int i = 0; i < SMALL_SIZE; i++) { - ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i; - } + TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN); +} + +// Default encoding for boolean is RLE when using V2 pages +TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) { + TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE); } // PARQUET-979 diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index bd7eb9dc7ab..bdc5b15332d 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -135,7 +135,7 @@ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024; static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; -static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; +static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 3efaf1dbf55..82182b0449a 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -128,7 +128,7 @@ def test_parquet_metadata_api(): assert col_meta.is_stats_set is True assert isinstance(col_meta.statistics, pq.Statistics) assert col_meta.compression == 'SNAPPY' - assert set(col_meta.encodings) == {'PLAIN', 'RLE'} + assert set(col_meta.encodings) == {'RLE'} assert col_meta.has_dictionary_page is False assert col_meta.dictionary_page_offset is None assert col_meta.data_page_offset > 0