Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2336,7 +2336,8 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
Encoding::type encoding = properties->encoding(descr->path());
if (encoding == Encoding::UNKNOWN) {
encoding = (descr->physical_type() == Type::BOOLEAN &&
properties->version() != ParquetVersion::PARQUET_1_0)
properties->version() != ParquetVersion::PARQUET_1_0 &&
properties->data_page_version() == ParquetDataPageVersion::V2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, if the data page version is v2, it is user's responsibility to set parquet version to PARQUET_2_X. Unfortunately, this is not enforced yet.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we prevent from this or just keep the behavior here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's do it separately.

? Encoding::RLE
: Encoding::PLAIN;
}
Expand Down
78 changes: 53 additions & 25 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,11 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
int64_t output_size = SMALL_SIZE,
const ColumnProperties& column_properties = ColumnProperties(),
const ParquetVersion::type version = ParquetVersion::PARQUET_1_0,
const ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1,
bool enable_checksum = false) {
sink_ = CreateOutputStream();
WriterProperties::Builder wp_builder;
wp_builder.version(version);
wp_builder.version(version)->data_page_version(data_page_version);
if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY ||
column_properties.encoding() == Encoding::RLE_DICTIONARY) {
wp_builder.enable_dictionary();
Expand Down Expand Up @@ -176,7 +177,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, enable_checksum));
}

void TestDictionaryFallbackEncoding(ParquetVersion::type version) {
void TestDictionaryFallbackEncoding(ParquetVersion::type version,
ParquetDataPageVersion data_page_version) {
this->GenerateData(VERY_LARGE_SIZE);
ColumnProperties column_properties;
column_properties.set_dictionary_enabled(true);
Expand All @@ -187,7 +189,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
column_properties.set_encoding(Encoding::RLE_DICTIONARY);
}

auto writer = this->BuildWriter(VERY_LARGE_SIZE, column_properties, version);
auto writer =
this->BuildWriter(VERY_LARGE_SIZE, column_properties, version, data_page_version);

writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
writer->Close();
Expand All @@ -204,13 +207,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
if (this->type_num() == Type::BOOLEAN) {
// Dictionary encoding is not allowed for boolean type
std::set<Encoding::type> expected;
if (version == ParquetVersion::PARQUET_1_0) {
if (version != ParquetVersion::PARQUET_1_0 &&
data_page_version == ParquetDataPageVersion::V2) {
// There is only 1 encoding (RLE) in a fallback case for version 2.0 and data page
// v2 enabled.
expected = {Encoding::RLE};
} else {
// There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for
// version 1.0. Note that RLE is used for DL/RL.
// version 1.0 or data page v1. Note that RLE is used for DL/RL.
expected = {Encoding::PLAIN, Encoding::RLE};
} else {
// There is only 1 encoding (RLE) in a fallback case for version 2.0
expected = {Encoding::RLE};
}
ASSERT_EQ(encodings, expected);
} else if (version == ParquetVersion::PARQUET_1_0) {
Expand All @@ -231,7 +236,10 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
this->metadata_encoding_stats();
if (this->type_num() == Type::BOOLEAN) {
ASSERT_EQ(encoding_stats[0].encoding,
version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : Encoding::RLE);
version != ParquetVersion::PARQUET_1_0 &&
data_page_version == ParquetDataPageVersion::V2
? Encoding::RLE
: Encoding::PLAIN);
ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE);
} else if (version == ParquetVersion::PARQUET_1_0) {
std::vector<Encoding::type> expected(
Expand Down Expand Up @@ -262,8 +270,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
enable_statistics);
column_properties.set_codec_options(
std::make_shared<CodecOptions>(compression_level));
std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(
num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1, enable_checksum);
writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
writer->Close();
Expand All @@ -281,8 +290,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
enable_statistics);
column_properties.set_codec_options(
std::make_shared<CodecOptions>(compression_level));
std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(
num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1, enable_checksum);
writer->WriteBatchSpaced(this->values_.size(), nullptr, nullptr, valid_bits.data(), 0,
this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
Expand Down Expand Up @@ -693,12 +703,19 @@ TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) {

// Test cases for dictionary fallback encoding
TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) {
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1);
}

TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) {
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4,
ParquetDataPageVersion::V1);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4,
ParquetDataPageVersion::V2);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6,
ParquetDataPageVersion::V1);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6,
ParquetDataPageVersion::V2);
}

TEST(TestWriter, NullValuesBuffer) {
Expand Down Expand Up @@ -767,10 +784,13 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) {

class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
public:
void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) {
void TestWithEncoding(ParquetVersion::type version,
ParquetDataPageVersion data_page_version,
const std::set<Encoding::type>& expected_encodings) {
this->SetUpSchema(Repetition::REQUIRED);
auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version,
/*enable_checksum*/ false);
auto writer =
this->BuildWriter(SMALL_SIZE, ColumnProperties(), version, data_page_version,
/*enable_checksum*/ false);
for (int i = 0; i < SMALL_SIZE; i++) {
bool value = (i % 2 == 0) ? true : false;
writer->WriteBatch(1, nullptr, nullptr, &value);
Expand All @@ -780,21 +800,29 @@ class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
for (int i = 0; i < SMALL_SIZE; i++) {
ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
}
const auto& encodings = this->metadata_encodings();
auto iter = std::find(encodings.begin(), encodings.end(), encoding);
ASSERT_TRUE(iter != encodings.end());
auto metadata_encodings = this->metadata_encodings();
std::set<Encoding::type> metadata_encodings_set{metadata_encodings.begin(),
metadata_encodings.end()};
EXPECT_EQ(expected_encodings, metadata_encodings_set);
}
};

// PARQUET-764
// Correct bitpacking for boolean write at non-byte boundaries
TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN);
for (auto data_page_version :
{ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
TestWithEncoding(ParquetVersion::PARQUET_1_0, data_page_version,
{Encoding::PLAIN, Encoding::RLE});
}
}

// Default encoding for boolean is RLE when using V2 pages
// Default encoding for boolean is RLE when both V2 format and V2 pages enabled.
TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE);
TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V1,
{Encoding::PLAIN, Encoding::RLE});
TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V2,
{Encoding::RLE});
}

// PARQUET-979
Expand Down
22 changes: 10 additions & 12 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <algorithm>
#include <cinttypes>
#include <memory>
#include <ostream>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -1501,8 +1502,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
thrift_encoding_stats.push_back(data_enc_stat);
add_encoding(data_encoding);
}
column_chunk_->meta_data.__set_encodings(thrift_encodings);
column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
column_chunk_->meta_data.__set_encodings(std::move(thrift_encodings));
column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats));

const auto& encrypt_md =
properties_->column_encryption_properties(column_->path()->ToDotString());
Expand All @@ -1521,7 +1522,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
}
column_chunk_->__set_crypto_metadata(ccmd);
column_chunk_->__set_crypto_metadata(std::move(ccmd));

bool encrypted_footer =
properties_->file_encryption_properties()->encrypted_footer();
Expand Down Expand Up @@ -1601,16 +1602,13 @@ std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(

ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
: impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
: impl_{std::make_unique<ColumnChunkMetaDataBuilderImpl>(std::move(props), column)} {}

ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
void* contents)
: impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
new ColumnChunkMetaDataBuilderImpl(
std::move(props), column,
reinterpret_cast<format::ColumnChunk*>(contents)))} {}
: impl_{std::make_unique<ColumnChunkMetaDataBuilderImpl>(
std::move(props), column, reinterpret_cast<format::ColumnChunk*>(contents))} {}

ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;

Expand Down Expand Up @@ -1782,7 +1780,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
key_value_metadata_(std::move(key_value_metadata)) {
if (properties_->file_encryption_properties() != nullptr &&
properties_->file_encryption_properties()->encrypted_footer()) {
crypto_metadata_.reset(new format::FileCryptoMetaData());
crypto_metadata_ = std::make_unique<format::FileCryptoMetaData>();
}
}

Expand Down Expand Up @@ -1956,8 +1954,8 @@ std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
FileMetaDataBuilder::FileMetaDataBuilder(
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
std::shared_ptr<const KeyValueMetadata> key_value_metadata)
: impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
schema, std::move(props), std::move(key_value_metadata)))} {}
: impl_{std::make_unique<FileMetaDataBuilderImpl>(schema, std::move(props),
std::move(key_value_metadata))} {}

FileMetaDataBuilder::~FileMetaDataBuilder() = default;

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_parquet_metadata_api():
assert col_meta.is_stats_set is True
assert isinstance(col_meta.statistics, pq.Statistics)
assert col_meta.compression == 'SNAPPY'
assert set(col_meta.encodings) == {'RLE'}
assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
assert col_meta.has_dictionary_page is False
assert col_meta.dictionary_page_offset is None
assert col_meta.data_page_offset > 0
Expand Down