From dce2d1ea74a1031f8bd00f139fb5bb7fd3a0d561 Mon Sep 17 00:00:00 2001 From: Patrick Pai Date: Tue, 14 Jul 2020 10:32:24 -0500 Subject: [PATCH 1/6] disable writing files with LZ4 codec in c++ library --- cpp/src/parquet/column_writer_test.cc | 8 ++++---- cpp/src/parquet/file_deserialize_test.cc | 5 ++--- cpp/src/parquet/file_serialize_test.cc | 2 +- cpp/src/parquet/types.cc | 5 ++++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 23554aa3dcd..3f34dfe195e 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -488,13 +488,13 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) { #ifdef ARROW_WITH_LZ4 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithLz4Compression) { - this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, false, - LARGE_SIZE); + ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, false, + LARGE_SIZE), ParquetException); } TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndLz4Compression) { - this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, true, - LARGE_SIZE); + ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, true, + LARGE_SIZE), ParquetException); } #endif diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 3fe22301ec4..d70cd97bbdb 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -249,9 +249,8 @@ TEST_F(TestPageSerde, Compression) { codec_types.push_back(Compression::GZIP); #endif -#ifdef ARROW_WITH_LZ4 - codec_types.push_back(Compression::LZ4); -#endif +// TODO: Add LZ4 compression type after PARQUET-1878 is complete. +// Testing for deserializing LZ4 is hard without writing enabled, so it is not included here. #ifdef ARROW_WITH_ZSTD codec_types.push_back(Compression::ZSTD); diff --git a/cpp/src/parquet/file_serialize_test.cc b/cpp/src/parquet/file_serialize_test.cc index c5c4df23f42..72d7d6f742d 100644 --- a/cpp/src/parquet/file_serialize_test.cc +++ b/cpp/src/parquet/file_serialize_test.cc @@ -309,7 +309,7 @@ TYPED_TEST(TestSerialize, SmallFileGzip) { #ifdef ARROW_WITH_LZ4 TYPED_TEST(TestSerialize, SmallFileLz4) { - ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::LZ4)); + ASSERT_THROW(this->FileSerializeTest(Compression::LZ4), ParquetException); } #endif diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 7e22da4c7c1..ad907c5ff8e 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -42,8 +42,11 @@ bool IsCodecSupported(Compression::type codec) { case Compression::GZIP: case Compression::BROTLI: case Compression::ZSTD: - case Compression::LZ4: return true; + case Compression::LZ4: + // TODO: Re-enable after PARQUET-1878 is complete. + // Temporarily disabled because of ARROW-9424. + return false; default: return false; } From 48c42cb4f08388efb4079b67f579422a0bcb61ab Mon Sep 17 00:00:00 2001 From: Patrick Pai Date: Tue, 14 Jul 2020 12:41:08 -0500 Subject: [PATCH 2/6] fix cpp lint --- cpp/src/parquet/column_writer_test.cc | 14 ++++++++++---- cpp/src/parquet/file_deserialize_test.cc | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 3f34dfe195e..a25f641ae41 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -488,13 +488,19 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) { #ifdef ARROW_WITH_LZ4 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithLz4Compression) { - ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, false, - LARGE_SIZE), ParquetException); + ASSERT_THROW( + this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, false, + LARGE_SIZE), + ParquetException + ); } TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndLz4Compression) { - ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, true, - LARGE_SIZE), ParquetException); + ASSERT_THROW( + this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, true, + LARGE_SIZE), + ParquetException + ); } #endif diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index d70cd97bbdb..f8a9e7d92f4 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -250,7 +250,7 @@ TEST_F(TestPageSerde, Compression) { #endif // TODO: Add LZ4 compression type after PARQUET-1878 is complete. -// Testing for deserializing LZ4 is hard without writing enabled, so it is not included here. +// Testing for deserializing LZ4 is hard without writing enabled, so it is not included. #ifdef ARROW_WITH_ZSTD codec_types.push_back(Compression::ZSTD); From c5dfdd26c10e91d551aecc535b8e845825bf98b7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 14 Jul 2020 15:21:36 -0500 Subject: [PATCH 3/6] clang-format --- cpp/src/parquet/column_writer_test.cc | 16 ++++++---------- cpp/src/parquet/file_deserialize_test.cc | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index a25f641ae41..a92d4d28f7d 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -488,19 +488,15 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) { #ifdef ARROW_WITH_LZ4 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithLz4Compression) { - ASSERT_THROW( - this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, false, - LARGE_SIZE), - ParquetException - ); + ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, + false, LARGE_SIZE), + ParquetException); } TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndLz4Compression) { - ASSERT_THROW( - this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, true, - LARGE_SIZE), - ParquetException - ); + ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, + true, LARGE_SIZE), + ParquetException); } #endif diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index f8a9e7d92f4..1dd34922ae9 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -249,8 +249,8 @@ TEST_F(TestPageSerde, Compression) { codec_types.push_back(Compression::GZIP); #endif -// TODO: Add LZ4 compression type after PARQUET-1878 is complete. -// Testing for deserializing LZ4 is hard without writing enabled, so it is not included. + // TODO: Add LZ4 compression type after PARQUET-1878 is complete. + // Testing for deserializing LZ4 is hard without writing enabled, so it is not included. #ifdef ARROW_WITH_ZSTD codec_types.push_back(Compression::ZSTD); From 357b8664ca008cb39d68dda189b0bc7997884262 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 14 Jul 2020 17:18:59 -0500 Subject: [PATCH 4/6] Fix up Python unit tests, throw more helpful error message --- cpp/src/parquet/thrift_internal.h | 1 + cpp/src/parquet/types.cc | 9 +++++++++ python/pyarrow/tests/test_parquet.py | 16 ++++++++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 103dc6503d7..e1959e79438 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -101,6 +101,7 @@ static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type case format::CompressionCodec::BROTLI: return Compression::BROTLI; case format::CompressionCodec::LZ4: + // ARROW-9424: Existing files use LZ4_RAW but this may need to change return Compression::LZ4; case format::CompressionCodec::ZSTD: return Compression::ZSTD; diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index ad907c5ff8e..e75d4034f10 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -43,6 +43,7 @@ bool IsCodecSupported(Compression::type codec) { case Compression::BROTLI: case Compression::ZSTD: return true; + case Compression::LZ4_FRAME: case Compression::LZ4: // TODO: Re-enable after PARQUET-1878 is complete. // Temporarily disabled because of ARROW-9424. @@ -58,6 +59,14 @@ std::unique_ptr GetCodec(Compression::type codec) { std::unique_ptr GetCodec(Compression::type codec, int compression_level) { std::unique_ptr result; + if (codec == Compression::LZ4 || codec == Compression::LZ4_FRAME) { + throw ParquetException( + "Per ARROW-9424, writing files with LZ4 compression has been " + "disabled until implementation issues have been resolved. " + "It is recommended to read any existing files and rewrite them " + "using a different compression."); + } + if (!IsCodecSupported(codec)) { std::stringstream ss; ss << "Codec type " << Codec::GetCodecAsString(codec) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 410eee158e9..aea3c076b0c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -698,6 +698,10 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): tm.assert_frame_equal(df, df_read) +# ARROW-9424: LZ4 support is currently disabled +SUPPORTED_COMPRESSIONS = ['NONE', 'SNAPPY', 'GZIP', 'ZSTD'] + + @pytest.mark.pandas @parametrize_legacy_dataset def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): @@ -735,7 +739,7 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) - for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']: + for compression in SUPPORTED_COMPRESSIONS: if (compression != 'NONE' and not pa.lib.Codec.is_available(compression)): continue @@ -747,6 +751,13 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): tm.assert_frame_equal(df, df_read) +# ARROW-9424: LZ4 support is currently disabled +def test_lz4_compression_disabled(): + table = pa.table([pa.array([1, 2, 3, 4, 5])], names=['f0']) + with pytest.raises(IOError): + pq.write_table(table, pa.BufferOutputStream(), compression='lz4') + + def make_sample_file(table_or_df): if isinstance(table_or_df, pa.Table): a_table = table_or_df @@ -828,8 +839,9 @@ def test_compression_level(use_legacy_dataset): # level. # GZIP (zlib) allows for specifying a compression level but as of up # to version 1.2.11 the valid range is [-1, 9]. - invalid_combinations = [("snappy", 4), ("lz4", 5), ("gzip", -1337), + invalid_combinations = [("snappy", 4), ("gzip", -1337), ("None", 444), ("lzo", 14)] + # ARROW-9424: lz4 is disabled for now ("lz4", 5), buf = io.BytesIO() for (codec, level) in invalid_combinations: with pytest.raises((ValueError, OSError)): From 7e51a6096ae85bf94681412c13d5e718c62c9524 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 14 Jul 2020 17:27:28 -0500 Subject: [PATCH 5/6] Only disallow LZ4 for writing files, not reading --- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/types.cc | 27 ++++++++++++++++++++++----- cpp/src/parquet/types.h | 9 +++++++++ 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 0bfc303dba0..bc462adc53f 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -182,7 +182,7 @@ class SerializedPageReader : public PageReader { InitDecryption(); } max_page_header_size_ = kDefaultMaxPageHeaderSize; - decompressor_ = GetCodec(codec); + decompressor_ = internal::GetReadCodec(codec); } // Implement the PageReader interface diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 13f91e36529..f9cf37cb133 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -172,7 +172,7 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) { InitEncryption(); } - compressor_ = GetCodec(codec, compression_level); + compressor_ = internal::GetWriteCodec(codec, compression_level); thrift_serializer_.reset(new ThriftSerializer); } diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index e75d4034f10..d711528c423 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -53,13 +53,12 @@ bool IsCodecSupported(Compression::type codec) { } } -std::unique_ptr GetCodec(Compression::type codec) { - return GetCodec(codec, Codec::UseDefaultCompressionLevel()); -} +namespace internal { -std::unique_ptr GetCodec(Compression::type codec, int compression_level) { +std::unique_ptr GetCodec(Compression::type codec, int compression_level, + bool for_writing) { std::unique_ptr result; - if (codec == Compression::LZ4 || codec == Compression::LZ4_FRAME) { + if (for_writing && (codec == Compression::LZ4 || codec == Compression::LZ4_FRAME)) { throw ParquetException( "Per ARROW-9424, writing files with LZ4 compression has been " "disabled until implementation issues have been resolved. " @@ -78,6 +77,24 @@ std::unique_ptr GetCodec(Compression::type codec, int compression_level) return result; } +std::unique_ptr GetReadCodec(Compression::type codec) { + return GetCodec(codec, Codec::UseDefaultCompressionLevel(), /*for_writing=*/false); +} + +std::unique_ptr GetWriteCodec(Compression::type codec, int compression_level) { + return GetCodec(codec, compression_level, /*for_writing=*/true); +} + +} // namespace internal + +std::unique_ptr GetCodec(Compression::type codec, int compression_level) { + return internal::GetCodec(codec, compression_level, /*for_writing=*/false); +} + +std::unique_ptr GetCodec(Compression::type codec) { + return GetCodec(codec, Codec::UseDefaultCompressionLevel()); +} + std::string FormatStatValue(Type::type parquet_type, ::arrow::util::string_view val) { std::stringstream result; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index a1586b6af7c..f7f7bb7c2eb 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -465,6 +465,15 @@ struct Encoding { PARQUET_EXPORT bool IsCodecSupported(Compression::type codec); +namespace internal { + +// ARROW-9424: Separate functions for reading and writing so we can disable LZ4 +// on writing +std::unique_ptr GetReadCodec(Compression::type codec); +std::unique_ptr GetWriteCodec(Compression::type codec, int compression_level); + +} // namespace internal + PARQUET_EXPORT std::unique_ptr GetCodec(Compression::type codec); From a8be0bcdb69afdfd7a04363879a2f4cd491c7c0b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 14 Jul 2020 18:52:04 -0500 Subject: [PATCH 6/6] Ensure that existing lz4-compressed files can still be read --- cpp/src/parquet/types.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index d711528c423..1b4bb4acdd5 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -42,12 +42,8 @@ bool IsCodecSupported(Compression::type codec) { case Compression::GZIP: case Compression::BROTLI: case Compression::ZSTD: - return true; - case Compression::LZ4_FRAME: case Compression::LZ4: - // TODO: Re-enable after PARQUET-1878 is complete. - // Temporarily disabled because of ARROW-9424. - return false; + return true; default: return false; }