From 79f8eb031e1a07071bda095609da8797a3d75c6c Mon Sep 17 00:00:00 2001 From: Tham Ha Thi Date: Thu, 13 Sep 2018 17:25:39 -0400 Subject: [PATCH 001/125] encryption (from apache/parquet-cpp github repo) --- cpp/src/parquet/column_reader.cc | 30 ++- cpp/src/parquet/column_reader.h | 2 +- cpp/src/parquet/column_writer-test.cc | 2 +- cpp/src/parquet/column_writer.cc | 69 +++-- cpp/src/parquet/column_writer.h | 1 + cpp/src/parquet/encryption.cc | 45 ++++ cpp/src/parquet/encryption.h | 55 ++++ cpp/src/parquet/file_reader.cc | 162 +++++++++-- cpp/src/parquet/file_writer.cc | 81 ++++-- cpp/src/parquet/file_writer.h | 5 +- cpp/src/parquet/metadata.cc | 290 +++++++++++++++++--- cpp/src/parquet/metadata.h | 52 +++- cpp/src/parquet/properties.h | 371 +++++++++++++++++++++++++- cpp/src/parquet/thrift.h | 101 +++++-- cpp/src/parquet/types.h | 58 ++++ 15 files changed, 1195 insertions(+), 129 deletions(-) create mode 100644 cpp/src/parquet/encryption.cc create mode 100644 cpp/src/parquet/encryption.h diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 130b75a5210..d6ac3bae047 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -35,6 +35,8 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" +#include "parquet/util/crypto.h" + using arrow::MemoryPool; namespace parquet { @@ -106,11 +108,14 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, + std::shared_ptr encryption, ::arrow::MemoryPool* pool) - : stream_(stream), + : : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), seen_num_rows_(0), - total_num_rows_(total_num_rows) { + total_num_rows_(total_num_rows), + encryption_(encryption), + decryption_buffer_(AllocateBuffer(pool, 0)) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); } @@ -138,6 +143,10 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; + + // Encryption + std::shared_ptr encryption_; + std::shared_ptr decryption_buffer_; }; std::shared_ptr SerializedPageReader::NextPage() { @@ -161,7 +170,8 @@ std::shared_ptr SerializedPageReader::NextPage() { header_size = static_cast(buffer.size()); try { DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_); + &header_size, ¤t_page_header_, + encryption_.get()); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -190,6 +200,15 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } + // Decrypt it if we need to + if (encryption_ != nullptr) { + decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); + compressed_len = parquet_encryption::Decrypt( + encryption_, false, buffer, compressed_len, decryption_buffer_->mutable_data()); + + buffer = decryption_buffer_->data(); + } + // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. @@ -258,9 +277,10 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool) { + Compression::type codec, std::shared_ptr encryption, + ::arrow::MemoryPool* pool) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, pool)); + new SerializedPageReader(stream, total_num_rows, codec, encryption, pool)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index e7d6afbb467..052b21218dc 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -81,7 +81,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, + Compression::type codec, std::shared_ptr encryption = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index dd0d65aa5cd..a0b916db667 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -107,7 +107,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); std::unique_ptr pager = - PageWriter::Open(sink_, column_properties.compression(), metadata_.get()); + PageWriter::Open(sink_, column_properties.compression(), nullptr, metadata_.get()); std::shared_ptr writer = ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get()); return std::static_pointer_cast>(writer); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f2783d00964..eb5523910ec 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -36,6 +36,7 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" #include "parquet/types.h" +#include "parquet/util/crypto.h" namespace parquet { @@ -125,8 +126,9 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { // and the page metadata. class SerializedPageWriter : public PageWriter { public: - SerializedPageWriter(const std::shared_ptr& sink, - Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, + const std::shared_ptr& encryption, + ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), @@ -135,7 +137,8 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_(0), data_page_offset_(0), total_uncompressed_size_(0), - total_compressed_size_(0) { + total_compressed_size_(0), + encryption_(encryption) { compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -157,10 +160,23 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_encoding(ToThrift(page.encoding())); dict_page_header.__set_is_sorted(page.is_sorted()); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + + std::shared_ptr encrypted_data_buffer = nullptr; + if (encryption_.get()) { + encrypted_data_buffer = std::static_pointer_cast( + AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); + output_data_len = parquet_encryption::Encrypt( + encryption_, false, compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_dictionary_page_header(dict_page_header); // TODO(PARQUET-594) crc checksum @@ -169,11 +185,11 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data->data(), compressed_data->size())); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; int64_t final_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(&final_pos)); @@ -224,10 +240,22 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.repetition_level_encoding())); data_page_header.__set_statistics(ToThrift(page.statistics())); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + + std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); + if (encryption_.get()) { + encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); + output_data_len = parquet_encryption::Encrypt( + encryption_, false, compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_data_page_header(data_page_header); // TODO(PARQUET-594) crc checksum @@ -237,11 +265,11 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data->data(), compressed_data->size())); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); int64_t current_pos = -1; @@ -275,19 +303,21 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; + std::shared_ptr encryption_; }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: - BufferedPageWriter(const std::shared_ptr& sink, - Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, + const std::shared_ptr& encryption, + ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata) { - in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, pool)); - } + in_memory_sink_ = CreateOutputStream(pool); + pager_ = std::unique_ptr( + new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, pool)); + } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { return pager_->WriteDictionaryPage(page); @@ -330,14 +360,15 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool, bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, metadata, pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, pool)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, metadata, pool)); + new SerializedPageWriter(sink, codec, encryption, metadata, pool)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 023b96585eb..7c601fe7499 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -83,6 +83,7 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool buffered_row_group = false); diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc new file mode 100644 index 00000000000..0a2d9ef939a --- /dev/null +++ b/cpp/src/parquet/encryption.cc @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "encryption.h" + +#include + +namespace parquet { + +// integer key retriever +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { + key_map_.insert(std::make_pair(key_id, key)); +} + +const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { + uint32_t key_id; + memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + + return key_map_[key_id]; +} + +// string key retriever +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { + key_map_.insert(std::make_pair(key_id, key)); +} + +const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { + return key_map_[key_id]; +} + +} // namespace parquet diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h new file mode 100644 index 00000000000..1dbf0d20f39 --- /dev/null +++ b/cpp/src/parquet/encryption.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_H +#define PARQUET_ENCRYPTION_H + +#include +#include +#include + +namespace parquet { + +class PARQUET_EXPORT DecryptionKeyRetriever { + public: + virtual const std::string& GetKey(const std::string& key_metadata) = 0; + virtual ~DecryptionKeyRetriever() {} +}; + +// Simple integer key retriever +class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(uint32_t key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); + + private: + std::map key_map_; +}; + +// Simple string key retriever +class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(const std::string& key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); + + private: + std::map key_map_; +}; + +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index d0ca9ca809d..a73c5b7bb21 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -46,6 +46,7 @@ namespace parquet { static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; @@ -82,9 +83,13 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, - FileMetaData* file_metadata, int row_group_number, + FileMetaData* file_metadata, + FileCryptoMetaData* file_crypto_metadata, int row_group_number, const ReaderProperties& props) - : source_(source), file_metadata_(file_metadata), properties_(props) { + : source_(source), + file_metadata_(file_metadata), + file_crypto_metadata_(file_crypto_metadata), + properties_(props) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -119,13 +124,65 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); + std::unique_ptr crypto_meta_data = col->crypto_meta_data(); + + bool encrypted = true; + + // file is unencrypted + // or file is encrypted but column is unencrypted + if (!file_crypto_metadata_ || !crypto_meta_data) { + encrypted = false; + } + + if (!encrypted) { + return PageReader::Open(stream, col->num_values(), col->compression(), + nullptr, properties_.memory_pool()); + } + + // the column is encrypted + + auto file_decryption = properties_.file_decryption(); + + // the column is encrypted with footer key + if (crypto_meta_data->encrypted_with_footer_key()) { + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + + if (footer_key.empty()) { + throw ParquetException("column is encrypted with null footer key"); + } + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); + + return PageReader::Open(stream, col->num_values(), col->compression(), + footer_encryption, properties_.memory_pool()); + } + + // file is non-uniform encrypted and the column is encrypted with its own key + + std::string column_key_metadata = crypto_meta_data->column_key_metadata(); + // encrypted with column key + std::string column_key = + file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); + + if (column_key.empty()) { + throw ParquetException("column is encrypted with null key, path=" + + col->path_in_schema()->ToDotString()); + } + auto column_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, column_key, + file_decryption->GetAad()); + return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool()); + column_encryption, properties_.memory_pool()); } private: std::shared_ptr source_; FileMetaData* file_metadata_; + FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; }; @@ -146,7 +203,8 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); + new SerializedRowGroup(source_, file_metadata_.get(), + file_crypto_metadata_.get(), i, properties_)); return std::make_shared(std::move(contents)); } @@ -176,38 +234,96 @@ class SerializedFile : public ParquetFileReader::Contents { // Check if all bytes are read. Check if last 4 bytes read have the magic bits if (footer_buffer->size() != footer_read_size || - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0) { + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } - uint32_t metadata_len = arrow::util::SafeLoadAs( + // no encryption + if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { + uint32_t metadata_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = file_size - kFooterSize - metadata_len; + if (kFooterSize + metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (metadata_len + kFooterSize)) { + metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); + } + // encryption + else { + // read crypto metadata + uint32_t crypto_metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - kFooterSize); - int64_t metadata_start = file_size - kFooterSize - metadata_len; - if (kFooterSize + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } + int64_t crypto_metadata_start = file_size - kFooterSize - crypto_metadata_len; + + if (kFooterSize + crypto_metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (crypto_metadata_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - crypto_metadata_len - kFooterSize, crypto_metadata_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(crypto_metadata_start, crypto_metadata_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != crypto_metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + file_crypto_metadata_ = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + + int64_t footer_offset = file_crypto_metadata_->footer_offset(); + uint32_t footer_read_size = (uint32_t)(crypto_metadata_start - footer_offset); + + std::shared_ptr metadata_buffer = + SliceBuffer(footer_buffer, footer_offset, footer_read_size); + + if (file_crypto_metadata_->encrypted_footer()) { + // get footer key metadata + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + + auto file_decryption = properties_.file_decryption(); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); - std::shared_ptr metadata_buffer; - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + kFooterSize)) { - metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size, + footer_encryption); + } else { + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size); } } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } private: std::shared_ptr source_; std::shared_ptr file_metadata_; + std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; }; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index f2f42f38441..d79274c50fd 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -33,6 +33,7 @@ namespace parquet { // FIXME: copied from reader-internal.cc static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', 'E'}; // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -123,7 +124,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), col_meta, + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + properties_->encryption(column_descr->path()), col_meta, // TODO properties_->memory_pool()); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -221,7 +223,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, properties_->memory_pool(), buffered_row_group_); + properties_->encryption(column_descr->path()), col_meta, + properties_->memory_pool(), buffered_row_group_); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -262,7 +265,22 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*file_metadata_, sink_.get()); + + auto file_encryption = properties_->file_encryption(); + if (file_encryption == nullptr) { + WriteFileMetaData(*file_metadata_, sink_.get()); + } else { + uint64_t metadata_start = static_cast(sink_->Tell()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption.get()); + + auto crypto_metadata = metadata_->GetCryptoMetaData(metadata_start); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + } + + sink_->Close(); } } @@ -324,8 +342,12 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr row_group_writer_; void StartFile() { - // Parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + if (properties_->file_encryption() == nullptr) { + // Parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + } else { + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + } } }; @@ -361,25 +383,52 @@ std::unique_ptr ParquetFileWriter::Open( key_value_metadata); } -void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, + EncryptionProperties* footer_encryption) { + if (footer_encryption == nullptr) { + // Write MetaData + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); + + file_metadata.WriteTo(sink); + + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; + + // Write Footer + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + } else { + // encrypt and write to sink + file_metadata.WriteTo(sink, footer_encryption); + } +} + +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { + ParquetOutputWrapper wrapper(sink); + return WriteFileMetaData(file_metadata, &wrapper); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ArrowOutputStream* sink) { int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint64_t crypto_offset = static_cast(position); - // Write MetaData - uint32_t metadata_len = static_cast(position); - - file_metadata.WriteTo(sink); + // Get a FileCryptoMetaData + crypto_metadata.WriteTo(sink); PARQUET_THROW_NOT_OK(sink->Tell(&position)); - metadata_len = static_cast(position) - metadata_len; + auto crypto_len = static_cast(position) - crypto_offset; + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&crypto_len), 4)); - // Write Footer - PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(PARQUET_EMAGIC, 4)); } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink) { ParquetOutputWrapper wrapper(sink); - return WriteFileMetaData(file_metadata, &wrapper); + return WriteFileCryptoMetaData(crypto_metadata, &wrapper); } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index cdc787f15de..5d3a5aa6359 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -107,7 +107,10 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + EncryptionProperties* encryption_properties = NULLPTR); +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink); PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e6764862a57..c9074eb3b1d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -111,6 +111,52 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d } // MetaData Accessor +// ColumnCryptoMetaData +class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { + public: + explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata) + : crypto_metadata_(crypto_metadata) {} + + ~ColumnCryptoMetaDataImpl() {} + + bool encrypted_with_footer_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY; + } + bool encrypted_with_column_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY; + } + const std::vector& path_in_schema() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema; + } + const std::string& column_key_metadata() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.column_key_metadata; + } + + private: + const format::ColumnCryptoMetaData* crypto_metadata_; +}; + +std::unique_ptr ColumnCryptoMetaData::Make( + const uint8_t* metadata) { + return std::unique_ptr(new ColumnCryptoMetaData(metadata)); +} + +ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) + : impl_(new ColumnCryptoMetaDataImpl( + reinterpret_cast(metadata))) {} + +ColumnCryptoMetaData::~ColumnCryptoMetaData() {} + +const std::vector& ColumnCryptoMetaData::path_in_schema() const { + return impl_->path_in_schema(); +} +bool ColumnCryptoMetaData::encrypted_with_footer_key() const { + return impl_->encrypted_with_footer_key(); +} +const std::string& ColumnCryptoMetaData::column_key_metadata() const { + return impl_->column_key_metadata(); +} + // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: @@ -193,6 +239,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return column_->meta_data.total_uncompressed_size; } + inline std::unique_ptr crypto_meta_data() const { + if (column_->__isset.crypto_meta_data) { + return ColumnCryptoMetaData::Make( + reinterpret_cast(&column_->crypto_meta_data)); + } else { + return nullptr; + } + } + private: mutable std::shared_ptr possible_stats_; std::vector encodings_; @@ -270,6 +325,10 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +std::unique_ptr ColumnChunkMetaData::crypto_meta_data() const { + return impl_->crypto_meta_data(); +} + // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: @@ -334,11 +393,12 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, + std::shared_ptr encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get()); + metadata_.get(), encryption.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -366,9 +426,9 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst) const { + void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst); + serializer.Serialize(metadata_.get(), dst, encryption); } std::unique_ptr RowGroup(int i) { @@ -453,14 +513,17 @@ class FileMetaData::FileMetaDataImpl { }; std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len) { + uint32_t* metadata_len, + std::shared_ptr encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); + return std::shared_ptr( + new FileMetaData(metadata, metadata_len, encryption)); } -FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, + std::shared_ptr encryption) : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} + new FileMetaDataImpl(metadata, metadata_len, encryption))} {} FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -512,10 +575,68 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { - return impl_->WriteTo(dst); +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { + return impl_->WriteTo(dst, encryption); +} + +class FileCryptoMetaData::FileCryptoMetaDataImpl { + public: + FileCryptoMetaDataImpl() {} + + explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) { + metadata_.reset(new format::FileCryptoMetaData); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + metadata_len_ = *metadata_len; + } + + ~FileCryptoMetaDataImpl() {} + + EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + + bool encrypted_footer() { return metadata_->encrypted_footer; } + + const std::string& footer_key_metadata() { return metadata_->footer_key_metadata; } + + uint64_t footer_offset() { return metadata_->footer_offset; } + + void WriteTo(::arrow::io::OutputStream* dst) const { + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); + } + + private: + friend FileMetaDataBuilder; + std::unique_ptr metadata_; + uint32_t metadata_len_; +}; + +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() { + return impl_->encryption_algorithm(); +} +bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } +const std::string& FileCryptoMetaData::footer_key_metadata() { + return impl_->footer_key_metadata(); +} +uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } + +std::shared_ptr FileCryptoMetaData::Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len) { + return std::shared_ptr( + new FileCryptoMetaData(serialized_metadata, metadata_len)); } +FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, + uint32_t* metadata_len) + : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} + +FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {} + +FileCryptoMetaData::~FileCryptoMetaData() {} + +void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } + ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) : application_(application), version{major, minor, patch, "", "", ""} {} @@ -646,7 +767,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // column metadata void SetStatistics(const EncodedStatistics& val) { - column_chunk_->meta_data.__set_statistics(ToThrift(val)); + column_metadata_.__set_statistics(ToThrift(val)); } void Finish(int64_t num_values, int64_t dictionary_page_offset, @@ -654,19 +775,20 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback) { if (dictionary_page_offset > 0) { - column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); + column_metadata_.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); } else { column_chunk_->__set_file_offset(data_page_offset + compressed_size); } - column_chunk_->__isset.meta_data = true; - column_chunk_->meta_data.__set_num_values(num_values); + + column_metadata_.__set_num_values(num_values); if (index_page_offset >= 0) { - column_chunk_->meta_data.__set_index_page_offset(index_page_offset); + column_metadata_.__set_index_page_offset(index_page_offset); } - column_chunk_->meta_data.__set_data_page_offset(data_page_offset); - column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); - column_chunk_->meta_data.__set_total_compressed_size(compressed_size); + column_metadata_.__set_data_page_offset(data_page_offset); + column_metadata_.__set_total_uncompressed_size(uncompressed_size); + column_metadata_.__set_total_compressed_size(compressed_size); + std::vector thrift_encodings; if (has_dictionary) { thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); @@ -684,12 +806,62 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { if (dictionary_fallback) { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } - column_chunk_->meta_data.__set_encodings(thrift_encodings); + column_metadata_.__set_encodings(thrift_encodings); } void WriteTo(::arrow::io::OutputStream* sink) { ThriftSerializer serializer; - serializer.Serialize(column_chunk_, sink); + const auto& encrypt_md = properties_->column_encryption_props(column_->path()); + + // column is unencrypted + if (!encrypt_md || !encrypt_md->encrypted()) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + + serializer.Serialize(column_chunk_, sink); + } else { // column is encrypted + column_chunk_->__isset.crypto_meta_data = true; + + // encrypted with footer key + format::ColumnCryptoMetaData ccmd; + if (encrypt_md->encrypted_with_footer_key()) { + ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; + ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); + } else { // encrypted with column key + format::EncryptionWithColumnKey eck; + eck.__set_column_key_metadata(encrypt_md->key_metadata()); + eck.__set_path_in_schema(column_->path()->ToDotVector()); + ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; + ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); + } + column_chunk_->__set_crypto_meta_data(ccmd); + + auto footer_encryption = properties_->footer_encryption(); + + // non-uniform: footer is unencrypted, or column is encrypted with a column-specific + // key + if ((footer_encryption == nullptr && encrypt_md->encrypted()) || + !encrypt_md->encrypted_with_footer_key()) { + // don't set meta_data + column_chunk_->__isset.meta_data = false; + + // Thrift-serialize the ColumnMetaData structure, + // encrypt it with the column key, and write the result to the output stream + // (first length, then buffer) + auto encrypt_props = properties_->encryption(column_->path()); + uint64_t metadata_start = sink->Tell(); + + serializer.Serialize(&column_metadata_, sink, encrypt_props.get()); + + // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. + column_chunk_->__set_file_offset(metadata_start); + } else { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + } + + serializer.Serialize(column_chunk_, sink); + } } const ColumnDescriptor* descr() const { return column_; } @@ -697,14 +869,15 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; - column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type())); - column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector()); - column_chunk_->meta_data.__set_codec( - ToThrift(properties_->compression(column_->path()))); + column_metadata_ = column_chunk_->meta_data; + column_metadata_.__set_type(ToThrift(column_->physical_type())); + column_metadata_.__set_path_in_schema(column_->path()->ToDotVector()); + column_metadata_.__set_codec(ToThrift(properties_->compression(column_->path()))); } format::ColumnChunk* column_chunk_; std::unique_ptr owned_column_chunk_; + format::ColumnMetaData column_metadata_; const std::shared_ptr properties_; const ColumnDescriptor* column_; }; @@ -797,20 +970,22 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { << " columns are initialized"; throw ParquetException(ss.str()); } - int64_t total_byte_size = 0; - - for (int i = 0; i < schema_->num_columns(); i++) { - if (!(row_group_->columns[i].file_offset >= 0)) { - std::stringstream ss; - ss << "Column " << i << " is not complete."; - throw ParquetException(ss.str()); - } - total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; - } - DCHECK(total_bytes_written == total_byte_size) - << "Total bytes in this RowGroup does not match with compressed sizes of columns"; - - row_group_->__set_total_byte_size(total_byte_size); + // int64_t total_byte_size = 0; + + // for (int i = 0; i < schema_->num_columns(); i++) { + // if (!(row_group_->columns[i].file_offset >= 0)) { + // std::stringstream ss; + // ss << "Column " << i << " is not complete."; + // throw ParquetException(ss.str()); + // } + // total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + // } + // DCHECK(total_bytes_written == total_byte_size) + // << "Total bytes in this RowGroup does not match with compressed sizes of + // columns"; + + // row_group_->__set_total_byte_size(total_byte_size); + row_group_->__set_total_byte_size(total_bytes_written); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -871,6 +1046,9 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); + if (props->footer_encryption() != nullptr) { + crypto_metadata_.reset(new format::FileCryptoMetaData()); + } } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -936,8 +1114,39 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } + std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) { + if (crypto_metadata_ == nullptr) { + return nullptr; + } + + auto file_encryption = properties_->file_encryption(); + auto footer_encryption = properties_->footer_encryption(); + + // build format::FileCryptoMetaData + EncryptionAlgorithm encryption_algorithm; + encryption_algorithm.algorithm = footer_encryption->algorithm(); + encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); + crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); + crypto_metadata_->__set_encrypted_footer(!footer_encryption->key().empty()); + + std::string footer_key_metadata = file_encryption->footer_key_metadata(); + if (!footer_key_metadata.empty()) { + crypto_metadata_->__set_footer_key_metadata(footer_key_metadata); + } + crypto_metadata_->__set_footer_offset(footerOffset); + + // TODO set iv_prefix??? + + // return as FileCryptoMetaData + std::unique_ptr file_crypto_meta_data = + std::unique_ptr(new FileCryptoMetaData()); + file_crypto_meta_data->impl_->metadata_ = std::move(crypto_metadata_); + return file_crypto_meta_data; + } + protected: std::unique_ptr metadata_; + std::unique_ptr crypto_metadata_; private: const std::shared_ptr properties_; @@ -969,4 +1178,9 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData( + uint64_t footerOffset) { + return impl_->BuildFileCryptoMetaData(footerOffset); +} + } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index b3a5f7b808a..ddc34023765 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -98,6 +98,22 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +class PARQUET_EXPORT ColumnCryptoMetaData { + public: + static std::unique_ptr Make(const uint8_t* metadata); + ~ColumnCryptoMetaData(); + + const std::vector& path_in_schema() const; + bool encrypted_with_footer_key() const; + const std::string& column_key_metadata() const; + + private: + explicit ColumnCryptoMetaData(const uint8_t* metadata); + + class ColumnCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor @@ -128,6 +144,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; + std::unique_ptr crypto_meta_data() const; private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, @@ -168,7 +185,8 @@ class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); + uint32_t* metadata_len, + std::shared_ptr encryption = NULLPTR); ~FileMetaData(); @@ -184,7 +202,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst) const; + void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -199,7 +217,8 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, + std::shared_ptr encryption = NULLPTR); // PIMPL Idiom FileMetaData(); @@ -207,6 +226,30 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +class PARQUET_EXPORT FileCryptoMetaData { + public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + ~FileCryptoMetaData(); + + EncryptionAlgorithm encryption_algorithm(); + bool encrypted_footer(); + const std::string& footer_key_metadata(); + uint64_t footer_offset(); + + void WriteTo(::arrow::io::OutputStream* dst) const; + + private: + friend FileMetaDataBuilder; + FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + + // PIMPL Idiom + FileCryptoMetaData(); + class FileCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { public: @@ -291,6 +334,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); + // crypto metadata + std::unique_ptr GetCryptoMetaData(uint64_t footerOffset); + private: explicit FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d08d7b0c8fe..d441769d0c3 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -22,11 +22,13 @@ #include #include +#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "arrow/util/logging.h" namespace parquet { @@ -37,6 +39,127 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 0; static bool DEFAULT_USE_BUFFERED_STREAM = false; +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class Builder { + public: + Builder(const std::string& path, bool encrypt) + : path_(path), encrypt_(encrypt), encrypted_with_footer_key_(encrypt) {} + + Builder* key(const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + DCHECK(encrypt_); + + key_ = key; + return this; + } + Builder* key_metadata(const std::string& key_id) { + DCHECK(!key_id.empty()); + key_metadata_ = key_id; + return this; + } + + Builder* key_id(uint32_t key_id) { + std::string key_metadata = std::string(reinterpret_cast(&key_id), 4); + this->key_metadata(key_metadata); + return this; + } + + std::shared_ptr build() { + return std::make_shared( + path_, encrypt_, encrypted_with_footer_key_, key_, key_metadata_); + } + + private: + std::string path_; + bool encrypt_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + }; + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + ColumnEncryptionProperties(const std::string& path, bool encrypt, + bool encrypted_with_footer_key, const std::string& key, + const std::string& key_metadata) + : path_(path), + encrypt_(encrypt), + encrypted_with_footer_key_(encrypted_with_footer_key), + key_(key), + key_metadata_(key_metadata) {} + + const std::string& path() const { return path_; } + bool encrypted() const { return encrypt_; } + bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } + + private: + std::string path_; + bool encrypt_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + FileDecryptionProperties(const std::string& footer_key) : footer_key_(footer_key) { + DCHECK(footer_key_.length() == 16 || footer_key_.length() == 24 || + footer_key_.length() == 32); + } + + FileDecryptionProperties(const std::shared_ptr& key_retriever) + : key_retriever_(key_retriever) {} + + void SetAad(const std::string& aad) { aad_ = aad; } + + void SetColumnKey(const std::string& name, const std::string& key) { + SetColumnKey(std::vector({name}), key); + } + + void SetColumnKey(const std::vector& paths, const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + schema::ColumnPath columnPath(paths); + + column_keys_[columnPath.ToDotString()] = key; + } + + const std::string& GetColumnKey(const std::shared_ptr& columnPath, + const std::string& key_metadata = "") { + if (key_metadata.empty()) { + return column_keys_.at(columnPath->ToDotString()); + } + if (key_retriever_ == NULLPTR) { + throw ParquetException("no key retriever is provided for column key metadata"); + } + return key_retriever_->GetKey(key_metadata); + } + + const std::string& GetFooterKey(const std::string& footer_key_metadata = "") { + if (footer_key_metadata.empty()) { + return footer_key_; + } + if (key_retriever_ == NULLPTR) { + throw ParquetException("no key retriever is provided for footer key metadata"); + } + return key_retriever_->GetKey(footer_key_metadata); + } + const std::string& GetAad() { return aad_; } + + private: + std::string footer_key_; + std::string aad_; + + std::map column_keys_; + + std::shared_ptr key_retriever_; +}; + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) @@ -60,10 +183,17 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } + void file_decryption(const std::shared_ptr& decryption) { + file_decryption_ = decryption; + } + + FileDecryptionProperties* file_decryption() { return file_decryption_.get(); } + private: ::arrow::MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + std::shared_ptr file_decryption_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -80,6 +210,10 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; +static constexpr Encryption::type DEFAULT_ENCRYPTION_ALGORITHM = Encryption::AES_GCM_V1; +static constexpr int32_t MAXIMAL_KEY_METADATA_LENGTH = 256; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPT_THE_REST = true; class PARQUET_EXPORT ColumnProperties { public: @@ -128,6 +262,195 @@ class PARQUET_EXPORT ColumnProperties { size_t max_stats_size_; }; +class PARQUET_EXPORT FileEncryptionProperties { + public: + class Builder { + public: + Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) {} + + Builder(const std::string& key) + : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + footer_key_ = key; + } + + Builder* algorithm(Encryption::type algorithm) { + algorithm_ = algorithm; + return this; + } + + Builder* footer_key(const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + footer_key_ = key; + return this; + } + + Builder* footer_key_metadata(const std::string& key_metadata) { + DCHECK(!footer_key_.empty()); + DCHECK(!key_metadata.empty() && + key_metadata.length() < MAXIMAL_KEY_METADATA_LENGTH); + footer_key_metadata_ = key_metadata; + return this; + } + + Builder* aad(const std::string& aad) { + DCHECK(!aad.empty()); + aad_ = aad; + return this; + } + + Builder* aad_metadata(const std::string& aad_metadata) { + DCHECK(!aad_.empty()); + DCHECK(!aad_metadata.empty() && + aad_metadata.length() < MAXIMAL_AAD_METADATA_LENGTH); + aad_metadata_ = aad_metadata; + return this; + } + + /** + * encrypt_the_rest will define if other columns (not defined in columns argument) + * will be encrypted or not + * if encrypt_the_rest = true, other columns will be encrypted with footer key + * else, other columns will be unencrypted + */ + Builder* column_properties( + const std::map>& + column_properties, + bool encrypt_the_rest = DEFAULT_ENCRYPT_THE_REST) { + encrypt_the_rest_ = encrypt_the_rest; + column_properties_ = column_properties; + + if (!footer_key_.empty()) { + uniform_encryption_ = true; + + for (const auto& col : column_properties) { + if (col.second->key().compare(footer_key_) != 0) { + uniform_encryption_ = false; + break; + } + } + } else { + if (encrypt_the_rest) { + throw ParquetException("Encrypt the rest with null footer key"); + } + bool all_are_unencrypted = true; + for (const auto& col : column_properties) { + if (col.second->encrypted()) { + if (col.second->key().empty()) { + throw ParquetException("Encrypt column with null footer key"); + } + all_are_unencrypted = false; + } + } + + if (all_are_unencrypted) { + throw ParquetException("Footer and all columns unencrypted"); + } + } + return this; + } + + std::shared_ptr build() { + std::shared_ptr footer_encryption; + if (!footer_key_.empty()) { + footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); + } + return std::make_shared( + footer_encryption, footer_key_metadata_, aad_metadata_, uniform_encryption_, + column_properties_, encrypt_the_rest_); + } + + private: + Encryption::type algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_; + std::string aad_metadata_; + + bool uniform_encryption_; + + std::map> column_properties_; + bool encrypt_the_rest_; + }; + + FileEncryptionProperties( + const std::shared_ptr& footer_encryption, + const std::string& footer_key_metadata, const std::string& aad_metadata, + bool uniform_encryption, + const std::map>& + column_properties, + bool encrypt_the_rest) + : footer_encryption_(footer_encryption), + footer_key_metadata_(footer_key_metadata), + aad_metadata_(aad_metadata), + uniform_encryption_(uniform_encryption), + column_properties_(column_properties), + encrypt_the_rest_(encrypt_the_rest) {} + + std::shared_ptr GetFooterEncryptionProperties() { + return footer_encryption_; + } + + const std::string& footer_key_metadata() const { return footer_key_metadata_; } + + const std::string& aad_metadata() const { return aad_metadata_; } + + std::shared_ptr GetColumnCryptoMetaData( + const std::shared_ptr& path) { + // uniform encryption + if (uniform_encryption_) { + return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); + } + + // non-uniform encryption + std::string path_str = path->ToDotString(); + if (column_properties_.find(path_str) != column_properties_.end()) { + return column_properties_[path_str]; + } + + // encrypted with footer key + if (encrypt_the_rest_) { + return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); + } + + // unencrypted + return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); + } + + std::shared_ptr GetColumnEncryptionProperties( + const std::shared_ptr& path) { + // uniform encryption + if (uniform_encryption_) { + return footer_encryption_; + } + + // non-uniform encryption + std::string path_str = path->ToDotString(); + if (column_properties_.find(path_str) != column_properties_.end()) { + return std::make_shared(footer_encryption_->algorithm(), + column_properties_[path_str]->key(), + footer_encryption_->aad()); + } + + if (encrypt_the_rest_) { + return footer_encryption_; + } + + return NULLPTR; + } + + private: + std::shared_ptr footer_encryption_; + std::string footer_key_metadata_; + std::string aad_metadata_; + + bool uniform_encryption_; + + std::map> column_properties_; + bool encrypt_the_rest_; +}; + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -268,6 +591,12 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } + Builder* encryption( + const std::shared_ptr& file_encryption) { + file_encryption_ = file_encryption; + return this; + } + Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); return this; @@ -313,10 +642,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_), + default_column_properties_, column_properties)); } private: @@ -327,6 +656,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; + std::shared_ptr file_encryption_; // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -350,6 +680,18 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } + inline FileEncryptionProperties* file_encryption() const { + return parquet_file_encryption_.get(); + } + + inline std::shared_ptr footer_encryption() const { + if (parquet_file_encryption_ == NULLPTR) { + return NULLPTR; + } else { + return parquet_file_encryption_->GetFooterEncryptionProperties(); + } + } + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -393,11 +735,30 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } + std::shared_ptr column_encryption_props( + const std::shared_ptr& path) const { + if (parquet_file_encryption_) { + return parquet_file_encryption_->GetColumnCryptoMetaData(path); + } else { + return NULLPTR; + } + } + + std::shared_ptr encryption( + const std::shared_ptr& path) const { + if (parquet_file_encryption_) { + return parquet_file_encryption_->GetColumnEncryptionProperties(path); + } else { + return NULLPTR; + } + } + private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, + std::shared_ptr file_encryption, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -407,6 +768,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), + parquet_file_encryption_(file_encryption), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -417,6 +779,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + std::shared_ptr parquet_file_encryption_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index c7b62073df5..092a881e3eb 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -44,6 +44,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/statistics.h" +#include "parquet/util/crypto.h" #include "parquet/parquet_types.h" // IYWU pragma: export @@ -81,6 +82,16 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } +static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + if (encryption.__isset.AES_GCM_V1) { + return EncryptionAlgorithm{Encryption::AES_GCM_V1, + encryption.AES_GCM_V1.aad_metadata}; + } else { + return EncryptionAlgorithm{Encryption::AES_GCM_CTR_V1, + encryption.AES_GCM_CTR_V1.aad_metadata}; + } +} + static inline format::Type::type ToThrift(Type::type type) { return static_cast(type); } @@ -131,6 +142,20 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { return statistics; } +static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { + format::EncryptionAlgorithm encryption_algorithm; + if (encryption.algorithm == Encryption::AES_GCM_V1) { + encryption_algorithm.__isset.AES_GCM_V1 = true; + encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); + encryption_algorithm.AES_GCM_V1.aad_metadata = encryption.aad_metadata; + } else { + encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; + encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); + encryption_algorithm.AES_GCM_CTR_V1.aad_metadata = encryption.aad_metadata; + } + return encryption_algorithm; +} + // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities @@ -140,22 +165,48 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new ThriftBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; - shared_ptr tproto = // - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + const EncryptionProperties* encryption = NULLPTR) { + if (encryption == NULLPTR) { + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; + } else { + // first 4 bytes for length + uint8_t clenBytes[4]; + memcpy(clenBytes, buf, 4); + + uint32_t clen = *(reinterpret_cast(clenBytes)); + + // decrypt + std::vector decrypted_buffer(encryption->CalculatePlainSize(clen)); + + uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( + encryption->algorithm(), true, &buf[4], clen, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + decrypted_buffer.data()); + + if (decrypted_buffer_len <= 0) { + throw ParquetException("Couldn't decrypt buffer\n"); + } + + DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, + deserialized_msg); + + *len = 4 + clen; } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; } /// Utility class to serialize thrift objects to a binary format. This object @@ -186,12 +237,26 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); - return static_cast(out_length); + + if (encryption == NULLPTR) { + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); + } else { + std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); + int cipher_buffer_len = parquet_encryption::Encrypt( + encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + cipher_buffer.data()); + + PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); + + return static_cast(cipher_buffer_len + 4); + } } private: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 0bfaf99b381..02bc59f5433 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -447,6 +447,64 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct EncryptionAlgorithm { + Encryption::type algorithm; + std::string aad_metadata; +}; + +class PARQUET_EXPORT EncryptionProperties { + private: + static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); + } + + public: + EncryptionProperties() = default; + EncryptionProperties(Encryption::type algorithm, const std::string& key, + const std::string& aad = "") + : algorithm_(algorithm), key_(key), aad_(aad) {} + + ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } + + int key_length() const { return static_cast(key_.length()); } + uint8_t* key_bytes() const { return str2bytes(key_); } + + void aad(const std::string& aad) { aad_ = aad; } + int aad_length() const { return static_cast(aad_.length()); } + uint8_t* aad_bytes() const { return str2bytes(aad_); } + + Encryption::type algorithm() const { return algorithm_; } + + const std::string& key() const { return key_; } + const std::string& aad() const { return aad_; } + + uint32_t CalculateCipherSize(uint32_t plain_len) const { + if (algorithm_ == Encryption::AES_GCM_V1) { + return plain_len + 28; + } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + return plain_len + 16; + } + return plain_len; + } + + uint32_t CalculatePlainSize(uint32_t cipher_len) const { + if (algorithm_ == Encryption::AES_GCM_V1) { + return cipher_len - 28; + } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + return cipher_len - 16; + } + return cipher_len; + } + + private: + Encryption::type algorithm_; // encryption algorithm + std::string key_; // encryption key, should have 16, 24, 32-byte length + std::string aad_; // encryption additional authenticated data +}; + // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; From 138feebff84e6bb7006a4560a5f65748a94617be Mon Sep 17 00:00:00 2001 From: Tham Ha Thi Date: Thu, 24 Jan 2019 10:30:32 +0700 Subject: [PATCH 002/125] update thrift change and update encrypted footer --- cpp/src/parquet/file_reader.cc | 76 ++++++++++++++------------- cpp/src/parquet/file_writer.cc | 27 +++------- cpp/src/parquet/metadata.cc | 76 ++++++++++++--------------- cpp/src/parquet/metadata.h | 12 ++--- cpp/src/parquet/thrift.h | 94 ++++++++++++++++++++++++---------- cpp/src/parquet/types.h | 16 ++++-- 6 files changed, 163 insertions(+), 138 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index a73c5b7bb21..4b3027d7397 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -130,7 +130,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if (!file_crypto_metadata_ || !crypto_meta_data) { + if (!file_crypto_metadata_ || !crypto_metadata) { encrypted = false; } @@ -144,8 +144,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto file_decryption = properties_.file_decryption(); // the column is encrypted with footer key - if (crypto_meta_data->encrypted_with_footer_key()) { - std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + if (crypto_metadata->encrypted_with_footer_key()) { + std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); if (footer_key.empty()) { @@ -162,7 +162,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is non-uniform encrypted and the column is encrypted with its own key - std::string column_key_metadata = crypto_meta_data->column_key_metadata(); + std::string column_key_metadata = crypto_metadata->key_metadata(); // encrypted with column key std::string column_key = file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); @@ -239,7 +239,8 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Corrupt footer."); } - // no encryption + // no encryption or encryption with plaintext footer + // TODO: encryption with plaintext footer if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { uint32_t metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - @@ -265,58 +266,61 @@ class SerializedFile : public ParquetFileReader::Contents { } file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } - // encryption + // encryption with encrypted footer else { - // read crypto metadata - uint32_t crypto_metadata_len = arrow::util::SafeLoadAs( + // both metadata & crypto metadata length + uint32_t footer_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - kFooterSize); - int64_t crypto_metadata_start = file_size - kFooterSize - crypto_metadata_len; + int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; - if (kFooterSize + crypto_metadata_len > file_size) { + if (kFooterSize + footer_len > file_size) { throw ParquetException( "Invalid parquet file. File is less than " "file metadata size."); } - std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (crypto_metadata_len + kFooterSize)) { + if (footer_read_size >= (footer_len + kFooterSize)) { crypto_metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - crypto_metadata_len - kFooterSize, crypto_metadata_len); + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); } else { PARQUET_THROW_NOT_OK( - source_->ReadAt(crypto_metadata_start, crypto_metadata_len, &crypto_metadata_buffer)); - if (crypto_metadata_buffer->size() != crypto_metadata_len) { + source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != footer_len) { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } + uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - - int64_t footer_offset = file_crypto_metadata_->footer_offset(); - uint32_t footer_read_size = (uint32_t)(crypto_metadata_start - footer_offset); - - std::shared_ptr metadata_buffer = - SliceBuffer(footer_buffer, footer_offset, footer_read_size); + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - if (file_crypto_metadata_->encrypted_footer()) { - // get footer key metadata - std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); - - auto file_decryption = properties_.file_decryption(); - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); - - auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->GetAad()); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + std::shared_ptr metadata_buffer; + PARQUET_THROW_NOT_OK( + source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException("Invalid encrypted parquet file. Could not read footer metadata bytes."); + } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size, - footer_encryption); - } else { - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size); + // get footer key metadata + std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); + auto file_decryption = properties_.file_decryption(); + if (file_decryption == nullptr) { + throw ParquetException("No decryption properties are provided. Could not read encrypted footer metadata"); + } + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + if (footer_key.size() == 0) { + throw ParquetException("Invalid footer encryption key. Could not parse footer metadata"); } + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, + footer_encryption); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d79274c50fd..63a6aeddc41 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -271,13 +271,16 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*file_metadata_, sink_.get()); } else { uint64_t metadata_start = static_cast(sink_->Tell()); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + file_encryption->GetFooterEncryptionProperties(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption.get()); + uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - auto crypto_metadata = metadata_->GetCryptoMetaData(metadata_start); - WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + sink_->Write(PARQUET_EMAGIC, 4); } sink_->Close(); @@ -392,7 +395,6 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin uint32_t metadata_len = static_cast(position); file_metadata.WriteTo(sink); - PARQUET_THROW_NOT_OK(sink->Tell(&position)); metadata_len = static_cast(position) - metadata_len; @@ -410,25 +412,10 @@ void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { return WriteFileMetaData(file_metadata, &wrapper); } -void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, - ArrowOutputStream* sink) { - int64_t position = -1; - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - uint64_t crypto_offset = static_cast(position); - - // Get a FileCryptoMetaData - crypto_metadata.WriteTo(sink); - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - auto crypto_len = static_cast(position) - crypto_offset; - PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&crypto_len), 4)); - - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_EMAGIC, 4)); -} - void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink) { ParquetOutputWrapper wrapper(sink); - return WriteFileCryptoMetaData(crypto_metadata, &wrapper); + crypto_metadata.WriteTo(sink); } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c9074eb3b1d..8f521bf2b24 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -75,7 +75,7 @@ static std::shared_ptr MakeTypedColumnStats( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); + metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); } // Default behavior return TypedStatistics::Make( @@ -128,8 +128,8 @@ class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { const std::vector& path_in_schema() const { return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema; } - const std::string& column_key_metadata() const { - return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.column_key_metadata; + const std::string& key_metadata() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata; } private: @@ -153,8 +153,8 @@ const std::vector& ColumnCryptoMetaData::path_in_schema() const { bool ColumnCryptoMetaData::encrypted_with_footer_key() const { return impl_->encrypted_with_footer_key(); } -const std::string& ColumnCryptoMetaData::column_key_metadata() const { - return impl_->column_key_metadata(); +const std::string& ColumnCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); } // ColumnChunk metadata @@ -239,10 +239,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return column_->meta_data.total_uncompressed_size; } - inline std::unique_ptr crypto_meta_data() const { - if (column_->__isset.crypto_meta_data) { + inline std::unique_ptr crypto_metadata() const { + if (column_->__isset.crypto_metadata) { return ColumnCryptoMetaData::Make( - reinterpret_cast(&column_->crypto_meta_data)); + reinterpret_cast(&column_->crypto_metadata)); } else { return nullptr; } @@ -325,8 +325,8 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } -std::unique_ptr ColumnChunkMetaData::crypto_meta_data() const { - return impl_->crypto_meta_data(); +std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { + return impl_->crypto_metadata(); } // row-group metadata @@ -398,7 +398,7 @@ class FileMetaData::FileMetaDataImpl { : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), encryption.get()); + metadata_.get(), encryption.get(), false); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -428,7 +428,7 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst, encryption); + serializer.Serialize(metadata_.get(), dst, encryption, false); } std::unique_ptr RowGroup(int i) { @@ -594,13 +594,7 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } - - bool encrypted_footer() { return metadata_->encrypted_footer; } - - const std::string& footer_key_metadata() { return metadata_->footer_key_metadata; } - - uint64_t footer_offset() { return metadata_->footer_offset; } - + const std::string& key_metadata() { return metadata_->key_metadata; } void WriteTo(::arrow::io::OutputStream* dst) const { ThriftSerializer serializer; serializer.Serialize(metadata_.get(), dst); @@ -612,14 +606,13 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { uint32_t metadata_len_; }; -EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() { +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const { return impl_->encryption_algorithm(); } -bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } -const std::string& FileCryptoMetaData::footer_key_metadata() { - return impl_->footer_key_metadata(); + +const std::string& FileCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); } -uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } std::shared_ptr FileCryptoMetaData::Make( const uint8_t* serialized_metadata, uint32_t* metadata_len) { @@ -820,7 +813,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { serializer.Serialize(column_chunk_, sink); } else { // column is encrypted - column_chunk_->__isset.crypto_meta_data = true; + column_chunk_->__isset.crypto_metadata = true; // encrypted with footer key format::ColumnCryptoMetaData ccmd; @@ -829,12 +822,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; - eck.__set_column_key_metadata(encrypt_md->key_metadata()); + eck.__set_key_metadata(encrypt_md->key_metadata()); eck.__set_path_in_schema(column_->path()->ToDotVector()); ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk_->__set_crypto_meta_data(ccmd); + column_chunk_->__set_crypto_metadata(ccmd); auto footer_encryption = properties_->footer_encryption(); @@ -1114,7 +1107,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } - std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) { + std::unique_ptr BuildFileCryptoMetaData() { if (crypto_metadata_ == nullptr) { return nullptr; } @@ -1125,23 +1118,19 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { // build format::FileCryptoMetaData EncryptionAlgorithm encryption_algorithm; encryption_algorithm.algorithm = footer_encryption->algorithm(); - encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); + // TODO: aad metadata + //encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); - crypto_metadata_->__set_encrypted_footer(!footer_encryption->key().empty()); - - std::string footer_key_metadata = file_encryption->footer_key_metadata(); - if (!footer_key_metadata.empty()) { - crypto_metadata_->__set_footer_key_metadata(footer_key_metadata); + std::string key_metadata = file_encryption->footer_key_metadata(); + if (!key_metadata.empty()) { + crypto_metadata_->__set_key_metadata(key_metadata); } - crypto_metadata_->__set_footer_offset(footerOffset); - // TODO set iv_prefix??? - - // return as FileCryptoMetaData - std::unique_ptr file_crypto_meta_data = + std::unique_ptr file_crypto_metadata = std::unique_ptr(new FileCryptoMetaData()); - file_crypto_meta_data->impl_->metadata_ = std::move(crypto_metadata_); - return file_crypto_meta_data; + file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_); + + return file_crypto_metadata; } protected: @@ -1178,9 +1167,8 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } -std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData( - uint64_t footerOffset) { - return impl_->BuildFileCryptoMetaData(footerOffset); +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { + return impl_->BuildFileCryptoMetaData(); } } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index ddc34023765..6d78a1642e3 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -105,7 +105,7 @@ class PARQUET_EXPORT ColumnCryptoMetaData { const std::vector& path_in_schema() const; bool encrypted_with_footer_key() const; - const std::string& column_key_metadata() const; + const std::string& key_metadata() const; private: explicit ColumnCryptoMetaData(const uint8_t* metadata); @@ -144,7 +144,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; - std::unique_ptr crypto_meta_data() const; + std::unique_ptr crypto_metadata() const; private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, @@ -233,10 +233,8 @@ class PARQUET_EXPORT FileCryptoMetaData { uint32_t* metadata_len); ~FileCryptoMetaData(); - EncryptionAlgorithm encryption_algorithm(); - bool encrypted_footer(); - const std::string& footer_key_metadata(); - uint64_t footer_offset(); + EncryptionAlgorithm encryption_algorithm() const; + const std::string& key_metadata() const; void WriteTo(::arrow::io::OutputStream* dst) const; @@ -335,7 +333,7 @@ class PARQUET_EXPORT FileMetaDataBuilder { std::unique_ptr Finish(); // crypto metadata - std::unique_ptr GetCryptoMetaData(uint64_t footerOffset); + std::unique_ptr GetCryptoMetaData(); private: explicit FileMetaDataBuilder( diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 092a881e3eb..8f2e58ebb29 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -82,14 +82,34 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } +static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) { + return AadMetadata { + aesGcmV1.aad_prefix, + aesGcmV1.aad_file_unique, + aesGcmV1.supply_aad_prefix + }; +} + +static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { + return AadMetadata { + aesGcmCtrV1.aad_prefix, + aesGcmCtrV1.aad_file_unique, + aesGcmCtrV1.supply_aad_prefix + }; +} + static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + EncryptionAlgorithm encryption_algorithm; + if (encryption.__isset.AES_GCM_V1) { - return EncryptionAlgorithm{Encryption::AES_GCM_V1, - encryption.AES_GCM_V1.aad_metadata}; + encryption_algorithm.algorithm = Encryption::AES_GCM_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); + } else { - return EncryptionAlgorithm{Encryption::AES_GCM_CTR_V1, - encryption.AES_GCM_CTR_V1.aad_metadata}; + encryption_algorithm.algorithm = Encryption::AES_GCM_CTR_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); } + return encryption_algorithm; } static inline format::Type::type ToThrift(Type::type type) { @@ -142,16 +162,30 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { return statistics; } +static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { + format::AesGcmV1 aesGcmV1; + aesGcmV1.aad_prefix = aad.aad_prefix; + aesGcmV1.aad_file_unique = aad.aad_file_unique; + aesGcmV1.supply_aad_prefix = aad.supply_aad_prefix; + return aesGcmV1; +} + +static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { + format::AesGcmCtrV1 aesGcmCtrV1; + aesGcmCtrV1.aad_prefix = aad.aad_prefix; + aesGcmCtrV1.aad_file_unique = aad.aad_file_unique; + aesGcmCtrV1.supply_aad_prefix = aad.supply_aad_prefix; + return aesGcmCtrV1; +} + static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; if (encryption.algorithm == Encryption::AES_GCM_V1) { encryption_algorithm.__isset.AES_GCM_V1 = true; - encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); - encryption_algorithm.AES_GCM_V1.aad_metadata = encryption.aad_metadata; + encryption_algorithm.AES_GCM_V1 = ToAesGcmV1Thrift(encryption.aad); } else { encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; - encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); - encryption_algorithm.AES_GCM_CTR_V1.aad_metadata = encryption.aad_metadata; + encryption_algorithm.AES_GCM_CTR_V1 = ToAesGcmCtrV1Thrift(encryption.aad); } return encryption_algorithm; } @@ -166,7 +200,8 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const EncryptionProperties* encryption = NULLPTR) { + const EncryptionProperties* encryption = NULLPTR, + bool shouldReadLength = true) { if (encryption == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( @@ -184,27 +219,28 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali uint32_t bytes_left = tmem_transport->available_read(); *len = *len - bytes_left; } else { - // first 4 bytes for length - uint8_t clenBytes[4]; - memcpy(clenBytes, buf, 4); - - uint32_t clen = *(reinterpret_cast(clenBytes)); - + uint32_t clen; + if (shouldReadLength) { + // first 4 bytes for length + uint8_t clenBytes[4]; + memcpy(clenBytes, buf, 4); + clen = *(reinterpret_cast(clenBytes)); + } + else { + clen = *len; + } // decrypt - std::vector decrypted_buffer(encryption->CalculatePlainSize(clen)); - + const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; + std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption->algorithm(), true, &buf[4], clen, encryption->key_bytes(), + encryption->algorithm(), true, cipherBuf, clen, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), decrypted_buffer.data()); - if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } - DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - *len = 4 + clen; } } @@ -237,7 +273,8 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR, + bool shouldWriteLength = true) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -252,10 +289,15 @@ class ThriftSerializer { encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), cipher_buffer.data()); - PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); - PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); - - return static_cast(cipher_buffer_len + 4); + if (shouldWriteLength) { + PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len + 4); + } + else { + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len); + } } } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 02bc59f5433..b7c1f430c50 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -447,9 +447,15 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct AadMetadata { + std::string aad_prefix; + std::string aad_file_unique; + bool supply_aad_prefix; +}; + struct EncryptionAlgorithm { Encryption::type algorithm; - std::string aad_metadata; + AadMetadata aad; }; class PARQUET_EXPORT EncryptionProperties { @@ -481,8 +487,8 @@ class PARQUET_EXPORT EncryptionProperties { const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } - uint32_t CalculateCipherSize(uint32_t plain_len) const { - if (algorithm_ == Encryption::AES_GCM_V1) { + uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { + if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { return plain_len + 28; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { return plain_len + 16; @@ -490,8 +496,8 @@ class PARQUET_EXPORT EncryptionProperties { return plain_len; } - uint32_t CalculatePlainSize(uint32_t cipher_len) const { - if (algorithm_ == Encryption::AES_GCM_V1) { + uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { + if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { return cipher_len - 28; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { return cipher_len - 16; From 65fa0204faab5e3052822704516d13982b269cef Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 28 Jan 2019 17:12:39 +0700 Subject: [PATCH 003/125] add encryption source files into CMakeLists.txt --- cpp/src/parquet/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index cb8de1657d6..20f1906f59d 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -166,6 +166,7 @@ set(PARQUET_SRCS column_writer.cc deprecated_io.cc encoding.cc + encryption.cc file_reader.cc file_writer.cc metadata.cc @@ -177,7 +178,8 @@ set(PARQUET_SRCS properties.cc schema.cc statistics.cc - types.cc) + types.cc + util/crypto.cc) # Ensure that thrift compilation is done before using its generated headers # in parquet code. From 859fe6b97004fbf79399b77d6d0996ca3a4faccf Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 29 Jan 2019 18:30:31 +0700 Subject: [PATCH 004/125] add example from old PR of parquet-cpp --- cpp/examples/parquet/CMakeLists.txt | 4 + .../low-level-api/encryption-reader-writer.cc | 436 ++++++++++++++++++ 2 files changed, 440 insertions(+) create mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer.cc diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index f2722b1cbf8..2a2421c18ab 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -17,10 +17,13 @@ add_executable(parquet-low-level-example low-level-api/reader-writer.cc) add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) +add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) +target_include_directories(parquet-encryption-example PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) +target_link_libraries(parquet-encryption-example parquet_static) add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated @@ -33,4 +36,5 @@ endif() add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 + parquet-encryption-example parquet-arrow-example) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc new file mode 100644 index 00000000000..0f8e9ab3e83 --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -0,0 +1,436 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +/* + * This example describes writing and reading Parquet Files in C++ and serves as a + * reference to the API. + * The file contains all the physical data types supported by Parquet. + * This example uses the RowGroupWriter API that supports writing RowGroups optimized for + *memory consumption + **/ + +/* Parquet is a structured columnar file format + * Parquet File = "Parquet data" + "Parquet Metadata" + * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a + * columnar layout + * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their + * Columns + * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a + * complex (nested) type (internal nodes) + * For specific details, please refer the format here: + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + **/ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; +const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; +const std::string FOOTER_ENCRYPTION_KEY = "0123456789012345"; // 16 bytes +const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes + +int main(int argc, char** argv) { + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // parquet::REQUIRED fields do not need definition and repetition level values + // parquet::OPTIONAL fields require only definition level values + // parquet::REPEATED fields require both definition and repetition level values + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // uniform encryption + parquet::FileEncryptionProperties::Builder file_encryption_builder; + file_encryption_builder.footer_key(FOOTER_ENCRYPTION_KEY); + + // non-uniform with column keys + std::map> encryption_cols; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("column_0", true); + encryption_col_builder0.key(COLUMN_ENCRYPTION_KEY); + auto encryption_col0 = encryption_col_builder0.build(); + + encryption_cols[encryption_col0->path()] = encryption_col0; + + file_encryption_builder.column_properties(encryption_cols, true); + + builder.encryption(file_encryption_builder.build()); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + try { + // decryption properties + std::shared_ptr decryption_properties = + std::make_shared(FOOTER_ENCRYPTION_KEY); + decryption_properties->SetColumnKey("column_0", COLUMN_ENCRYPTION_KEY); + + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption(decryption_properties); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; + return -1; + } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; + + return 0; +} From 60adabbd408ea51e1f6b5aad4f17d0a84ee4a6ad Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 31 Jan 2019 17:26:33 +0700 Subject: [PATCH 005/125] change due to new update in crypto package --- cpp/src/parquet/thrift.h | 13 +++++++++---- cpp/src/parquet/types.h | 8 ++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 8f2e58ebb29..b57c82e4773 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -201,7 +201,7 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const EncryptionProperties* encryption = NULLPTR, - bool shouldReadLength = true) { + bool shouldReadLength = false) { if (encryption == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( @@ -233,7 +233,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption->algorithm(), true, cipherBuf, clen, encryption->key_bytes(), + encryption->algorithm(), true, cipherBuf, 0, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { @@ -241,7 +241,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - *len = 4 + clen; + *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); } } @@ -274,7 +274,7 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR, - bool shouldWriteLength = true) { + bool shouldWriteLength = false) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -288,6 +288,11 @@ class ThriftSerializer { encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), cipher_buffer.data()); + if (cipher_buffer_len > cipher_buffer.size()) { + std::stringstream ss; + ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; + throw ParquetException(ss.str()); + } if (shouldWriteLength) { PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index b7c1f430c50..67aafb5ccb3 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -489,18 +489,18 @@ class PARQUET_EXPORT EncryptionProperties { uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { - return plain_len + 28; + return plain_len + 28 + 4; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { - return plain_len + 16; + return plain_len + 16 + 4; } return plain_len; } uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { - return cipher_len - 28; + return cipher_len - 28 - 4; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { - return cipher_len - 16; + return cipher_len - 16 - 4; } return cipher_len; } From 40156572e1717fb876e2015e579b5b2c6d89ffa4 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 1 Feb 2019 15:20:48 +0700 Subject: [PATCH 006/125] pass EncryptionProperties into parquet_encryption::Encrypt()/Decrypt() instead of pass it as many params --- cpp/src/parquet/column_reader.cc | 6 +++--- cpp/src/parquet/column_reader.h | 2 +- cpp/src/parquet/column_writer.cc | 4 ++-- cpp/src/parquet/file_writer.cc | 4 ++-- cpp/src/parquet/file_writer.h | 2 +- cpp/src/parquet/metadata.cc | 14 +++++++------- cpp/src/parquet/metadata.h | 6 +++--- cpp/src/parquet/thrift.h | 13 +++++-------- 8 files changed, 24 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index d6ac3bae047..8a642f1e2ca 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -108,7 +108,7 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - std::shared_ptr encryption, + const std::shared_ptr encryption, ::arrow::MemoryPool* pool) : : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), @@ -171,7 +171,7 @@ std::shared_ptr SerializedPageReader::NextPage() { try { DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, - encryption_.get()); + encryption_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -277,7 +277,7 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, std::shared_ptr encryption, + Compression::type codec, const std::shared_ptr encryption, ::arrow::MemoryPool* pool) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, encryption, pool)); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 052b21218dc..a5f1c7b6bd1 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -81,7 +81,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, std::shared_ptr encryption = NULLPTR, + Compression::type codec, const std::shared_ptr& encryption = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index eb5523910ec..44b759a585a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -185,7 +185,7 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -265,7 +265,7 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 63a6aeddc41..99ab6ae3c54 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -276,7 +276,7 @@ class FileSerializer : public ParquetFileWriter::Contents { std::shared_ptr footer_encryption = file_encryption->GetFooterEncryptionProperties(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption.get()); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); @@ -387,7 +387,7 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - EncryptionProperties* footer_encryption) { + const std::shared_ptr& footer_encryption) { if (footer_encryption == nullptr) { // Write MetaData int64_t position = -1; diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 5d3a5aa6359..ab18034637e 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -108,7 +108,7 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - EncryptionProperties* encryption_properties = NULLPTR); + const std::shared_ptr& encryption_properties = NULLPTR); void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8f521bf2b24..b378b650ea0 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -394,11 +394,11 @@ class FileMetaData::FileMetaDataImpl { FileMetaDataImpl() : metadata_len_(0) {} explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, - std::shared_ptr encryption = nullptr) + const std::shared_ptr& encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), encryption.get(), false); + metadata_.get(), encryption, false); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -426,7 +426,7 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { ThriftSerializer serializer; serializer.Serialize(metadata_.get(), dst, encryption, false); } @@ -514,14 +514,14 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr FileMetaData::Make(const void* metadata, uint32_t* metadata_len, - std::shared_ptr encryption) { + const std::shared_ptr& encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( new FileMetaData(metadata, metadata_len, encryption)); } FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, - std::shared_ptr encryption) + const std::shared_ptr& encryption) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len, encryption))} {} @@ -575,7 +575,7 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { return impl_->WriteTo(dst, encryption); } @@ -844,7 +844,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { auto encrypt_props = properties_->encryption(column_->path()); uint64_t metadata_start = sink->Tell(); - serializer.Serialize(&column_metadata_, sink, encrypt_props.get()); + serializer.Serialize(&column_metadata_, sink, encrypt_props); // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. column_chunk_->__set_file_offset(metadata_start); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 6d78a1642e3..ef8ed175d73 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -186,7 +186,7 @@ class PARQUET_EXPORT FileMetaData { // API convenience to get a MetaData accessor static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len, - std::shared_ptr encryption = NULLPTR); + const std::shared_ptr& encryption = NULLPTR); ~FileMetaData(); @@ -202,7 +202,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption = NULLPTR) const; + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -218,7 +218,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, - std::shared_ptr encryption = NULLPTR); + const std::shared_ptr& encryption = NULLPTR); // PIMPL Idiom FileMetaData(); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index b57c82e4773..53f9b84c3a6 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -200,7 +200,7 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const EncryptionProperties* encryption = NULLPTR, + const std::shared_ptr& encryption = NULLPTR, bool shouldReadLength = false) { if (encryption == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. @@ -233,9 +233,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption->algorithm(), true, cipherBuf, 0, encryption->key_bytes(), - encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), - decrypted_buffer.data()); + encryption, true, cipherBuf, 0, decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } @@ -273,7 +271,8 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR, + int64_t Serialize(const T* obj, ArrowOutputStream* out, + const std::shared_ptr& encryption = NULLPTR, bool shouldWriteLength = false) { uint8_t* out_buffer; uint32_t out_length; @@ -285,9 +284,7 @@ class ThriftSerializer { } else { std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); int cipher_buffer_len = parquet_encryption::Encrypt( - encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), - encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), - cipher_buffer.data()); + encryption, true, out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > cipher_buffer.size()) { std::stringstream ss; ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; From 700e71e1bef82f218927fd164db3aab3711b1d0c Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 09:57:16 +0700 Subject: [PATCH 007/125] fix issue of wrong column name in encryption-example and remove FileEncryptionProperties::uniform_encryption_ --- .../low-level-api/encryption-reader-writer.cc | 4 +-- cpp/src/parquet/properties.h | 30 +++++-------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 0f8e9ab3e83..42f9b45e24f 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -73,7 +73,7 @@ int main(int argc, char** argv) { // non-uniform with column keys std::map> encryption_cols; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("column_0", true); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("ba_field", true); encryption_col_builder0.key(COLUMN_ENCRYPTION_KEY); auto encryption_col0 = encryption_col_builder0.build(); @@ -199,7 +199,7 @@ int main(int argc, char** argv) { // decryption properties std::shared_ptr decryption_properties = std::make_shared(FOOTER_ENCRYPTION_KEY); - decryption_properties->SetColumnKey("column_0", COLUMN_ENCRYPTION_KEY); + decryption_properties->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); reader_properties.file_decryption(decryption_properties); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d441769d0c3..cbc42227b2a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -49,6 +49,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { Builder* key(const std::string& key) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); DCHECK(encrypt_); + encrypted_with_footer_key_ = false; key_ = key; return this; @@ -266,10 +267,10 @@ class PARQUET_EXPORT FileEncryptionProperties { public: class Builder { public: - Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) {} + Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) {} Builder(const std::string& key) - : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) { + : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); footer_key_ = key; } @@ -321,11 +322,8 @@ class PARQUET_EXPORT FileEncryptionProperties { column_properties_ = column_properties; if (!footer_key_.empty()) { - uniform_encryption_ = true; - for (const auto& col : column_properties) { if (col.second->key().compare(footer_key_) != 0) { - uniform_encryption_ = false; break; } } @@ -356,7 +354,7 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); } return std::make_shared( - footer_encryption, footer_key_metadata_, aad_metadata_, uniform_encryption_, + footer_encryption, footer_key_metadata_, aad_metadata_, column_properties_, encrypt_the_rest_); } @@ -368,8 +366,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_; std::string aad_metadata_; - bool uniform_encryption_; - std::map> column_properties_; bool encrypt_the_rest_; }; @@ -377,14 +373,12 @@ class PARQUET_EXPORT FileEncryptionProperties { FileEncryptionProperties( const std::shared_ptr& footer_encryption, const std::string& footer_key_metadata, const std::string& aad_metadata, - bool uniform_encryption, const std::map>& column_properties, bool encrypt_the_rest) : footer_encryption_(footer_encryption), footer_key_metadata_(footer_key_metadata), aad_metadata_(aad_metadata), - uniform_encryption_(uniform_encryption), column_properties_(column_properties), encrypt_the_rest_(encrypt_the_rest) {} @@ -398,11 +392,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::shared_ptr GetColumnCryptoMetaData( const std::shared_ptr& path) { - // uniform encryption - if (uniform_encryption_) { - return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); - } - // non-uniform encryption std::string path_str = path->ToDotString(); if (column_properties_.find(path_str) != column_properties_.end()) { @@ -414,17 +403,12 @@ class PARQUET_EXPORT FileEncryptionProperties { return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); } - // unencrypted + // uniform encryption or unencrypted return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); } std::shared_ptr GetColumnEncryptionProperties( const std::shared_ptr& path) { - // uniform encryption - if (uniform_encryption_) { - return footer_encryption_; - } - // non-uniform encryption std::string path_str = path->ToDotString(); if (column_properties_.find(path_str) != column_properties_.end()) { @@ -433,10 +417,12 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_encryption_->aad()); } + // encrypted with footer key if (encrypt_the_rest_) { return footer_encryption_; } + // uniform encryption or unencrypted return NULLPTR; } @@ -445,8 +431,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key_metadata_; std::string aad_metadata_; - bool uniform_encryption_; - std::map> column_properties_; bool encrypt_the_rest_; }; From 73b936e32da5078428bbb2cd09cf6ef6cc804c2b Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 09:58:20 +0700 Subject: [PATCH 008/125] get column path from ColumnCryptoMetadata when column is encrypted with its own key --- cpp/src/parquet/file_reader.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 4b3027d7397..60bfccf33c6 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -163,13 +163,15 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is non-uniform encrypted and the column is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); + std::shared_ptr column_path = + std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key std::string column_key = - file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); + file_decryption->GetColumnKey(column_path, column_key_metadata); if (column_key.empty()) { throw ParquetException("column is encrypted with null key, path=" + - col->path_in_schema()->ToDotString()); + column_path->ToDotString()); } auto column_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, column_key, From 015fd0ab0f1b21bf4490a706c9952547dd64e23b Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 18:11:52 +0700 Subject: [PATCH 009/125] let encryption examples to be able to cover more cases --- .../low-level-api/encryption-reader-writer.cc | 728 +++++++++--------- 1 file changed, 371 insertions(+), 357 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 42f9b45e24f..b762f46ddfb 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -48,389 +48,403 @@ const std::string FOOTER_ENCRYPTION_KEY = "0123456789012345"; // 16 bytes const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes int main(int argc, char** argv) { - /********************************************************************************** - PARQUET WRITER EXAMPLE - **********************************************************************************/ - // parquet::REQUIRED fields do not need definition and repetition level values - // parquet::OPTIONAL fields require only definition level values - // parquet::REPEATED fields require both definition and repetition level values - try { - // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); - - // Setup the parquet schema - std::shared_ptr schema = SetupSchema(); - - // Add writer properties - parquet::WriterProperties::Builder builder; - builder.compression(parquet::Compression::SNAPPY); - - // uniform encryption - parquet::FileEncryptionProperties::Builder file_encryption_builder; - file_encryption_builder.footer_key(FOOTER_ENCRYPTION_KEY); - - // non-uniform with column keys - std::map> encryption_cols; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("ba_field", true); - encryption_col_builder0.key(COLUMN_ENCRYPTION_KEY); - auto encryption_col0 = encryption_col_builder0.build(); - - encryption_cols[encryption_col0->path()] = encryption_col0; - - file_encryption_builder.column_properties(encryption_cols, true); - - builder.encryption(file_encryption_builder.build()); - - std::shared_ptr props = builder.build(); - - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - // Append a RowGroup with a specific number of rows. - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } + std::vector> file_encryption_properties; + std::vector> file_decryption_properties; + + // uniform encryption + parquet::FileEncryptionProperties::Builder file_encryption_builder_1; + file_encryption_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); + + std::shared_ptr decryption_properties_1 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + + // non-uniform with column keys + std::map> encryption_cols; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0("ba_field", true); + encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); + auto encryption_col0 = encryption_col_builder_0.build(); + encryption_cols[encryption_col0->path()] = encryption_col0; + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2; + file_encryption_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); + file_encryption_builder_2.column_properties(encryption_cols, true); + + std::shared_ptr decryption_properties_2 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); + + file_encryption_properties.push_back(file_encryption_builder_1.build()); + file_encryption_properties.push_back(file_encryption_builder_2.build()); + + file_decryption_properties.push_back(decryption_properties_1); + file_decryption_properties.push_back(decryption_properties_2); + + for (int i = 0; i < file_encryption_properties.size(); ++i) { + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // parquet::REQUIRED fields do not need definition and repetition level values + // parquet::OPTIONAL fields require only definition level values + // parquet::REPEATED fields require both definition and repetition level values + // setup for encryption + try { + + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + builder.encryption(file_encryption_properties[i]); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::ByteArray value; - char hello[FIXED_LENGTH] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = FIXED_LENGTH; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); } - } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); - // Close the ParquetFileWriter - file_writer->Close(); + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the bytes to file - DCHECK(out_file->Close().ok()); - } catch (const std::exception& e) { - std::cerr << "Parquet write error: " << e.what() << std::endl; - return -1; - } + // Close the ParquetFileWriter + file_writer->Close(); - /********************************************************************************** - PARQUET READER EXAMPLE - **********************************************************************************/ - - try { - // decryption properties - std::shared_ptr decryption_properties = - std::make_shared(FOOTER_ENCRYPTION_KEY); - decryption_properties->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); - - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption(decryption_properties); - - // Create a ParquetReader instance - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - assert(num_row_groups == 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - assert(value == expected_value); - i++; - } + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - assert(value == i); - i++; - } + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption(file_decryption_properties[i]); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - assert(value == expected_value); - if ((i % 2) == 0) { - assert(repetition_level == 1); - } else { - assert(repetition_level == 0); + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; } - i++; - } - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; } - i++; - } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - assert(value == expected_value); - i++; - } + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - double expected_value = i * 1.1111111; - assert(value == expected_value); - i++; - } + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // Verify the value written - char expected_value[FIXED_LENGTH] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); // There are no NULL values in the rows written assert(values_read == 1); - assert(value.len == FIXED_LENGTH); - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - assert(definition_level == 1); - } else { - // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; } - i++; - } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - char v = static_cast(i); - char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - i++; + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; } - } catch (const std::exception& e) { - std::cerr << "Parquet read error: " << e.what() << std::endl; - return -1; - } - - std::cout << "Parquet Writing and Reading Complete" << std::endl; + std::cout << "Example [" << (i+1) << "] Parquet Writing and Reading Complete" << std::endl; + } return 0; -} +} \ No newline at end of file From c0cbac9dc26f2c137124e2605b5b60b8f614d85f Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 18:23:21 +0700 Subject: [PATCH 010/125] footer plaintext mode --- cpp/src/parquet/file_reader.cc | 44 +++++++++++++++++++--- cpp/src/parquet/file_writer.cc | 61 +++++++++++++++++++++--------- cpp/src/parquet/file_writer.h | 3 +- cpp/src/parquet/metadata.cc | 69 ++++++++++++++++++++++++++++++++-- cpp/src/parquet/metadata.h | 8 +++- cpp/src/parquet/properties.h | 12 ++++-- cpp/src/parquet/thrift.h | 23 +++++++++++- 7 files changed, 187 insertions(+), 33 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 60bfccf33c6..f061e1790de 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -130,7 +130,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if (!file_crypto_metadata_ || !crypto_metadata) { + if ((!file_crypto_metadata_ && !file_metadata_->is_plaintext_mode()) || !crypto_metadata) { encrypted = false; } @@ -145,16 +145,22 @@ class SerializedRowGroup : public RowGroupReader::Contents { // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { - std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); + const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() + ? file_metadata_->footer_signing_key_metadata() + : file_crypto_metadata_->key_metadata(); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } + Encryption::type algorithm = file_metadata_->is_plaintext_mode() + ? file_metadata_->encryption_algorithm().algorithm + : file_crypto_metadata_->encryption_algorithm().algorithm; + auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->GetAad()); + algorithm, footer_key, file_decryption->GetAad()); return PageReader::Open(stream, col->num_values(), col->compression(), footer_encryption, properties_.memory_pool()); @@ -164,7 +170,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = - std::make_shared(crypto_metadata->path_in_schema()); + std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key std::string column_key = file_decryption->GetColumnKey(column_path, column_key_metadata); @@ -266,7 +272,33 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); + + uint32_t read_metadata_len; + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); + + if (file_metadata_->is_plaintext_mode()) { + if (metadata_len - read_metadata_len != 28) { + throw ParquetException("Invalid parquet file. Cannot verify plaintext mode footer."); + } + // get footer key + std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); + auto file_decryption = properties_.file_decryption(); + if (file_decryption == nullptr) { + throw ParquetException("No decryption properties are provided. Could not verify plaintext footer metadata"); + } + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + if (footer_key.empty()) { + throw ParquetException("No footer key are provided. Could not verify plaintext footer metadata"); + } + // TODO: aad + auto encryption = std::make_shared( + file_metadata_->encryption_algorithm().algorithm, + footer_key + ); + if (! file_metadata_->verify(encryption, metadata_buffer->data() + read_metadata_len, 28)) { + throw ParquetException("Invalid parquet file. Could not verify plaintext footer metadata"); + } + } } // encryption with encrypted footer else { diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 99ab6ae3c54..037e5145a79 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -264,23 +264,39 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - file_metadata_ = metadata_->Finish(); - auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { + file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); - } else { - uint64_t metadata_start = static_cast(sink_->Tell()); - auto crypto_metadata = metadata_->GetCryptoMetaData(); - WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption); - uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); - sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - - sink_->Write(PARQUET_EMAGIC, 4); + } + else { + if (file_encryption->encrypt_footer()) { + // encrypted footer + file_metadata_ = metadata_->Finish(); + + uint64_t metadata_start = static_cast(sink_->Tell()); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); + uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); + + sink_->Write(PARQUET_EMAGIC, 4); + } + else { + // footer plain mode + EncryptionAlgorithm signing_encryption; + signing_encryption.algorithm = Encryption::AES_GCM_V1; + // TODO: AAD + file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->footer_key_metadata()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); + } } sink_->Close(); @@ -387,7 +403,8 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& footer_encryption) { + const std::shared_ptr& footer_encryption, + bool encrypt_footer) { if (footer_encryption == nullptr) { // Write MetaData int64_t position = -1; @@ -402,8 +419,18 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); } else { - // encrypt and write to sink - file_metadata.WriteTo(sink, footer_encryption); + if (encrypt_footer) { + // encrypt and write to sink + file_metadata.WriteTo(sink, footer_encryption); + } + else { + uint32_t metadata_len = static_cast(sink->Tell()); + file_metadata.WriteTo(sink, footer_encryption); + metadata_len = static_cast(sink->Tell()) - metadata_len; + + sink->Write(reinterpret_cast(&metadata_len), 4); + sink->Write(PARQUET_MAGIC, 4); + } } } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index ab18034637e..5fb7f3575d1 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -108,7 +108,8 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - const std::shared_ptr& encryption_properties = NULLPTR); + const std::shared_ptr& encryption_properties = NULLPTR, + bool encrypt_footer = false); void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index b378b650ea0..882b20afcc3 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -412,6 +412,22 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } + bool verify(std::shared_ptr encryption, + const void* tail, uint32_t tail_len) { + // re-encrypt the footer + uint8_t* encrypted_file_metadata; + uint32_t encrypted_file_metadata_len; + ThriftSerializer serializer; + serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, + &encrypted_file_metadata, encryption); + // compare + if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len, + reinterpret_cast(tail), tail_len)) { + return false; + } + return true; + } + inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } inline int64_t num_rows() const { return metadata_->num_rows; } @@ -423,12 +439,32 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } + inline bool is_plaintext_mode() const { return metadata_->__isset.encryption_algorithm; } + inline EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + inline const std::string& footer_signing_key_metadata() { + return metadata_->footer_signing_key_metadata; + } const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst, encryption, false); + if (is_plaintext_mode()) { + serializer.Serialize(metadata_.get(), dst); + // 1. encrypt the footer key + uint8_t* encrypted_file_metadata; + uint32_t encrypted_file_metadata_len; + ThriftSerializer serializer; + serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, + &encrypted_file_metadata, encryption); + // 2. write 28 bytes of nonce_and_tag (at the end of encrypted file metadata) + dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-28, 28); + } + else { + serializer.Serialize(metadata_.get(), dst, encryption, false); + } } std::unique_ptr RowGroup(int i) { @@ -534,6 +570,11 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } +bool FileMetaData::verify(std::shared_ptr encryption, + const void* tail, uint32_t tail_len) { + return impl_->verify(encryption, tail, tail_len); +} + uint32_t FileMetaData::size() const { return impl_->size(); } int FileMetaData::num_columns() const { return impl_->num_columns(); } @@ -542,6 +583,16 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } +bool FileMetaData::is_plaintext_mode() const { return impl_->is_plaintext_mode(); } + +EncryptionAlgorithm FileMetaData::encryption_algorithm() const { + return impl_->encryption_algorithm(); +} + +const std::string& FileMetaData::footer_signing_key_metadata() const { + return impl_->footer_signing_key_metadata(); +} + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: @@ -1051,7 +1102,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return current_row_group_builder_.get(); } - std::unique_ptr Finish() { + std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm, + const std::string& footer_signing_key_metadata) { int64_t total_rows = 0; for (auto row_group : row_groups_) { total_rows += row_group.num_rows; @@ -1097,6 +1149,13 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; + if (signing_algorithm != NULLPTR) { + metadata_->__set_encryption_algorithm(ToThrift(*signing_algorithm)); + if (footer_signing_key_metadata.size() > 0) { + metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); + } + } + parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), &metadata_->schema); @@ -1165,7 +1224,11 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { return impl_->AppendRowGroup(); } -std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +std::unique_ptr FileMetaDataBuilder::Finish( + const EncryptionAlgorithm* signing_algorithm, + const std::string& footer_signing_key_metadata) { + return impl_->Finish(signing_algorithm, footer_signing_key_metadata); +} std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index ef8ed175d73..97563570085 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -190,11 +190,16 @@ class PARQUET_EXPORT FileMetaData { ~FileMetaData(); + bool verify(std::shared_ptr encryption, + const void* tail, uint32_t tail_len); // file metadata uint32_t size() const; int num_columns() const; int64_t num_rows() const; int num_row_groups() const; + bool is_plaintext_mode() const; + EncryptionAlgorithm encryption_algorithm() const; + const std::string& footer_signing_key_metadata() const; ParquetVersion::type version() const; const std::string& created_by() const; int num_schema_elements() const; @@ -330,7 +335,8 @@ class PARQUET_EXPORT FileMetaDataBuilder { RowGroupMetaDataBuilder* AppendRowGroup(); // Complete the Thrift structure - std::unique_ptr Finish(); + std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm = NULLPTR, + const std::string& footer_signing_key_metadata = ""); // crypto metadata std::unique_ptr GetCryptoMetaData(); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index cbc42227b2a..a6a744b1b35 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -280,9 +280,10 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - Builder* footer_key(const std::string& key) { + Builder* footer_key(const std::string& key, bool encrypt_footer=true) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); footer_key_ = key; + encrypt_footer_ = encrypt_footer; return this; } @@ -354,13 +355,14 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); } return std::make_shared( - footer_encryption, footer_key_metadata_, aad_metadata_, + footer_encryption, encrypt_footer_, footer_key_metadata_, aad_metadata_, column_properties_, encrypt_the_rest_); } private: Encryption::type algorithm_; std::string footer_key_; + bool encrypt_footer_; std::string footer_key_metadata_; std::string aad_; @@ -371,12 +373,13 @@ class PARQUET_EXPORT FileEncryptionProperties { }; FileEncryptionProperties( - const std::shared_ptr& footer_encryption, + const std::shared_ptr& footer_encryption, bool encrypt_footer, const std::string& footer_key_metadata, const std::string& aad_metadata, const std::map>& column_properties, bool encrypt_the_rest) : footer_encryption_(footer_encryption), + encrypt_footer_(encrypt_footer), footer_key_metadata_(footer_key_metadata), aad_metadata_(aad_metadata), column_properties_(column_properties), @@ -386,6 +389,8 @@ class PARQUET_EXPORT FileEncryptionProperties { return footer_encryption_; } + bool encrypt_footer() const { return encrypt_footer_; } + const std::string& footer_key_metadata() const { return footer_key_metadata_; } const std::string& aad_metadata() const { return aad_metadata_; } @@ -428,6 +433,7 @@ class PARQUET_EXPORT FileEncryptionProperties { private: std::shared_ptr footer_encryption_; + bool encrypt_footer_; std::string footer_key_metadata_; std::string aad_metadata_; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 53f9b84c3a6..a1d6493a6c6 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -259,9 +259,28 @@ class ThriftSerializer { /// memory returned is owned by this object and will be invalid when another object /// is serialized. template - void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer, + const std::shared_ptr& encryption = NULLPTR) { SerializeObject(obj); - mem_buffer_->getBuffer(buffer, len); + if (encryption == NULLPTR) { + mem_buffer_->getBuffer(buffer, len); + } + else { + uint8_t* out_buffer; + uint32_t out_length; + mem_buffer_->getBuffer(&out_buffer, &out_length); + // encrypt + std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); + int cipher_buffer_len = parquet_encryption::Encrypt( + encryption, true, out_buffer, out_length, cipher_buffer.data()); + if (cipher_buffer_len > cipher_buffer.size()) { + std::stringstream ss; + ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; + throw ParquetException(ss.str()); + } + *len = cipher_buffer_len; + *buffer = cipher_buffer.data(); + } } template From 2ba89f4c8f248d0186d5408e72b8096062b92340 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 4 Mar 2019 12:56:49 +0700 Subject: [PATCH 011/125] footer plaintext mode example --- .../parquet/low-level-api/encryption-reader-writer.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index b762f46ddfb..c26d9eee2dd 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -74,11 +74,20 @@ int main(int argc, char** argv) { std::make_shared(FOOTER_ENCRYPTION_KEY); decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); + // plain mode footer = unencrypted footer + parquet::FileEncryptionProperties::Builder file_encryption_builder_3; + file_encryption_builder_3.footer_key(FOOTER_ENCRYPTION_KEY, false); + + std::shared_ptr decryption_properties_3 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); + file_encryption_properties.push_back(file_encryption_builder_3.build()); file_decryption_properties.push_back(decryption_properties_1); file_decryption_properties.push_back(decryption_properties_2); + file_decryption_properties.push_back(decryption_properties_3); for (int i = 0; i < file_encryption_properties.size(); ++i) { /********************************************************************************** From 331f94bab4005f17f6f4d1ee62530ab630283ea6 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 7 Mar 2019 14:19:29 +0700 Subject: [PATCH 012/125] fix compiling issue --- .../parquet/low-level-api/encryption-reader-writer.cc | 2 +- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/metadata.cc | 9 +++++---- cpp/src/parquet/thrift.h | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index c26d9eee2dd..76ba264780f 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { file_decryption_properties.push_back(decryption_properties_2); file_decryption_properties.push_back(decryption_properties_3); - for (int i = 0; i < file_encryption_properties.size(); ++i) { + for (unsigned i = 0; i < file_encryption_properties.size(); ++i) { /********************************************************************************** PARQUET WRITER EXAMPLE **********************************************************************************/ diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index f061e1790de..f8e13c9214d 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -273,7 +273,7 @@ class SerializedFile : public ParquetFileReader::Contents { } } - uint32_t read_metadata_len; + uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); if (file_metadata_->is_plaintext_mode()) { diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 882b20afcc3..2272181b2d2 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -416,12 +416,12 @@ class FileMetaData::FileMetaDataImpl { const void* tail, uint32_t tail_len) { // re-encrypt the footer uint8_t* encrypted_file_metadata; - uint32_t encrypted_file_metadata_len; + uint32_t encrypted_file_metadata_len = metadata_len_; ThriftSerializer serializer; serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, &encrypted_file_metadata, encryption); - // compare - if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len, + // compare (not count 4 bytes at the end for length) + if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len - 4, reinterpret_cast(tail), tail_len)) { return false; } @@ -460,7 +460,8 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, &encrypted_file_metadata, encryption); // 2. write 28 bytes of nonce_and_tag (at the end of encrypted file metadata) - dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-28, 28); + // (not count 4 bytes at the end for length) + dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-32, 28); } else { serializer.Serialize(metadata_.get(), dst, encryption, false); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index a1d6493a6c6..01d163b52a2 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -271,7 +271,7 @@ class ThriftSerializer { mem_buffer_->getBuffer(&out_buffer, &out_length); // encrypt std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - int cipher_buffer_len = parquet_encryption::Encrypt( + unsigned cipher_buffer_len = parquet_encryption::Encrypt( encryption, true, out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > cipher_buffer.size()) { std::stringstream ss; @@ -302,7 +302,7 @@ class ThriftSerializer { return static_cast(out_length); } else { std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - int cipher_buffer_len = parquet_encryption::Encrypt( + unsigned cipher_buffer_len = parquet_encryption::Encrypt( encryption, true, out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > cipher_buffer.size()) { std::stringstream ss; From 2f741e7ee8968ca6d3f9a8038d4c445145c7709f Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 8 Mar 2019 19:00:31 +0700 Subject: [PATCH 013/125] fix plaintext mode verification --- cpp/src/parquet/metadata.cc | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 2272181b2d2..03fa6f53465 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -414,15 +414,23 @@ class FileMetaData::FileMetaDataImpl { bool verify(std::shared_ptr encryption, const void* tail, uint32_t tail_len) { - // re-encrypt the footer - uint8_t* encrypted_file_metadata; - uint32_t encrypted_file_metadata_len = metadata_len_; + // serialize the footer + uint8_t* serialized_data; + uint32_t serialized_len = metadata_len_; ThriftSerializer serializer; - serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, - &encrypted_file_metadata, encryption); - // compare (not count 4 bytes at the end for length) - if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len - 4, - reinterpret_cast(tail), tail_len)) { + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt with nonce + uint8_t* nonce = const_cast(reinterpret_cast(tail)); + uint8_t* tag = const_cast(reinterpret_cast(tail)) + 12; + + std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); + uint32_t encrypted_len = parquet_encryption::SignedFooterEncrypt( + serialized_data, serialized_len, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + nonce, 12, encrypted_buffer.data()); + + if (0 != memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16)) { return false; } return true; @@ -459,9 +467,10 @@ class FileMetaData::FileMetaDataImpl { ThriftSerializer serializer; serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, &encrypted_file_metadata, encryption); - // 2. write 28 bytes of nonce_and_tag (at the end of encrypted file metadata) - // (not count 4 bytes at the end for length) - dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-32, 28); + // write nonce + dst->Write(encrypted_file_metadata + 4, 12); + // write tag + dst->Write(encrypted_file_metadata + encrypted_file_metadata_len - 16, 16); } else { serializer.Serialize(metadata_.get(), dst, encryption, false); From 1fdd1e278619931d320a1990a211e298e0b64453 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 12 Mar 2019 14:32:27 +0700 Subject: [PATCH 014/125] fix memory issue when serializing plaintext mode footer --- cpp/src/parquet/metadata.cc | 29 +++++++++++++++-------------- cpp/src/parquet/thrift.h | 23 ++--------------------- 2 files changed, 17 insertions(+), 35 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 03fa6f53465..25a1cabdc7f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -29,7 +29,7 @@ #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/thrift.h" - +#include #include // IWYU pragma: keep namespace parquet { @@ -430,10 +430,7 @@ class FileMetaData::FileMetaDataImpl { encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), nonce, 12, encrypted_buffer.data()); - if (0 != memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16)) { - return false; - } - return true; + return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); } inline uint32_t size() const { return metadata_len_; } @@ -460,17 +457,21 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { ThriftSerializer serializer; if (is_plaintext_mode()) { - serializer.Serialize(metadata_.get(), dst); - // 1. encrypt the footer key - uint8_t* encrypted_file_metadata; - uint32_t encrypted_file_metadata_len; - ThriftSerializer serializer; - serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, - &encrypted_file_metadata, encryption); + uint8_t* serialized_data; + uint32_t serialized_len; + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt the footer key + std::vector encrypted_data(encryption->CalculateCipherSize(serialized_len)); + unsigned encrypted_len = parquet_encryption::Encrypt( + encryption, true, serialized_data, serialized_len, encrypted_data.data()); + + // write unencrypted footer + dst->Write(serialized_data, serialized_len); // write nonce - dst->Write(encrypted_file_metadata + 4, 12); + dst->Write(encrypted_data.data() + 4, 12); // write tag - dst->Write(encrypted_file_metadata + encrypted_file_metadata_len - 16, 16); + dst->Write(encrypted_data.data() + encrypted_len - 16, 16); } else { serializer.Serialize(metadata_.get(), dst, encryption, false); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 01d163b52a2..8eb872abbb0 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -259,28 +259,9 @@ class ThriftSerializer { /// memory returned is owned by this object and will be invalid when another object /// is serialized. template - void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer, - const std::shared_ptr& encryption = NULLPTR) { + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { SerializeObject(obj); - if (encryption == NULLPTR) { - mem_buffer_->getBuffer(buffer, len); - } - else { - uint8_t* out_buffer; - uint32_t out_length; - mem_buffer_->getBuffer(&out_buffer, &out_length); - // encrypt - std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - unsigned cipher_buffer_len = parquet_encryption::Encrypt( - encryption, true, out_buffer, out_length, cipher_buffer.data()); - if (cipher_buffer_len > cipher_buffer.size()) { - std::stringstream ss; - ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; - throw ParquetException(ss.str()); - } - *len = cipher_buffer_len; - *buffer = cipher_buffer.data(); - } + mem_buffer_->getBuffer(buffer, len); } template From 0bbe128f20030b2336a1a7a4d7f94fcf12f8a1d8 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 14 Mar 2019 18:53:06 +0700 Subject: [PATCH 015/125] protection of sensitive metadata --- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/metadata.cc | 96 ++++++++++++++++++++++++---------- cpp/src/parquet/metadata.h | 8 +-- 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index f8e13c9214d..45200ff2d88 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -99,7 +99,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i); + auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption()); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 25a1cabdc7f..0bb1a16cb4f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -162,10 +162,38 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { - const format::ColumnMetaData& meta_data = column->meta_data; - for (auto encoding : meta_data.encodings) { + + metadata_ = column->meta_data; + + if (column->__isset.crypto_metadata) { + format::ColumnCryptoMetaData ccmd = column->crypto_metadata; + + if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + if (file_decryption == NULLPTR) { + throw ParquetException("Cannot decrypt ColumnMetadata. FileDecryptionProperties must be provided."); + } + // should decrypt metadata + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + const std::string& key = file_decryption->GetColumnKey(path, key_metadata); + if (key.empty()) { + throw ParquetException("Cannot decrypt ColumnMetadata. Column encryption key must be provided."); + } + + // TODO: get algorithm from FileCryptoMetadata??? + auto encryption = std::make_shared(Encryption::AES_GCM_V1, key); + + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, encryption, false); + } + } + + for (auto encoding : metadata_.encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; @@ -176,12 +204,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline const std::string& file_path() const { return column_->file_path; } // column metadata - inline Type::type type() const { return FromThrift(column_->meta_data.type); } + inline Type::type type() const { return FromThrift(metadata_.type); } - inline int64_t num_values() const { return column_->meta_data.num_values; } + inline int64_t num_values() const { return metadata_.num_values; } std::shared_ptr path_in_schema() { - return std::make_shared(column_->meta_data.path_in_schema); + return std::make_shared(metadata_.path_in_schema); } // Check if statistics are set and are valid @@ -191,12 +219,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!column_->meta_data.__isset.statistics || + if (!metadata_.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(column_->meta_data, descr_); + possible_stats_ = MakeColumnStats(metadata_, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -208,35 +236,35 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } inline Compression::type compression() const { - return FromThrift(column_->meta_data.codec); + return FromThrift(metadata_.codec); } const std::vector& encodings() const { return encodings_; } inline bool has_dictionary_page() const { - return column_->meta_data.__isset.dictionary_page_offset; + return metadata_.__isset.dictionary_page_offset; } inline int64_t dictionary_page_offset() const { - return column_->meta_data.dictionary_page_offset; + return metadata_.dictionary_page_offset; } - inline int64_t data_page_offset() const { return column_->meta_data.data_page_offset; } + inline int64_t data_page_offset() const { return metadata_.data_page_offset; } inline bool has_index_page() const { - return column_->meta_data.__isset.index_page_offset; + return metadata_.__isset.index_page_offset; } inline int64_t index_page_offset() const { - return column_->meta_data.index_page_offset; + return metadata_.index_page_offset; } inline int64_t total_compressed_size() const { - return column_->meta_data.total_compressed_size; + return metadata_.total_compressed_size; } inline int64_t total_uncompressed_size() const { - return column_->meta_data.total_uncompressed_size; + return metadata_.total_uncompressed_size; } inline std::unique_ptr crypto_metadata() const { @@ -252,23 +280,26 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { mutable std::shared_ptr possible_stats_; std::vector encodings_; const format::ColumnChunk* column_; + format::ColumnMetaData metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; }; std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) { + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version)); + new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption)); } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - writer_version))} {} + writer_version, file_decryption))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -345,7 +376,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i) { + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -353,7 +384,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_); + writer_version_, file_decryption); } private: @@ -384,8 +415,8 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const { - return impl_->ColumnChunk(i); +std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption) const { + return impl_->ColumnChunk(i, file_decryption); } // file metadata @@ -901,12 +932,22 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__isset.meta_data = false; // Thrift-serialize the ColumnMetaData structure, - // encrypt it with the column key, and write the result to the output stream - // (first length, then buffer) + // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = properties_->encryption(column_->path()); uint64_t metadata_start = sink->Tell(); - serializer.Serialize(&column_metadata_, sink, encrypt_props); + uint8_t* serialized_data; + uint32_t serialized_len; + serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); + + // encrypt the footer key + std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); + unsigned encrypted_len = parquet_encryption::Encrypt( + encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); + // TODO + const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); + std::string encrypted_column_metadata(temp, encrypted_len); + column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. column_chunk_->__set_file_offset(metadata_start); @@ -924,6 +965,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; + column_metadata_ = column_chunk_->meta_data; column_metadata_.__set_type(ToThrift(column_->physical_type())); column_metadata_.__set_path_in_schema(column_->path()->ToDotVector()); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 97563570085..1548da2b39f 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -119,7 +119,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + const ApplicationVersion* writer_version = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR); ~ColumnChunkMetaData(); @@ -148,7 +149,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + const ApplicationVersion* writer_version = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -169,7 +171,7 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i) const; + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, From 23130e73944af65f870d805192aa9477f06b6511 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 18 Mar 2019 13:16:41 +0700 Subject: [PATCH 016/125] fix duplication of variable i --- .../parquet/low-level-api/encryption-reader-writer.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 76ba264780f..66c2d22f584 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { file_decryption_properties.push_back(decryption_properties_2); file_decryption_properties.push_back(decryption_properties_3); - for (unsigned i = 0; i < file_encryption_properties.size(); ++i) { + for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { /********************************************************************************** PARQUET WRITER EXAMPLE **********************************************************************************/ @@ -110,7 +110,7 @@ int main(int argc, char** argv) { // Add writer properties parquet::WriterProperties::Builder builder; builder.compression(parquet::Compression::SNAPPY); - builder.encryption(file_encryption_properties[i]); + builder.encryption(file_encryption_properties[example_id]); std::shared_ptr props = builder.build(); @@ -226,7 +226,7 @@ int main(int argc, char** argv) { try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption(file_decryption_properties[i]); + reader_properties.file_decryption(file_decryption_properties[example_id]); // Create a ParquetReader instance std::unique_ptr parquet_reader = @@ -453,7 +453,7 @@ int main(int argc, char** argv) { std::cerr << "Parquet read error: " << e.what() << std::endl; } - std::cout << "Example [" << (i+1) << "] Parquet Writing and Reading Complete" << std::endl; + std::cout << "Example [" << (example_id+1) << "] Parquet Writing and Reading Complete" << std::endl; } return 0; } \ No newline at end of file From b221b96a98c28fdfdcf9a66b8fc83b5b8c0111f7 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 19 Mar 2019 16:25:13 +0700 Subject: [PATCH 017/125] column metadata encryption: read algorithm, aad from FileCryptoMetadata or FileMetadata --- cpp/src/parquet/file_reader.cc | 10 +++++++++- cpp/src/parquet/metadata.cc | 31 ++++++++++++++++++------------- cpp/src/parquet/metadata.h | 11 ++++++++--- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 45200ff2d88..17be7c5c70c 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -98,8 +98,16 @@ class SerializedRowGroup : public RowGroupReader::Contents { const ReaderProperties* properties() const override { return &properties_; } std::unique_ptr GetColumnPageReader(int i) override { + EncryptionAlgorithm algorithm; + if (file_crypto_metadata_) { + algorithm = file_crypto_metadata_->encryption_algorithm(); + } + else if (file_metadata_->is_plaintext_mode()) { + algorithm = file_metadata_->encryption_algorithm(); + } // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption()); + auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption(), + &algorithm); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 0bb1a16cb4f..e261199cf2e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -163,7 +163,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption = NULLPTR) + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -184,8 +185,9 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { throw ParquetException("Cannot decrypt ColumnMetadata. Column encryption key must be provided."); } - // TODO: get algorithm from FileCryptoMetadata??? - auto encryption = std::make_shared(Encryption::AES_GCM_V1, key); + DCHECK(algorithm != NULLPTR); + // TODO: AAD + auto encryption = std::make_shared(algorithm->algorithm, key); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), @@ -288,18 +290,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption) { + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption)); + new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption, algorithm)); } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption) + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - writer_version, file_decryption))} {} + writer_version, file_decryption, algorithm))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -376,7 +380,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) { + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -384,7 +389,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryption); + writer_version_, file_decryption, algorithm); } private: @@ -415,8 +420,9 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption) const { - return impl_->ColumnChunk(i, file_decryption); +std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm) const { + return impl_->ColumnChunk(i, file_decryption, algorithm); } // file metadata @@ -940,11 +946,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { uint32_t serialized_len; serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); - // encrypt the footer key std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); unsigned encrypted_len = parquet_encryption::Encrypt( encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); - // TODO + const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 1548da2b39f..4ca7d056017 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -114,13 +114,16 @@ class PARQUET_EXPORT ColumnCryptoMetaData { std::unique_ptr impl_; }; +class FileCryptoMetaData; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR); + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR); ~ColumnChunkMetaData(); @@ -150,7 +153,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR); + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -171,7 +175,8 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) const; + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, From 21b6dda0141fd1eca7cd1483d6904899057b7820 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 20 Mar 2019 19:28:46 +0700 Subject: [PATCH 018/125] keep redacted metadata version for old readers --- cpp/src/parquet/metadata.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e261199cf2e..bc0520faa16 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -934,13 +934,9 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // key if ((footer_encryption == nullptr && encrypt_md->encrypted()) || !encrypt_md->encrypted_with_footer_key()) { - // don't set meta_data - column_chunk_->__isset.meta_data = false; - // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = properties_->encryption(column_->path()); - uint64_t metadata_start = sink->Tell(); uint8_t* serialized_data; uint32_t serialized_len; @@ -954,8 +950,22 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); - // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. - column_chunk_->__set_file_offset(metadata_start); + // Keep redacted metadata version for old readers + if (footer_encryption == nullptr) { + format::ColumnMetaData metadata_redacted = column_metadata_; + if (metadata_redacted.__isset.statistics) { + metadata_redacted.__isset.statistics = false; + } + if (metadata_redacted.__isset.encoding_stats) { + metadata_redacted.__isset.encoding_stats = false; + } + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(metadata_redacted); + } + else { + // don't set meta_data + column_chunk_->__isset.meta_data = true; + } } else { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); From 42e4b48e5933a7229890ec3a630ce08fc68a6a29 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 20 Mar 2019 20:08:22 +0700 Subject: [PATCH 019/125] hidden column exception --- cpp/src/parquet/encryption.h | 10 +++++++++- cpp/src/parquet/metadata.cc | 8 ++++---- cpp/src/parquet/properties.h | 12 ++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 1dbf0d20f39..d4d30076cd0 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -18,10 +18,12 @@ #ifndef PARQUET_ENCRYPTION_H #define PARQUET_ENCRYPTION_H -#include #include #include +#include +#include "parquet/exception.h" + namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { @@ -50,6 +52,12 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { std::map key_map_; }; +class PARQUET_EXPORT HiddenColumnException : public ParquetException { + public: + HiddenColumnException(const std::string columnPath) + : ParquetException(columnPath.c_str()) {} +}; + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index bc0520faa16..69c094e7e94 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -179,11 +179,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { // should decrypt metadata std::shared_ptr path = std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); - std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - const std::string& key = file_decryption->GetColumnKey(path, key_metadata); - if (key.empty()) { - throw ParquetException("Cannot decrypt ColumnMetadata. Column encryption key must be provided."); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; std::cout << 1 << std::endl; + if (!file_decryption->HasColumnKey(path, key_metadata)) { + throw HiddenColumnException(path->ToDotString()); } + const std::string& key = file_decryption->GetColumnKey(path, key_metadata); std::cout << 2 << std::endl; DCHECK(algorithm != NULLPTR); // TODO: AAD diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a6a744b1b35..1a1cab362ab 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -130,6 +130,18 @@ class PARQUET_EXPORT FileDecryptionProperties { column_keys_[columnPath.ToDotString()] = key; } + bool HasColumnKey(const std::shared_ptr& columnPath, + const std::string& key_metadata = "") { + if (key_metadata.empty()) { + auto search = column_keys_.find(columnPath->ToDotString()); + return search != column_keys_.end(); + } + if (key_retriever_ == NULLPTR) { + return false; + } + return key_retriever_->GetKey(key_metadata).empty(); + } + const std::string& GetColumnKey(const std::shared_ptr& columnPath, const std::string& key_metadata = "") { if (key_metadata.empty()) { From 593a45430ec3497f07c32a8b39e681316a41a84e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 21 Mar 2019 18:24:42 +0700 Subject: [PATCH 020/125] remove log --- cpp/src/parquet/encryption.h | 2 +- cpp/src/parquet/metadata.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index d4d30076cd0..028da9d28ae 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -54,7 +54,7 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { class PARQUET_EXPORT HiddenColumnException : public ParquetException { public: - HiddenColumnException(const std::string columnPath) + HiddenColumnException(const std::string& columnPath) : ParquetException(columnPath.c_str()) {} }; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 69c094e7e94..c89db8292eb 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -179,11 +179,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { // should decrypt metadata std::shared_ptr path = std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); - std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; std::cout << 1 << std::endl; + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; if (!file_decryption->HasColumnKey(path, key_metadata)) { throw HiddenColumnException(path->ToDotString()); } - const std::string& key = file_decryption->GetColumnKey(path, key_metadata); std::cout << 2 << std::endl; + const std::string& key = file_decryption->GetColumnKey(path, key_metadata); DCHECK(algorithm != NULLPTR); // TODO: AAD From 916b3620f4e5698302dfbe97f9be7c9846a7aa6b Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 21 Mar 2019 18:35:15 +0700 Subject: [PATCH 021/125] add example for hidden column --- .../low-level-api/encryption-reader-writer.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 66c2d22f584..27e179cfd7f 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -74,20 +74,30 @@ int main(int argc, char** argv) { std::make_shared(FOOTER_ENCRYPTION_KEY); decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); - // plain mode footer = unencrypted footer + // plaintext mode footer = unencrypted footer parquet::FileEncryptionProperties::Builder file_encryption_builder_3; file_encryption_builder_3.footer_key(FOOTER_ENCRYPTION_KEY, false); std::shared_ptr decryption_properties_3 = std::make_shared(FOOTER_ENCRYPTION_KEY); + // plaintext mode footer, hidden column + parquet::FileEncryptionProperties::Builder file_encryption_builder_4; + file_encryption_builder_4.footer_key(FOOTER_ENCRYPTION_KEY, false); + file_encryption_builder_4.column_properties(encryption_cols, true); // reusing encryption_cols + + std::shared_ptr decryption_properties_4 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); file_encryption_properties.push_back(file_encryption_builder_3.build()); + file_encryption_properties.push_back(file_encryption_builder_4.build()); file_decryption_properties.push_back(decryption_properties_1); file_decryption_properties.push_back(decryption_properties_2); file_decryption_properties.push_back(decryption_properties_3); + file_decryption_properties.push_back(decryption_properties_4); for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { /********************************************************************************** @@ -449,6 +459,8 @@ int main(int argc, char** argv) { i++; } } + } catch (const parquet::HiddenColumnException& e) { + std::cerr << "Parquet read error: hidden column: " << e.what() << std::endl; } catch (const std::exception& e) { std::cerr << "Parquet read error: " << e.what() << std::endl; } From 991cf35162cef7a5d1d74e1a5bea2460ee841d26 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 29 Mar 2019 18:22:47 +0700 Subject: [PATCH 022/125] handle row group file_offset and total_compressed_size --- cpp/src/parquet/metadata.cc | 29 +++++++++++++++++++++++++++++ cpp/src/parquet/metadata.h | 1 + 2 files changed, 30 insertions(+) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c89db8292eb..b9f0dc71d36 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -378,6 +378,10 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t total_byte_size() const { return row_group_->total_byte_size; } + inline int64_t file_offset() const { return row_group_->file_offset; } + + inline int64_t total_compressed_size() const { return row_group_->total_compressed_size; } + inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, @@ -976,6 +980,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } const ColumnDescriptor* descr() const { return column_; } + int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } private: void Init(format::ColumnChunk* column_chunk) { @@ -1049,6 +1054,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) impl_->SetStatistics(result); } +int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { + return impl_->total_compressed_size(); +} + class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, @@ -1097,6 +1106,26 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { // columns"; // row_group_->__set_total_byte_size(total_byte_size); + + int64_t file_offset = 0; + int64_t total_compressed_size = 0; + + for (int i = 0; i < schema_->num_columns(); i++) { + if (!(row_group_->columns[i].file_offset >= 0)) { + std::stringstream ss; + ss << "Column " << i << " is not complete."; + throw ParquetException(ss.str()); + } + if (i == 0) { + file_offset = row_group_->columns[0].file_offset; + } + // sometimes column metadata is encrypted and not available to read, + // so we must get total_compressed_size from column builder + total_compressed_size += column_builders_[i]->total_compressed_size(); + } + + row_group_->__set_file_offset(file_offset); + row_group_->__set_total_compressed_size(total_compressed_size); row_group_->__set_total_byte_size(total_bytes_written); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 4ca7d056017..bdd0faeede7 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -280,6 +280,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void SetStatistics(const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; + int64_t total_compressed_size() const; // commit the metadata void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, From f7b265fb23a19c238e9b66969a4c02bd866c33b1 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 15 Apr 2019 13:55:28 +0300 Subject: [PATCH 023/125] Apply API changes --- .../low-level-api/encryption-reader-writer.cc | 69 +- cpp/src/parquet/encryption.h | 6 + cpp/src/parquet/file_reader.cc | 44 +- cpp/src/parquet/file_writer.cc | 22 +- cpp/src/parquet/metadata.cc | 60 +- cpp/src/parquet/properties.h | 759 ++++++++++++------ cpp/src/parquet/schema.h | 7 + cpp/src/parquet/thrift.h | 6 +- cpp/src/parquet/types.h | 18 +- 9 files changed, 649 insertions(+), 342 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 27e179cfd7f..666c1a07e76 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -53,51 +53,58 @@ int main(int argc, char** argv) { std::vector> file_decryption_properties; // uniform encryption - parquet::FileEncryptionProperties::Builder file_encryption_builder_1; - file_encryption_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); - - std::shared_ptr decryption_properties_1 = - std::make_shared(FOOTER_ENCRYPTION_KEY); + parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder decryption_properties_builder_1; + decryption_properties_builder_1.withFooterKey(FOOTER_ENCRYPTION_KEY); // non-uniform with column keys - std::map> encryption_cols; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0("ba_field", true); - encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; + std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); + encryption_col_builder_0.withKey(COLUMN_ENCRYPTION_KEY); auto encryption_col0 = encryption_col_builder_0.build(); - encryption_cols[encryption_col0->path()] = encryption_col0; + encryption_cols[path_ptr] = encryption_col0; + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); + decryption_col_builder2.withKey(COLUMN_ENCRYPTION_KEY); + decryption_cols[path_ptr] = decryption_col_builder2.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_2; - file_encryption_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_2.column_properties(encryption_cols, true); + file_encryption_builder_2.withEncryptedColumns(encryption_cols); - std::shared_ptr decryption_properties_2 = - std::make_shared(FOOTER_ENCRYPTION_KEY); - decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder decryption_properties_builder_2; + decryption_properties_builder_2.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_2.withColumnKeys(decryption_cols); - // plaintext mode footer = unencrypted footer - parquet::FileEncryptionProperties::Builder file_encryption_builder_3; - file_encryption_builder_3.footer_key(FOOTER_ENCRYPTION_KEY, false); + // plain mode footer = unencrypted footer + parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); + file_encryption_builder_3.withPlaintextFooter(); - std::shared_ptr decryption_properties_3 = - std::make_shared(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder decryption_properties_builder_3; + decryption_properties_builder_3.withFooterKey(FOOTER_ENCRYPTION_KEY); - // plaintext mode footer, hidden column - parquet::FileEncryptionProperties::Builder file_encryption_builder_4; - file_encryption_builder_4.footer_key(FOOTER_ENCRYPTION_KEY, false); - file_encryption_builder_4.column_properties(encryption_cols, true); // reusing encryption_cols + // plaintext mode footer, hidden column + parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); - std::shared_ptr decryption_properties_4 = - std::make_shared(FOOTER_ENCRYPTION_KEY); + file_encryption_builder_4.withPlaintextFooter(); + file_encryption_builder_4.withEncryptedColumns(encryption_cols); // reusing encryption_cols + parquet::FileDecryptionProperties::Builder decryption_properties_builder_4; + decryption_properties_builder_4.withFooterKey(FOOTER_ENCRYPTION_KEY); file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); file_encryption_properties.push_back(file_encryption_builder_3.build()); file_encryption_properties.push_back(file_encryption_builder_4.build()); - file_decryption_properties.push_back(decryption_properties_1); - file_decryption_properties.push_back(decryption_properties_2); - file_decryption_properties.push_back(decryption_properties_3); - file_decryption_properties.push_back(decryption_properties_4); + file_decryption_properties.push_back(decryption_properties_builder_1.build()); + file_decryption_properties.push_back(decryption_properties_builder_2.build()); + file_decryption_properties.push_back(decryption_properties_builder_3.build()); + file_decryption_properties.push_back(decryption_properties_builder_4.build()); for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { /********************************************************************************** @@ -468,4 +475,4 @@ int main(int argc, char** argv) { std::cout << "Example [" << (example_id+1) << "] Parquet Writing and Reading Complete" << std::endl; } return 0; -} \ No newline at end of file +} diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 028da9d28ae..3902c4f11bd 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -58,6 +58,12 @@ class PARQUET_EXPORT HiddenColumnException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; +class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { + public: + KeyAccessDeniedException(const std::string &columnPath) + : ParquetException(columnPath.c_str()) {} +}; + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 17be7c5c70c..d362ca4f68d 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -157,18 +157,26 @@ class SerializedRowGroup : public RowGroupReader::Contents { ? file_metadata_->footer_signing_key_metadata() : file_crypto_metadata_->key_metadata(); - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + std::string footer_key = file_decryption->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } - Encryption::type algorithm = file_metadata_->is_plaintext_mode() + ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; auto footer_encryption = std::make_shared( - algorithm, footer_key, file_decryption->GetAad()); + algorithm, footer_key); return PageReader::Open(stream, col->num_values(), col->compression(), footer_encryption, properties_.memory_pool()); @@ -180,16 +188,18 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key - std::string column_key = - file_decryption->GetColumnKey(column_path, column_key_metadata); - + std::string column_key; + if (column_key_metadata.empty()) + column_key = file_decryption->getColumnKey(column_path); + else if (file_decryption->getKeyRetriever() != nullptr) + column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); + if (column_key.empty()) { throw ParquetException("column is encrypted with null key, path=" + column_path->ToDotString()); } auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, column_key, - file_decryption->GetAad()); + file_crypto_metadata_->encryption_algorithm().algorithm, column_key); return PageReader::Open(stream, col->num_values(), col->compression(), column_encryption, properties_.memory_pool()); @@ -294,7 +304,12 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_decryption == nullptr) { throw ParquetException("No decryption properties are provided. Could not verify plaintext footer metadata"); } - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + std::string footer_key; + if (footer_key_metadata.empty()) + footer_key = file_decryption->getFooterKey(); + else if (file_decryption->getKeyRetriever() != nullptr) + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key.empty()) { throw ParquetException("No footer key are provided. Could not verify plaintext footer metadata"); } @@ -353,14 +368,19 @@ class SerializedFile : public ParquetFileReader::Contents { auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { throw ParquetException("No decryption properties are provided. Could not read encrypted footer metadata"); - } - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + } + std::string footer_key; + if (footer_key_metadata.empty()) + footer_key = file_decryption->getFooterKey(); + else if (file_decryption->getKeyRetriever() != nullptr) + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key.size() == 0) { throw ParquetException("Invalid footer encryption key. Could not parse footer metadata"); } auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->GetAad()); + file_decryption->getAADPrefix()); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_encryption); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 037e5145a79..faa3dfdab9e 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -270,7 +270,7 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*file_metadata_, sink_.get()); } else { - if (file_encryption->encrypt_footer()) { + if (file_encryption->encryptedFooter()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -278,23 +278,27 @@ class FileSerializer : public ParquetFileWriter::Contents { auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + // TODO: Fix AAD calculation std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + std::make_shared(algorithm, + file_encryption->getFooterEncryptionKey()); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - + sink_->Write(PARQUET_EMAGIC, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; - signing_encryption.algorithm = Encryption::AES_GCM_V1; - // TODO: AAD - file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->footer_key_metadata()); - - std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; + file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->getFooterSigningKeyMetadata ()); + // TODO: Fix AAD calculation + ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + std::shared_ptr footer_encryption = + std::make_shared(algorithm, + file_encryption->getFooterSigningKey()); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index b9f0dc71d36..48331ed6ffe 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -178,29 +178,42 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } // should decrypt metadata std::shared_ptr path = std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - if (!file_decryption->HasColumnKey(path, key_metadata)) { - throw HiddenColumnException(path->ToDotString()); - } - const std::string& key = file_decryption->GetColumnKey(path, key_metadata); - - DCHECK(algorithm != NULLPTR); - // TODO: AAD - auto encryption = std::make_shared(algorithm->algorithm, key); - - uint32_t len = static_cast(column->encrypted_column_metadata.size()); - DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, encryption, false); + std::string key = file_decryption->getColumnKey(path); + // No explicit column key given via API. Retrieve via key metadata. + if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ + try { + key = file_decryption->getKeyRetriever()->GetKey(key_metadata); + } catch (KeyAccessDeniedException e) { + // Hidden column: encrypted, but key unavailable + throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + } + if (key.empty ()) + throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + } + + if (key.empty()) { + // Hidden column: encrypted, but key unavailable + throw HiddenColumnException("HiddenColumnException path= " + path->ToDotString()); + } + DCHECK(algorithm != NULLPTR); + + // TODO: AAD + auto encryption = std::make_shared(algorithm->algorithm, key); + + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, encryption, false); } } - + for (auto encoding : metadata_.encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; } - + // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } @@ -910,7 +923,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const auto& encrypt_md = properties_->column_encryption_props(column_->path()); // column is unencrypted - if (!encrypt_md || !encrypt_md->encrypted()) { + if (!encrypt_md || !encrypt_md->isEncrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); @@ -920,12 +933,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypted with footer key format::ColumnCryptoMetaData ccmd; - if (encrypt_md->encrypted_with_footer_key()) { + if (encrypt_md->isEncryptedWithFooterKey()) { ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; - eck.__set_key_metadata(encrypt_md->key_metadata()); + eck.__set_key_metadata(encrypt_md->getKeyMetaData()); eck.__set_path_in_schema(column_->path()->ToDotVector()); ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); @@ -936,8 +949,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_encryption == nullptr && encrypt_md->encrypted()) || - !encrypt_md->encrypted_with_footer_key()) { + if ((footer_encryption == nullptr && encrypt_md->isEncrypted()) || + !encrypt_md->isEncryptedWithFooterKey()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = properties_->encryption(column_->path()); @@ -1277,7 +1290,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { // TODO: aad metadata //encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); - std::string key_metadata = file_encryption->footer_key_metadata(); + std::string key_metadata; + if (file_encryption->encryptedFooter()) + key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); + else + key_metadata = file_encryption->getFooterSigningKeyMetadata(); + if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 1a1cab362ab..d99383e49fd 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -29,7 +29,7 @@ #include "parquet/schema.h" #include "parquet/types.h" #include "arrow/util/logging.h" - +#include "arrow/util/utf8.h" namespace parquet { struct ParquetVersion { @@ -38,141 +38,361 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 0; static bool DEFAULT_USE_BUFFERED_STREAM = false; +static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static const std::string NULL_STRING = ""; class PARQUET_EXPORT ColumnEncryptionProperties { public: class Builder { public: - Builder(const std::string& path, bool encrypt) - : path_(path), encrypt_(encrypt), encrypted_with_footer_key_(encrypt) {} - - Builder* key(const std::string& key) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - DCHECK(encrypt_); - encrypted_with_footer_key_ = false; + // Convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + // Convenience builder for encrypted columns. + Builder(const std::shared_ptr& path) : Builder(path, true) {} + + // Set a column-specific key. + // If key is not set on an encrypted column, the column will + // be encrypted with the footer key. + // keyBytes Key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); key_ = key; return this; } - Builder* key_metadata(const std::string& key_id) { - DCHECK(!key_id.empty()); - key_metadata_ = key_id; + + // Set a key retrieval metadata. + // use either withKeyMetaData or withKeyID, not both + Builder* withKeyMetaData(const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; return this; } - - Builder* key_id(uint32_t key_id) { - std::string key_metadata = std::string(reinterpret_cast(&key_id), 4); - this->key_metadata(key_metadata); + + // Set a key retrieval metadata (converted from String). + // use either withKeyMetaData or withKeyID, not both + // key_id will be converted to metadata (UTF-8 array). + Builder* withKeyID(std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), + key_id.size())) + throw ParquetException("key id should be in UTF8 encoding"); + + DCHECK(!key_id.empty()); + this->withKeyMetaData(key_id); return this; } - + std::shared_ptr build() { - return std::make_shared( - path_, encrypt_, encrypted_with_footer_key_, key_, key_metadata_); + return + std::shared_ptr(new ColumnEncryptionProperties( + encrypted_, + column_path_, + key_, + key_metadata_)); } - - private: - std::string path_; - bool encrypt_; - bool encrypted_with_footer_key_; + + + private: + const std::shared_ptr column_path_; + bool encrypted_; std::string key_; std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} }; - + + const std::shared_ptr& getPath() { return column_path_; } + bool isEncrypted() const { return encrypted_; } + bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } + const std::string& getKey() const { return key_; } + const std::string& getKeyMetaData() const { return key_metadata_; } + ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - - ColumnEncryptionProperties(const std::string& path, bool encrypt, - bool encrypted_with_footer_key, const std::string& key, - const std::string& key_metadata) - : path_(path), - encrypt_(encrypt), - encrypted_with_footer_key_(encrypted_with_footer_key), - key_(key), - key_metadata_(key_metadata) {} - - const std::string& path() const { return path_; } - bool encrypted() const { return encrypt_; } - bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } - const std::string& key() const { return key_; } - const std::string& key_metadata() const { return key_metadata_; } - + private: - std::string path_; - bool encrypt_; + const std::shared_ptr column_path_; + bool encrypted_; bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; + + explicit ColumnEncryptionProperties(bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata):column_path_(column_path){ + DCHECK(column_path != nullptr); + if (!encrypted) + DCHECK(key.empty() && key_metadata.empty()); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) + DCHECK(key_metadata.empty()); + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; + } }; - -class PARQUET_EXPORT FileDecryptionProperties { + +class PARQUET_EXPORT ColumnDecryptionProperties { public: - FileDecryptionProperties(const std::string& footer_key) : footer_key_(footer_key) { - DCHECK(footer_key_.length() == 16 || footer_key_.length() == 24 || - footer_key_.length() == 32); - } - - FileDecryptionProperties(const std::shared_ptr& key_retriever) - : key_retriever_(key_retriever) {} - - void SetAad(const std::string& aad) { aad_ = aad; } - - void SetColumnKey(const std::string& name, const std::string& key) { - SetColumnKey(std::vector({name}), key); - } - - void SetColumnKey(const std::vector& paths, const std::string& key) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - - schema::ColumnPath columnPath(paths); - - column_keys_[columnPath.ToDotString()] = key; - } - - bool HasColumnKey(const std::shared_ptr& columnPath, - const std::string& key_metadata = "") { - if (key_metadata.empty()) { - auto search = column_keys_.find(columnPath->ToDotString()); - return search != column_keys_.end(); + class Builder { + public: + // convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name)); + } + + Builder(const std::shared_ptr& path) + : column_path_(path) {} + + // Set an explicit column key. If applied on a file that contains key metadata for this column - + // the metadata will be ignored, the column will be decrypted with this key. + // key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); + key_ = key; + return this; } - if (key_retriever_ == NULLPTR) { - return false; + + std::shared_ptr build() { + return + std::shared_ptr(new ColumnDecryptionProperties(column_path_, + key_)); } - return key_retriever_->GetKey(key_metadata).empty(); + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& getPath() { return column_path_; } + const std::string& getKey() const { return key_; } + + private: + const std::shared_ptr column_path_; + std::string key_; + + // This class is only required for setting explicit column decryption keys - + // to override key retriever (or to provide keys when key metadata and/or + // key retriever are not available) + explicit ColumnDecryptionProperties(const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ + DCHECK(column_path != nullptr); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + key_ = key; } - - const std::string& GetColumnKey(const std::shared_ptr& columnPath, - const std::string& key_metadata = "") { - if (key_metadata.empty()) { - return column_keys_.at(columnPath->ToDotString()); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + // Verifies identity (AAD Prefix) of individual file, or of file collection in a data set. + // Throws exception if an AAD prefix is wrong. + // In a data set, AAD Prefixes should be collected, and then checked for missing files. + virtual void check(std::string aad_prefix) = 0; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class Builder { + public: + Builder(){ + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } - if (key_retriever_ == NULLPTR) { - throw ParquetException("no key retriever is provided for column key metadata"); + + // Set an explicit footer key. If applied on a file that contains footer key metadata - + // the metadata will be ignored, the footer will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from key retriever. + //param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* withFooterKey(std::string footer_key) { + if (footer_key.empty ()) { + return this; + } + DCHECK(!footer_key.empty()); + footer_key_ = footer_key; + return this; } - return key_retriever_->GetKey(key_metadata); - } - - const std::string& GetFooterKey(const std::string& footer_key_metadata = "") { - if (footer_key_metadata.empty()) { - return footer_key_; + + // Set explicit column keys (decryption properties). + // Its also possible to set a key retriever on this property object. Upon file decryption, + // availability of explicit keys is checked before invocation of the retriever callback. + // If an explicit key is available for a footer or a column, its key metadata will + // be ignored. + Builder* withColumnKeys(const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_properties) { + if (column_properties.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = column_properties; + return this; + } + + // Set a key retriever callback. Its also possible to + // set explicit footer or column keys on this file property object. Upon file decryption, + // availability of explicit keys is checked before invocation of the retriever callback. + // If an explicit key is available for a footer or a column, its key metadata will + // be ignored. + Builder* withKeyRetriever(const std::shared_ptr& + key_retriever) { + if (key_retriever == NULLPTR) + return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; + } + + // Skip integrity verification of plaintext footers. + // If not called, integrity of plaintext footers will be checked in runtime, + // and an exception will be thrown in the following situations: + // - footer signing key is not available (not passed, or not found by key retriever) + // - footer content and signature don't match + Builder* withoutFooterSignatureVerification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + // Explicitly supply the file AAD prefix. + // A must when a prefix is used for file encryption, but not stored in file. + // If AAD prefix is stored in file, it will be compared to the explicitly + // supplied value and an exception will be thrown if they differ. + Builder* withAADPrefix(std::string aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + + aad_prefix_ = aad_prefix; + return this; + } + + // Set callback for verification of AAD Prefixes stored in file. + Builder* withAADPrefixVerifier(std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) + return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; } - if (key_retriever_ == NULLPTR) { - throw ParquetException("no key retriever is provided for footer key metadata"); + + std::shared_ptr build() { + return + std::shared_ptr(new FileDecryptionProperties(footer_key_, + key_retriever_, + check_plaintext_footer_integrity_, + aad_prefix_, + aad_prefix_verifier_, + column_property_map_)); + } - return key_retriever_->GetKey(footer_key_metadata); + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + }; + + const std::string& getColumnKey(const std::shared_ptr& column_path) { + if (column_property_map_.find(column_path) != column_property_map_.end()) { + auto column_prop = column_property_map_[column_path]; + if (column_prop != nullptr) + return column_prop->getKey(); + } + return NULL_STRING; } - const std::string& GetAad() { return aad_; } - + + const std::string& getFooterKey() { + return footer_key_; + } + + const std::string& getAADPrefix() { return aad_prefix_; } + + std::shared_ptr getKeyRetriever() { + return key_retriever_; + } + + bool checkFooterIntegrity() { + return check_plaintext_footer_integrity_; + } + + const std::shared_ptr &getAADPrefixVerifier() { + return aad_prefix_verifier_; + } + private: std::string footer_key_; - std::string aad_; - - std::map column_keys_; - + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + + + FileDecryptionProperties(const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_property_map.size()); + if (!footer_key.empty()) + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + if (footer_key.empty() && check_plaintext_footer_integrity) + DCHECK(NULLPTR != key_retriever); + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_property_map_ = column_property_map; + } + }; - + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) @@ -223,10 +443,9 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; -static constexpr Encryption::type DEFAULT_ENCRYPTION_ALGORITHM = Encryption::AES_GCM_V1; -static constexpr int32_t MAXIMAL_KEY_METADATA_LENGTH = 256; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPT_THE_REST = true; +static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; class PARQUET_EXPORT ColumnProperties { public: @@ -278,181 +497,182 @@ class PARQUET_EXPORT ColumnProperties { class PARQUET_EXPORT FileEncryptionProperties { public: class Builder { - public: - Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) {} - - Builder(const std::string& key) - : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - footer_key_ = key; + public: + Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; } - - Builder* algorithm(Encryption::type algorithm) { - algorithm_ = algorithm; + + // Create files with plaintext footer. + // If not called, the files will be created with encrypted footer (default). + Builder* withPlaintextFooter() { + encrypted_footer_ = false; return this; } - - Builder* footer_key(const std::string& key, bool encrypt_footer=true) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - footer_key_ = key; - encrypt_footer_ = encrypt_footer; + + // Set encryption algorithm. + // If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; return this; } - - Builder* footer_key_metadata(const std::string& key_metadata) { - DCHECK(!footer_key_.empty()); - DCHECK(!key_metadata.empty() && - key_metadata.length() < MAXIMAL_KEY_METADATA_LENGTH); - footer_key_metadata_ = key_metadata; + + // Set a key retrieval metadata (converted from String). + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyID(std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), + key_id.size())) + throw ParquetException("footer key id should be in UTF8 encoding"); + + if (key_id.empty()) + return this; + + return withFooterKeyMetadata(key_id); + } + + // Set a key retrieval metadata. + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) + return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; return this; } - - Builder* aad(const std::string& aad) { - DCHECK(!aad.empty()); - aad_ = aad; + + // Set the file AAD Prefix. + Builder* withAADPrefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) + return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; return this; } - - Builder* aad_metadata(const std::string& aad_metadata) { - DCHECK(!aad_.empty()); - DCHECK(!aad_metadata.empty() && - aad_metadata.length() < MAXIMAL_AAD_METADATA_LENGTH); - aad_metadata_ = aad_metadata; + + // Skip storing AAD Prefix in file. + // If not called, and if AAD Prefix is set, it will be stored. + Builder* withoutAADPrefixStorage() { + store_aad_prefix_in_file_ = false; return this; } - - /** - * encrypt_the_rest will define if other columns (not defined in columns argument) - * will be encrypted or not - * if encrypt_the_rest = true, other columns will be encrypted with footer key - * else, other columns will be unencrypted - */ - Builder* column_properties( - const std::map>& - column_properties, - bool encrypt_the_rest = DEFAULT_ENCRYPT_THE_REST) { - encrypt_the_rest_ = encrypt_the_rest; - column_properties_ = column_properties; - - if (!footer_key_.empty()) { - for (const auto& col : column_properties) { - if (col.second->key().compare(footer_key_) != 0) { - break; - } - } - } else { - if (encrypt_the_rest) { - throw ParquetException("Encrypt the rest with null footer key"); - } - bool all_are_unencrypted = true; - for (const auto& col : column_properties) { - if (col.second->encrypted()) { - if (col.second->key().empty()) { - throw ParquetException("Encrypt column with null footer key"); - } - all_are_unencrypted = false; - } - } - - if (all_are_unencrypted) { - throw ParquetException("Footer and all columns unencrypted"); - } - } + + // Set the list of encrypted columns and their properties (keys etc). + // If not called, all columns will be encrypted with the footer key. + // If called, the file columns not in the list will be left unencrypted. + Builder* withEncryptedColumns(const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + encryptedColumns){ + if (encryptedColumns.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = encryptedColumns; return this; } - + std::shared_ptr build() { - std::shared_ptr footer_encryption; - if (!footer_key_.empty()) { - footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); - } - return std::make_shared( - footer_encryption, encrypt_footer_, footer_key_metadata_, aad_metadata_, - column_properties_, encrypt_the_rest_); + return + std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, + footer_key_, + footer_key_metadata_, + encrypted_footer_, + aad_prefix_, + store_aad_prefix_in_file_, + column_property_map_)); } - - private: - Encryption::type algorithm_; + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; std::string footer_key_; - bool encrypt_footer_; std::string footer_key_metadata_; - - std::string aad_; - std::string aad_metadata_; - - std::map> column_properties_; - bool encrypt_the_rest_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; }; - - FileEncryptionProperties( - const std::shared_ptr& footer_encryption, bool encrypt_footer, - const std::string& footer_key_metadata, const std::string& aad_metadata, - const std::map>& - column_properties, - bool encrypt_the_rest) - : footer_encryption_(footer_encryption), - encrypt_footer_(encrypt_footer), - footer_key_metadata_(footer_key_metadata), - aad_metadata_(aad_metadata), - column_properties_(column_properties), - encrypt_the_rest_(encrypt_the_rest) {} - - std::shared_ptr GetFooterEncryptionProperties() { - return footer_encryption_; + + bool encryptedFooter() const { return encrypted_footer_; } + + const EncryptionAlgorithm getAlgorithm() { + return algorithm_; } - - bool encrypt_footer() const { return encrypt_footer_; } - - const std::string& footer_key_metadata() const { return footer_key_metadata_; } - - const std::string& aad_metadata() const { return aad_metadata_; } - - std::shared_ptr GetColumnCryptoMetaData( - const std::shared_ptr& path) { - // non-uniform encryption - std::string path_str = path->ToDotString(); - if (column_properties_.find(path_str) != column_properties_.end()) { - return column_properties_[path_str]; - } - - // encrypted with footer key - if (encrypt_the_rest_) { - return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); - } - - // uniform encryption or unencrypted - return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); + + const std::string& getFooterEncryptionKey() { + return (encrypted_footer_? footer_key_ : NULL_STRING); } - - std::shared_ptr GetColumnEncryptionProperties( - const std::shared_ptr& path) { - // non-uniform encryption - std::string path_str = path->ToDotString(); - if (column_properties_.find(path_str) != column_properties_.end()) { - return std::make_shared(footer_encryption_->algorithm(), - column_properties_[path_str]->key(), - footer_encryption_->aad()); - } - - // encrypted with footer key - if (encrypt_the_rest_) { - return footer_encryption_; + + const std::string& getFooterEncryptionKeyMetadata() { + return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); + } + + const std::string& getFooterSigningKey() { + return (encrypted_footer_? NULL_STRING : footer_key_); + } + + const std::string& getFooterSigningKeyMetadata() { + return (encrypted_footer_? NULL_STRING : footer_key_metadata_); + } + + const std::string& getFileAAD() const { return file_AAD_; } + + std::shared_ptr + getColumnProperties(const std::shared_ptr& column_path) { + if (column_property_map_.size () == 0){ + auto builder = + std::shared_ptr( + new ColumnEncryptionProperties::Builder (column_path)); + return builder->build(); } - - // uniform encryption or unencrypted + if (column_property_map_.find(column_path) != column_property_map_.end()) + return column_property_map_[column_path]; + return NULLPTR; } - + private: - std::shared_ptr footer_encryption_; - bool encrypt_footer_; + EncryptionAlgorithm algorithm_; // encryption algorithm + std::string footer_key_; // encryption key, should have 16, 24, 32-byte length std::string footer_key_metadata_; - std::string aad_metadata_; - - std::map> column_properties_; - bool encrypt_the_rest_; + bool encrypted_footer_; + std::string file_AAD_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + FileEncryptionProperties(ParquetCipher::type cipher, + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_property_map_(column_property_map){ + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); + + } }; - + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -690,10 +910,16 @@ class PARQUET_EXPORT WriterProperties { if (parquet_file_encryption_ == NULLPTR) { return NULLPTR; } else { - return parquet_file_encryption_->GetFooterEncryptionProperties(); + std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); + //TODO: Fix AAD calculation + if (footer_key.empty()) + footer_key = parquet_file_encryption_->getFooterSigningKey (); + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + footer_key, parquet_file_encryption_->getFileAAD()); + } } - + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -737,24 +963,43 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr column_encryption_props( - const std::shared_ptr& path) const { + std::shared_ptr column_encryption_props(const + std::shared_ptr& path) const { if (parquet_file_encryption_) { - return parquet_file_encryption_->GetColumnCryptoMetaData(path); + return parquet_file_encryption_->getColumnProperties(path); } else { return NULLPTR; } } std::shared_ptr encryption( - const std::shared_ptr& path) const { + const std::shared_ptr& path) const { if (parquet_file_encryption_) { - return parquet_file_encryption_->GetColumnEncryptionProperties(path); + auto column_prop = parquet_file_encryption_->getColumnProperties(path); + if (column_prop == NULLPTR) + return NULLPTR; + if (column_prop->isEncryptedWithFooterKey()) { + //TODO: Fix AAD calculation + if (parquet_file_encryption_->encryptedFooter ()) { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterEncryptionKey(), + parquet_file_encryption_->getFileAAD()); + } else { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterSigningKey(), + parquet_file_encryption_->getFileAAD()); + } + } + + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + column_prop->getKey(), + parquet_file_encryption_->getFileAAD()); + } else { return NULLPTR; } } - + private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 740edbc4904..f31a27e78dd 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -90,6 +90,13 @@ class PARQUET_EXPORT ColumnPath { std::string ToDotString() const; const std::vector& ToDotVector() const; + struct CmpColumnPath { + bool operator()(const std::shared_ptr& a, + const std::shared_ptr& b) const { + return a->ToDotString() < b->ToDotString(); + } + }; + protected: std::vector path_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 8eb872abbb0..26cf7ea0bda 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -102,11 +102,11 @@ static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encrypt EncryptionAlgorithm encryption_algorithm; if (encryption.__isset.AES_GCM_V1) { - encryption_algorithm.algorithm = Encryption::AES_GCM_V1; + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); } else { - encryption_algorithm.algorithm = Encryption::AES_GCM_CTR_V1; + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); } return encryption_algorithm; @@ -180,7 +180,7 @@ static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; - if (encryption.algorithm == Encryption::AES_GCM_V1) { + if (encryption.algorithm == ParquetCipher::AES_GCM_V1) { encryption_algorithm.__isset.AES_GCM_V1 = true; encryption_algorithm.AES_GCM_V1 = ToAesGcmV1Thrift(encryption.aad); } else { diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 67aafb5ccb3..8e16309eec0 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -443,7 +443,7 @@ struct Compression { PARQUET_EXPORT std::unique_ptr<::arrow::util::Codec> GetCodecFromArrow(Compression::type codec); -struct Encryption { +struct ParquetCipher { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; @@ -454,7 +454,7 @@ struct AadMetadata { }; struct EncryptionAlgorithm { - Encryption::type algorithm; + ParquetCipher::type algorithm; AadMetadata aad; }; @@ -469,7 +469,7 @@ class PARQUET_EXPORT EncryptionProperties { public: EncryptionProperties() = default; - EncryptionProperties(Encryption::type algorithm, const std::string& key, + EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, const std::string& aad = "") : algorithm_(algorithm), key_(key), aad_(aad) {} @@ -482,31 +482,31 @@ class PARQUET_EXPORT EncryptionProperties { int aad_length() const { return static_cast(aad_.length()); } uint8_t* aad_bytes() const { return str2bytes(aad_); } - Encryption::type algorithm() const { return algorithm_; } + ParquetCipher::type algorithm() const { return algorithm_; } const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { + if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { return plain_len + 28 + 4; - } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { return plain_len + 16 + 4; } return plain_len; } uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { + if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { return cipher_len - 28 - 4; - } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { return cipher_len - 16 - 4; } return cipher_len; } private: - Encryption::type algorithm_; // encryption algorithm + ParquetCipher::type algorithm_; // encryption algorithm std::string key_; // encryption key, should have 16, 24, 32-byte length std::string aad_; // encryption additional authenticated data }; From c58af32eb137ac4dac0aefade17c5904e1908967 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 16 Apr 2019 11:20:05 +0300 Subject: [PATCH 024/125] Add AAD calculation --- cpp/src/parquet/column_reader.cc | 88 +++++++++++- cpp/src/parquet/column_reader.h | 4 +- cpp/src/parquet/column_writer.cc | 66 +++++++-- cpp/src/parquet/column_writer.h | 3 +- cpp/src/parquet/encryption.h | 8 +- cpp/src/parquet/file_reader.cc | 227 +++++++++++++++++++++++-------- cpp/src/parquet/file_writer.cc | 52 +++++-- cpp/src/parquet/metadata.cc | 190 +++++++++++++++++--------- cpp/src/parquet/metadata.h | 40 ++++-- cpp/src/parquet/properties.h | 43 +++++- cpp/src/parquet/thrift.h | 15 +- cpp/src/parquet/types.h | 6 +- 12 files changed, 568 insertions(+), 174 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 8a642f1e2ca..efdc2a03a1d 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -94,6 +94,14 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; + //reset column_map and fileAAD as default_reader_properties is static but + //can be used when reading parquet file with different reading options. + if (default_reader_properties.column_map() != NULLPTR + && default_reader_properties.column_map()->size () != 0) + default_reader_properties.column_map()->clear(); + if (!default_reader_properties.fileAAD().empty()) + default_reader_properties.set_fileAAD (""); + return default_reader_properties; } @@ -108,16 +116,35 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, + bool column_has_dictionary, int16_t row_group_ordinal, + int16_t column_ordinal, const std::shared_ptr encryption, ::arrow::MemoryPool* pool) - : : stream_(stream), + : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), + first_page_(true), + column_has_dictionary_ (column_has_dictionary), + row_group_ordinal_(row_group_ordinal), + column_ordinal_(column_ordinal), + page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), encryption_(encryption), decryption_buffer_(AllocateBuffer(pool, 0)) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); + if (encryption != NULLPTR) { + DCHECK (!encryption_->fileAAD().empty()); + //prepare the AAD for quick update later + data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + } } // Implement the PageReader interface @@ -135,6 +162,14 @@ class SerializedPageReader : public PageReader { std::unique_ptr<::arrow::util::Codec> decompressor_; std::shared_ptr decompression_buffer_; + bool first_page_; + bool column_has_dictionary_; + int16_t row_group_ordinal_; + int16_t column_ordinal_; + int16_t page_ordinal_; + std::string data_pageAAD_; + std::string data_page_headerAAD_; + // Maximum allowed page size uint32_t max_page_header_size_; @@ -152,9 +187,21 @@ class SerializedPageReader : public PageReader { std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with + bool current_page_is_dictionary = false; + if (column_has_dictionary_ ){ + if (first_page_) { + current_page_is_dictionary = true; + first_page_ = false; + } else + page_ordinal_++; + } else + page_ordinal_++; + + while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; uint32_t allowed_page_size = kDefaultPageHeaderSize; + std::string aad; // Page headers can be very large because of page statistics // We try to deserialize a larger buffer progressively @@ -169,9 +216,20 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, - encryption_); + if (encryption_!= NULLPTR) { + if (current_page_is_dictionary) { + aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption_->aad(data_page_headerAAD_); + } + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, encryption_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -189,7 +247,20 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - + if (encryption_!= NULLPTR){ + DCHECK(!encryption_->fileAAD().empty()); + if (current_page_is_dictionary){ + aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption_->aad(data_pageAAD_); + } + } + // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -277,10 +348,13 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, const std::shared_ptr encryption, + Compression::type codec, bool column_has_dictionary, + int16_t row_group_ordinal, int16_t column_ordinal, + const std::shared_ptr encryption, ::arrow::MemoryPool* pool) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, encryption, pool)); + new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, + row_group_ordinal, column_ordinal, encryption, pool)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index a5f1c7b6bd1..b513595c833 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -81,7 +81,9 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, const std::shared_ptr& encryption = NULLPTR, + Compression::type codec, bool column_has_dictionary = false, + int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, + const std::shared_ptr& encryption = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 44b759a585a..c926671b4a5 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -128,7 +128,8 @@ class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata,int16_t row_group_ordinal, + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), @@ -138,7 +139,21 @@ class SerializedPageWriter : public PageWriter { data_page_offset_(0), total_uncompressed_size_(0), total_compressed_size_(0), - encryption_(encryption) { + page_ordinal_(0), + row_group_ordinal_(row_group_ordinal), + column_ordinal_(column_chunk_ordinal), + encryption_(encryption){ + if (encryption != NULLPTR) { + //prepare the add for quick update later + data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -165,12 +180,21 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (encryption_.get()) { + encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + encrypted_data_buffer = std::static_pointer_cast( AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); output_data_len = parquet_encryption::Encrypt( encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); + encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } format::PageHeader page_header; @@ -201,9 +225,15 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); - + auto props = metadata_->get_encryption_props(metadata_->descr()->path()); + if (props != nullptr){ + props->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + } // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get()); + metadata_->WriteTo(sink_.get(), props); } /** @@ -230,7 +260,6 @@ class SerializedPageWriter : public PageWriter { int64_t WriteDataPage(const CompressedDataPage& page) override { int64_t uncompressed_size = page.uncompressed_size(); std::shared_ptr compressed_data = page.buffer(); - format::DataPageHeader data_page_header; data_page_header.__set_num_values(page.num_values()); data_page_header.__set_encoding(ToThrift(page.encoding())); @@ -245,11 +274,15 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (encryption_.get()) { + parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption_->aad(data_pageAAD_); encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); output_data_len = parquet_encryption::Encrypt( encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption_->aad(data_page_headerAAD_); } format::PageHeader page_header; @@ -272,6 +305,7 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); + page_ordinal_++; int64_t current_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(¤t_pos)); return current_pos - start_pos; @@ -298,6 +332,11 @@ class SerializedPageWriter : public PageWriter { int64_t data_page_offset_; int64_t total_uncompressed_size_; int64_t total_compressed_size_; + int16_t page_ordinal_; + int16_t row_group_ordinal_; + int16_t column_ordinal_; + std::string data_pageAAD_; + std::string data_page_headerAAD_; std::unique_ptr thrift_serializer_; @@ -311,12 +350,14 @@ class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t current_column_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, pool)); + new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, + row_group_ordinal, current_column_ordinal, pool)); } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -361,14 +402,19 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, encryption, metadata, pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, + row_group_ordinal, column_chunk_ordinal, + pool)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, encryption, metadata, pool)); + new SerializedPageWriter(sink, codec, encryption, metadata, + row_group_ordinal, column_chunk_ordinal, + pool)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 7c601fe7499..545fb571f61 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -84,7 +84,8 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, + int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool buffered_row_group = false); diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3902c4f11bd..2147900b1c5 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -60,10 +60,16 @@ class PARQUET_EXPORT HiddenColumnException : public ParquetException { class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { public: - KeyAccessDeniedException(const std::string &columnPath) + KeyAccessDeniedException(const std::string& columnPath) : ParquetException(columnPath.c_str()) {} }; +class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { + public: + UnsupportedOperationException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} +}; + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index d362ca4f68d..95285915c92 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -39,6 +39,8 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/memory.h" +#include "parquet/util/crypto.h" namespace parquet { @@ -89,7 +91,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { : source_(source), file_metadata_(file_metadata), file_crypto_metadata_(file_crypto_metadata), - properties_(props) { + properties_(props), + row_group_ordinal_((int16_t)row_group_number){ row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -105,10 +108,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { else if (file_metadata_->is_plaintext_mode()) { algorithm = file_metadata_->encryption_algorithm(); } + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> + column_map = properties_.column_map(); // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption(), - &algorithm); - + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, + properties_.file_decryption(), + &algorithm, properties_.fileAAD(), + column_map); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -144,11 +151,17 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i/*column_ordinal*/, nullptr, properties_.memory_pool()); } // the column is encrypted - + std::string aad = parquet_encryption::createModuleAAD(properties_.fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + (int16_t)i, (int16_t)-1); + auto file_decryption = properties_.file_decryption(); // the column is encrypted with footer key @@ -160,25 +173,27 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string footer_key = file_decryption->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); } if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } - + ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; + auto footer_encryption = std::make_shared( - algorithm, footer_key); + algorithm, footer_key, properties_.fileAAD(), aad); return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, footer_encryption, properties_.memory_pool()); } @@ -189,19 +204,37 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key std::string column_key; - if (column_key_metadata.empty()) + // first look if we already got the key from before + if (column_map != NULLPTR && column_map->find(column_path) != column_map->end()) { + column_key = column_map->at(column_path); + } + else { column_key = file_decryption->getColumnKey(column_path); - else if (file_decryption->getKeyRetriever() != nullptr) - column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + file_decryption->getKeyRetriever() != nullptr){ + try { + column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << " HiddenColumnException, path=" + column_path->ToDotString(); + throw HiddenColumnException(ss.str()); + } + } + } if (column_key.empty()) { - throw ParquetException("column is encrypted with null key, path=" + - column_path->ToDotString()); + throw HiddenColumnException("column is encrypted with null key, path=" + + column_path->ToDotString()); } auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, column_key); - + file_crypto_metadata_->encryption_algorithm().algorithm, + column_key, + properties_.fileAAD(), aad); + return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, column_encryption, properties_.memory_pool()); } @@ -211,6 +244,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; + int16_t row_group_ordinal_; }; // ---------------------------------------------------------------------- @@ -296,30 +330,72 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_metadata_->is_plaintext_mode()) { if (metadata_len - read_metadata_len != 28) { - throw ParquetException("Invalid parquet file. Cannot verify plaintext mode footer."); + throw ParquetException("Invalid parquet file. Cannot verify plaintext" + "mode footer."); } // get footer key std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. Could not verify plaintext footer metadata"); + throw ParquetException("No decryption properties are provided. " + "Could not verify plaintext footer metadata"); } - std::string footer_key; - if (footer_key_metadata.empty()) - footer_key = file_decryption->getFooterKey(); - else if (file_decryption->getKeyRetriever() != nullptr) - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - + std::string footer_key = file_decryption->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) throw ParquetException("No footer key or " + "key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << "Footer key: access denied"; + throw ParquetException(ss.str()); + } + } if (footer_key.empty()) { - throw ParquetException("No footer key are provided. Could not verify plaintext footer metadata"); + throw ParquetException("Footer key unavailable. Could not verify plaintext " + "footer metadata"); } - // TODO: aad + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + bool supply_aad_prefix = algo.aad.supply_aad_prefix; + std::string aad_file_unique = algo.aad.aad_file_unique; + std::string aad_prefix = algo.aad.aad_prefix; + if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 + && algo.algorithm != ParquetCipher::AES_GCM_V1) + throw ParquetException("Unsupported algorithm"); + if (!file_decryption->getAADPrefix().empty()) { + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties is not the same"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) { + aad_prefix_verifier->check(aad_prefix); + } + } + if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + throw ParquetException("AAD prefix used for file encryption, but not stored in " + "file and not supplied in decryption properties"); + } + std::string fileAAD; + if (!supply_aad_prefix) + fileAAD = aad_prefix + aad_file_unique; + else + fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + + properties_.set_fileAAD(fileAAD); + std::string aad = parquet_encryption::createFooterAAD(fileAAD); auto encryption = std::make_shared( - file_metadata_->encryption_algorithm().algorithm, - footer_key - ); - if (! file_metadata_->verify(encryption, metadata_buffer->data() + read_metadata_len, 28)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext footer metadata"); + file_metadata_->encryption_algorithm().algorithm, + footer_key, fileAAD, aad); + if (! file_metadata_->verify(encryption, metadata_buffer->data() + + read_metadata_len, 28)) { + throw ParquetException("Invalid parquet file. Could not verify plaintext" + " footer metadata"); } } } @@ -349,40 +425,83 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } - + auto file_decryption = properties_.file_decryption(); + if (file_decryption == nullptr) { + throw ParquetException("No decryption properties are provided. Could not read " + "encrypted footer metadata"); + } + uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); + bool supply_aad_prefix = algo.aad.supply_aad_prefix; + std::string aad_file_unique = algo.aad.aad_file_unique; + std::string aad_prefix = algo.aad.aad_prefix; + if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 + && algo.algorithm != ParquetCipher::AES_GCM_V1) + throw ParquetException("Unsupported algorithm"); + if (!file_decryption->getAADPrefix().empty()) { + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties is not the same"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) { + aad_prefix_verifier->check(aad_prefix); + } + } + if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + throw ParquetException("AAD prefix used for file encryption, but not stored in file " + "and not supplied in decryption properties"); + } + std::string fileAAD; + if (!supply_aad_prefix) + fileAAD = aad_prefix + aad_file_unique; + else + fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + //save fileAAD for later use + properties_.set_fileAAD(fileAAD); + std::string aad = parquet_encryption::createFooterAAD(fileAAD); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; PARQUET_THROW_NOT_OK( source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid encrypted parquet file. Could not read footer metadata bytes."); + throw ParquetException("Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); } - + // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); - auto file_decryption = properties_.file_decryption(); - if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. Could not read encrypted footer metadata"); - } - std::string footer_key; - if (footer_key_metadata.empty()) - footer_key = file_decryption->getFooterKey(); - else if (file_decryption->getKeyRetriever() != nullptr) - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + std::string footer_key = file_decryption->getFooterKey(); + if (footer_key.empty()) { + if (footer_key_metadata.empty()) throw ParquetException("No footer key or " + "key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << "Footer key: access denied"; + throw ParquetException(ss.str()); + } + } - if (footer_key.size() == 0) { - throw ParquetException("Invalid footer encryption key. Could not parse footer metadata"); + if (footer_key.empty()) { + throw ParquetException("Invalid footer encryption key. " + "Could not parse footer metadata"); } auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->getAADPrefix()); + file_crypto_metadata_->encryption_algorithm().algorithm, + footer_key, + fileAAD, aad); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, - footer_encryption); + footer_encryption); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index faa3dfdab9e..347c4541372 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -24,6 +24,8 @@ #include "parquet/deprecated_io.h" #include "parquet/platform.h" #include "parquet/schema.h" +#include "parquet/util/memory.h" +#include "parquet/util/crypto.h" using arrow::MemoryPool; @@ -78,13 +80,16 @@ inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) { class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, - RowGroupMetaDataBuilder* metadata, + RowGroupMetaDataBuilder* metadata, + int16_t row_group_ordinal, const WriterProperties* properties, bool buffered_row_group = false) + : sink_(sink), metadata_(metadata), properties_(properties), total_bytes_written_(0), closed_(false), + row_group_ordinal_ (row_group_ordinal), current_column_index_(0), num_rows_(0), buffered_row_group_(buffered_row_group) { @@ -125,8 +130,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, // TODO - properties_->memory_pool()); + properties_->encryption(column_descr->path()), col_meta, + row_group_ordinal_, (int16_t)(current_column_index_-1), + properties_->memory_pool()); + column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -181,7 +188,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { // Ensures all columns have been written metadata_->set_num_rows(num_rows_); - metadata_->Finish(total_bytes_written_); + metadata_->Finish(total_bytes_written_, row_group_ordinal_); } } @@ -191,6 +198,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const WriterProperties* properties_; int64_t total_bytes_written_; bool closed_; + int16_t row_group_ordinal_; int current_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; @@ -222,9 +230,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), + PageWriter::Open(sink_, properties_->compression(column_descr->path()), properties_->encryption(column_descr->path()), col_meta, + (int16_t)row_group_ordinal_, + (int16_t)current_column_index_, properties_->memory_pool(), buffered_row_group_); + column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -277,28 +288,37 @@ class FileSerializer : public ParquetFileWriter::Contents { uint64_t metadata_start = static_cast(sink_->Tell()); auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; - // TODO: Fix AAD calculation + + ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); std::shared_ptr footer_encryption = std::make_shared(algorithm, - file_encryption->getFooterEncryptionKey()); + file_encryption->getFooterEncryptionKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - sink_->Write(PARQUET_EMAGIC, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; + EncryptionAlgorithm algo = file_encryption->getAlgorithm(); + + signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; + signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; + if (!algo.aad.supply_aad_prefix) + signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; - file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->getFooterSigningKeyMetadata ()); - // TODO: Fix AAD calculation - ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + + file_metadata_ = metadata_->Finish(&signing_encryption, + file_encryption->getFooterSigningKeyMetadata ()); + ParquetCipher::type algorithm = algo.algorithm; + std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); std::shared_ptr footer_encryption = std::make_shared(algorithm, - file_encryption->getFooterSigningKey()); + file_encryption->getFooterSigningKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); } } @@ -324,7 +344,9 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, properties_.get(), buffered_row_group)); + sink_, rg_metadata, (int16_t)(num_row_groups_-1), properties_.get(), + buffered_row_group)); + row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 48331ed6ffe..d8c91b8d384 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -162,11 +162,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR) - : column_(column), descr_(descr), writer_version_(writer_version) { - + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, std::string, + schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) + : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; if (column->__isset.crypto_metadata) { @@ -177,43 +181,60 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { throw ParquetException("Cannot decrypt ColumnMetadata. FileDecryptionProperties must be provided."); } // should decrypt metadata - std::shared_ptr path = std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::shared_ptr path = + std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - std::string key = file_decryption->getColumnKey(path); - // No explicit column key given via API. Retrieve via key metadata. - if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ - try { - key = file_decryption->getKeyRetriever()->GetKey(key_metadata); - } catch (KeyAccessDeniedException e) { - // Hidden column: encrypted, but key unavailable - throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + std::string key; + if (column_map != NULLPTR && (column_map->find(path) != column_map->end())) { + //First retrieve the key in column_map if exists + key = column_map->at(path); + } else { + key = file_decryption->getColumnKey(path); + // No explicit column key given via API. Retrieve via key metadata. + if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ + try { + key = file_decryption->getKeyRetriever()->GetKey(key_metadata); + } catch (KeyAccessDeniedException &e) { + // Hidden column: encrypted, but key unavailable + std::stringstream ss; + ss << e.what(); + ss << " HiddenColumnException path=" + path->ToDotString(); + throw HiddenColumnException(ss.str()); + } + if (key.empty ()) + throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + } + if (column_map != NULLPTR) { + // save column key for future use + (*column_map)[path]=key; } - if (key.empty ()) - throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); } - if (key.empty()) { // Hidden column: encrypted, but key unavailable throw HiddenColumnException("HiddenColumnException path= " + path->ToDotString()); } - DCHECK(algorithm != NULLPTR); + DCHECK(algorithm != NULLPTR); + + std::string aad = parquet_encryption::createModuleAAD(fileAAD, + parquet_encryption::ColumnMetaData, + row_group_ordinal, + column_ordinal, (int16_t)-1); - // TODO: AAD - auto encryption = std::make_shared(algorithm->algorithm, key); + auto encryption = std::make_shared(algorithm->algorithm, + key, fileAAD, aad); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), &len, &metadata_, encryption, false); } } - for (auto encoding : metadata_.encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; } - + // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } @@ -302,23 +323,39 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm) { + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption, algorithm)); + new ColumnChunkMetaData(metadata, descr, + row_group_ordinal, column_ordinal, + writer_version, file_decryption, + algorithm, fileAAD, column_map)); } - + ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm) - : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( - reinterpret_cast(metadata), descr, - writer_version, file_decryption, algorithm))} {} + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map) +: impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + reinterpret_cast(metadata), + descr, + row_group_ordinal, + column_ordinal, + writer_version, + file_decryption, algorithm, + fileAAD, column_map))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} - + // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -397,8 +434,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR) { + std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -406,7 +448,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryption, algorithm); + row_group_ordinal, (int16_t)i, + writer_version_, file_decryption, algorithm, fileAAD, column_map); } private: @@ -437,9 +480,16 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm) const { - return impl_->ColumnChunk(i, file_decryption, algorithm); +std::unique_ptr RowGroupMetaData::ColumnChunk(int i, + int16_t row_group_ordinal, + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, + schema::ColumnPath::CmpColumnPath>> column_map) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, + fileAAD, column_map); } // file metadata @@ -479,11 +529,13 @@ class FileMetaData::FileMetaDataImpl { uint8_t* tag = const_cast(reinterpret_cast(tail)) + 12; std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); - uint32_t encrypted_len = parquet_encryption::SignedFooterEncrypt( - serialized_data, serialized_len, encryption->key_bytes(), - encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), - nonce, 12, encrypted_buffer.data()); - + uint32_t encrypted_len = + parquet_encryption::SignedFooterEncrypt(serialized_data, serialized_len, + encryption->key_bytes(), + encryption->key_length(), + encryption->aad_bytes(), + encryption->aad_length(), + nonce, 12, encrypted_buffer.data()); return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); } @@ -691,7 +743,13 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryption) const { + if (encryption != nullptr) + encryption->aad(parquet_encryption::createModuleAAD(encryption->fileAAD(), + parquet_encryption::Footer, + (int16_t)-1, (int16_t)-1, + (int16_t)-1)); return impl_->WriteTo(dst, encryption); } @@ -918,7 +976,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_metadata_.__set_encodings(thrift_encodings); } - void WriteTo(::arrow::io::OutputStream* sink) { + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryption) { ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_props(column_->path()); @@ -953,20 +1012,20 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { !encrypt_md->isEncryptedWithFooterKey()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata - auto encrypt_props = properties_->encryption(column_->path()); + auto encrypt_props = encryption; + uint8_t* serialized_data; + uint32_t serialized_len; - uint8_t* serialized_data; - uint32_t serialized_len; serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); + // encrypt the footer key std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); + unsigned encrypted_len = parquet_encryption::Encrypt( encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); - const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); - // Keep redacted metadata version for old readers if (footer_encryption == nullptr) { format::ColumnMetaData metadata_redacted = column_metadata_; @@ -992,6 +1051,11 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } + std::shared_ptr + get_encryption_props(const std::shared_ptr& path) { + return properties_->encryption(path); + } + const ColumnDescriptor* descr() const { return column_; } int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } @@ -1045,6 +1109,11 @@ void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); } +std::shared_ptr +ColumnChunkMetaDataBuilder::get_encryption_props(const std::shared_ptr& path) { + return impl_->get_encryption_props(path); +} + void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, @@ -1055,10 +1124,11 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, compressed_size, uncompressed_size, has_dictionary, dictionary_fallback); } -void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { - impl_->WriteTo(sink); +void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryption) { + impl_->WriteTo(sink, encryption); } - + const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { return impl_->descr(); } @@ -1097,7 +1167,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { int current_column() { return current_column_; } - void Finish(int64_t total_bytes_written) { + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { if (!(current_column_ == schema_->num_columns())) { std::stringstream ss; ss << "Only " << current_column_ - 1 << " out of " << schema_->num_columns() @@ -1119,10 +1189,8 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { // columns"; // row_group_->__set_total_byte_size(total_byte_size); - int64_t file_offset = 0; int64_t total_compressed_size = 0; - for (int i = 0; i < schema_->num_columns(); i++) { if (!(row_group_->columns[i].file_offset >= 0)) { std::stringstream ss; @@ -1140,6 +1208,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { row_group_->__set_file_offset(file_offset); row_group_->__set_total_compressed_size(total_compressed_size); row_group_->__set_total_byte_size(total_bytes_written); + row_group_->__set_ordinal(row_group_ordinal); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -1187,8 +1256,8 @@ void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) { impl_->set_num_rows(num_rows); } -void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written) { - impl_->Finish(total_bytes_written); +void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { + impl_->Finish(total_bytes_written, row_group_ordinal); } // file metadata @@ -1284,12 +1353,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption = properties_->file_encryption(); auto footer_encryption = properties_->footer_encryption(); - // build format::FileCryptoMetaData - EncryptionAlgorithm encryption_algorithm; - encryption_algorithm.algorithm = footer_encryption->algorithm(); - // TODO: aad metadata - //encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); + crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->getAlgorithm())); std::string key_metadata; if (file_encryption->encryptedFooter()) key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index bdd0faeede7..65f3f417a5f 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -22,11 +22,13 @@ #include #include #include - +#include + #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "parquet/platform.h" +#include "parquet/schema.h" #include "parquet/properties.h" #include "parquet/types.h" @@ -36,6 +38,7 @@ class ColumnDescriptor; class EncodedStatistics; class Statistics; class SchemaDescriptor; +class FileCryptoMetaData; namespace schema { @@ -114,16 +117,18 @@ class PARQUET_EXPORT ColumnCryptoMetaData { std::unique_ptr impl_; }; -class FileCryptoMetaData; - class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR); + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string &fileAAD = "", + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR); ~ColumnChunkMetaData(); @@ -152,9 +157,14 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR); + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -175,8 +185,14 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR) const; + std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string &fileAAD = "", + std::shared_ptr, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -280,6 +296,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void SetStatistics(const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; + int64_t total_compressed_size() const; // commit the metadata void Finish(int64_t num_values, int64_t dictonary_page_offset, @@ -291,7 +308,10 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { const void* contents() const; // For writing metadata at end of column chunk - void WriteTo(::arrow::io::OutputStream* sink); + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryption = NULLPTR); + std::shared_ptr + get_encryption_props(const std::shared_ptr& path); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, @@ -320,7 +340,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { void set_num_rows(int64_t num_rows); // commit the metadata - void Finish(int64_t total_bytes_written); + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1); private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d99383e49fd..ba0b57a201c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "parquet/encryption.h" #include "parquet/exception.h" @@ -399,6 +401,10 @@ class PARQUET_EXPORT ReaderProperties { : pool_(pool) { buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM; buffer_size_ = DEFAULT_BUFFER_SIZE; + column_map_ = std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>>(new std::map, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>()); } ::arrow::MemoryPool* memory_pool() const { return pool_; } @@ -416,6 +422,15 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map () { + return column_map_; + } + + const std::string& fileAAD() { return fileAAD_; } + + void set_fileAAD (std::string fileAAD) { fileAAD_ = fileAAD; } + void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -427,6 +442,10 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size_; bool buffered_stream_enabled_; std::shared_ptr file_decryption_; + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between + //ColumnPath and their encryption keys + std::string fileAAD_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -446,6 +465,7 @@ static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOM static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; +static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; class PARQUET_EXPORT ColumnProperties { public: @@ -669,8 +689,27 @@ class PARQUET_EXPORT FileEncryptionProperties { DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); + + uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; + memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); + RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH) ; - } + bool supply_aad_prefix = false; + if (aad_prefix.empty()) + file_AAD_ = aad_file_unique_str; + else { + file_AAD_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } + } }; class PARQUET_EXPORT WriterProperties { @@ -911,7 +950,6 @@ class PARQUET_EXPORT WriterProperties { return NULLPTR; } else { std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); - //TODO: Fix AAD calculation if (footer_key.empty()) footer_key = parquet_file_encryption_->getFooterSigningKey (); return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, @@ -979,7 +1017,6 @@ class PARQUET_EXPORT WriterProperties { if (column_prop == NULLPTR) return NULLPTR; if (column_prop->isEncryptedWithFooterKey()) { - //TODO: Fix AAD calculation if (parquet_file_encryption_->encryptedFooter ()) { return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, parquet_file_encryption_->getFooterEncryptionKey(), diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 26cf7ea0bda..4189d2c9079 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -164,17 +164,17 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { format::AesGcmV1 aesGcmV1; - aesGcmV1.aad_prefix = aad.aad_prefix; - aesGcmV1.aad_file_unique = aad.aad_file_unique; - aesGcmV1.supply_aad_prefix = aad.supply_aad_prefix; + aesGcmV1.__set_aad_prefix(aad.aad_prefix); + aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); return aesGcmV1; } static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { format::AesGcmCtrV1 aesGcmCtrV1; - aesGcmCtrV1.aad_prefix = aad.aad_prefix; - aesGcmCtrV1.aad_file_unique = aad.aad_file_unique; - aesGcmCtrV1.supply_aad_prefix = aad.supply_aad_prefix; + aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); return aesGcmCtrV1; } @@ -237,9 +237,10 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } + *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); + } } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 8e16309eec0..8f4ba5f212a 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -470,8 +470,8 @@ class PARQUET_EXPORT EncryptionProperties { public: EncryptionProperties() = default; EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, - const std::string& aad = "") - : algorithm_(algorithm), key_(key), aad_(aad) {} + const std::string& file_aad, const std::string& aad = "") + : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) {} ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } @@ -486,6 +486,7 @@ class PARQUET_EXPORT EncryptionProperties { const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } + const std::string& fileAAD() const { return file_aad_; } uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { @@ -508,6 +509,7 @@ class PARQUET_EXPORT EncryptionProperties { private: ParquetCipher::type algorithm_; // encryption algorithm std::string key_; // encryption key, should have 16, 24, 32-byte length + std::string file_aad_; std::string aad_; // encryption additional authenticated data }; From c8f1a8a11ea95e4a47abfa1ca9edbff5aea25e05 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 16 Apr 2019 11:31:00 +0300 Subject: [PATCH 025/125] Fix parquet tests to work with the changes required to support AAD --- cpp/src/parquet/column_writer-test.cc | 2 +- cpp/src/parquet/statistics-test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index a0b916db667..63179beab30 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -245,7 +245,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); auto metadata_accessor = - ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, -1, -1, &app_version); return metadata_accessor->is_stats_set(); } diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc index fa1caa96d31..a218accb017 100644 --- a/cpp/src/parquet/statistics-test.cc +++ b/cpp/src/parquet/statistics-test.cc @@ -519,7 +519,7 @@ void AssertStatsSet(const ApplicationVersion& version, const ColumnDescriptor* column, bool expected_is_set) { auto metadata_builder = ColumnChunkMetaDataBuilder::Make(props, column); auto column_chunk = - ColumnChunkMetaData::Make(metadata_builder->contents(), column, &version); + ColumnChunkMetaData::Make(metadata_builder->contents(), column, -1, -1, &version); EncodedStatistics stats; stats.set_is_signed(false); metadata_builder->SetStatistics(stats); From 681209e7b1dbf6e45fa945c07bd8eb5e4bf095e8 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 19 Apr 2019 17:05:47 +0700 Subject: [PATCH 026/125] verify plaintext footer depends on config of decryption properties --- cpp/src/parquet/file_reader.cc | 78 ++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 95285915c92..8167242872e 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -329,37 +329,11 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); if (file_metadata_->is_plaintext_mode()) { - if (metadata_len - read_metadata_len != 28) { - throw ParquetException("Invalid parquet file. Cannot verify plaintext" - "mode footer."); - } - // get footer key - std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. " - "Could not verify plaintext footer metadata"); - } - std::string footer_key = file_decryption->getFooterKey(); - // ignore footer key metadata if footer key is explicitly set via API - if (footer_key.empty()) { - if (footer_key_metadata.empty()) throw ParquetException("No footer key or " - "key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << e.what(); - ss << "Footer key: access denied"; - throw ParquetException(ss.str()); - } - } - if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify plaintext " - "footer metadata"); + throw ParquetException("No decryption properties are provided"); } + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); bool supply_aad_prefix = algo.aad.supply_aad_prefix; std::string aad_file_unique = algo.aad.aad_file_unique; @@ -387,15 +361,45 @@ class SerializedFile : public ParquetFileReader::Contents { else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - properties_.set_fileAAD(fileAAD); - std::string aad = parquet_encryption::createFooterAAD(fileAAD); - auto encryption = std::make_shared( - file_metadata_->encryption_algorithm().algorithm, - footer_key, fileAAD, aad); - if (! file_metadata_->verify(encryption, metadata_buffer->data() - + read_metadata_len, 28)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext" - " footer metadata"); + properties_.set_fileAAD(fileAAD); + if (file_decryption->checkFooterIntegrity()) { + if (metadata_len - read_metadata_len != 28) { + throw ParquetException("Invalid parquet file. Cannot verify plaintext" + "mode footer."); + } + + // get footer key + std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); + std::string footer_key = file_decryption->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) throw ParquetException("No footer key or " + "key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << "Footer key: access denied"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException("Footer key unavailable. Could not verify plaintext " + "footer metadata"); + } + + std::string aad = parquet_encryption::createFooterAAD(fileAAD); + auto encryption = std::make_shared( + file_metadata_->encryption_algorithm().algorithm, + footer_key, fileAAD, aad); + if (! file_metadata_->verify(encryption, metadata_buffer->data() + + read_metadata_len, 28)) { + throw ParquetException("Invalid parquet file. Could not verify plaintext" + " footer metadata"); + } } } } From 0a2351d87dfaccbc766f84e737688574292db046 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 22 Apr 2019 07:54:49 +0300 Subject: [PATCH 027/125] Fix code style --- cpp/src/parquet/column_reader.cc | 71 +++---- cpp/src/parquet/column_writer.cc | 71 ++++--- cpp/src/parquet/encryption.h | 2 +- cpp/src/parquet/file_reader.cc | 215 ++++++++++---------- cpp/src/parquet/file_writer.cc | 67 ++++--- cpp/src/parquet/metadata.cc | 222 +++++++++++---------- cpp/src/parquet/metadata.h | 20 +- cpp/src/parquet/properties.h | 326 +++++++++++++++---------------- cpp/src/parquet/schema.h | 2 +- cpp/src/parquet/types.h | 15 +- 10 files changed, 514 insertions(+), 497 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index efdc2a03a1d..e7f3a5d90a1 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -94,14 +94,14 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; - //reset column_map and fileAAD as default_reader_properties is static but - //can be used when reading parquet file with different reading options. + // reset column_map and fileAAD as default_reader_properties is static but + // can be used when reading parquet file with different reading options. if (default_reader_properties.column_map() != NULLPTR && default_reader_properties.column_map()->size () != 0) default_reader_properties.column_map()->clear(); - if (!default_reader_properties.fileAAD().empty()) + if (!default_reader_properties.fileAAD().empty()) default_reader_properties.set_fileAAD (""); - + return default_reader_properties; } @@ -136,14 +136,16 @@ class SerializedPageReader : public PageReader { if (encryption != NULLPTR) { DCHECK (!encryption_->fileAAD().empty()); //prepare the AAD for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); } } @@ -192,7 +194,7 @@ std::shared_ptr SerializedPageReader::NextPage() { if (first_page_) { current_page_is_dictionary = true; first_page_ = false; - } else + } else page_ordinal_++; } else page_ordinal_++; @@ -216,20 +218,20 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - if (encryption_!= NULLPTR) { - if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - encryption_->aad(aad); - } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - encryption_->aad(data_page_headerAAD_); - } - } - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, encryption_); + if (encryption_!= NULLPTR) { + if (current_page_is_dictionary) { + aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption_->aad(data_page_headerAAD_); + } + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, encryption_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -250,17 +252,18 @@ std::shared_ptr SerializedPageReader::NextPage() { if (encryption_!= NULLPTR){ DCHECK(!encryption_->fileAAD().empty()); if (current_page_is_dictionary){ - aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - encryption_->aad(aad); + aad = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); } else { - parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); encryption_->aad(data_pageAAD_); } } - + // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index c926671b4a5..fcb7768619a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -129,7 +129,7 @@ class SerializedPageWriter : public PageWriter { SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata,int16_t row_group_ordinal, - int16_t column_chunk_ordinal, + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), @@ -145,14 +145,16 @@ class SerializedPageWriter : public PageWriter { encryption_(encryption){ if (encryption != NULLPTR) { //prepare the add for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -180,10 +182,11 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (encryption_.get()) { - encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + encryption_->aad( + parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); encrypted_data_buffer = std::static_pointer_cast( AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); @@ -191,10 +194,11 @@ class SerializedPageWriter : public PageWriter { encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + encryption_->aad( + parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } format::PageHeader page_header; @@ -227,10 +231,11 @@ class SerializedPageWriter : public PageWriter { fallback); auto props = metadata_->get_encryption_props(metadata_->descr()->path()); if (props != nullptr){ - props->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + props->aad( + parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk metadata_->WriteTo(sink_.get(), props); @@ -281,7 +286,8 @@ class SerializedPageWriter : public PageWriter { encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, + page_ordinal_); encryption_->aad(data_page_headerAAD_); } @@ -353,11 +359,12 @@ class BufferedPageWriter : public PageWriter { ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t current_column_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : final_sink_(sink), metadata_(metadata) { - in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, - row_group_ordinal, current_column_ordinal, pool)); + : final_sink_(sink), + metadata_(metadata) { + in_memory_sink_ = CreateOutputStream(pool); + pager_ = std::unique_ptr( + new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, + row_group_ordinal, current_column_ordinal, pool)); } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -407,14 +414,14 @@ std::unique_ptr PageWriter::Open( bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, encryption, metadata, - row_group_ordinal, column_chunk_ordinal, - pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, + row_group_ordinal, column_chunk_ordinal, + pool)); } else { return std::unique_ptr( new SerializedPageWriter(sink, codec, encryption, metadata, - row_group_ordinal, column_chunk_ordinal, - pool)); + row_group_ordinal, column_chunk_ordinal, + pool)); } } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 2147900b1c5..3a4481bd4aa 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -69,7 +69,7 @@ class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { UnsupportedOperationException(const std::string& columnPath) : ParquetException(columnPath.c_str()) {} }; - + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8167242872e..cd06801e35f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -109,13 +109,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { algorithm = file_metadata_->encryption_algorithm(); } std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map = properties_.column_map(); // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, properties_.file_decryption(), - &algorithm, properties_.fileAAD(), - column_map); + &algorithm, + properties_.fileAAD(), + column_map); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -157,10 +158,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // the column is encrypted - std::string aad = parquet_encryption::createModuleAAD(properties_.fileAAD(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - (int16_t)i, (int16_t)-1); + std::string aad = parquet_encryption::createModuleAAD( + properties_.fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + (int16_t)i, (int16_t)-1); auto file_decryption = properties_.file_decryption(); @@ -173,22 +175,21 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string footer_key = file_decryption->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + footer_key = file_decryption->getKeyRetriever()->GetKey( + footer_key_metadata); } if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } - ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; - auto footer_encryption = std::make_shared( algorithm, footer_key, properties_.fileAAD(), aad); @@ -197,7 +198,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { footer_encryption, properties_.memory_pool()); } - // file is non-uniform encrypted and the column is encrypted with its own key + // file is non-uniform encrypted and the column + // is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = @@ -205,28 +207,29 @@ class SerializedRowGroup : public RowGroupReader::Contents { // encrypted with column key std::string column_key; // first look if we already got the key from before - if (column_map != NULLPTR && column_map->find(column_path) != column_map->end()) { + if (column_map != NULLPTR + && column_map->find(column_path) != column_map->end()) { column_key = column_map->at(column_path); - } - else { + } else { column_key = file_decryption->getColumnKey(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && - file_decryption->getKeyRetriever() != nullptr){ - try { - column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << e.what(); - ss << " HiddenColumnException, path=" + column_path->ToDotString(); - throw HiddenColumnException(ss.str()); - } - } + file_decryption->getKeyRetriever() != nullptr){ + try { + column_key = file_decryption->getKeyRetriever()->GetKey( + column_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + + column_path->ToDotString() + " " + << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } + } } - if (column_key.empty()) { throw HiddenColumnException("column is encrypted with null key, path=" + - column_path->ToDotString()); + column_path->ToDotString()); } auto column_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, @@ -234,7 +237,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.fileAAD(), aad); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, column_encryption, properties_.memory_pool()); } @@ -334,33 +337,34 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided"); } - EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - bool supply_aad_prefix = algo.aad.supply_aad_prefix; - std::string aad_file_unique = algo.aad.aad_file_unique; - std::string aad_prefix = algo.aad.aad_prefix; - if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 - && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); - if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties is not the same"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); - if (aad_prefix_verifier != NULLPTR) { - aad_prefix_verifier->check(aad_prefix); + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + bool supply_aad_prefix = algo.aad.supply_aad_prefix; + std::string aad_file_unique = algo.aad.aad_file_unique; + std::string aad_prefix = algo.aad.aad_prefix; + if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 + && algo.algorithm != ParquetCipher::AES_GCM_V1) + throw ParquetException("Unsupported algorithm"); + if (!file_decryption->getAADPrefix().empty()) { + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and " + "in properties is not the same"); } - } - if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { - throw ParquetException("AAD prefix used for file encryption, but not stored in " - "file and not supplied in decryption properties"); - } - std::string fileAAD; - if (!supply_aad_prefix) - fileAAD = aad_prefix + aad_file_unique; - else - fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) + aad_prefix_verifier->check(aad_prefix); + } + if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + throw ParquetException("AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + std::string fileAAD; + if (!supply_aad_prefix) + fileAAD = aad_prefix + aad_file_unique; + else + fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + properties_.set_fileAAD(fileAAD); if (file_decryption->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { @@ -373,32 +377,31 @@ class SerializedFile : public ParquetFileReader::Contents { std::string footer_key = file_decryption->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) throw ParquetException("No footer key or " - "key metadata"); + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); if (file_decryption->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + footer_key = + file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; - ss << e.what(); - ss << "Footer key: access denied"; + ss << "Footer key: access denied " << e.what() << "\n"; throw ParquetException(ss.str()); } } if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify plaintext " - "footer metadata"); + throw ParquetException("Footer key unavailable. Could not verify " + "plaintext footer metadata"); } - std::string aad = parquet_encryption::createFooterAAD(fileAAD); auto encryption = std::make_shared( - file_metadata_->encryption_algorithm().algorithm, - footer_key, fileAAD, aad); + file_metadata_->encryption_algorithm().algorithm, + footer_key, fileAAD, aad); if (! file_metadata_->verify(encryption, metadata_buffer->data() + read_metadata_len, 28)) { throw ParquetException("Invalid parquet file. Could not verify plaintext" - " footer metadata"); + " footer metadata"); } } } @@ -431,40 +434,40 @@ class SerializedFile : public ParquetFileReader::Contents { } auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. Could not read " - "encrypted footer metadata"); + throw ParquetException("No decryption properties are provided. Could not read " + "encrypted footer metadata"); } - uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); bool supply_aad_prefix = algo.aad.supply_aad_prefix; std::string aad_file_unique = algo.aad.aad_file_unique; std::string aad_prefix = algo.aad.aad_prefix; if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); + throw ParquetException("Unsupported algorithm"); if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties is not the same"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); - if (aad_prefix_verifier != NULLPTR) { - aad_prefix_verifier->check(aad_prefix); - } + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties " + "is not the same"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) + aad_prefix_verifier->check(aad_prefix); } if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { - throw ParquetException("AAD prefix used for file encryption, but not stored in file " - "and not supplied in decryption properties"); + throw ParquetException("AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } std::string fileAAD; if (!supply_aad_prefix) fileAAD = aad_prefix + aad_file_unique; else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - //save fileAAD for later use + // save fileAAD for later use properties_.set_fileAAD(fileAAD); std::string aad = parquet_encryption::createFooterAAD(fileAAD); @@ -475,37 +478,37 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); if (metadata_buffer->size() != metadata_len) { throw ParquetException("Invalid encrypted parquet file. " - "Could not read footer metadata bytes."); + "Could not read footer metadata bytes."); } - + // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); std::string footer_key = file_decryption->getFooterKey(); if (footer_key.empty()) { - if (footer_key_metadata.empty()) throw ParquetException("No footer key or " - "key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << e.what(); - ss << "Footer key: access denied"; - throw ParquetException(ss.str()); - } + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = + file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n";; + throw ParquetException(ss.str()); + } } - if (footer_key.empty()) { - throw ParquetException("Invalid footer encryption key. " - "Could not parse footer metadata"); + throw ParquetException("Invalid footer encryption key. " + "Could not parse footer metadata"); } auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, - footer_key, - fileAAD, aad); - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, - footer_encryption); + footer_key, + fileAAD, aad); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), + &metadata_len, + footer_encryption); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 347c4541372..3111f745222 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -82,8 +82,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, - const WriterProperties* properties, bool buffered_row_group = false) - + const WriterProperties* properties, + bool buffered_row_group = false) : sink_(sink), metadata_(metadata), properties_(properties), @@ -130,10 +130,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, - row_group_ordinal_, (int16_t)(current_column_index_-1), - properties_->memory_pool()); - + properties_->encryption(column_descr->path()), + col_meta, row_group_ordinal_, + (int16_t)(current_column_index_-1), + properties_->memory_pool()); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -230,12 +230,11 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, - (int16_t)row_group_ordinal_, - (int16_t)current_column_index_, - properties_->memory_pool(), buffered_row_group_); - + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + properties_->encryption(column_descr->path()), + col_meta, (int16_t)row_group_ordinal_, + (int16_t)current_column_index_, + properties_->memory_pool(), buffered_row_group_); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -278,9 +277,8 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*file_metadata_, sink_.get()); - } - else { + WriteFileMetaData(*metadata, sink_.get()); + } else { if (file_encryption->encryptedFooter()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -288,37 +286,38 @@ class FileSerializer : public ParquetFileWriter::Contents { uint64_t metadata_start = static_cast(sink_->Tell()); auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = - std::make_shared(algorithm, - file_encryption->getFooterEncryptionKey(), - file_encryption->getFileAAD(), aad); + + ParquetCipher::type algorithm = + file_encryption->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD( + file_encryption->getFileAAD()); + std::shared_ptr footer_encryption = std::make_shared( + algorithm, + file_encryption->getFooterEncryptionKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(PARQUET_EMAGIC, 4); - } - else { + } else { // footer plain mode EncryptionAlgorithm signing_encryption; EncryptionAlgorithm algo = file_encryption->getAlgorithm(); - signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; - - file_metadata_ = metadata_->Finish(&signing_encryption, - file_encryption->getFooterSigningKeyMetadata ()); - ParquetCipher::type algorithm = algo.algorithm; - std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = - std::make_shared(algorithm, - file_encryption->getFooterSigningKey(), - file_encryption->getFileAAD(), aad); + file_metadata_ = metadata_->Finish( + &signing_encryption, + file_encryption->getFooterSigningKeyMetadata ()); + ParquetCipher::type algorithm = algo.algorithm; + std::string aad = parquet_encryption::createFooterAAD( + file_encryption->getFileAAD()); + std::shared_ptr footer_encryption = std::make_shared( + algorithm, + file_encryption->getFooterSigningKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index d8c91b8d384..5b745382310 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -160,17 +160,18 @@ const std::string& ColumnCryptoMetaData::key_metadata() const { // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: - explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, - int16_t column_ordinal, - const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, std::string, - schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) - : column_(column), descr_(descr), writer_version_(writer_version) { + explicit ColumnChunkMetaDataImpl( + const format::ColumnChunk* column, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, std::string, + schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) + : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; if (column->__isset.crypto_metadata) { @@ -182,51 +183,60 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } // should decrypt metadata std::shared_ptr path = - std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - std::string key; - if (column_map != NULLPTR && (column_map->find(path) != column_map->end())) { + std::string key; + if (column_map != NULLPTR + && (column_map->find(path) != column_map->end())) { //First retrieve the key in column_map if exists - key = column_map->at(path); - } else { + key = column_map->at(path); + } else { key = file_decryption->getColumnKey(path); // No explicit column key given via API. Retrieve via key metadata. - if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ + if (key.empty() && !key_metadata.empty() + && file_decryption->getKeyRetriever() != nullptr){ try { - key = file_decryption->getKeyRetriever()->GetKey(key_metadata); + key = file_decryption->getKeyRetriever()->GetKey(key_metadata); } catch (KeyAccessDeniedException &e) { // Hidden column: encrypted, but key unavailable - std::stringstream ss; - ss << e.what(); - ss << " HiddenColumnException path=" + path->ToDotString(); - throw HiddenColumnException(ss.str()); - } + std::stringstream ss; + ss << "HiddenColumnException path=" + path->ToDotString() + " " + << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } if (key.empty ()) - throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + throw HiddenColumnException("HiddenColumnException path=" + + path->ToDotString()); + } + if (column_map != NULLPTR) { + // save column key for future use + (*column_map)[path]=key; } - if (column_map != NULLPTR) { - // save column key for future use - (*column_map)[path]=key; - } - } - if (key.empty()) { - // Hidden column: encrypted, but key unavailable - throw HiddenColumnException("HiddenColumnException path= " + path->ToDotString()); - } + } + if (key.empty()) { + // Hidden column: encrypted, but key unavailable + throw HiddenColumnException("HiddenColumnException path= " + + path->ToDotString()); + } DCHECK(algorithm != NULLPTR); - std::string aad = parquet_encryption::createModuleAAD(fileAAD, - parquet_encryption::ColumnMetaData, - row_group_ordinal, - column_ordinal, (int16_t)-1); - - auto encryption = std::make_shared(algorithm->algorithm, - key, fileAAD, aad); - - uint32_t len = static_cast(column->encrypted_column_metadata.size()); - DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, encryption, false); + std::string aad = parquet_encryption::createModuleAAD( + fileAAD, + parquet_encryption::ColumnMetaData, + row_group_ordinal, + column_ordinal, (int16_t)-1); + auto encryption = std::make_shared( + algorithm->algorithm, + key, fileAAD, aad); + uint32_t len = + static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg( + reinterpret_cast( + column->encrypted_column_metadata.c_str()), + &len, &metadata_, + encryption, + false); } } for (auto encoding : metadata_.encodings) { @@ -234,7 +244,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } possible_stats_ = nullptr; } - // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } @@ -331,31 +340,32 @@ std::unique_ptr ColumnChunkMetaData::Make( std::shared_ptr, std::string, schema::ColumnPath::CmpColumnPath>> column_map) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, - row_group_ordinal, column_ordinal, - writer_version, file_decryption, - algorithm, fileAAD, column_map)); + new ColumnChunkMetaData(metadata, descr, row_group_ordinal, + column_ordinal, writer_version, + file_decryption, algorithm, fileAAD, + column_map)); } - -ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map) -: impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + +ColumnChunkMetaData::ColumnChunkMetaData( + const void* metadata, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map) + : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, + file_decryption, algorithm, fileAAD, column_map))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} - // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -434,22 +444,25 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal, - FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, algorithm, fileAAD, column_map); + return ColumnChunkMetaData::Make( + &row_group_->columns[i], schema_->Column(i), + row_group_ordinal, (int16_t)i, + writer_version_, file_decryption, algorithm, fileAAD, + column_map); } private: @@ -480,16 +493,13 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i, - int16_t row_group_ordinal, - FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, - schema::ColumnPath::CmpColumnPath>> column_map) const { +std::unique_ptr RowGroupMetaData::ColumnChunk( + int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, const std::string& fileAAD, + std::shared_ptr, std::string, + schema::ColumnPath::CmpColumnPath>> column_map) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, - fileAAD, column_map); + fileAAD, column_map); } // file metadata @@ -497,8 +507,9 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = nullptr) + explicit FileMetaDataImpl( + const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, @@ -531,11 +542,11 @@ class FileMetaData::FileMetaDataImpl { std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); uint32_t encrypted_len = parquet_encryption::SignedFooterEncrypt(serialized_data, serialized_len, - encryption->key_bytes(), - encryption->key_length(), - encryption->aad_bytes(), - encryption->aad_length(), - nonce, 12, encrypted_buffer.data()); + encryption->key_bytes(), + encryption->key_length(), + encryption->aad_bytes(), + encryption->aad_length(), + nonce, 12, encrypted_buffer.data()); return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); } @@ -560,7 +571,8 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryption) const { ThriftSerializer serializer; if (is_plaintext_mode()) { uint8_t* serialized_data; @@ -665,9 +677,10 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len, - const std::shared_ptr& encryption) { +std::shared_ptr FileMetaData::Make( + const void* metadata, + uint32_t* metadata_len, + const std::shared_ptr& encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( new FileMetaData(metadata, metadata_len, encryption)); @@ -744,12 +757,13 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryption) const { + const std::shared_ptr& encryption) const { if (encryption != nullptr) - encryption->aad(parquet_encryption::createModuleAAD(encryption->fileAAD(), - parquet_encryption::Footer, - (int16_t)-1, (int16_t)-1, - (int16_t)-1)); + encryption->aad( + parquet_encryption::createModuleAAD(encryption->fileAAD(), + parquet_encryption::Footer, + (int16_t)-1, (int16_t)-1, + (int16_t)-1)); return impl_->WriteTo(dst, encryption); } @@ -977,7 +991,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { + const std::shared_ptr& encryption) { ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_props(column_->path()); @@ -1013,8 +1027,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = encryption; - uint8_t* serialized_data; - uint32_t serialized_len; + uint8_t* serialized_data; + uint32_t serialized_len; serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); // encrypt the footer key @@ -1125,10 +1139,10 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { + const std::shared_ptr& encryption) { impl_->WriteTo(sink, encryption); } - + const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { return impl_->descr(); } @@ -1359,7 +1373,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); else key_metadata = file_encryption->getFooterSigningKeyMetadata(); - + if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 65f3f417a5f..c74ca88ae95 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -23,7 +23,7 @@ #include #include #include - + #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" @@ -124,7 +124,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, const std::string &fileAAD = "", std::shared_ptr, @@ -157,14 +157,14 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, const std::string& fileAAD = "", std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR); + std::string, parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -186,13 +186,13 @@ class PARQUET_EXPORT RowGroupMetaData { // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, - FileDecryptionProperties* file_decryption = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, const std::string &fileAAD = "", std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR) const; + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -309,7 +309,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption = NULLPTR); + const std::shared_ptr& encryption = NULLPTR); std::shared_ptr get_encryption_props(const std::shared_ptr& path); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index ba0b57a201c..2e62927ba6d 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -54,20 +54,20 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // Convenience builder for encrypted columns. Builder(const std::shared_ptr& path) : Builder(path, true) {} - + // Set a column-specific key. // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. Builder* withKey(const std::string& key) { - if (key.empty ()) + if (key.empty ()) return this; - + DCHECK(!key.empty()); key_ = key; return this; } - + // Set a key retrieval metadata. // use either withKeyMetaData or withKeyID, not both Builder* withKeyMetaData(const std::string& key_metadata) { @@ -76,7 +76,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { key_metadata_ = key_metadata; return this; } - + // Set a key retrieval metadata (converted from String). // use either withKeyMetaData or withKeyID, not both // key_id will be converted to metadata (UTF-8 array). @@ -84,14 +84,14 @@ class PARQUET_EXPORT ColumnEncryptionProperties { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) - throw ParquetException("key id should be in UTF8 encoding"); - + key_id.size())) + throw ParquetException("key id should be in UTF8 encoding"); + DCHECK(!key_id.empty()); this->withKeyMetaData(key_id); return this; } - + std::shared_ptr build() { return std::shared_ptr(new ColumnEncryptionProperties( @@ -100,35 +100,32 @@ class PARQUET_EXPORT ColumnEncryptionProperties { key_, key_metadata_)); } - - - private: + + private: const std::shared_ptr column_path_; bool encrypted_; std::string key_; std::string key_metadata_; - + Builder(const std::shared_ptr& path, bool encrypted) : column_path_(path), encrypted_(encrypted) {} }; - const std::shared_ptr& getPath() { return column_path_; } bool isEncrypted() const { return encrypted_; } bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } const std::string& getKey() const { return key_; } const std::string& getKeyMetaData() const { return key_metadata_; } - + ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - + private: const std::shared_ptr column_path_; bool encrypted_; bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; - explicit ColumnEncryptionProperties(bool encrypted, const std::shared_ptr& column_path, const std::string& key, @@ -136,20 +133,20 @@ class PARQUET_EXPORT ColumnEncryptionProperties { DCHECK(column_path != nullptr); if (!encrypted) DCHECK(key.empty() && key_metadata.empty()); - + if (!key.empty()) DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - + encrypted_with_footer_key_ = (encrypted && key.empty()); if (encrypted_with_footer_key_) DCHECK(key_metadata.empty()); - + encrypted_ = encrypted; key_metadata_ = key_metadata; key_ = key; } }; - + class PARQUET_EXPORT ColumnDecryptionProperties { public: class Builder { @@ -158,58 +155,57 @@ class PARQUET_EXPORT ColumnDecryptionProperties { Builder(const std::string name) { Builder(schema::ColumnPath::FromDotString(name)); } - + Builder(const std::shared_ptr& path) : column_path_(path) {} - + // Set an explicit column key. If applied on a file that contains key metadata for this column - // the metadata will be ignored, the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. + // key length must be either 16, 24 or 32 bytes. Builder* withKey(const std::string& key) { - if (key.empty ()) + if (key.empty ()) return this; - + DCHECK(!key.empty()); key_ = key; return this; } - + std::shared_ptr build() { - return - std::shared_ptr(new ColumnDecryptionProperties(column_path_, - key_)); + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); } - + private: const std::shared_ptr column_path_; std::string key_; }; - + ColumnDecryptionProperties() = default; ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - + const std::shared_ptr& getPath() { return column_path_; } const std::string& getKey() const { return key_; } - + private: const std::shared_ptr column_path_; std::string key_; - + // This class is only required for setting explicit column decryption keys - // to override key retriever (or to provide keys when key metadata and/or // key retriever are not available) explicit ColumnDecryptionProperties(const std::shared_ptr& column_path, const std::string& key):column_path_(column_path){ DCHECK(column_path != nullptr); - + if (!key.empty()) DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - + key_ = key; } }; - + class PARQUET_EXPORT AADPrefixVerifier { public: // Verifies identity (AAD Prefix) of individual file, or of file collection in a data set. @@ -217,7 +213,7 @@ class PARQUET_EXPORT AADPrefixVerifier { // In a data set, AAD Prefixes should be collected, and then checked for missing files. virtual void check(std::string aad_prefix) = 0; }; - + class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { @@ -225,7 +221,7 @@ class PARQUET_EXPORT FileDecryptionProperties { Builder(){ check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } - + // Set an explicit footer key. If applied on a file that contains footer key metadata - // the metadata will be ignored, the footer will be decrypted/verified with this key. // If explicit key is not set, footer key will be fetched from key retriever. @@ -238,7 +234,7 @@ class PARQUET_EXPORT FileDecryptionProperties { footer_key_ = footer_key; return this; } - + // Set explicit column keys (decryption properties). // Its also possible to set a key retriever on this property object. Upon file decryption, // availability of explicit keys is checked before invocation of the retriever callback. @@ -250,29 +246,29 @@ class PARQUET_EXPORT FileDecryptionProperties { column_properties) { if (column_properties.size () == 0) return this; - + if (column_property_map_.size () != 0) throw ParquetException("Column properties already set"); - + column_property_map_ = column_properties; return this; } - + // Set a key retriever callback. Its also possible to // set explicit footer or column keys on this file property object. Upon file decryption, // availability of explicit keys is checked before invocation of the retriever callback. // If an explicit key is available for a footer or a column, its key metadata will // be ignored. Builder* withKeyRetriever(const std::shared_ptr& - key_retriever) { + key_retriever) { if (key_retriever == NULLPTR) - return this; - + return this; + DCHECK(key_retriever_ == NULLPTR); key_retriever_ = key_retriever; - return this; + return this; } - + // Skip integrity verification of plaintext footers. // If not called, integrity of plaintext footers will be checked in runtime, // and an exception will be thrown in the following situations: @@ -282,107 +278,105 @@ class PARQUET_EXPORT FileDecryptionProperties { check_plaintext_footer_integrity_ = false; return this; } - + // Explicitly supply the file AAD prefix. // A must when a prefix is used for file encryption, but not stored in file. // If AAD prefix is stored in file, it will be compared to the explicitly // supplied value and an exception will be thrown if they differ. Builder* withAADPrefix(std::string aad_prefix) { if (aad_prefix.empty()) { - return this; + return this; } DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; return this; } - + // Set callback for verification of AAD Prefixes stored in file. Builder* withAADPrefixVerifier(std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; - + DCHECK(aad_prefix_verifier_ == NULLPTR); aad_prefix_verifier_ = aad_prefix_verifier; return this; } - + std::shared_ptr build() { - return - std::shared_ptr(new FileDecryptionProperties(footer_key_, - key_retriever_, - check_plaintext_footer_integrity_, - aad_prefix_, - aad_prefix_verifier_, - column_property_map_)); - + return std::shared_ptr( + new FileDecryptionProperties(footer_key_, + key_retriever_, + check_plaintext_footer_integrity_, + aad_prefix_, + aad_prefix_verifier_, + column_property_map_)); } - + private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; - + std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - }; - + bool check_plaintext_footer_integrity_; + }; + const std::string& getColumnKey(const std::shared_ptr& column_path) { if (column_property_map_.find(column_path) != column_property_map_.end()) { auto column_prop = column_property_map_[column_path]; - if (column_prop != nullptr) - return column_prop->getKey(); - } + if (column_prop != nullptr) + return column_prop->getKey(); + } return NULL_STRING; } - + const std::string& getFooterKey() { return footer_key_; } - + const std::string& getAADPrefix() { return aad_prefix_; } - + std::shared_ptr getKeyRetriever() { return key_retriever_; } - + bool checkFooterIntegrity() { return check_plaintext_footer_integrity_; } - + const std::shared_ptr &getAADPrefixVerifier() { return aad_prefix_verifier_; } - + private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; - + std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; - - + + FileDecryptionProperties(const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - std::string aad_prefix, - std::shared_ptr aad_prefix_verifier, - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_property_map.size()); if (!footer_key.empty()) DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); + footer_key.length() == 32); if (footer_key.empty() && check_plaintext_footer_integrity) DCHECK(NULLPTR != key_retriever); aad_prefix_verifier_ = aad_prefix_verifier; @@ -392,9 +386,9 @@ class PARQUET_EXPORT FileDecryptionProperties { aad_prefix_ = aad_prefix; column_property_map_ = column_property_map; } - + }; - + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) @@ -426,11 +420,11 @@ class PARQUET_EXPORT ReaderProperties { std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map () { return column_map_; } - + const std::string& fileAAD() { return fileAAD_; } void set_fileAAD (std::string fileAAD) { fileAAD_ = fileAAD; } - + void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -444,7 +438,7 @@ class PARQUET_EXPORT ReaderProperties { std::shared_ptr file_decryption_; std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between - //ColumnPath and their encryption keys + //ColumnPath and their encryption keys std::string fileAAD_; }; @@ -523,65 +517,65 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } - + // Create files with plaintext footer. // If not called, the files will be created with encrypted footer (default). Builder* withPlaintextFooter() { encrypted_footer_ = false; return this; } - + // Set encryption algorithm. // If not called, files will be encrypted with AES_GCM_V1 (default). Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { parquet_cipher_ = parquet_cipher; return this; } - + // Set a key retrieval metadata (converted from String). // use either withFooterKeyMetaData or withFooterKeyID, not both. Builder* withFooterKeyID(std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) + key_id.size())) throw ParquetException("footer key id should be in UTF8 encoding"); - - if (key_id.empty()) + + if (key_id.empty()) return this; - + return withFooterKeyMetadata(key_id); } - + // Set a key retrieval metadata. // use either withFooterKeyMetaData or withFooterKeyID, not both. Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { if (footer_key_metadata.empty()) return this; - + DCHECK(footer_key_metadata_.empty()); footer_key_metadata_ = footer_key_metadata; return this; } - + // Set the file AAD Prefix. Builder* withAADPrefix(const std::string& aad_prefix) { if (aad_prefix.empty()) return this; - + DCHECK(aad_prefix_.empty()); aad_prefix_ = aad_prefix; store_aad_prefix_in_file_ = true; return this; } - + // Skip storing AAD Prefix in file. // If not called, and if AAD Prefix is set, it will be stored. Builder* withoutAADPrefixStorage() { store_aad_prefix_in_file_ = false; return this; } - + // Set the list of encrypted columns and their properties (keys etc). // If not called, all columns will be encrypted with the footer key. // If called, the file columns not in the list will be left unencrypted. @@ -591,63 +585,62 @@ class PARQUET_EXPORT FileEncryptionProperties { encryptedColumns){ if (encryptedColumns.size () == 0) return this; - + if (column_property_map_.size () != 0) throw ParquetException("Column properties already set"); - + column_property_map_ = encryptedColumns; return this; } - + std::shared_ptr build() { - return - std::shared_ptr(new FileEncryptionProperties( - parquet_cipher_, - footer_key_, - footer_key_metadata_, - encrypted_footer_, - aad_prefix_, - store_aad_prefix_in_file_, - column_property_map_)); + return std::shared_ptr( + new FileEncryptionProperties(parquet_cipher_, + footer_key_, + footer_key_metadata_, + encrypted_footer_, + aad_prefix_, + store_aad_prefix_in_file_, + column_property_map_)); } - + private: ParquetCipher::type parquet_cipher_; bool encrypted_footer_; std::string footer_key_; std::string footer_key_metadata_; - + std::string aad_prefix_; bool store_aad_prefix_in_file_; std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; }; - + bool encryptedFooter() const { return encrypted_footer_; } - + const EncryptionAlgorithm getAlgorithm() { return algorithm_; } - + const std::string& getFooterEncryptionKey() { return (encrypted_footer_? footer_key_ : NULL_STRING); } - + const std::string& getFooterEncryptionKeyMetadata() { return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); } - + const std::string& getFooterSigningKey() { return (encrypted_footer_? NULL_STRING : footer_key_); } - + const std::string& getFooterSigningKeyMetadata() { return (encrypted_footer_? NULL_STRING : footer_key_metadata_); } - + const std::string& getFileAAD() const { return file_AAD_; } - + std::shared_ptr getColumnProperties(const std::shared_ptr& column_path) { if (column_property_map_.size () == 0){ @@ -658,34 +651,34 @@ class PARQUET_EXPORT FileEncryptionProperties { } if (column_property_map_.find(column_path) != column_property_map_.end()) return column_property_map_[column_path]; - + return NULLPTR; } - + private: EncryptionAlgorithm algorithm_; // encryption algorithm std::string footer_key_; // encryption key, should have 16, 24, 32-byte length std::string footer_key_metadata_; bool encrypted_footer_; std::string file_AAD_; - + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; - + FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_property_map) + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), - column_property_map_(column_property_map){ + column_property_map_(column_property_map){ DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); @@ -695,7 +688,7 @@ class PARQUET_EXPORT FileEncryptionProperties { RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), AAD_FILE_UNIQUE_LENGTH) ; - + bool supply_aad_prefix = false; if (aad_prefix.empty()) file_AAD_ = aad_file_unique_str; @@ -709,9 +702,9 @@ class PARQUET_EXPORT FileEncryptionProperties { if (!aad_prefix.empty() && store_aad_prefix_in_file) { algorithm_.aad.aad_prefix = aad_prefix; } - } + } }; - + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -951,13 +944,13 @@ class PARQUET_EXPORT WriterProperties { } else { std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); if (footer_key.empty()) - footer_key = parquet_file_encryption_->getFooterSigningKey (); + footer_key = parquet_file_encryption_->getFooterSigningKey (); return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - footer_key, parquet_file_encryption_->getFileAAD()); - + footer_key, parquet_file_encryption_->getFileAAD()); + } } - + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -1002,7 +995,7 @@ class PARQUET_EXPORT WriterProperties { } std::shared_ptr column_encryption_props(const - std::shared_ptr& path) const { + std::shared_ptr& path) const { if (parquet_file_encryption_) { return parquet_file_encryption_->getColumnProperties(path); } else { @@ -1011,32 +1004,31 @@ class PARQUET_EXPORT WriterProperties { } std::shared_ptr encryption( - const std::shared_ptr& path) const { + const std::shared_ptr& path) const { if (parquet_file_encryption_) { auto column_prop = parquet_file_encryption_->getColumnProperties(path); if (column_prop == NULLPTR) - return NULLPTR; + return NULLPTR; if (column_prop->isEncryptedWithFooterKey()) { - if (parquet_file_encryption_->encryptedFooter ()) { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterEncryptionKey(), - parquet_file_encryption_->getFileAAD()); - } else { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterSigningKey(), - parquet_file_encryption_->getFileAAD()); - } + if (parquet_file_encryption_->encryptedFooter ()) { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterEncryptionKey(), + parquet_file_encryption_->getFileAAD()); + } else { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterSigningKey(), + parquet_file_encryption_->getFileAAD()); + } } - + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - column_prop->getKey(), - parquet_file_encryption_->getFileAAD()); - + column_prop->getKey(), + parquet_file_encryption_->getFileAAD()); } else { return NULLPTR; } } - + private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index f31a27e78dd..50a19342a10 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -92,7 +92,7 @@ class PARQUET_EXPORT ColumnPath { struct CmpColumnPath { bool operator()(const std::shared_ptr& a, - const std::shared_ptr& b) const { + const std::shared_ptr& b) const { return a->ToDotString() < b->ToDotString(); } }; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 8f4ba5f212a..79d6f8a752e 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -459,14 +459,6 @@ struct EncryptionAlgorithm { }; class PARQUET_EXPORT EncryptionProperties { - private: - static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); - } - public: EncryptionProperties() = default; EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, @@ -511,6 +503,13 @@ class PARQUET_EXPORT EncryptionProperties { std::string key_; // encryption key, should have 16, 24, 32-byte length std::string file_aad_; std::string aad_; // encryption additional authenticated data + static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); + } + }; // parquet::PageType From 6fbfc52639593932d8eb412a08fab463d222f84d Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 22 Apr 2019 22:55:19 +0300 Subject: [PATCH 028/125] Code style fixes in properties.h --- cpp/src/parquet/properties.h | 142 ++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 61 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 2e62927ba6d..1651505d27b 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -53,7 +53,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } // Convenience builder for encrypted columns. - Builder(const std::shared_ptr& path) : Builder(path, true) {} + Builder(const std::shared_ptr& path) + : Builder(path, true) {} // Set a column-specific key. // If key is not set on an encrypted column, the column will @@ -83,8 +84,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { Builder* withKeyID(std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); - if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) + const uint8_t *data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) throw ParquetException("key id should be in UTF8 encoding"); DCHECK(!key_id.empty()); @@ -93,12 +94,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } std::shared_ptr build() { - return - std::shared_ptr(new ColumnEncryptionProperties( - encrypted_, - column_path_, - key_, - key_metadata_)); + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, + column_path_, + key_, + key_metadata_)); } private: @@ -126,10 +126,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; - explicit ColumnEncryptionProperties(bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata):column_path_(column_path){ + explicit ColumnEncryptionProperties( + bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata):column_path_(column_path){ DCHECK(column_path != nullptr); if (!encrypted) DCHECK(key.empty() && key_metadata.empty()); @@ -159,8 +160,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { Builder(const std::shared_ptr& path) : column_path_(path) {} - // Set an explicit column key. If applied on a file that contains key metadata for this column - - // the metadata will be ignored, the column will be decrypted with this key. + // Set an explicit column key. If applied on a file that contains + // key metadata for this column the metadata will be ignored, + // the column will be decrypted with this key. // key length must be either 16, 24 or 32 bytes. Builder* withKey(const std::string& key) { if (key.empty ()) @@ -195,8 +197,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { // This class is only required for setting explicit column decryption keys - // to override key retriever (or to provide keys when key metadata and/or // key retriever are not available) - explicit ColumnDecryptionProperties(const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ DCHECK(column_path != nullptr); if (!key.empty()) @@ -208,9 +211,11 @@ class PARQUET_EXPORT ColumnDecryptionProperties { class PARQUET_EXPORT AADPrefixVerifier { public: - // Verifies identity (AAD Prefix) of individual file, or of file collection in a data set. + // Verifies identity (AAD Prefix) of individual file, + // or of file collection in a data set. // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, and then checked for missing files. + // In a data set, AAD Prefixes should be collected, + // and then checked for missing files. virtual void check(std::string aad_prefix) = 0; }; @@ -222,10 +227,12 @@ class PARQUET_EXPORT FileDecryptionProperties { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } - // Set an explicit footer key. If applied on a file that contains footer key metadata - - // the metadata will be ignored, the footer will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from key retriever. - //param footerKey Key length must be either 16, 24 or 32 bytes. + // Set an explicit footer key. If applied on a file that contains + // footer key metadata the metadata will be ignored, the footer + // will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from + // key retriever. + // param footerKey Key length must be either 16, 24 or 32 bytes. Builder* withFooterKey(std::string footer_key) { if (footer_key.empty ()) { return this; @@ -236,10 +243,11 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. Upon file decryption, - // availability of explicit keys is checked before invocation of the retriever callback. - // If an explicit key is available for a footer or a column, its key metadata will - // be ignored. + // Its also possible to set a key retriever on this property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. Builder* withColumnKeys(const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& @@ -255,10 +263,11 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. Upon file decryption, - // availability of explicit keys is checked before invocation of the retriever callback. - // If an explicit key is available for a footer or a column, its key metadata will - // be ignored. + // set explicit footer or column keys on this file property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. Builder* withKeyRetriever(const std::shared_ptr& key_retriever) { if (key_retriever == NULLPTR) @@ -272,7 +281,8 @@ class PARQUET_EXPORT FileDecryptionProperties { // Skip integrity verification of plaintext footers. // If not called, integrity of plaintext footers will be checked in runtime, // and an exception will be thrown in the following situations: - // - footer signing key is not available (not passed, or not found by key retriever) + // - footer signing key is not available + // (not passed, or not found by key retriever) // - footer content and signature don't match Builder* withoutFooterSignatureVerification() { check_plaintext_footer_integrity_ = false; @@ -293,7 +303,8 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set callback for verification of AAD Prefixes stored in file. - Builder* withAADPrefixVerifier(std::shared_ptr aad_prefix_verifier) { + Builder* withAADPrefixVerifier( + std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; @@ -325,7 +336,8 @@ class PARQUET_EXPORT FileDecryptionProperties { bool check_plaintext_footer_integrity_; }; - const std::string& getColumnKey(const std::shared_ptr& column_path) { + const std::string& getColumnKey( + const std::shared_ptr& column_path) { if (column_property_map_.find(column_path) != column_property_map_.end()) { auto column_prop = column_property_map_[column_path]; if (column_prop != nullptr) @@ -365,15 +377,18 @@ class PARQUET_EXPORT FileDecryptionProperties { bool check_plaintext_footer_integrity_; - FileDecryptionProperties(const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - std::string aad_prefix, - std::shared_ptr aad_prefix_verifier, - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { - DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_property_map.size()); + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { + DCHECK(!footer_key.empty() || + NULLPTR != key_retriever || + 0 != column_property_map.size()); if (!footer_key.empty()) DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); @@ -398,7 +413,7 @@ class PARQUET_EXPORT ReaderProperties { column_map_ = std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>>(new std::map, std::string, - parquet::schema::ColumnPath::CmpColumnPath>()); + schema::ColumnPath::CmpColumnPath>()); } ::arrow::MemoryPool* memory_pool() const { return pool_; } @@ -513,7 +528,8 @@ class PARQUET_EXPORT FileEncryptionProperties { class Builder { public: Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } @@ -537,8 +553,8 @@ class PARQUET_EXPORT FileEncryptionProperties { Builder* withFooterKeyID(std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); - if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) throw ParquetException("footer key id should be in UTF8 encoding"); if (key_id.empty()) @@ -579,10 +595,11 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set the list of encrypted columns and their properties (keys etc). // If not called, all columns will be encrypted with the footer key. // If called, the file columns not in the list will be left unencrypted. - Builder* withEncryptedColumns(const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - encryptedColumns){ + Builder* withEncryptedColumns( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + encryptedColumns){ if (encryptedColumns.size () == 0) return this; @@ -641,12 +658,11 @@ class PARQUET_EXPORT FileEncryptionProperties { const std::string& getFileAAD() const { return file_AAD_; } - std::shared_ptr - getColumnProperties(const std::shared_ptr& column_path) { + std::shared_ptr getColumnProperties( + const std::shared_ptr& column_path) { if (column_property_map_.size () == 0){ - auto builder = - std::shared_ptr( - new ColumnEncryptionProperties::Builder (column_path)); + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder (column_path)); return builder->build(); } if (column_property_map_.find(column_path) != column_property_map_.end()) @@ -657,7 +673,7 @@ class PARQUET_EXPORT FileEncryptionProperties { private: EncryptionAlgorithm algorithm_; // encryption algorithm - std::string footer_key_; // encryption key, should have 16, 24, 32-byte length + std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; std::string file_AAD_; @@ -670,7 +686,8 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key, std::string footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::string& aad_prefix, + bool store_aad_prefix_in_file, const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& @@ -681,13 +698,16 @@ class PARQUET_EXPORT FileEncryptionProperties { column_property_map_(column_property_map){ DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); + DCHECK(footer_key.length() == 16 + || footer_key.length() == 24 + || footer_key.length() == 32); uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH) ; + std::string aad_file_unique_str( + reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH) ; bool supply_aad_prefix = false; if (aad_prefix.empty()) From 6187d40da388a10bdc6474c7d62accade63592c5 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Wed, 24 Apr 2019 17:28:51 +0700 Subject: [PATCH 029/125] revert change in parquet.thrift --- cpp/src/parquet/parquet.thrift | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index 288c72a60a6..000b74dde1c 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -570,7 +570,7 @@ struct PageHeader { /** Uncompressed page size in bytes (not including this header) **/ 2: required i32 uncompressed_page_size - /** Compressed (and potentially encrypted) page size in bytes, not including this header **/ + /** Compressed page size in bytes (not including this header) **/ 3: required i32 compressed_page_size /** 32bit crc for the data below. This allows for disabling checksumming in HDFS @@ -955,7 +955,6 @@ struct FileMetaData { 9: optional binary footer_signing_key_metadata } -/** Crypto metadata for files with encrypted footer **/ struct FileCryptoMetaData { /** * Encryption algorithm. This field is only used for files From c14ff0a5750ee2ee4c503be5b24c44942efcf702 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 29 Apr 2019 08:40:28 +0300 Subject: [PATCH 030/125] Move all encrypted related classes from properties.h to new files: encryption_properties.h and encryption_properties.cc --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/encryption_properties.cc | 176 +++++++ cpp/src/parquet/encryption_properties.h | 498 ++++++++++++++++++++ cpp/src/parquet/properties.h | 574 +---------------------- 4 files changed, 677 insertions(+), 572 deletions(-) create mode 100644 cpp/src/parquet/encryption_properties.cc create mode 100644 cpp/src/parquet/encryption_properties.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 20f1906f59d..56c79f0c084 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -167,6 +167,7 @@ set(PARQUET_SRCS deprecated_io.cc encoding.cc encryption.cc + encryption_properties.cc file_reader.cc file_writer.cc metadata.cc diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc new file mode 100644 index 00000000000..1eb3847ec71 --- /dev/null +++ b/cpp/src/parquet/encryption_properties.cc @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption_properties.h" + +#include + +#include "arrow/util/utf8.h" + +namespace parquet { + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::withKeyID( + std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t *data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) + throw ParquetException("key id should be in UTF8 encoding"); + + DCHECK(!key_id.empty()); + this->withKeyMetaData(key_id); + return this; +} + +ColumnEncryptionProperties::ColumnEncryptionProperties( + bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata):column_path_(column_path){ + DCHECK(column_path != nullptr); + if (!encrypted) + DCHECK(key.empty() && key_metadata.empty()); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) + DCHECK(key_metadata.empty()); + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; +} + +ColumnDecryptionProperties::ColumnDecryptionProperties( + const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ + DCHECK(column_path != nullptr); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + key_ = key; +} + +const std::string& FileDecryptionProperties::getColumnKey( + const std::shared_ptr& column_path) { + if (column_property_map_.find(column_path) != column_property_map_.end()) { + auto column_prop = column_property_map_[column_path]; + if (column_prop != nullptr) + return column_prop->getKey(); + } + return NULL_STRING; +} + +FileDecryptionProperties::FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { + DCHECK(!footer_key.empty() || + NULLPTR != key_retriever || + 0 != column_property_map.size()); + + if (!footer_key.empty()) + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + if (footer_key.empty() && check_plaintext_footer_integrity) + DCHECK(NULLPTR != key_retriever); + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_property_map_ = column_property_map; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::withFooterKeyID( + std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) + throw ParquetException("footer key id should be in UTF8 encoding"); + + if (key_id.empty()) + return this; + + return withFooterKeyMetadata(key_id); +} + +std::shared_ptr FileEncryptionProperties::getColumnProperties( + const std::shared_ptr& column_path) { + if (column_property_map_.size () == 0){ + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder (column_path)); + return builder->build(); + } + if (column_property_map_.find(column_path) != column_property_map_.end()) + return column_property_map_[column_path]; + + return NULLPTR; +} + +FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, + bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map) +: footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_property_map_(column_property_map){ + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 + || footer_key.length() == 24 + || footer_key.length() == 32); + + uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; + memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); + RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + std::string aad_file_unique_str( + reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH) ; + + bool supply_aad_prefix = false; + if (aad_prefix.empty()) + file_AAD_ = aad_file_unique_str; + else { + file_AAD_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } +} + +} // namespace parquet diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h new file mode 100644 index 00000000000..f3ed589260f --- /dev/null +++ b/cpp/src/parquet/encryption_properties.h @@ -0,0 +1,498 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_PROPERTIES_H +#define PARQUET_ENCRYPTION_PROPERTIES_H + +#include +#include +#include + +#include "parquet/encryption.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "arrow/util/logging.h" +#include "parquet/util/visibility.h" + +namespace parquet { + +static const std::string NULL_STRING = ""; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM + = ParquetCipher::AES_GCM_V1; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; +static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; + +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class Builder { + public: + // Convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + + // Convenience builder for encrypted columns. + Builder(const std::shared_ptr& path) + : Builder(path, true) {} + + // Set a column-specific key. + // If key is not set on an encrypted column, the column will + // be encrypted with the footer key. + // keyBytes Key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + // Set a key retrieval metadata. + // use either withKeyMetaData or withKeyID, not both + Builder* withKeyMetaData(const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either withKeyMetaData or withKeyID, not both + // key_id will be converted to metadata (UTF-8 array). + Builder* withKeyID(std::string key_id); + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, + column_path_, + key_, + key_metadata_)); + } + + private: + const std::shared_ptr column_path_; + bool encrypted_; + std::string key_; + std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} + }; + + const std::shared_ptr& getPath() { return column_path_; } + bool isEncrypted() const { return encrypted_; } + bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } + const std::string& getKey() const { return key_; } + const std::string& getKeyMetaData() const { return key_metadata_; } + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + private: + const std::shared_ptr column_path_; + bool encrypted_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + explicit ColumnEncryptionProperties( + bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata); +}; + +class PARQUET_EXPORT ColumnDecryptionProperties { + public: + class Builder { + public: + // convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name)); + } + + Builder(const std::shared_ptr& path) + : column_path_(path) {} + + // Set an explicit column key. If applied on a file that contains + // key metadata for this column the metadata will be ignored, + // the column will be decrypted with this key. + // key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); + } + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& getPath() { return column_path_; } + const std::string& getKey() const { return key_; } + + private: + const std::shared_ptr column_path_; + std::string key_; + + // This class is only required for setting explicit column decryption keys - + // to override key retriever (or to provide keys when key metadata and/or + // key retriever are not available) + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, + const std::string& key); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + // Verifies identity (AAD Prefix) of individual file, + // or of file collection in a data set. + // Throws exception if an AAD prefix is wrong. + // In a data set, AAD Prefixes should be collected, + // and then checked for missing files. + virtual void check(std::string aad_prefix) = 0; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class Builder { + public: + Builder(){ + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + } + + // Set an explicit footer key. If applied on a file that contains + // footer key metadata the metadata will be ignored, the footer + // will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from + // key retriever. + // param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* withFooterKey(std::string footer_key) { + if (footer_key.empty ()) { + return this; + } + DCHECK(!footer_key.empty()); + footer_key_ = footer_key; + return this; + } + + // Set explicit column keys (decryption properties). + // Its also possible to set a key retriever on this property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* withColumnKeys(const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_properties) { + if (column_properties.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = column_properties; + return this; + } + + // Set a key retriever callback. Its also possible to + // set explicit footer or column keys on this file property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* withKeyRetriever(const std::shared_ptr& + key_retriever) { + if (key_retriever == NULLPTR) + return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; + } + + // Skip integrity verification of plaintext footers. + // If not called, integrity of plaintext footers will be checked in runtime, + // and an exception will be thrown in the following situations: + // - footer signing key is not available + // (not passed, or not found by key retriever) + // - footer content and signature don't match + Builder* withoutFooterSignatureVerification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + // Explicitly supply the file AAD prefix. + // A must when a prefix is used for file encryption, but not stored in file. + // If AAD prefix is stored in file, it will be compared to the explicitly + // supplied value and an exception will be thrown if they differ. + Builder* withAADPrefix(std::string aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; + } + + // Set callback for verification of AAD Prefixes stored in file. + Builder* withAADPrefixVerifier( + std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) + return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new FileDecryptionProperties(footer_key_, + key_retriever_, + check_plaintext_footer_integrity_, + aad_prefix_, + aad_prefix_verifier_, + column_property_map_)); + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + }; + + const std::string& getColumnKey( + const std::shared_ptr& column_path); + + const std::string& getFooterKey() { + return footer_key_; + } + + const std::string& getAADPrefix() { return aad_prefix_; } + std::shared_ptr getKeyRetriever() { + return key_retriever_; + } + + bool checkFooterIntegrity() { + return check_plaintext_footer_integrity_; + } + + const std::shared_ptr &getAADPrefixVerifier() { + return aad_prefix_verifier_; + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map); +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + class Builder { + public: + Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + // Create files with plaintext footer. + // If not called, the files will be created with encrypted footer (default). + Builder* withPlaintextFooter() { + encrypted_footer_ = false; + return this; + } + + // Set encryption algorithm. + // If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyID(std::string key_id); + + // Set a key retrieval metadata. + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) + return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; + } + + // Set the file AAD Prefix. + Builder* withAADPrefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) + return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; + } + + // Skip storing AAD Prefix in file. + // If not called, and if AAD Prefix is set, it will be stored. + Builder* withoutAADPrefixStorage() { + store_aad_prefix_in_file_ = false; + return this; + } + + // Set the list of encrypted columns and their properties (keys etc). + // If not called, all columns will be encrypted with the footer key. + // If called, the file columns not in the list will be left unencrypted. + Builder* withEncryptedColumns( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + encryptedColumns){ + if (encryptedColumns.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = encryptedColumns; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new FileEncryptionProperties(parquet_cipher_, + footer_key_, + footer_key_metadata_, + encrypted_footer_, + aad_prefix_, + store_aad_prefix_in_file_, + column_property_map_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + }; + bool encryptedFooter() const { return encrypted_footer_; } + + const EncryptionAlgorithm getAlgorithm() { + return algorithm_; + } + + const std::string& getFooterEncryptionKey() { + return (encrypted_footer_? footer_key_ : NULL_STRING); + } + + const std::string& getFooterEncryptionKeyMetadata() { + return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); + } + + const std::string& getFooterSigningKey() { + return (encrypted_footer_? NULL_STRING : footer_key_); + } + + const std::string& getFooterSigningKeyMetadata() { + return (encrypted_footer_? NULL_STRING : footer_key_metadata_); + } + + const std::string& getFileAAD() const { return file_AAD_; } + + std::shared_ptr getColumnProperties( + const std::shared_ptr& column_path); + + private: + EncryptionAlgorithm algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + bool encrypted_footer_; + std::string file_AAD_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + FileEncryptionProperties(ParquetCipher::type cipher, + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, + bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map); +}; + +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_PROPERTIES_H diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 1651505d27b..faed20baf57 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -21,10 +21,7 @@ #include #include #include -#include -#include -#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" @@ -32,6 +29,8 @@ #include "parquet/types.h" #include "arrow/util/logging.h" #include "arrow/util/utf8.h" +#include "parquet/encryption_properties.h" + namespace parquet { struct ParquetVersion { @@ -40,369 +39,6 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 0; static bool DEFAULT_USE_BUFFERED_STREAM = false; -static constexpr bool DEFAULT_CHECK_SIGNATURE = true; -static const std::string NULL_STRING = ""; - -class PARQUET_EXPORT ColumnEncryptionProperties { - public: - class Builder { - public: - // Convenience builder for regular (not nested) columns. - Builder(const std::string name) { - Builder(schema::ColumnPath::FromDotString(name), true); - } - - // Convenience builder for encrypted columns. - Builder(const std::shared_ptr& path) - : Builder(path, true) {} - - // Set a column-specific key. - // If key is not set on an encrypted column, the column will - // be encrypted with the footer key. - // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { - if (key.empty ()) - return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - // Set a key retrieval metadata. - // use either withKeyMetaData or withKeyID, not both - Builder* withKeyMetaData(const std::string& key_metadata) { - DCHECK(!key_metadata.empty()); - DCHECK(key_metadata_.empty()); - key_metadata_ = key_metadata; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either withKeyMetaData or withKeyID, not both - // key_id will be converted to metadata (UTF-8 array). - Builder* withKeyID(std::string key_id) { - //key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t *data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) - throw ParquetException("key id should be in UTF8 encoding"); - - DCHECK(!key_id.empty()); - this->withKeyMetaData(key_id); - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnEncryptionProperties(encrypted_, - column_path_, - key_, - key_metadata_)); - } - - private: - const std::shared_ptr column_path_; - bool encrypted_; - std::string key_; - std::string key_metadata_; - - Builder(const std::shared_ptr& path, bool encrypted) - : column_path_(path), encrypted_(encrypted) {} - }; - const std::shared_ptr& getPath() { return column_path_; } - bool isEncrypted() const { return encrypted_; } - bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } - const std::string& getKey() const { return key_; } - const std::string& getKeyMetaData() const { return key_metadata_; } - - ColumnEncryptionProperties() = default; - ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; - ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - - private: - const std::shared_ptr column_path_; - bool encrypted_; - bool encrypted_with_footer_key_; - std::string key_; - std::string key_metadata_; - explicit ColumnEncryptionProperties( - bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata):column_path_(column_path){ - DCHECK(column_path != nullptr); - if (!encrypted) - DCHECK(key.empty() && key_metadata.empty()); - - if (!key.empty()) - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - - encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_) - DCHECK(key_metadata.empty()); - - encrypted_ = encrypted; - key_metadata_ = key_metadata; - key_ = key; - } -}; - -class PARQUET_EXPORT ColumnDecryptionProperties { - public: - class Builder { - public: - // convenience builder for regular (not nested) columns. - Builder(const std::string name) { - Builder(schema::ColumnPath::FromDotString(name)); - } - - Builder(const std::shared_ptr& path) - : column_path_(path) {} - - // Set an explicit column key. If applied on a file that contains - // key metadata for this column the metadata will be ignored, - // the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { - if (key.empty ()) - return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_)); - } - - private: - const std::shared_ptr column_path_; - std::string key_; - }; - - ColumnDecryptionProperties() = default; - ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; - ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - - const std::shared_ptr& getPath() { return column_path_; } - const std::string& getKey() const { return key_; } - - private: - const std::shared_ptr column_path_; - std::string key_; - - // This class is only required for setting explicit column decryption keys - - // to override key retriever (or to provide keys when key metadata and/or - // key retriever are not available) - explicit ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ - DCHECK(column_path != nullptr); - - if (!key.empty()) - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - - key_ = key; - } -}; - -class PARQUET_EXPORT AADPrefixVerifier { - public: - // Verifies identity (AAD Prefix) of individual file, - // or of file collection in a data set. - // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, - // and then checked for missing files. - virtual void check(std::string aad_prefix) = 0; -}; - -class PARQUET_EXPORT FileDecryptionProperties { - public: - class Builder { - public: - Builder(){ - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - } - - // Set an explicit footer key. If applied on a file that contains - // footer key metadata the metadata will be ignored, the footer - // will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from - // key retriever. - // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* withFooterKey(std::string footer_key) { - if (footer_key.empty ()) { - return this; - } - DCHECK(!footer_key.empty()); - footer_key_ = footer_key; - return this; - } - - // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* withColumnKeys(const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties) { - if (column_properties.size () == 0) - return this; - - if (column_property_map_.size () != 0) - throw ParquetException("Column properties already set"); - - column_property_map_ = column_properties; - return this; - } - - // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* withKeyRetriever(const std::shared_ptr& - key_retriever) { - if (key_retriever == NULLPTR) - return this; - - DCHECK(key_retriever_ == NULLPTR); - key_retriever_ = key_retriever; - return this; - } - - // Skip integrity verification of plaintext footers. - // If not called, integrity of plaintext footers will be checked in runtime, - // and an exception will be thrown in the following situations: - // - footer signing key is not available - // (not passed, or not found by key retriever) - // - footer content and signature don't match - Builder* withoutFooterSignatureVerification() { - check_plaintext_footer_integrity_ = false; - return this; - } - - // Explicitly supply the file AAD prefix. - // A must when a prefix is used for file encryption, but not stored in file. - // If AAD prefix is stored in file, it will be compared to the explicitly - // supplied value and an exception will be thrown if they differ. - Builder* withAADPrefix(std::string aad_prefix) { - if (aad_prefix.empty()) { - return this; - } - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - return this; - } - - // Set callback for verification of AAD Prefixes stored in file. - Builder* withAADPrefixVerifier( - std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) - return this; - - DCHECK(aad_prefix_verifier_ == NULLPTR); - aad_prefix_verifier_ = aad_prefix_verifier; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new FileDecryptionProperties(footer_key_, - key_retriever_, - check_plaintext_footer_integrity_, - aad_prefix_, - aad_prefix_verifier_, - column_property_map_)); - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - }; - - const std::string& getColumnKey( - const std::shared_ptr& column_path) { - if (column_property_map_.find(column_path) != column_property_map_.end()) { - auto column_prop = column_property_map_[column_path]; - if (column_prop != nullptr) - return column_prop->getKey(); - } - return NULL_STRING; - } - - const std::string& getFooterKey() { - return footer_key_; - } - - const std::string& getAADPrefix() { return aad_prefix_; } - - std::shared_ptr getKeyRetriever() { - return key_retriever_; - } - - bool checkFooterIntegrity() { - return check_plaintext_footer_integrity_; - } - - const std::shared_ptr &getAADPrefixVerifier() { - return aad_prefix_verifier_; - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - - - FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - std::string aad_prefix, - std::shared_ptr aad_prefix_verifier, - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { - DCHECK(!footer_key.empty() || - NULLPTR != key_retriever || - 0 != column_property_map.size()); - if (!footer_key.empty()) - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); - if (footer_key.empty() && check_plaintext_footer_integrity) - DCHECK(NULLPTR != key_retriever); - aad_prefix_verifier_ = aad_prefix_verifier; - footer_key_ = footer_key; - check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; - key_retriever_ = key_retriever; - aad_prefix_ = aad_prefix; - column_property_map_ = column_property_map; - } - -}; class PARQUET_EXPORT ReaderProperties { public: @@ -471,10 +107,6 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; -static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; -static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; class PARQUET_EXPORT ColumnProperties { public: @@ -523,208 +155,6 @@ class PARQUET_EXPORT ColumnProperties { size_t max_stats_size_; }; -class PARQUET_EXPORT FileEncryptionProperties { - public: - class Builder { - public: - Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { - footer_key_ = footer_key; - store_aad_prefix_in_file_ = false; - } - - // Create files with plaintext footer. - // If not called, the files will be created with encrypted footer (default). - Builder* withPlaintextFooter() { - encrypted_footer_ = false; - return this; - } - - // Set encryption algorithm. - // If not called, files will be encrypted with AES_GCM_V1 (default). - Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { - parquet_cipher_ = parquet_cipher; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyID(std::string key_id) { - //key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) - throw ParquetException("footer key id should be in UTF8 encoding"); - - if (key_id.empty()) - return this; - - return withFooterKeyMetadata(key_id); - } - - // Set a key retrieval metadata. - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) - return this; - - DCHECK(footer_key_metadata_.empty()); - footer_key_metadata_ = footer_key_metadata; - return this; - } - - // Set the file AAD Prefix. - Builder* withAADPrefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) - return this; - - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - store_aad_prefix_in_file_ = true; - return this; - } - - // Skip storing AAD Prefix in file. - // If not called, and if AAD Prefix is set, it will be stored. - Builder* withoutAADPrefixStorage() { - store_aad_prefix_in_file_ = false; - return this; - } - - // Set the list of encrypted columns and their properties (keys etc). - // If not called, all columns will be encrypted with the footer key. - // If called, the file columns not in the list will be left unencrypted. - Builder* withEncryptedColumns( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - encryptedColumns){ - if (encryptedColumns.size () == 0) - return this; - - if (column_property_map_.size () != 0) - throw ParquetException("Column properties already set"); - - column_property_map_ = encryptedColumns; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new FileEncryptionProperties(parquet_cipher_, - footer_key_, - footer_key_metadata_, - encrypted_footer_, - aad_prefix_, - store_aad_prefix_in_file_, - column_property_map_)); - } - - private: - ParquetCipher::type parquet_cipher_; - bool encrypted_footer_; - std::string footer_key_; - std::string footer_key_metadata_; - - std::string aad_prefix_; - bool store_aad_prefix_in_file_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - }; - - bool encryptedFooter() const { return encrypted_footer_; } - - const EncryptionAlgorithm getAlgorithm() { - return algorithm_; - } - - const std::string& getFooterEncryptionKey() { - return (encrypted_footer_? footer_key_ : NULL_STRING); - } - - const std::string& getFooterEncryptionKeyMetadata() { - return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); - } - - const std::string& getFooterSigningKey() { - return (encrypted_footer_? NULL_STRING : footer_key_); - } - - const std::string& getFooterSigningKeyMetadata() { - return (encrypted_footer_? NULL_STRING : footer_key_metadata_); - } - - const std::string& getFileAAD() const { return file_AAD_; } - - std::shared_ptr getColumnProperties( - const std::shared_ptr& column_path) { - if (column_property_map_.size () == 0){ - auto builder = std::shared_ptr( - new ColumnEncryptionProperties::Builder (column_path)); - return builder->build(); - } - if (column_property_map_.find(column_path) != column_property_map_.end()) - return column_property_map_[column_path]; - - return NULLPTR; - } - - private: - EncryptionAlgorithm algorithm_; // encryption algorithm - std::string footer_key_; - std::string footer_key_metadata_; - bool encrypted_footer_; - std::string file_AAD_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - - FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, - bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_property_map) - : footer_key_(footer_key), - footer_key_metadata_(footer_key_metadata), - encrypted_footer_(encrypted_footer), - column_property_map_(column_property_map){ - DCHECK(!footer_key.empty()); - // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 - || footer_key.length() == 24 - || footer_key.length() == 32); - - uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; - memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); - RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str( - reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH) ; - - bool supply_aad_prefix = false; - if (aad_prefix.empty()) - file_AAD_ = aad_file_unique_str; - else { - file_AAD_ = aad_prefix + aad_file_unique_str; - if (!store_aad_prefix_in_file) supply_aad_prefix = true; - } - algorithm_.algorithm = cipher; - algorithm_.aad.aad_file_unique = aad_file_unique_str; - algorithm_.aad.supply_aad_prefix = supply_aad_prefix; - if (!aad_prefix.empty() && store_aad_prefix_in_file) { - algorithm_.aad.aad_prefix = aad_prefix; - } - } -}; - class PARQUET_EXPORT WriterProperties { public: class Builder { From 447a11869102f94f45bb78a56cd2f2cd312c233a Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 25 Apr 2019 23:01:31 +0700 Subject: [PATCH 031/125] update crypto API change --- cpp/src/parquet/CMakeLists.txt | 2 + cpp/src/parquet/column_reader.cc | 91 ++++---- cpp/src/parquet/column_reader.h | 6 +- cpp/src/parquet/column_writer.cc | 104 +++++---- cpp/src/parquet/column_writer.h | 6 +- cpp/src/parquet/file_reader.cc | 162 ++++---------- cpp/src/parquet/file_writer.cc | 76 ++++--- cpp/src/parquet/file_writer.h | 2 +- cpp/src/parquet/internal_file_decryptor.cc | 246 +++++++++++++++++++++ cpp/src/parquet/internal_file_decryptor.h | 116 ++++++++++ cpp/src/parquet/internal_file_encryptor.cc | 144 ++++++++++++ cpp/src/parquet/internal_file_encryptor.h | 61 +++++ cpp/src/parquet/metadata.cc | 180 +++++---------- cpp/src/parquet/metadata.h | 34 ++- cpp/src/parquet/thrift.h | 27 ++- 15 files changed, 864 insertions(+), 393 deletions(-) create mode 100644 cpp/src/parquet/internal_file_decryptor.cc create mode 100644 cpp/src/parquet/internal_file_decryptor.h create mode 100644 cpp/src/parquet/internal_file_encryptor.cc create mode 100644 cpp/src/parquet/internal_file_encryptor.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 56c79f0c084..1d2e8e45dd4 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -168,6 +168,8 @@ set(PARQUET_SRCS encoding.cc encryption.cc encryption_properties.cc + internal_file_decryptor.cc + internal_file_encryptor.cc file_reader.cc file_writer.cc metadata.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e7f3a5d90a1..cb98e353858 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -31,6 +31,7 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" @@ -118,8 +119,9 @@ class SerializedPageReader : public PageReader { int64_t total_num_rows, Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, int16_t column_ordinal, - const std::shared_ptr encryption, - ::arrow::MemoryPool* pool) + ::arrow::MemoryPool* pool, + std::shared_ptr meta_decryptor, + std::shared_ptr data_decryptor) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), first_page_(true), @@ -129,20 +131,24 @@ class SerializedPageReader : public PageReader { page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), - encryption_(encryption), - decryption_buffer_(AllocateBuffer(pool, 0)) { + decryption_buffer_(AllocateBuffer(pool, 0)), + meta_decryptor_(meta_decryptor), + data_decryptor_(data_decryptor) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); - if (encryption != NULLPTR) { - DCHECK (!encryption_->fileAAD().empty()); + if (data_decryptor_ != NULLPTR) { + DCHECK (!data_decryptor_->fileAAD().empty()); //prepare the AAD for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + data_decryptor_->fileAAD(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); + } + if (meta_decryptor_ != NULLPTR) { + DCHECK (!meta_decryptor_->fileAAD().empty()); data_page_headerAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + meta_decryptor_->fileAAD(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -182,8 +188,9 @@ class SerializedPageReader : public PageReader { int64_t total_num_rows_; // Encryption - std::shared_ptr encryption_; std::shared_ptr decryption_buffer_; + std::shared_ptr meta_decryptor_; + std::shared_ptr data_decryptor_; }; std::shared_ptr SerializedPageReader::NextPage() { @@ -218,20 +225,23 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - if (encryption_!= NULLPTR) { - if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - encryption_->aad(aad); - } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - encryption_->aad(data_page_headerAAD_); - } - } - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, encryption_); + if (meta_decryptor_ != NULLPTR) { + if (current_page_is_dictionary) { + aad = parquet_encryption::createModuleAAD( + meta_decryptor_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + meta_decryptor_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, + page_ordinal_); + meta_decryptor_->aad(data_page_headerAAD_); + } + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, + meta_decryptor_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -249,18 +259,18 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - if (encryption_!= NULLPTR){ - DCHECK(!encryption_->fileAAD().empty()); + if (data_decryptor_ != NULLPTR){ + DCHECK(!data_decryptor_->fileAAD().empty()); if (current_page_is_dictionary){ aad = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + data_decryptor_->fileAAD(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); - encryption_->aad(aad); + data_decryptor_->aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - encryption_->aad(data_pageAAD_); + data_decryptor_->aad(data_pageAAD_); } } @@ -275,10 +285,10 @@ std::shared_ptr SerializedPageReader::NextPage() { } // Decrypt it if we need to - if (encryption_ != nullptr) { - decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); - compressed_len = parquet_encryption::Decrypt( - encryption_, false, buffer, compressed_len, decryption_buffer_->mutable_data()); + if (data_decryptor_ != nullptr) { + decryption_buffer_->Resize(compressed_len - data_decryptor_->CiphertextSizeDelta()); + compressed_len = data_decryptor_->Decrypt( + buffer, compressed_len, decryption_buffer_->mutable_data()); buffer = decryption_buffer_->data(); } @@ -349,15 +359,18 @@ std::shared_ptr SerializedPageReader::NextPage() { return std::shared_ptr(nullptr); } -std::unique_ptr PageReader::Open( - const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, bool column_has_dictionary, - int16_t row_group_ordinal, int16_t column_ordinal, - const std::shared_ptr encryption, - ::arrow::MemoryPool* pool) { +std::unique_ptr PageReader::Open(const std::shared_ptr& stream, + int64_t total_num_rows, + Compression::type codec, + bool column_has_dictionary, + int16_t row_group_ordinal, + int16_t column_ordinal, + ::arrow::MemoryPool* pool, + std::shared_ptr meta_decryptor, + std::shared_ptr data_decryptor) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, - row_group_ordinal, column_ordinal, encryption, pool)); + row_group_ordinal, column_ordinal, pool, meta_decryptor, data_decryptor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index b513595c833..f6abd435858 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -45,6 +45,7 @@ namespace parquet { class DictionaryPage; class Page; +class Decryptor; // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -83,8 +84,9 @@ class PARQUET_EXPORT PageReader { const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, bool column_has_dictionary = false, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, - const std::shared_ptr& encryption = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + std::shared_ptr meta_decryptor = NULLPTR, + std::shared_ptr data_decryptor = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fcb7768619a..fee0db9f1b7 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -33,6 +33,7 @@ #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/statistics.h" #include "parquet/thrift.h" #include "parquet/types.h" @@ -127,10 +128,11 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata,int16_t row_group_ordinal, int16_t column_chunk_ordinal, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), pool_(pool), @@ -142,16 +144,19 @@ class SerializedPageWriter : public PageWriter { page_ordinal_(0), row_group_ordinal_(row_group_ordinal), column_ordinal_(column_chunk_ordinal), - encryption_(encryption){ - if (encryption != NULLPTR) { + meta_encryptor_(meta_encryptor), + data_encryptor_(data_encryptor){ + if (data_encryptor_ != NULLPTR) { //prepare the add for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + data_encryptor_->fileAAD(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); + } + if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + meta_encryptor_->fileAAD(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -181,24 +186,17 @@ class SerializedPageWriter : public PageWriter { int32_t output_data_len = static_cast(compressed_data->size()); std::shared_ptr encrypted_data_buffer = nullptr; - if (encryption_.get()) { - encryption_->aad( - parquet_encryption::createModuleAAD(encryption_->fileAAD(), + if (data_encryptor_.get()) { + data_encryptor_->aad( + parquet_encryption::createModuleAAD(data_encryptor_->fileAAD(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1)); - encrypted_data_buffer = std::static_pointer_cast( - AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); - output_data_len = parquet_encryption::Encrypt( - encryption_, false, compressed_data->data(), output_data_len, - encrypted_data_buffer->mutable_data()); + AllocateBuffer(pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - encryption_->aad( - parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); } format::PageHeader page_header; @@ -213,7 +211,15 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); + + if (meta_encryptor_) { + meta_encryptor_->aad( + parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + } + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -229,16 +235,15 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); - auto props = metadata_->get_encryption_props(metadata_->descr()->path()); - if (props != nullptr){ - props->aad( - parquet_encryption::createModuleAAD(encryption_->fileAAD(), + if (meta_encryptor_ != nullptr){ + meta_encryptor_->aad( + parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), parquet_encryption::ColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get(), props); + metadata_->WriteTo(sink_.get(), meta_encryptor_); } /** @@ -278,17 +283,14 @@ class SerializedPageWriter : public PageWriter { int32_t output_data_len = static_cast(compressed_data->size()); std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); - if (encryption_.get()) { + if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - encryption_->aad(data_pageAAD_); - encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); - output_data_len = parquet_encryption::Encrypt( - encryption_, false, compressed_data->data(), output_data_len, + data_encryptor_->aad(data_pageAAD_); + encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + output_data_len); + output_data_len = data_encryptor_->Encrypt( + compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, - page_ordinal_); - encryption_->aad(data_page_headerAAD_); } format::PageHeader page_header; @@ -304,7 +306,12 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); + if (meta_encryptor_) { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, + page_ordinal_); + meta_encryptor_->aad(data_page_headerAAD_); + } + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -348,24 +355,28 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; - std::shared_ptr encryption_; + + std::shared_ptr meta_encryptor_; + std::shared_ptr data_encryptor_; }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t current_column_ordinal, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, - row_group_ordinal, current_column_ordinal, pool)); - } // TODO: nullptr for EncryptionProperties + new SerializedPageWriter(in_memory_sink_, codec, metadata, + row_group_ordinal, current_column_ordinal, pool, + meta_encryptor, data_encryptor)); + } int64_t WriteDictionaryPage(const DictionaryPage& page) override { return pager_->WriteDictionaryPage(page); @@ -408,20 +419,21 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, - bool buffered_row_group) { + bool buffered_row_group, + std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, encryption, metadata, + new BufferedPageWriter(sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, - pool)); + pool, meta_encryptor, data_encryptor)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, encryption, metadata, + new SerializedPageWriter(sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, - pool)); + pool, meta_encryptor, data_encryptor)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 545fb571f61..8d638182d8b 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -46,6 +46,7 @@ namespace parquet { class ColumnChunkMetaDataBuilder; class WriterProperties; +class Encryptor; class PARQUET_EXPORT LevelEncoder { public: @@ -83,11 +84,12 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool buffered_row_group = false); + bool buffered_row_group = false, + std::shared_ptr header_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR); // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index cd06801e35f..8c1a06c3dc1 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -34,6 +34,7 @@ #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -87,12 +88,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, FileCryptoMetaData* file_crypto_metadata, int row_group_number, - const ReaderProperties& props) + const ReaderProperties& props, InternalFileDecryptor* file_decryptor) : source_(source), file_metadata_(file_metadata), file_crypto_metadata_(file_crypto_metadata), properties_(props), - row_group_ordinal_((int16_t)row_group_number){ + row_group_ordinal_((int16_t)row_group_number), + file_decryptor_(file_decryptor){ row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -108,15 +110,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { else if (file_metadata_->is_plaintext_mode()) { algorithm = file_metadata_->encryption_algorithm(); } - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> - column_map = properties_.column_map(); // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, properties_.file_decryption(), - &algorithm, - properties_.fileAAD(), - column_map); + &algorithm, file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -153,8 +150,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i/*column_ordinal*/, - nullptr, properties_.memory_pool()); + row_group_ordinal_, + (int16_t)i/* column_ordinal */, + properties_.memory_pool()); } // the column is encrypted @@ -164,38 +162,25 @@ class SerializedRowGroup : public RowGroupReader::Contents { row_group_ordinal_, (int16_t)i, (int16_t)-1); - auto file_decryption = properties_.file_decryption(); - // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() ? file_metadata_->footer_signing_key_metadata() : file_crypto_metadata_->key_metadata(); - std::string footer_key = file_decryption->getFooterKey(); - // ignore footer key metadata if footer key is explicitly set via API - if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - footer_key = file_decryption->getKeyRetriever()->GetKey( - footer_key_metadata); - } - if (footer_key.empty()) { - throw ParquetException("column is encrypted with null footer key"); - } ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; - auto footer_encryption = std::make_shared( - algorithm, footer_key, properties_.fileAAD(), aad); + auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( + algorithm, footer_key_metadata, aad); + auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( + algorithm, footer_key_metadata, aad); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - footer_encryption, properties_.memory_pool()); + col->has_dictionary_page(), row_group_ordinal_, + (int16_t)i, properties_.memory_pool(), + meta_decryptor, data_decryptor); } // file is non-uniform encrypted and the column @@ -204,41 +189,21 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - // encrypted with column key - std::string column_key; - // first look if we already got the key from before - if (column_map != NULLPTR - && column_map->find(column_path) != column_map->end()) { - column_key = column_map->at(column_path); - } else { - column_key = file_decryption->getColumnKey(column_path); - // No explicit column key given via API. Retrieve via key metadata. - if (column_key.empty() && !column_key_metadata.empty() && - file_decryption->getKeyRetriever() != nullptr){ - try { - column_key = file_decryption->getKeyRetriever()->GetKey( - column_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << "HiddenColumnException, path=" + - column_path->ToDotString() + " " - << e.what() << "\n"; - throw HiddenColumnException(ss.str()); - } - } - } - if (column_key.empty()) { - throw HiddenColumnException("column is encrypted with null key, path=" + - column_path->ToDotString()); - } - auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, - column_key, - properties_.fileAAD(), aad); - return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - column_encryption, properties_.memory_pool()); + auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( + column_path, + file_crypto_metadata_->encryption_algorithm().algorithm, + column_key_metadata, aad); + auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( + column_path, + file_crypto_metadata_->encryption_algorithm().algorithm, + column_key_metadata, aad); + + return PageReader::Open(stream, col->num_values(), + col->compression(), + col->has_dictionary_page(), row_group_ordinal_, + (int16_t)i, properties_.memory_pool(), + meta_decryptor, data_decryptor); } private: @@ -248,6 +213,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; + InternalFileDecryptor* file_decryptor_; }; // ---------------------------------------------------------------------- @@ -267,7 +233,8 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( new SerializedRowGroup(source_, file_metadata_.get(), - file_crypto_metadata_.get(), i, properties_)); + file_crypto_metadata_.get(), i, properties_, + file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -333,6 +300,7 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_metadata_->is_plaintext_mode()) { auto file_decryption = properties_.file_decryption(); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); if (file_decryption == nullptr) { throw ParquetException("No decryption properties are provided"); } @@ -365,41 +333,19 @@ class SerializedFile : public ParquetFileReader::Contents { else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - properties_.set_fileAAD(fileAAD); + file_decryptor_->file_aad(fileAAD); if (file_decryption->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); } - // get footer key std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); - std::string footer_key = file_decryption->getFooterKey(); - // ignore footer key metadata if footer key is explicitly set via API - if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = - file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << "Footer key: access denied " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - } - if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify " - "plaintext footer metadata"); - } - std::string aad = parquet_encryption::createFooterAAD(fileAAD); - auto encryption = std::make_shared( + auto encryptor = file_decryptor_->GetFooterSigningEncryptor( file_metadata_->encryption_algorithm().algorithm, - footer_key, fileAAD, aad); - if (! file_metadata_->verify(encryption, metadata_buffer->data() - + read_metadata_len, 28)) { + footer_key_metadata); + if (! file_metadata_->verify(encryptor, metadata_buffer->data() + + read_metadata_len)) { throw ParquetException("Invalid parquet file. Could not verify plaintext" " footer metadata"); } @@ -437,6 +383,7 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided. Could not read " "encrypted footer metadata"); } + file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); @@ -468,8 +415,7 @@ class SerializedFile : public ParquetFileReader::Contents { else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; // save fileAAD for later use - properties_.set_fileAAD(fileAAD); - std::string aad = parquet_encryption::createFooterAAD(fileAAD); + file_decryptor_->file_aad(fileAAD); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; @@ -483,32 +429,13 @@ class SerializedFile : public ParquetFileReader::Contents { // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); - std::string footer_key = file_decryption->getFooterKey(); - if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = - file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << "Footer key: access denied " << e.what() << "\n";; - throw ParquetException(ss.str()); - } - } - if (footer_key.empty()) { - throw ParquetException("Invalid footer encryption key. " - "Could not parse footer metadata"); - } - auto footer_encryption = std::make_shared( + + auto footer_decryptor = file_decryptor_->GetFooterDecryptor( file_crypto_metadata_->encryption_algorithm().algorithm, - footer_key, - fileAAD, aad); + footer_key_metadata); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, - footer_encryption); + footer_decryptor); } } @@ -517,6 +444,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr file_metadata_; std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; + std::unique_ptr file_decryptor_; }; // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 3111f745222..2d1f0784d3b 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -23,6 +23,7 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" #include "parquet/platform.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" #include "parquet/util/memory.h" #include "parquet/util/crypto.h" @@ -83,7 +84,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, const WriterProperties* properties, - bool buffered_row_group = false) + bool buffered_row_group = false, + InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), properties_(properties), @@ -92,7 +94,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { row_group_ordinal_ (row_group_ordinal), current_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group) { + buffered_row_group_(buffered_row_group), + file_encryptor_(file_encryptor) { if (buffered_row_group) { InitColumns(); } else { @@ -128,12 +131,19 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++current_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); + auto meta_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; + std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, row_group_ordinal_, (int16_t)(current_column_index_-1), - properties_->memory_pool()); + properties_->memory_pool(), false, + meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -202,6 +212,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { int current_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; + InternalFileEncryptor* file_encryptor_; void CheckRowsWritten() const { // verify when only one column is written at a time @@ -229,12 +240,18 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); + auto meta_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, (int16_t)row_group_ordinal_, (int16_t)current_column_index_, - properties_->memory_pool(), buffered_row_group_); + properties_->memory_pool(), buffered_row_group_, + meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -287,15 +304,8 @@ class FileSerializer : public ParquetFileWriter::Contents { auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - ParquetCipher::type algorithm = - file_encryption->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD( - file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = std::make_shared( - algorithm, - file_encryption->getFooterEncryptionKey(), - file_encryption->getFileAAD(), aad); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); + auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(PARQUET_EMAGIC, 4); @@ -311,14 +321,8 @@ class FileSerializer : public ParquetFileWriter::Contents { file_metadata_ = metadata_->Finish( &signing_encryption, file_encryption->getFooterSigningKeyMetadata ()); - ParquetCipher::type algorithm = algo.algorithm; - std::string aad = parquet_encryption::createFooterAAD( - file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = std::make_shared( - algorithm, - file_encryption->getFooterSigningKey(), - file_encryption->getFileAAD(), aad); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); + auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); } } @@ -344,7 +348,7 @@ class FileSerializer : public ParquetFileWriter::Contents { auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, (int16_t)(num_row_groups_-1), properties_.get(), - buffered_row_group)); + buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); @@ -385,12 +389,22 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; + std::unique_ptr file_encryptor_; + void StartFile() { - if (properties_->file_encryption() == nullptr) { - // Parquet files always start with PAR1 + auto file_encryption = properties_->file_encryption(); + if (file_encryption == nullptr) { + // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); } else { - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); + if (file_encryption->encryptedFooter()) { + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + } + else { + // plaintext mode footer + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + } } } }; @@ -428,9 +442,9 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& footer_encryption, + const std::shared_ptr& encryptor, bool encrypt_footer) { - if (footer_encryption == nullptr) { + if (encryptor == nullptr) { // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); @@ -446,11 +460,11 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin } else { if (encrypt_footer) { // encrypt and write to sink - file_metadata.WriteTo(sink, footer_encryption); + file_metadata.WriteTo(sink, encryptor); } else { uint32_t metadata_len = static_cast(sink->Tell()); - file_metadata.WriteTo(sink, footer_encryption); + file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; sink->Write(reinterpret_cast(&metadata_len), 4); diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 5fb7f3575d1..db9008fff56 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -108,7 +108,7 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - const std::shared_ptr& encryption_properties = NULLPTR, + const std::shared_ptr& encryptor = NULLPTR, bool encrypt_footer = false); void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc new file mode 100644 index 00000000000..91a1730a238 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -0,0 +1,246 @@ +#include "parquet/encryption_properties.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/util/crypto.h" + +namespace parquet { + +// FooterSigningEncryptor +static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} + +FooterSigningEncryptor::FooterSigningEncryptor( + ParquetCipher::type algorithm, const std::string& key, + const std::string& file_aad, const std::string& aad) + : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { + aes_encryptor_.reset(new parquet_encryption::AesEncryptor( + algorithm, static_cast(key_.size()), true)); +} + +int FooterSigningEncryptor::CiphertextSizeDelta() { + return aes_encryptor_->CiphertextSizeDelta(); +} + +int FooterSigningEncryptor::SignedFooterEncrypt( + const uint8_t* footer, int footer_len, + uint8_t* nonce, uint8_t* encrypted_footer) { + return aes_encryptor_->SignedFooterEncrypt( + footer, footer_len, str2bytes(key_), static_cast(key_.size()), + str2bytes(aad_), static_cast(aad_.size()), nonce, encrypted_footer); +} + +// Decryptor +Decryptor::Decryptor( + parquet_encryption::AesDecryptor* aes_decryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad) + : aes_decryptor_(aes_decryptor), key_(key) + , file_aad_(file_aad), aad_(aad) {} + +int Decryptor::CiphertextSizeDelta() { + return aes_decryptor_->CiphertextSizeDelta(); +} + +int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext) { + return aes_decryptor_->Decrypt( + ciphertext, ciphertext_len, + str2bytes(key_), static_cast(key_.size()), + str2bytes(aad_), static_cast(aad_.size()), plaintext); +} + +// InternalFileDecryptor +InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) + : properties_(properties) {} + +std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) +{ + std::string footer_key = properties_->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = + properties_->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException("Footer key unavailable. Could not verify " + "plaintext footer metadata"); + } + + std::string aad = parquet_encryption::createFooterAAD(file_aad_); + + return std::make_shared( + algorithm, footer_key, file_aad_, aad); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) { + std::string aad = parquet_encryption::createFooterAAD(file_aad_); + return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad) { + return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnData( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad) { + return GetFooterDecryptor(algorithm, footer_key_metadata, aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad, bool metadata) { + std::string footer_key = properties_->getFooterKey(); + if (footer_key.empty()) { + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = properties_->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n";; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException("Invalid footer encryption key. " + "Could not parse footer metadata"); + } + + auto aes_decryptor = metadata + ? GetMetaAesDecryptor(algorithm, footer_key.size()) + : GetDataAesDecryptor(algorithm, footer_key.size()); + return std::make_shared(aes_decryptor, footer_key, file_aad_, aad); +} + +std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad) { + return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad) { + return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad, bool metadata) { + std::string column_key; + // first look if we already got the key from before + if (column_map_ != NULLPTR + && column_map_->find(column_path) != column_map_->end()) { + column_key = column_map_->at(column_path); + } else { + column_key = properties_->getColumnKey(column_path); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + properties_->getKeyRetriever() != nullptr){ + try { + column_key = properties_->getKeyRetriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + + column_path->ToDotString() + " " + << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } + } + } + if (column_key.empty()) { + throw HiddenColumnException("column is encrypted with null key, path=" + + column_path->ToDotString()); + } + + if (column_map_ != NULLPTR) { + // save column key for future use + (*column_map_)[column_path] = column_key; + } + + auto aes_decryptor = metadata + ? GetMetaAesDecryptor(algorithm, column_key.size()) + : GetDataAesDecryptor(algorithm, column_key.size()); + + return std::make_shared(aes_decryptor, column_key, file_aad_, aad); +} + +parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (meta_decryptor_128_ == NULLPTR) { + meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return meta_decryptor_128_.get(); + } + else if (key_len == 24) { + if (meta_decryptor_196_ == NULLPTR) { + meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return meta_decryptor_196_.get(); + } + else if (key_len == 32) { + if (meta_decryptor_256_ == NULLPTR) { + meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return meta_decryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (data_decryptor_128_ == NULLPTR) { + data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return data_decryptor_128_.get(); + } + else if (key_len == 24) { + if (data_decryptor_196_ == NULLPTR) { + data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return data_decryptor_196_.get(); + } + else if (key_len == 32) { + if (data_decryptor_256_ == NULLPTR) { + data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return data_decryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h new file mode 100644 index 00000000000..c92fc08c873 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -0,0 +1,116 @@ +#ifndef INTERNAL_FILE_DECRYPTOR_H +#define INTERNAL_FILE_DECRYPTOR_H + +#include +#include "parquet/schema.h" + +namespace parquet_encryption { + class AesDecryptor; + class AesEncryptor; +} + +namespace parquet { + +class FileDecryptionProperties; + +class FooterSigningEncryptor { + public: + FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, + const std::string& file_aad, const std::string& aad); + int CiphertextSizeDelta(); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, + uint8_t* nonce, uint8_t* encrypted_footer); + + private: + ParquetCipher::type algorithm_; + std::string key_; + std::string file_aad_; + std::string aad_; + + std::shared_ptr aes_encryptor_; +}; + +class Decryptor { + public: + Decryptor(parquet_encryption::AesDecryptor* decryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad); + + const std::string& fileAAD() const { return file_aad_; } + void aad(const std::string& aad) { aad_ = aad; } + + int CiphertextSizeDelta(); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); + + private: + parquet_encryption::AesDecryptor* aes_decryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; +}; + +class InternalFileDecryptor { + public: + explicit InternalFileDecryptor(FileDecryptionProperties* propperties); + + void file_aad(const std::string& file_aad) { file_aad_ = file_aad; } + std::string& file_aad() { return file_aad_; } + + std::shared_ptr GetFooterSigningEncryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); + + std::shared_ptr GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); + std::shared_ptr GetFooterDecryptorForColumnMeta( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad); + std::shared_ptr GetFooterDecryptorForColumnData( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad); + std::shared_ptr GetColumnMetaDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad); + std::shared_ptr GetColumnDataDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad); + + private: + FileDecryptionProperties* properties_; + std::string file_aad_; + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; + + std::unique_ptr meta_decryptor_128_; + std::unique_ptr meta_decryptor_196_; + std::unique_ptr meta_decryptor_256_; + std::unique_ptr data_decryptor_128_; + std::unique_ptr data_decryptor_196_; + std::unique_ptr data_decryptor_256_; + + std::shared_ptr GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad, bool metadata); + std::shared_ptr GetColumnDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad, bool metadata = false); + + parquet_encryption::AesDecryptor* GetMetaAesDecryptor(ParquetCipher::type algorithm, + size_t key_size); + parquet_encryption::AesDecryptor* GetDataAesDecryptor(ParquetCipher::type algorithm, + size_t key_size); +}; + +} + +#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc new file mode 100644 index 00000000000..7f2195cc20b --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -0,0 +1,144 @@ +#include "parquet/encryption_properties.h" +#include "parquet/internal_file_encryptor.h" +#include "parquet/util/crypto.h" + +namespace parquet { + +static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} + +// Encryptor +Encryptor::Encryptor( + parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad) + : aes_encryptor_(aes_encryptor), key_(key) + , file_aad_(file_aad), aad_(aad) {} + +int Encryptor::CiphertextSizeDelta() { + return aes_encryptor_->CiphertextSizeDelta(); +} + +int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) { + return aes_encryptor_->Encrypt( + plaintext, plaintext_len, str2bytes(key_), static_cast(key_.size()), + str2bytes(aad_), static_cast(aad_.size()), ciphertext); +} + +// InternalFileEncryptor +InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) + : properties_(properties) {} + +std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { + ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); + std::string footer_key = properties_->getFooterEncryptionKey(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); + + return std::make_shared(aes_encryptor, footer_key, + properties_->getFileAAD(), aad); +} + +std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { + ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); + std::string footer_signing_key = properties_->getFooterSigningKey(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); + + return std::make_shared(aes_encryptor, footer_signing_key, + properties_->getFileAAD(), aad); +} + +std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( + const std::shared_ptr& column_path) { + return GetColumnEncryptor(column_path, true); +} + +std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( + const std::shared_ptr& column_path) { + return GetColumnEncryptor(column_path, false); +} + +std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( + const std::shared_ptr& column_path, + bool metadata) { + auto column_prop = properties_->getColumnProperties(column_path); + if (column_prop == NULLPTR) { + return NULLPTR; + } + + std::string key; + if (column_prop->isEncryptedWithFooterKey()) { + if (properties_->encryptedFooter()) { + key = properties_->getFooterEncryptionKey(); + } else { + key = properties_->getFooterSigningKey(); + } + } + else { + key = column_prop->getKey(); + } + + ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + auto aes_encryptor = metadata + ? GetMetaAesEncryptor(algorithm, key.size()) + : GetDataAesEncryptor(algorithm, key.size()); + + std::string file_aad = properties_->getFileAAD(); + + // TODO: aad + return std::make_shared(aes_encryptor, key, file_aad, ""); +} + +parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (meta_encryptor_128_ == NULLPTR) { + meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + } + return meta_encryptor_128_.get(); + } + else if (key_len == 24) { + if (meta_encryptor_196_ == NULLPTR) { + meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + } + return meta_encryptor_196_.get(); + } + else if (key_len == 32) { + if (meta_encryptor_256_ == NULLPTR) { + meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + } + return meta_encryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (data_encryptor_128_ == NULLPTR) { + data_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + } + return data_encryptor_128_.get(); + } + else if (key_len == 24) { + if (data_encryptor_196_ == NULLPTR) { + data_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + } + return data_encryptor_196_.get(); + } + else if (key_len == 32) { + if (data_encryptor_256_ == NULLPTR) { + data_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + } + return data_encryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +} // namespace parquet \ No newline at end of file diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h new file mode 100644 index 00000000000..2e3a3df0408 --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -0,0 +1,61 @@ +#ifndef INTERNAL_FILE_ENCRYPTOR_H +#define INTERNAL_FILE_ENCRYPTOR_H + +namespace parquet_encryption { + class AesEncryptor; +} + +namespace parquet { + +class FileEncryptionProperties; + +class Encryptor { + public: + Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad); + const std::string& fileAAD() { return file_aad_; } + void aad(const std::string& aad) { aad_ = aad; } + + int CiphertextSizeDelta(); + int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); + + private: + parquet_encryption::AesEncryptor* aes_encryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; +}; + +class InternalFileEncryptor { + public: + explicit InternalFileEncryptor(FileEncryptionProperties* propperties); + + std::shared_ptr GetFooterEncryptor(); + std::shared_ptr GetFooterSigningEncryptor(); + std::shared_ptr GetColumnMetaEncryptor(const std::shared_ptr& column_path); + std::shared_ptr GetColumnDataEncryptor(const std::shared_ptr& column_path); + + private: + FileEncryptionProperties* properties_; + + std::unique_ptr meta_encryptor_128_; + std::unique_ptr meta_encryptor_196_; + std::unique_ptr meta_encryptor_256_; + std::unique_ptr data_encryptor_128_; + std::unique_ptr data_encryptor_196_; + std::unique_ptr data_encryptor_256_; + + std::shared_ptr GetColumnEncryptor( + const std::shared_ptr& column_path, + bool metadata); + + parquet_encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); + parquet_encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); +}; + +} + +#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5b745382310..bd5d3959a5b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -23,6 +23,7 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/schema-internal.h" @@ -168,9 +169,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, std::string, - schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) + InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -186,56 +185,24 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - std::string key; - if (column_map != NULLPTR - && (column_map->find(path) != column_map->end())) { - //First retrieve the key in column_map if exists - key = column_map->at(path); - } else { - key = file_decryption->getColumnKey(path); - // No explicit column key given via API. Retrieve via key metadata. - if (key.empty() && !key_metadata.empty() - && file_decryption->getKeyRetriever() != nullptr){ - try { - key = file_decryption->getKeyRetriever()->GetKey(key_metadata); - } catch (KeyAccessDeniedException &e) { - // Hidden column: encrypted, but key unavailable - std::stringstream ss; - ss << "HiddenColumnException path=" + path->ToDotString() + " " - << e.what() << "\n"; - throw HiddenColumnException(ss.str()); - } - if (key.empty ()) - throw HiddenColumnException("HiddenColumnException path=" - + path->ToDotString()); - } - if (column_map != NULLPTR) { - // save column key for future use - (*column_map)[path]=key; - } - } - if (key.empty()) { - // Hidden column: encrypted, but key unavailable - throw HiddenColumnException("HiddenColumnException path= " - + path->ToDotString()); - } + DCHECK(algorithm != NULLPTR); - + DCHECK(file_decryptor != NULLPTR); + std::string aad = parquet_encryption::createModuleAAD( - fileAAD, + file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal, column_ordinal, (int16_t)-1); - auto encryption = std::make_shared( - algorithm->algorithm, - key, fileAAD, aad); - uint32_t len = - static_cast(column->encrypted_column_metadata.size()); + auto decryptor = file_decryptor->GetColumnMetaDecryptor( + path, algorithm->algorithm, + key_metadata, aad); + uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast( column->encrypted_column_metadata.c_str()), &len, &metadata_, - encryption, + decryptor, false); } } @@ -336,14 +303,11 @@ std::unique_ptr ColumnChunkMetaData::Make( const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map) { + InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, fileAAD, - column_map)); + file_decryption, algorithm, file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData( @@ -354,9 +318,7 @@ ColumnChunkMetaData::ColumnChunkMetaData( const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map) + InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, @@ -364,7 +326,7 @@ ColumnChunkMetaData::ColumnChunkMetaData( column_ordinal, writer_version, file_decryption, algorithm, - fileAAD, column_map))} {} + file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -448,10 +410,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { + InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -461,8 +420,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { return ColumnChunkMetaData::Make( &row_group_->columns[i], schema_->Column(i), row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, algorithm, fileAAD, - column_map); + writer_version_, file_decryption, algorithm, + file_decryptor); } private: @@ -495,11 +454,9 @@ const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema( std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, const std::string& fileAAD, - std::shared_ptr, std::string, - schema::ColumnPath::CmpColumnPath>> column_map) const { + const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, - fileAAD, column_map); + file_decryptor); } // file metadata @@ -509,11 +466,11 @@ class FileMetaData::FileMetaDataImpl { explicit FileMetaDataImpl( const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = nullptr) + const std::shared_ptr& decryptor = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), encryption, false); + metadata_.get(), decryptor, false); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -527,8 +484,7 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } - bool verify(std::shared_ptr encryption, - const void* tail, uint32_t tail_len) { + bool verify(std::shared_ptr encryptor, const void* tail) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -537,17 +493,16 @@ class FileMetaData::FileMetaDataImpl { // encrypt with nonce uint8_t* nonce = const_cast(reinterpret_cast(tail)); - uint8_t* tag = const_cast(reinterpret_cast(tail)) + 12; + uint8_t* tag = const_cast(reinterpret_cast(tail)) + + parquet_encryption::NonceLength; - std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); + std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + serialized_len); uint32_t encrypted_len = - parquet_encryption::SignedFooterEncrypt(serialized_data, serialized_len, - encryption->key_bytes(), - encryption->key_length(), - encryption->aad_bytes(), - encryption->aad_length(), - nonce, 12, encrypted_buffer.data()); - return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); + encryptor->SignedFooterEncrypt(serialized_data, serialized_len, + nonce, encrypted_buffer.data()); + return 0 == memcmp( + encrypted_buffer.data() + encrypted_len - parquet_encryption::GCMTagLength, + tag, parquet_encryption::GCMTagLength); } inline uint32_t size() const { return metadata_len_; } @@ -572,7 +527,7 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryption) const { + const std::shared_ptr& encryptor) const { ThriftSerializer serializer; if (is_plaintext_mode()) { uint8_t* serialized_data; @@ -580,19 +535,20 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); // encrypt the footer key - std::vector encrypted_data(encryption->CalculateCipherSize(serialized_len)); - unsigned encrypted_len = parquet_encryption::Encrypt( - encryption, true, serialized_data, serialized_len, encrypted_data.data()); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); + unsigned encrypted_len = encryptor->Encrypt(serialized_data, serialized_len, + encrypted_data.data()); // write unencrypted footer dst->Write(serialized_data, serialized_len); // write nonce - dst->Write(encrypted_data.data() + 4, 12); + dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); // write tag - dst->Write(encrypted_data.data() + encrypted_len - 16, 16); + dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, + parquet_encryption::GCMTagLength); } else { - serializer.Serialize(metadata_.get(), dst, encryption, false); + serializer.Serialize(metadata_.get(), dst, encryptor, false); } } @@ -680,16 +636,16 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption) { + const std::shared_ptr& decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( - new FileMetaData(metadata, metadata_len, encryption)); + new FileMetaData(metadata, metadata_len, decryptor)); } FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption) + const std::shared_ptr& decryptor) : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len, encryption))} {} + new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -700,9 +656,8 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify(std::shared_ptr encryption, - const void* tail, uint32_t tail_len) { - return impl_->verify(encryption, tail, tail_len); +bool FileMetaData::verify(std::shared_ptr encryptor, const void* tail) { + return impl_->verify(encryptor, tail); } uint32_t FileMetaData::size() const { return impl_->size(); } @@ -757,14 +712,8 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryption) const { - if (encryption != nullptr) - encryption->aad( - parquet_encryption::createModuleAAD(encryption->fileAAD(), - parquet_encryption::Footer, - (int16_t)-1, (int16_t)-1, - (int16_t)-1)); - return impl_->WriteTo(dst, encryption); + const std::shared_ptr& encryptor) const { + return impl_->WriteTo(dst, encryptor); } class FileCryptoMetaData::FileCryptoMetaDataImpl { @@ -991,7 +940,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { + const std::shared_ptr& encryptor) { ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_props(column_->path()); @@ -1018,30 +967,28 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - auto footer_encryption = properties_->footer_encryption(); + // TODO: check file_encryption() is null or not + auto footer_key = properties_->file_encryption()->getFooterEncryptionKey(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_encryption == nullptr && encrypt_md->isEncrypted()) || + if ((footer_key.empty() && encrypt_md->isEncrypted()) || !encrypt_md->isEncryptedWithFooterKey()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata - auto encrypt_props = encryption; uint8_t* serialized_data; uint32_t serialized_len; - serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); - // encrypt the footer key - std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); + unsigned encrypted_len = encryptor->Encrypt( + serialized_data, serialized_len, encrypted_data.data()); - unsigned encrypted_len = parquet_encryption::Encrypt( - encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Keep redacted metadata version for old readers - if (footer_encryption == nullptr) { + if (footer_key.empty()) { format::ColumnMetaData metadata_redacted = column_metadata_; if (metadata_redacted.__isset.statistics) { metadata_redacted.__isset.statistics = false; @@ -1065,11 +1012,6 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } - std::shared_ptr - get_encryption_props(const std::shared_ptr& path) { - return properties_->encryption(path); - } - const ColumnDescriptor* descr() const { return column_; } int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } @@ -1123,11 +1065,6 @@ void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); } -std::shared_ptr -ColumnChunkMetaDataBuilder::get_encryption_props(const std::shared_ptr& path) { - return impl_->get_encryption_props(path); -} - void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, @@ -1139,8 +1076,8 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { - impl_->WriteTo(sink, encryption); + const std::shared_ptr& encryptor) { + impl_->WriteTo(sink, encryptor); } const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { @@ -1283,7 +1220,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->footer_encryption() != nullptr) { + if (props->file_encryption() != nullptr) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } @@ -1365,7 +1302,6 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } auto file_encryption = properties_->file_encryption(); - auto footer_encryption = properties_->footer_encryption(); crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->getAlgorithm())); std::string key_metadata; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index c74ca88ae95..f4b14a397c8 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -39,6 +39,10 @@ class EncodedStatistics; class Statistics; class SchemaDescriptor; class FileCryptoMetaData; +class InternalFileDecryptor; +class Decryptor; +class Encryptor; +class FooterSigningEncryptor; namespace schema { @@ -126,9 +130,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string &fileAAD = "", - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR); + InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -161,10 +163,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR); + InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -188,11 +187,7 @@ class PARQUET_EXPORT RowGroupMetaData { std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string &fileAAD = "", - std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR) const; + InternalFileDecryptor* file_decryptor = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -209,12 +204,12 @@ class PARQUET_EXPORT FileMetaData { // API convenience to get a MetaData accessor static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = NULLPTR); + const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); - bool verify(std::shared_ptr encryption, - const void* tail, uint32_t tail_len); + bool verify(std::shared_ptr encryptor, + const void* tail); // file metadata uint32_t size() const; int num_columns() const; @@ -230,7 +225,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption = NULLPTR) const; + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -246,7 +241,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = NULLPTR); + const std::shared_ptr& decryptor = NULLPTR); // PIMPL Idiom FileMetaData(); @@ -309,10 +304,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption = NULLPTR); - std::shared_ptr - get_encryption_props(const std::shared_ptr& path); - + const std::shared_ptr& encryptor = NULLPTR); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 4189d2c9079..b7dc43be4f6 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -43,6 +43,8 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/platform.h" +#include "parquet/internal_file_encryptor.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/statistics.h" #include "parquet/util/crypto.h" @@ -200,9 +202,9 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& encryption = NULLPTR, + const std::shared_ptr& decryptor = NULLPTR, bool shouldReadLength = false) { - if (encryption == NULLPTR) { + if (decryptor == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -231,13 +233,13 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } // decrypt const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; - std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); - uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption, true, cipherBuf, 0, decrypted_buffer.data()); + std::vector decrypted_buffer(clen - decryptor->CiphertextSizeDelta()); + uint32_t decrypted_buffer_len = decryptor->Decrypt( + cipherBuf, 0, decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } - *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); + *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); @@ -273,20 +275,21 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryption = NULLPTR, + const std::shared_ptr& encryptor = NULLPTR, bool shouldWriteLength = false) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - if (encryption == NULLPTR) { + if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); } else { - std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - unsigned cipher_buffer_len = parquet_encryption::Encrypt( - encryption, true, out_buffer, out_length, cipher_buffer.data()); - if (cipher_buffer_len > cipher_buffer.size()) { + std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); + int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, + cipher_buffer.data()); + + if (cipher_buffer_len > static_cast(cipher_buffer.size())) { std::stringstream ss; ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; throw ParquetException(ss.str()); From 6d6a68554ea4143af8cc23144279048f26e2e675 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 7 May 2019 08:53:35 +0700 Subject: [PATCH 032/125] fix issue when column is encrypted in footer plaintext mode --- cpp/src/parquet/file_reader.cc | 18 ++++++++---------- cpp/src/parquet/metadata.cc | 3 ++- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8c1a06c3dc1..72f3ed82092 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -161,6 +161,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { parquet_encryption::ColumnMetaData, row_group_ordinal_, (int16_t)i, (int16_t)-1); + + ParquetCipher::type footer_algorithm = file_metadata_->is_plaintext_mode() + ? file_metadata_->encryption_algorithm().algorithm + : file_crypto_metadata_->encryption_algorithm().algorithm; // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { @@ -168,14 +172,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { ? file_metadata_->footer_signing_key_metadata() : file_crypto_metadata_->key_metadata(); - ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() - ? file_metadata_->encryption_algorithm().algorithm - : file_crypto_metadata_->encryption_algorithm().algorithm; - auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( - algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( - algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad); return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), row_group_ordinal_, @@ -191,12 +191,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::make_shared(crypto_metadata->path_in_schema()); auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( - column_path, - file_crypto_metadata_->encryption_algorithm().algorithm, + column_path, footer_algorithm, column_key_metadata, aad); auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( - column_path, - file_crypto_metadata_->encryption_algorithm().algorithm, + column_path, footer_algorithm, column_key_metadata, aad); return PageReader::Open(stream, col->num_values(), diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index bd5d3959a5b..8b8485e73e9 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1220,7 +1220,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->file_encryption() != nullptr) { + if (props->file_encryption() != nullptr + && props->file_encryption()->getFooterSigningKey() == NULL_STRING) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } From daec63778d10dcbc4c756c458035959d291a3d1b Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 7 May 2019 08:59:33 +0700 Subject: [PATCH 033/125] remove EncryptionProperties --- cpp/src/parquet/properties.h | 39 -------------------------- cpp/src/parquet/types.h | 54 ------------------------------------ 2 files changed, 93 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index faed20baf57..7bce487ad81 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -388,19 +388,6 @@ class PARQUET_EXPORT WriterProperties { return parquet_file_encryption_.get(); } - inline std::shared_ptr footer_encryption() const { - if (parquet_file_encryption_ == NULLPTR) { - return NULLPTR; - } else { - std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); - if (footer_key.empty()) - footer_key = parquet_file_encryption_->getFooterSigningKey (); - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - footer_key, parquet_file_encryption_->getFileAAD()); - - } - } - inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -453,32 +440,6 @@ class PARQUET_EXPORT WriterProperties { } } - std::shared_ptr encryption( - const std::shared_ptr& path) const { - if (parquet_file_encryption_) { - auto column_prop = parquet_file_encryption_->getColumnProperties(path); - if (column_prop == NULLPTR) - return NULLPTR; - if (column_prop->isEncryptedWithFooterKey()) { - if (parquet_file_encryption_->encryptedFooter ()) { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterEncryptionKey(), - parquet_file_encryption_->getFileAAD()); - } else { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterSigningKey(), - parquet_file_encryption_->getFileAAD()); - } - } - - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - column_prop->getKey(), - parquet_file_encryption_->getFileAAD()); - } else { - return NULLPTR; - } - } - private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 79d6f8a752e..92d006965bd 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -458,60 +458,6 @@ struct EncryptionAlgorithm { AadMetadata aad; }; -class PARQUET_EXPORT EncryptionProperties { - public: - EncryptionProperties() = default; - EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, - const std::string& file_aad, const std::string& aad = "") - : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) {} - - ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } - - int key_length() const { return static_cast(key_.length()); } - uint8_t* key_bytes() const { return str2bytes(key_); } - - void aad(const std::string& aad) { aad_ = aad; } - int aad_length() const { return static_cast(aad_.length()); } - uint8_t* aad_bytes() const { return str2bytes(aad_); } - - ParquetCipher::type algorithm() const { return algorithm_; } - - const std::string& key() const { return key_; } - const std::string& aad() const { return aad_; } - const std::string& fileAAD() const { return file_aad_; } - - uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { - return plain_len + 28 + 4; - } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { - return plain_len + 16 + 4; - } - return plain_len; - } - - uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { - return cipher_len - 28 - 4; - } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { - return cipher_len - 16 - 4; - } - return cipher_len; - } - - private: - ParquetCipher::type algorithm_; // encryption algorithm - std::string key_; // encryption key, should have 16, 24, 32-byte length - std::string file_aad_; - std::string aad_; // encryption additional authenticated data - static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); - } - -}; - // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; From 1eb25e23a2f816ebbd32c5ad15e2d387838d625a Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 11:34:58 +0300 Subject: [PATCH 034/125] Change HiddenColumnExceptio message --- cpp/src/parquet/internal_file_decryptor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 91a1730a238..68e2526929c 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -179,7 +179,7 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( } } if (column_key.empty()) { - throw HiddenColumnException("column is encrypted with null key, path=" + + throw HiddenColumnException("HiddenColumnException, path=" + column_path->ToDotString()); } From 3cc99fbd832a65b8de092d0997dae9361608ed55 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 11:37:53 +0300 Subject: [PATCH 035/125] Fix indentation in encryption_properties.cc --- cpp/src/parquet/encryption_properties.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index 1eb3847ec71..ba893ce1816 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -169,7 +169,7 @@ FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, algorithm_.aad.aad_file_unique = aad_file_unique_str; algorithm_.aad.supply_aad_prefix = supply_aad_prefix; if (!aad_prefix.empty() && store_aad_prefix_in_file) { - algorithm_.aad.aad_prefix = aad_prefix; + algorithm_.aad.aad_prefix = aad_prefix; } } From 0e29c5414f01ee4b55b335231a87db3f0a87e5cb Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 11:44:36 +0300 Subject: [PATCH 036/125] Rename functions in DecryptionKeyRetriever --- cpp/src/parquet/encryption.cc | 8 ++++---- cpp/src/parquet/encryption.h | 10 +++++----- cpp/src/parquet/internal_file_decryptor.cc | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 0a2d9ef939a..54297253856 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -22,11 +22,11 @@ namespace parquet { // integer key retriever -void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { +void IntegerKeyIdRetriever::putKey(uint32_t key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { +const std::string& IntegerKeyIdRetriever::getKey(const std::string& key_metadata) { uint32_t key_id; memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); @@ -34,11 +34,11 @@ const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata } // string key retriever -void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { +void StringKeyIdRetriever::putKey(const std::string& key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { +const std::string& StringKeyIdRetriever::getKey(const std::string& key_id) { return key_map_[key_id]; } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3a4481bd4aa..60b7b8c3300 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -28,15 +28,15 @@ namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual const std::string& GetKey(const std::string& key_metadata) = 0; + virtual const std::string& getKey(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; // Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: - void PutKey(uint32_t key_id, const std::string& key); - const std::string& GetKey(const std::string& key_metadata); + void putKey(uint32_t key_id, const std::string& key); + const std::string& getKey(const std::string& key_metadata); private: std::map key_map_; @@ -45,8 +45,8 @@ class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { // Simple string key retriever class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { public: - void PutKey(const std::string& key_id, const std::string& key); - const std::string& GetKey(const std::string& key_metadata); + void putKey(const std::string& key_id, const std::string& key); + const std::string& getKey(const std::string& key_metadata); private: std::map key_map_; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 68e2526929c..d00b5a035eb 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -68,7 +68,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE throw ParquetException("No footer key or key retriever"); try { footer_key = - properties_->getKeyRetriever()->GetKey(footer_key_metadata); + properties_->getKeyRetriever()->getKey(footer_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; @@ -118,7 +118,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( if (properties_->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = properties_->getKeyRetriever()->GetKey(footer_key_metadata); + footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n";; @@ -168,7 +168,7 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( if (column_key.empty() && !column_key_metadata.empty() && properties_->getKeyRetriever() != nullptr){ try { - column_key = properties_->getKeyRetriever()->GetKey(column_key_metadata); + column_key = properties_->getKeyRetriever()->getKey(column_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "HiddenColumnException, path=" + From cc73f2c47da2aa72b3ff42ef8f9cbe86a02f04ff Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 16:45:18 +0300 Subject: [PATCH 037/125] Add check for aad_prefix to withoutAADPrefixStorage --- cpp/src/parquet/encryption_properties.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index f3ed589260f..d2e4246e8a6 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -398,6 +398,8 @@ class PARQUET_EXPORT FileEncryptionProperties { // Skip storing AAD Prefix in file. // If not called, and if AAD Prefix is set, it will be stored. Builder* withoutAADPrefixStorage() { + DCHECK(!aad_prefix_.empty()); + store_aad_prefix_in_file_ = false; return this; } From cf6104f6749340ea7e085d533ce2a8715d2d1856 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 16:47:14 +0300 Subject: [PATCH 038/125] Add exception to FromThrift in thrift.h --- cpp/src/parquet/thrift.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index b7dc43be4f6..4a5a98fff8c 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -106,10 +106,11 @@ static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encrypt if (encryption.__isset.AES_GCM_V1) { encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); - - } else { + } else if (encryption.__isset.AES_GCM_CTR_V1) { encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); + } else { + throw ParquetException("Unsupported algorithm"); } return encryption_algorithm; } From 24d304fb6941a9d9993b2c0d4ca7d943a8117a57 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 16:55:00 +0300 Subject: [PATCH 039/125] Fix prefix aad calculation --- cpp/src/parquet/file_reader.cc | 65 +++++++++++++++------------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 72f3ed82092..1928c86d2a2 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -156,7 +156,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // the column is encrypted - std::string aad = parquet_encryption::createModuleAAD( + std::string aad_column_meta_data = parquet_encryption::createModuleAAD( properties_.fileAAD(), parquet_encryption::ColumnMetaData, row_group_ordinal_, @@ -173,9 +173,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { : file_crypto_metadata_->key_metadata(); auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( - footer_algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad_column_meta_data); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( - footer_algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad_column_meta_data); return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), row_group_ordinal_, @@ -192,10 +192,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( column_path, footer_algorithm, - column_key_metadata, aad); + column_key_metadata, aad_column_meta_data); auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( column_path, footer_algorithm, - column_key_metadata, aad); + column_key_metadata, aad_column_meta_data); return PageReader::Open(stream, col->num_values(), col->compression(), @@ -303,33 +303,28 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided"); } + std::string aad_prefix = file_decryption->getAADPrefix(); + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - bool supply_aad_prefix = algo.aad.supply_aad_prefix; - std::string aad_file_unique = algo.aad.aad_file_unique; - std::string aad_prefix = algo.aad.aad_prefix; - if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 - && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); - if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and " - "in properties is not the same"); + if (!algo.aad.aad_prefix.empty()) { + if (!aad_prefix.empty()) { + if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and " + "in properties is not the same"); + } } + aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = file_decryption->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } - if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException("AAD prefix used for file encryption, " "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD; - if (!supply_aad_prefix) - fileAAD = aad_prefix + aad_file_unique; - else - fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; file_decryptor_->file_aad(fileAAD); if (file_decryption->checkFooterIntegrity()) { @@ -386,32 +381,28 @@ class SerializedFile : public ParquetFileReader::Contents { file_crypto_metadata_ = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); - bool supply_aad_prefix = algo.aad.supply_aad_prefix; - std::string aad_file_unique = algo.aad.aad_file_unique; - std::string aad_prefix = algo.aad.aad_prefix; - if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 - && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); - if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties " - "is not the same"); + + std::string aad_prefix = file_decryption->getAADPrefix(); + + if (!algo.aad.aad_prefix.empty()) { + if (!aad_prefix.empty()) { + if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties " + "is not the same"); + } } + aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = file_decryption->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } - if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException("AAD prefix used for file encryption, " "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD; - if (!supply_aad_prefix) - fileAAD = aad_prefix + aad_file_unique; - else - fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; // save fileAAD for later use file_decryptor_->file_aad(fileAAD); From eacef66f653de7637159d90db2b2b944237e55b7 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 17:31:55 +0300 Subject: [PATCH 040/125] Remove fileAAD from ReaderProperties --- cpp/src/parquet/column_reader.cc | 2 -- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/properties.h | 5 ----- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index cb98e353858..ca142d9e464 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -100,8 +100,6 @@ ReaderProperties default_reader_properties() { if (default_reader_properties.column_map() != NULLPTR && default_reader_properties.column_map()->size () != 0) default_reader_properties.column_map()->clear(); - if (!default_reader_properties.fileAAD().empty()) - default_reader_properties.set_fileAAD (""); return default_reader_properties; } diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1928c86d2a2..18a64297545 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -157,7 +157,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // the column is encrypted std::string aad_column_meta_data = parquet_encryption::createModuleAAD( - properties_.fileAAD(), + file_decryptor_->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal_, (int16_t)i, (int16_t)-1); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 7bce487ad81..153c83b26fc 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -72,10 +72,6 @@ class PARQUET_EXPORT ReaderProperties { return column_map_; } - const std::string& fileAAD() { return fileAAD_; } - - void set_fileAAD (std::string fileAAD) { fileAAD_ = fileAAD; } - void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -90,7 +86,6 @@ class PARQUET_EXPORT ReaderProperties { std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between //ColumnPath and their encryption keys - std::string fileAAD_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); From 4f3112c19a684c028acbd14bc498dd8bf3a3ae00 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 17:39:31 +0300 Subject: [PATCH 041/125] Remove column_map from ReaderProperties --- cpp/src/parquet/column_reader.cc | 5 ----- cpp/src/parquet/internal_file_decryptor.h | 4 +++- cpp/src/parquet/properties.h | 12 ------------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ca142d9e464..708d5665f7e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -95,11 +95,6 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; - // reset column_map and fileAAD as default_reader_properties is static but - // can be used when reading parquet file with different reading options. - if (default_reader_properties.column_map() != NULLPTR - && default_reader_properties.column_map()->size () != 0) - default_reader_properties.column_map()->clear(); return default_reader_properties; } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index c92fc08c873..09508ce8870 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -84,7 +84,9 @@ class InternalFileDecryptor { private: FileDecryptionProperties* properties_; + // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; + // A map between ColumnPath and their encryption keys: std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; @@ -113,4 +115,4 @@ class InternalFileDecryptor { } -#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 153c83b26fc..d60b1130e4a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -46,10 +46,6 @@ class PARQUET_EXPORT ReaderProperties { : pool_(pool) { buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM; buffer_size_ = DEFAULT_BUFFER_SIZE; - column_map_ = std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>>(new std::map, - std::string, - schema::ColumnPath::CmpColumnPath>()); } ::arrow::MemoryPool* memory_pool() const { return pool_; } @@ -67,11 +63,6 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map () { - return column_map_; - } - void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -83,9 +74,6 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size_; bool buffered_stream_enabled_; std::shared_ptr file_decryption_; - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between - //ColumnPath and their encryption keys }; ReaderProperties PARQUET_EXPORT default_reader_properties(); From 986331f95efcd34e73da9465779cbef79eff22a2 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 21:32:49 +0300 Subject: [PATCH 042/125] Fix check for encryption and the existance of file_decryption in file_reader.cc --- cpp/src/parquet/file_reader.cc | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 18a64297545..4cc04256974 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -143,7 +143,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if ((!file_crypto_metadata_ && !file_metadata_->is_plaintext_mode()) || !crypto_metadata) { + if (!crypto_metadata) { encrypted = false; } @@ -165,7 +165,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { ParquetCipher::type footer_algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; - + // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() @@ -183,13 +183,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { meta_decryptor, data_decryptor); } - // file is non-uniform encrypted and the column - // is encrypted with its own key + // file is encrypted and the column is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - + auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( column_path, footer_algorithm, column_key_metadata, aad_column_meta_data); @@ -298,10 +297,10 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_metadata_->is_plaintext_mode()) { auto file_decryption = properties_.file_decryption(); - file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); - if (file_decryption == nullptr) { + if (file_decryption == NULLPTR) { throw ParquetException("No decryption properties are provided"); } + file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); std::string aad_prefix = file_decryption->getAADPrefix(); From 99c95b2ac960792bc16f36a0ea0e999ccdffd050 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 8 May 2019 04:19:26 +0300 Subject: [PATCH 043/125] Save footer_key_metadata, algorithm, footer_decryptor and footer_signing_encryptor in InternalFileDecryptor and remove redundant AAD calculation in file_reader.cc --- cpp/src/parquet/file_reader.cc | 54 ++++------------ cpp/src/parquet/internal_file_decryptor.cc | 74 ++++++++++------------ cpp/src/parquet/internal_file_decryptor.h | 47 +++++++------- cpp/src/parquet/metadata.cc | 28 ++++---- cpp/src/parquet/metadata.h | 3 - 5 files changed, 81 insertions(+), 125 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 4cc04256974..9492931bf3a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -103,17 +103,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { const ReaderProperties* properties() const override { return &properties_; } std::unique_ptr GetColumnPageReader(int i) override { - EncryptionAlgorithm algorithm; - if (file_crypto_metadata_) { - algorithm = file_crypto_metadata_->encryption_algorithm(); - } - else if (file_metadata_->is_plaintext_mode()) { - algorithm = file_metadata_->encryption_algorithm(); - } // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, properties_.file_decryption(), - &algorithm, file_decryptor_); + file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -156,26 +149,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // the column is encrypted - std::string aad_column_meta_data = parquet_encryption::createModuleAAD( - file_decryptor_->file_aad(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - (int16_t)i, (int16_t)-1); - - ParquetCipher::type footer_algorithm = file_metadata_->is_plaintext_mode() - ? file_metadata_->encryption_algorithm().algorithm - : file_crypto_metadata_->encryption_algorithm().algorithm; // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { - const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() - ? file_metadata_->footer_signing_key_metadata() - : file_crypto_metadata_->key_metadata(); - - auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( - footer_algorithm, footer_key_metadata, aad_column_meta_data); - auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( - footer_algorithm, footer_key_metadata, aad_column_meta_data); + auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); + auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), row_group_ordinal_, @@ -190,11 +168,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::make_shared(crypto_metadata->path_in_schema()); auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( - column_path, footer_algorithm, - column_key_metadata, aad_column_meta_data); + column_path, + column_key_metadata); auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( - column_path, footer_algorithm, - column_key_metadata, aad_column_meta_data); + column_path, + column_key_metadata); return PageReader::Open(stream, col->num_values(), col->compression(), @@ -326,16 +304,15 @@ class SerializedFile : public ParquetFileReader::Contents { std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; file_decryptor_->file_aad(fileAAD); + file_decryptor_->algorithm(algo.algorithm); + file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); if (file_decryption->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); } - std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); - auto encryptor = file_decryptor_->GetFooterSigningEncryptor( - file_metadata_->encryption_algorithm().algorithm, - footer_key_metadata); + auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); if (! file_metadata_->verify(encryptor, metadata_buffer->data() + read_metadata_len)) { throw ParquetException("Invalid parquet file. Could not verify plaintext" @@ -404,7 +381,9 @@ class SerializedFile : public ParquetFileReader::Contents { std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; // save fileAAD for later use file_decryptor_->file_aad(fileAAD); - + file_decryptor_->algorithm(algo.algorithm); + file_decryptor_->footer_key_metadata(file_crypto_metadata_->key_metadata()); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; @@ -415,12 +394,7 @@ class SerializedFile : public ParquetFileReader::Contents { "Could not read footer metadata bytes."); } - // get footer key metadata - std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); - - auto footer_decryptor = file_decryptor_->GetFooterDecryptor( - file_crypto_metadata_->encryption_algorithm().algorithm, - footer_key_metadata); + auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d00b5a035eb..9f71ac03de6 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -55,20 +55,20 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* p InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) : properties_(properties) {} -std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) +std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) + return footer_signing_encryptor_; std::string footer_key = properties_->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) + if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); if (properties_->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { footer_key = - properties_->getKeyRetriever()->getKey(footer_key_metadata); + properties_->getKeyRetriever()->getKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; @@ -82,43 +82,39 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE std::string aad = parquet_encryption::createFooterAAD(file_aad_); - return std::make_shared( - algorithm, footer_key, file_aad_, aad); + footer_signing_encryptor_ = std::make_shared( + algorithm_, footer_key, file_aad_, aad); + return footer_signing_encryptor_; } -std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) { +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); - return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); + return GetFooterDecryptor(aad, true); } std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, - const std::string& aad) { - return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); + const std::string& aad) +{ + return GetFooterDecryptor(aad, true); } std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnData( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, const std::string& aad) { - return GetFooterDecryptor(algorithm, footer_key_metadata, aad, false); + return GetFooterDecryptor(aad, false); } std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, const std::string& aad, bool metadata) { + if (footer_decryptor_ != NULLPTR) + return footer_decryptor_; std::string footer_key = properties_->getFooterKey(); if (footer_key.empty()) { - if (footer_key_metadata.empty()) + if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); if (properties_->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata); + footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n";; @@ -131,30 +127,28 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( } auto aes_decryptor = metadata - ? GetMetaAesDecryptor(algorithm, footer_key.size()) - : GetDataAesDecryptor(algorithm, footer_key.size()); - return std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + ? GetMetaAesDecryptor(footer_key.size()) + : GetDataAesDecryptor(footer_key.size()); + footer_decryptor_ = std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + return footer_decryptor_; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad) { - return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, true); + return GetColumnDecryptor(column_path, column_key_metadata, aad, true); } std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad) { - return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, false); + return GetColumnDecryptor(column_path, column_key_metadata, aad, false); } std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad, bool metadata) { std::string column_key; @@ -189,30 +183,30 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( } auto aes_decryptor = metadata - ? GetMetaAesDecryptor(algorithm, column_key.size()) - : GetDataAesDecryptor(algorithm, column_key.size()); + ? GetMetaAesDecryptor(column_key.size()) + : GetDataAesDecryptor(column_key.size()); return std::make_shared(aes_decryptor, column_key, file_aad_, aad); } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( - ParquetCipher::type algorithm, size_t key_size) { + size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_256_.get(); } @@ -220,23 +214,23 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( - ParquetCipher::type algorithm, size_t key_size) { + size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 09508ce8870..6601e9a2b7c 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -51,44 +51,46 @@ class Decryptor { class InternalFileDecryptor { public: - explicit InternalFileDecryptor(FileDecryptionProperties* propperties); + explicit InternalFileDecryptor(FileDecryptionProperties* properties); void file_aad(const std::string& file_aad) { file_aad_ = file_aad; } std::string& file_aad() { return file_aad_; } - std::shared_ptr GetFooterSigningEncryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + void algorithm(ParquetCipher::type algorithm) { algorithm_ = algorithm; } + ParquetCipher::type algorithm() { return algorithm_; } - std::shared_ptr GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + void footer_key_metadata(const std::string& footer_key_metadata) { + footer_key_metadata_ = footer_key_metadata; + } + std::string& footer_key_metadata() { return footer_key_metadata_; } + + std::shared_ptr GetFooterSigningEncryptor(); + + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, - const std::string& aad); + const std::string& aad = ""); std::shared_ptr GetFooterDecryptorForColumnData( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, - const std::string& aad); + const std::string& aad = ""); std::shared_ptr GetColumnMetaDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, - const std::string& aad); + const std::string& aad = ""); std::shared_ptr GetColumnDataDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, - const std::string& aad); + const std::string& aad = ""); private: FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - // A map between ColumnPath and their encryption keys: + // A map between ColumnPath and their encryption keys std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; + ParquetCipher::type algorithm_; + std::string footer_key_metadata_; + std::shared_ptr footer_decryptor_; + std::shared_ptr footer_signing_encryptor_; std::unique_ptr meta_decryptor_128_; std::unique_ptr meta_decryptor_196_; @@ -98,19 +100,14 @@ class InternalFileDecryptor { std::unique_ptr data_decryptor_256_; std::shared_ptr GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad, bool metadata = false); - parquet_encryption::AesDecryptor* GetMetaAesDecryptor(ParquetCipher::type algorithm, - size_t key_size); - parquet_encryption::AesDecryptor* GetDataAesDecryptor(ParquetCipher::type algorithm, - size_t key_size); + parquet_encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + parquet_encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); }; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8b8485e73e9..32c13b12ade 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -168,7 +168,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -178,25 +177,24 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (file_decryption == NULLPTR) { - throw ParquetException("Cannot decrypt ColumnMetadata. FileDecryptionProperties must be provided."); + throw ParquetException("Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); } // should decrypt metadata std::shared_ptr path = std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - - DCHECK(algorithm != NULLPTR); + DCHECK(file_decryptor != NULLPTR); - - std::string aad = parquet_encryption::createModuleAAD( + + std::string aad_column_metadata = parquet_encryption::createModuleAAD( file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal, column_ordinal, (int16_t)-1); auto decryptor = file_decryptor->GetColumnMetaDecryptor( - path, algorithm->algorithm, - key_metadata, aad); + path, key_metadata, aad_column_metadata); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast( @@ -302,12 +300,11 @@ std::unique_ptr ColumnChunkMetaData::Make( int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, file_decryptor)); + file_decryption, file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData( @@ -317,7 +314,6 @@ ColumnChunkMetaData::ColumnChunkMetaData( int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), @@ -325,7 +321,7 @@ ColumnChunkMetaData::ColumnChunkMetaData( row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, + file_decryption, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -409,7 +405,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; @@ -420,7 +415,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { return ColumnChunkMetaData::Make( &row_group_->columns[i], schema_->Column(i), row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, algorithm, + writer_version_, file_decryption, file_decryptor); } @@ -454,9 +449,8 @@ const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema( std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) const { - return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, - file_decryptor); + InternalFileDecryptor* file_decryptor) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, file_decryptor); } // file metadata diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index f4b14a397c8..a51b20374d1 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -129,7 +129,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -162,7 +161,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -186,7 +184,6 @@ class PARQUET_EXPORT RowGroupMetaData { const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) const; private: From 701274543868701ce50810ebc9faa41dc789c885 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 8 May 2019 08:21:27 +0300 Subject: [PATCH 044/125] Rename file_decryption to file_decryption_properties in properties.h --- .../low-level-api/encryption-reader-writer.cc | 2 +- cpp/src/parquet/file_reader.cc | 24 +++++++++---------- cpp/src/parquet/properties.h | 11 +++++---- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 666c1a07e76..9396202ed00 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -243,7 +243,7 @@ int main(int argc, char** argv) { try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption(file_decryption_properties[example_id]); + reader_properties.file_decryption_properties(file_decryption_properties[example_id]); // Create a ParquetReader instance std::unique_ptr parquet_reader = diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 9492931bf3a..c8daa2e5520 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -105,7 +105,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, - properties_.file_decryption(), + properties_.file_decryption_properties(), file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && @@ -274,13 +274,13 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); if (file_metadata_->is_plaintext_mode()) { - auto file_decryption = properties_.file_decryption(); - if (file_decryption == NULLPTR) { + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); - std::string aad_prefix = file_decryption->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->getAADPrefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); if (!algo.aad.aad_prefix.empty()) { @@ -292,7 +292,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); + file_decryption_properties->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } @@ -306,7 +306,7 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor_->file_aad(fileAAD); file_decryptor_->algorithm(algo.algorithm); file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); - if (file_decryption->checkFooterIntegrity()) { + if (file_decryption_properties->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); @@ -347,18 +347,18 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } - auto file_decryption = properties_.file_decryption(); - if (file_decryption == nullptr) { + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == nullptr) { throw ParquetException("No decryption properties are provided. Could not read " "encrypted footer metadata"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); - std::string aad_prefix = file_decryption->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->getAADPrefix(); if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { @@ -369,7 +369,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); + file_decryption_properties->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d60b1130e4a..93b6fd19a0b 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -63,17 +63,20 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } - void file_decryption(const std::shared_ptr& decryption) { - file_decryption_ = decryption; + void file_decryption_properties( + const std::shared_ptr& decryption) { + file_decryption_properties_ = decryption; } - FileDecryptionProperties* file_decryption() { return file_decryption_.get(); } + FileDecryptionProperties* file_decryption_properties() { + return file_decryption_properties_.get(); + } private: ::arrow::MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; - std::shared_ptr file_decryption_; + std::shared_ptr file_decryption_properties_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); From bbec428dd7ec398fa74d004a9433e90779f3bb66 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 8 May 2019 08:35:09 +0300 Subject: [PATCH 045/125] Do not pass file_decryption as function parameter --- cpp/src/parquet/file_reader.cc | 1 - cpp/src/parquet/internal_file_decryptor.h | 2 ++ cpp/src/parquet/metadata.cc | 17 +++++------------ cpp/src/parquet/metadata.h | 3 --- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index c8daa2e5520..d0c8b57acaf 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -105,7 +105,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, - properties_.file_decryption_properties(), file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 6601e9a2b7c..9646ebbb652 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -66,6 +66,8 @@ class InternalFileDecryptor { std::shared_ptr GetFooterSigningEncryptor(); + FileDecryptionProperties* properties() { return properties_; } + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta( const std::string& aad = ""); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 32c13b12ade..d765b69b8f7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -167,7 +167,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -176,7 +175,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { - if (file_decryption == NULLPTR) { + if (file_decryptor->properties() == NULLPTR) { throw ParquetException("Cannot decrypt ColumnMetadata. " "FileDecryptionProperties must be provided."); } @@ -299,12 +298,11 @@ std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, file_decryptor)); + file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData( @@ -313,7 +311,6 @@ ColumnChunkMetaData::ColumnChunkMetaData( int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), @@ -321,7 +318,6 @@ ColumnChunkMetaData::ColumnChunkMetaData( row_group_ordinal, column_ordinal, writer_version, - file_decryption, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -404,7 +400,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; @@ -415,8 +410,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { return ColumnChunkMetaData::Make( &row_group_->columns[i], schema_->Column(i), row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, - file_decryptor); + writer_version_, file_decryptor); } private: @@ -448,9 +442,8 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } std::unique_ptr RowGroupMetaData::ColumnChunk( - int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, - InternalFileDecryptor* file_decryptor) const { - return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, file_decryptor); + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } // file metadata diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index a51b20374d1..1f1916fe0dd 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -128,7 +128,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -160,7 +159,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -183,7 +181,6 @@ class PARQUET_EXPORT RowGroupMetaData { // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) const; private: From a73ebf14cb4a715fe99042800c937585dd78725b Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 8 May 2019 10:27:44 +0300 Subject: [PATCH 046/125] Rename is_plaintext_mode to is_encryption_algorithm_set --- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/metadata.cc | 6 +++--- cpp/src/parquet/metadata.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index d0c8b57acaf..eff0f24cb3b 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -272,7 +272,7 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); - if (file_metadata_->is_plaintext_mode()) { + if (file_metadata_->is_encryption_algorithm_set()) { auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index d765b69b8f7..d5f6e851cfc 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -503,7 +503,7 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } - inline bool is_plaintext_mode() const { return metadata_->__isset.encryption_algorithm; } + inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } inline EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } @@ -516,7 +516,7 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; - if (is_plaintext_mode()) { + if (is_encryption_algorithm_set()) { uint8_t* serialized_data; uint32_t serialized_len; serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); @@ -655,7 +655,7 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } -bool FileMetaData::is_plaintext_mode() const { return impl_->is_plaintext_mode(); } +bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } EncryptionAlgorithm FileMetaData::encryption_algorithm() const { return impl_->encryption_algorithm(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 1f1916fe0dd..d42016627d1 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -209,7 +209,7 @@ class PARQUET_EXPORT FileMetaData { int num_columns() const; int64_t num_rows() const; int num_row_groups() const; - bool is_plaintext_mode() const; + bool is_encryption_algorithm_set() const; EncryptionAlgorithm encryption_algorithm() const; const std::string& footer_signing_key_metadata() const; ParquetVersion::type version() const; From 9b0dca715acd5830be95bcb66fa6ab84ccec050d Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Thu, 9 May 2019 13:56:39 +0700 Subject: [PATCH 047/125] fix function naming --- .../low-level-api/encryption-reader-writer.cc | 22 ++-- cpp/src/parquet/column_reader.cc | 14 +-- cpp/src/parquet/column_writer.cc | 10 +- cpp/src/parquet/encryption.cc | 8 +- cpp/src/parquet/encryption.h | 10 +- cpp/src/parquet/encryption_properties.cc | 80 +++++++------ cpp/src/parquet/encryption_properties.h | 112 +++++++++--------- cpp/src/parquet/file_reader.cc | 14 +-- cpp/src/parquet/file_writer.cc | 10 +- cpp/src/parquet/internal_file_decryptor.cc | 18 +-- cpp/src/parquet/internal_file_decryptor.h | 2 +- cpp/src/parquet/internal_file_encryptor.cc | 32 ++--- cpp/src/parquet/internal_file_encryptor.h | 2 +- cpp/src/parquet/metadata.cc | 22 ++-- cpp/src/parquet/properties.h | 10 +- 15 files changed, 189 insertions(+), 177 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 9396202ed00..e0d44d60667 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -55,7 +55,7 @@ int main(int argc, char** argv) { // uniform encryption parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); parquet::FileDecryptionProperties::Builder decryption_properties_builder_1; - decryption_properties_builder_1.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); // non-uniform with column keys std::map, @@ -63,7 +63,7 @@ int main(int argc, char** argv) { parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); - encryption_col_builder_0.withKey(COLUMN_ENCRYPTION_KEY); + encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); auto encryption_col0 = encryption_col_builder_0.build(); encryption_cols[path_ptr] = encryption_col0; @@ -72,29 +72,29 @@ int main(int argc, char** argv) { std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); - decryption_col_builder2.withKey(COLUMN_ENCRYPTION_KEY); + decryption_col_builder2.key(COLUMN_ENCRYPTION_KEY); decryption_cols[path_ptr] = decryption_col_builder2.build(); - file_encryption_builder_2.withEncryptedColumns(encryption_cols); + file_encryption_builder_2.column_properties(encryption_cols); parquet::FileDecryptionProperties::Builder decryption_properties_builder_2; - decryption_properties_builder_2.withFooterKey(FOOTER_ENCRYPTION_KEY); - decryption_properties_builder_2.withColumnKeys(decryption_cols); + decryption_properties_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_2.column_properties(decryption_cols); // plain mode footer = unencrypted footer parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_3.withPlaintextFooter(); + file_encryption_builder_3.enable_plaintext_footer(); parquet::FileDecryptionProperties::Builder decryption_properties_builder_3; - decryption_properties_builder_3.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_3.footer_key(FOOTER_ENCRYPTION_KEY); // plaintext mode footer, hidden column parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_4.withPlaintextFooter(); - file_encryption_builder_4.withEncryptedColumns(encryption_cols); // reusing encryption_cols + file_encryption_builder_4.enable_plaintext_footer(); + file_encryption_builder_4.column_properties(encryption_cols); // reusing encryption_cols parquet::FileDecryptionProperties::Builder decryption_properties_builder_4; - decryption_properties_builder_4.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_4.footer_key(FOOTER_ENCRYPTION_KEY); file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 708d5665f7e..97e7bc8c4e4 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -130,18 +130,18 @@ class SerializedPageReader : public PageReader { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); if (data_decryptor_ != NULLPTR) { - DCHECK (!data_decryptor_->fileAAD().empty()); + DCHECK (!data_decryptor_->file_aad().empty()); //prepare the AAD for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_decryptor_->fileAAD(), + data_decryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { - DCHECK (!meta_decryptor_->fileAAD().empty()); + DCHECK (!meta_decryptor_->file_aad().empty()); data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_decryptor_->fileAAD(), + meta_decryptor_->file_aad(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -221,7 +221,7 @@ std::shared_ptr SerializedPageReader::NextPage() { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { aad = parquet_encryption::createModuleAAD( - meta_decryptor_->fileAAD(), + meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -253,10 +253,10 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; if (data_decryptor_ != NULLPTR){ - DCHECK(!data_decryptor_->fileAAD().empty()); + DCHECK(!data_decryptor_->file_aad().empty()); if (current_page_is_dictionary){ aad = parquet_encryption::createModuleAAD( - data_decryptor_->fileAAD(), + data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fee0db9f1b7..aff2cc86c14 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -149,14 +149,14 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_ != NULLPTR) { //prepare the add for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_encryptor_->fileAAD(), + data_encryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_encryptor_->fileAAD(), + meta_encryptor_->file_aad(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -188,7 +188,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { data_encryptor_->aad( - parquet_encryption::createModuleAAD(data_encryptor_->fileAAD(), + parquet_encryption::createModuleAAD(data_encryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1)); @@ -214,7 +214,7 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), + parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1)); @@ -237,7 +237,7 @@ class SerializedPageWriter : public PageWriter { fallback); if (meta_encryptor_ != nullptr){ meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), + parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 54297253856..0a2d9ef939a 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -22,11 +22,11 @@ namespace parquet { // integer key retriever -void IntegerKeyIdRetriever::putKey(uint32_t key_id, const std::string& key) { +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& IntegerKeyIdRetriever::getKey(const std::string& key_metadata) { +const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { uint32_t key_id; memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); @@ -34,11 +34,11 @@ const std::string& IntegerKeyIdRetriever::getKey(const std::string& key_metadata } // string key retriever -void StringKeyIdRetriever::putKey(const std::string& key_id, const std::string& key) { +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& StringKeyIdRetriever::getKey(const std::string& key_id) { +const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 60b7b8c3300..3a4481bd4aa 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -28,15 +28,15 @@ namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual const std::string& getKey(const std::string& key_metadata) = 0; + virtual const std::string& GetKey(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; // Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: - void putKey(uint32_t key_id, const std::string& key); - const std::string& getKey(const std::string& key_metadata); + void PutKey(uint32_t key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); private: std::map key_map_; @@ -45,8 +45,8 @@ class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { // Simple string key retriever class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { public: - void putKey(const std::string& key_id, const std::string& key); - const std::string& getKey(const std::string& key_metadata); + void PutKey(const std::string& key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); private: std::map key_map_; diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index ba893ce1816..36cccdb672d 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -23,16 +23,17 @@ namespace parquet { -ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::withKeyID( +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t *data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { throw ParquetException("key id should be in UTF8 encoding"); + } DCHECK(!key_id.empty()); - this->withKeyMetaData(key_id); + this->key_metadata(key_id); return this; } @@ -40,17 +41,20 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, - const std::string& key_metadata):column_path_(column_path){ + const std::string& key_metadata):column_path_(column_path) { DCHECK(column_path != nullptr); - if (!encrypted) + if (!encrypted) { DCHECK(key.empty() && key_metadata.empty()); + } - if (!key.empty()) + if (!key.empty()) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_) + if (encrypted_with_footer_key_){ DCHECK(key_metadata.empty()); + } encrypted_ = encrypted; key_metadata_ = key_metadata; @@ -58,22 +62,24 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( } ColumnDecryptionProperties::ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ + const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ DCHECK(column_path != nullptr); - if (!key.empty()) + if (!key.empty()) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } key_ = key; } -const std::string& FileDecryptionProperties::getColumnKey( +const std::string& FileDecryptionProperties::column_key( const std::shared_ptr& column_path) { - if (column_property_map_.find(column_path) != column_property_map_.end()) { - auto column_prop = column_property_map_[column_path]; - if (column_prop != nullptr) - return column_prop->getKey(); + if (column_properties_.find(column_path) != column_properties_.end()) { + auto column_prop = column_properties_[column_path]; + if (column_prop != nullptr) { + return column_prop->key(); + } } return NULL_STRING; } @@ -86,47 +92,52 @@ FileDecryptionProperties::FileDecryptionProperties( std::shared_ptr aad_prefix_verifier, std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { + schema::ColumnPath::CmpColumnPath> column_properties) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || - 0 != column_property_map.size()); + 0 != column_properties.size()); - if (!footer_key.empty()) + if (!footer_key.empty()) { DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); - if (footer_key.empty() && check_plaintext_footer_integrity) + } + if (footer_key.empty() && check_plaintext_footer_integrity) { DCHECK(NULLPTR != key_retriever); + } aad_prefix_verifier_ = aad_prefix_verifier; footer_key_ = footer_key; check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; key_retriever_ = key_retriever; aad_prefix_ = aad_prefix; - column_property_map_ = column_property_map; + column_properties_ = column_properties; } -FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::withFooterKeyID( +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { throw ParquetException("footer key id should be in UTF8 encoding"); + } - if (key_id.empty()) + if (key_id.empty()) { return this; + } - return withFooterKeyMetadata(key_id); + return footer_key_metadata(key_id); } -std::shared_ptr FileEncryptionProperties::getColumnProperties( +std::shared_ptr FileEncryptionProperties::column_properties( const std::shared_ptr& column_path) { - if (column_property_map_.size () == 0){ + if (column_properties_.size () == 0) { auto builder = std::shared_ptr( - new ColumnEncryptionProperties::Builder (column_path)); + new ColumnEncryptionProperties::Builder(column_path)); return builder->build(); } - if (column_property_map_.find(column_path) != column_property_map_.end()) - return column_property_map_[column_path]; + if (column_properties_.find(column_path) != column_properties_.end()) { + return column_properties_[column_path]; + } return NULLPTR; } @@ -140,11 +151,11 @@ FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& - column_property_map) + column_properties) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), - column_property_map_(column_property_map){ + column_properties_(column_properties) { DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 @@ -159,10 +170,11 @@ FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, AAD_FILE_UNIQUE_LENGTH) ; bool supply_aad_prefix = false; - if (aad_prefix.empty()) - file_AAD_ = aad_file_unique_str; + if (aad_prefix.empty()) { + file_aad_ = aad_file_unique_str; + } else { - file_AAD_ = aad_prefix + aad_file_unique_str; + file_aad_ = aad_prefix + aad_file_unique_str; if (!store_aad_prefix_in_file) supply_aad_prefix = true; } algorithm_.algorithm = cipher; diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index d2e4246e8a6..738a762d0c6 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -56,7 +56,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { + Builder* key(const std::string& key) { if (key.empty ()) return this; @@ -67,7 +67,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // Set a key retrieval metadata. // use either withKeyMetaData or withKeyID, not both - Builder* withKeyMetaData(const std::string& key_metadata) { + Builder* key_metadata(const std::string& key_metadata) { DCHECK(!key_metadata.empty()); DCHECK(key_metadata_.empty()); key_metadata_ = key_metadata; @@ -77,7 +77,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // Set a key retrieval metadata (converted from String). // use either withKeyMetaData or withKeyID, not both // key_id will be converted to metadata (UTF-8 array). - Builder* withKeyID(std::string key_id); + Builder* key_id(std::string key_id); std::shared_ptr build() { return std::shared_ptr( @@ -97,11 +97,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { : column_path_(path), encrypted_(encrypted) {} }; - const std::shared_ptr& getPath() { return column_path_; } - bool isEncrypted() const { return encrypted_; } - bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } - const std::string& getKey() const { return key_; } - const std::string& getKeyMetaData() const { return key_metadata_; } + const std::shared_ptr& column_path() { return column_path_; } + bool is_encrypted() const { return encrypted_; } + bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; @@ -136,7 +136,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { // key metadata for this column the metadata will be ignored, // the column will be decrypted with this key. // key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { + Builder* key(const std::string& key) { if (key.empty ()) return this; @@ -159,8 +159,8 @@ class PARQUET_EXPORT ColumnDecryptionProperties { ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - const std::shared_ptr& getPath() { return column_path_; } - const std::string& getKey() const { return key_; } + const std::shared_ptr& column_path() { return column_path_; } + const std::string& key() const { return key_; } private: const std::shared_ptr column_path_; @@ -188,7 +188,7 @@ class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { public: - Builder(){ + Builder() { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } @@ -198,7 +198,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // If explicit key is not set, footer key will be fetched from // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* withFooterKey(std::string footer_key) { + Builder* footer_key(std::string footer_key) { if (footer_key.empty ()) { return this; } @@ -213,17 +213,17 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* withColumnKeys(const std::map, + Builder* column_properties(const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& column_properties) { if (column_properties.size () == 0) return this; - if (column_property_map_.size () != 0) + if (column_properties_.size () != 0) throw ParquetException("Column properties already set"); - column_property_map_ = column_properties; + column_properties_ = column_properties; return this; } @@ -233,7 +233,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* withKeyRetriever(const std::shared_ptr& + Builder* key_retriever(const std::shared_ptr& key_retriever) { if (key_retriever == NULLPTR) return this; @@ -249,7 +249,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // - footer signing key is not available // (not passed, or not found by key retriever) // - footer content and signature don't match - Builder* withoutFooterSignatureVerification() { + Builder* disable_footer_signature_verification() { check_plaintext_footer_integrity_ = false; return this; } @@ -258,7 +258,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // A must when a prefix is used for file encryption, but not stored in file. // If AAD prefix is stored in file, it will be compared to the explicitly // supplied value and an exception will be thrown if they differ. - Builder* withAADPrefix(std::string aad_prefix) { + Builder* aad_prefix(std::string aad_prefix) { if (aad_prefix.empty()) { return this; } @@ -268,7 +268,7 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set callback for verification of AAD Prefixes stored in file. - Builder* withAADPrefixVerifier( + Builder* aad_prefix_verifier( std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; @@ -285,7 +285,7 @@ class PARQUET_EXPORT FileDecryptionProperties { check_plaintext_footer_integrity_, aad_prefix_, aad_prefix_verifier_, - column_property_map_)); + column_properties_)); } private: @@ -295,29 +295,29 @@ class PARQUET_EXPORT FileDecryptionProperties { std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; }; - const std::string& getColumnKey( + const std::string& column_key( const std::shared_ptr& column_path); - const std::string& getFooterKey() { + const std::string& footer_key() { return footer_key_; } - const std::string& getAADPrefix() { return aad_prefix_; } - std::shared_ptr getKeyRetriever() { + const std::string& aad_prefix() { return aad_prefix_; } + std::shared_ptr key_retriever() { return key_retriever_; } - bool checkFooterIntegrity() { + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } - const std::shared_ptr &getAADPrefixVerifier() { + const std::shared_ptr& aad_prefix_verifier() { return aad_prefix_verifier_; } @@ -328,7 +328,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -341,7 +341,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier, std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map); + schema::ColumnPath::CmpColumnPath> column_properties); }; class PARQUET_EXPORT FileEncryptionProperties { @@ -357,25 +357,25 @@ class PARQUET_EXPORT FileEncryptionProperties { // Create files with plaintext footer. // If not called, the files will be created with encrypted footer (default). - Builder* withPlaintextFooter() { + Builder* enable_plaintext_footer() { encrypted_footer_ = false; return this; } // Set encryption algorithm. // If not called, files will be encrypted with AES_GCM_V1 (default). - Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { + Builder* algorithm(ParquetCipher::type parquet_cipher) { parquet_cipher_ = parquet_cipher; return this; } // Set a key retrieval metadata (converted from String). - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyID(std::string key_id); + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_id(std::string key_id); // Set a key retrieval metadata. - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_metadata(const std::string& footer_key_metadata) { if (footer_key_metadata.empty()) return this; @@ -385,7 +385,7 @@ class PARQUET_EXPORT FileEncryptionProperties { } // Set the file AAD Prefix. - Builder* withAADPrefix(const std::string& aad_prefix) { + Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) return this; @@ -397,7 +397,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Skip storing AAD Prefix in file. // If not called, and if AAD Prefix is set, it will be stored. - Builder* withoutAADPrefixStorage() { + Builder* disable_store_aad_prefix_storage() { DCHECK(!aad_prefix_.empty()); store_aad_prefix_in_file_ = false; @@ -407,18 +407,18 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set the list of encrypted columns and their properties (keys etc). // If not called, all columns will be encrypted with the footer key. // If called, the file columns not in the list will be left unencrypted. - Builder* withEncryptedColumns( + Builder* column_properties( const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& - encryptedColumns){ - if (encryptedColumns.size () == 0) + column_properties){ + if (column_properties.size () == 0) return this; - if (column_property_map_.size () != 0) + if (column_properties_.size () != 0) throw ParquetException("Column properties already set"); - column_property_map_ = encryptedColumns; + column_properties_ = column_properties; return this; } @@ -430,7 +430,7 @@ class PARQUET_EXPORT FileEncryptionProperties { encrypted_footer_, aad_prefix_, store_aad_prefix_in_file_, - column_property_map_)); + column_properties_)); } private: @@ -443,33 +443,33 @@ class PARQUET_EXPORT FileEncryptionProperties { bool store_aad_prefix_in_file_; std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; }; - bool encryptedFooter() const { return encrypted_footer_; } + bool encrypted_footer() const { return encrypted_footer_; } - const EncryptionAlgorithm getAlgorithm() { + const EncryptionAlgorithm algorithm() { return algorithm_; } - const std::string& getFooterEncryptionKey() { + const std::string& footer_encryption_key() { return (encrypted_footer_? footer_key_ : NULL_STRING); } - const std::string& getFooterEncryptionKeyMetadata() { + const std::string& footer_encryption_key_metadata() { return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); } - const std::string& getFooterSigningKey() { + const std::string& footer_signing_key() { return (encrypted_footer_? NULL_STRING : footer_key_); } - const std::string& getFooterSigningKeyMetadata() { + const std::string& footer_signing_key_metadata() { return (encrypted_footer_? NULL_STRING : footer_key_metadata_); } - const std::string& getFileAAD() const { return file_AAD_; } + const std::string& file_aad() const { return file_aad_; } - std::shared_ptr getColumnProperties( + std::shared_ptr column_properties( const std::shared_ptr& column_path); private: @@ -477,11 +477,11 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; - std::string file_AAD_; + std::string file_aad_; std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; FileEncryptionProperties(ParquetCipher::type cipher, std::string footer_key, @@ -492,7 +492,7 @@ class PARQUET_EXPORT FileEncryptionProperties { const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& - column_property_map); + column_properties); }; } // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index eff0f24cb3b..3bc4fd41304 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -279,7 +279,7 @@ class SerializedFile : public ParquetFileReader::Contents { } file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); - std::string aad_prefix = file_decryption_properties->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->aad_prefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); if (!algo.aad.aad_prefix.empty()) { @@ -291,7 +291,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->getAADPrefixVerifier(); + file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } @@ -300,12 +300,12 @@ class SerializedFile : public ParquetFileReader::Contents { "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; + std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_->file_aad(fileAAD); + file_decryptor_->file_aad(file_aad); file_decryptor_->algorithm(algo.algorithm); file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); - if (file_decryption_properties->checkFooterIntegrity()) { + if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); @@ -357,7 +357,7 @@ class SerializedFile : public ParquetFileReader::Contents { FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); - std::string aad_prefix = file_decryption_properties->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->aad_prefix(); if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { @@ -368,7 +368,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->getAADPrefixVerifier(); + file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 2d1f0784d3b..d0ddc436b86 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -294,9 +294,9 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*metadata, sink_.get()); + WriteFileMetaData(*file_metadata_, sink_.get()); } else { - if (file_encryption->encryptedFooter()) { + if (file_encryption->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -312,7 +312,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } else { // footer plain mode EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption->getAlgorithm(); + EncryptionAlgorithm algo = file_encryption->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) @@ -320,7 +320,7 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( &signing_encryption, - file_encryption->getFooterSigningKeyMetadata ()); + file_encryption->footer_signing_key_metadata ()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); } @@ -398,7 +398,7 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); } else { file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); - if (file_encryption->encryptedFooter()) { + if (file_encryption->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); } else { diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 9f71ac03de6..5082cbe8839 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -59,16 +59,16 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE { if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; - std::string footer_key = properties_->getFooterKey(); + std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); - if (properties_->getKeyRetriever() == nullptr) + if (properties_->key_retriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { footer_key = - properties_->getKeyRetriever()->getKey(footer_key_metadata_); + properties_->key_retriever()->GetKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; @@ -107,14 +107,14 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( const std::string& aad, bool metadata) { if (footer_decryptor_ != NULLPTR) return footer_decryptor_; - std::string footer_key = properties_->getFooterKey(); + std::string footer_key = properties_->footer_key(); if (footer_key.empty()) { if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); - if (properties_->getKeyRetriever() == nullptr) + if (properties_->key_retriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata_); + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n";; @@ -157,12 +157,12 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( && column_map_->find(column_path) != column_map_->end()) { column_key = column_map_->at(column_path); } else { - column_key = properties_->getColumnKey(column_path); + column_key = properties_->column_key(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && - properties_->getKeyRetriever() != nullptr){ + properties_->key_retriever() != nullptr){ try { - column_key = properties_->getKeyRetriever()->getKey(column_key_metadata); + column_key = properties_->key_retriever()->GetKey(column_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "HiddenColumnException, path=" + diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 9646ebbb652..41740dc182f 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -36,7 +36,7 @@ class Decryptor { const std::string& key, const std::string& file_aad, const std::string& aad); - const std::string& fileAAD() const { return file_aad_; } + const std::string& file_aad() const { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 7f2195cc20b..fcd224339a8 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -33,23 +33,23 @@ InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* propertie : properties_(properties) {} std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { - ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); - std::string footer_key = properties_->getFooterEncryptionKey(); + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); return std::make_shared(aes_encryptor, footer_key, - properties_->getFileAAD(), aad); + properties_->file_aad(), aad); } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { - ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); - std::string footer_signing_key = properties_->getFooterSigningKey(); + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); return std::make_shared(aes_encryptor, footer_signing_key, - properties_->getFileAAD(), aad); + properties_->file_aad(), aad); } std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( @@ -65,29 +65,29 @@ std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata) { - auto column_prop = properties_->getColumnProperties(column_path); + auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { return NULLPTR; } std::string key; - if (column_prop->isEncryptedWithFooterKey()) { - if (properties_->encryptedFooter()) { - key = properties_->getFooterEncryptionKey(); + if (column_prop->is_encrypted_with_footer_key()) { + if (properties_->encrypted_footer()) { + key = properties_->footer_encryption_key(); } else { - key = properties_->getFooterSigningKey(); + key = properties_->footer_signing_key(); } } else { - key = column_prop->getKey(); + key = column_prop->key(); } - ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + ParquetCipher::type algorithm = properties_->algorithm().algorithm; auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size()) : GetDataAesEncryptor(algorithm, key.size()); - std::string file_aad = properties_->getFileAAD(); + std::string file_aad = properties_->file_aad(); // TODO: aad return std::make_shared(aes_encryptor, key, file_aad, ""); diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 2e3a3df0408..75c167b5339 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -14,7 +14,7 @@ class Encryptor { Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); - const std::string& fileAAD() { return file_aad_; } + const std::string& file_aad() { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index d5f6e851cfc..044932b89de 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -932,7 +932,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const auto& encrypt_md = properties_->column_encryption_props(column_->path()); // column is unencrypted - if (!encrypt_md || !encrypt_md->isEncrypted()) { + if (!encrypt_md || !encrypt_md->is_encrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); @@ -942,12 +942,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypted with footer key format::ColumnCryptoMetaData ccmd; - if (encrypt_md->isEncryptedWithFooterKey()) { + if (encrypt_md->is_encrypted_with_footer_key()) { ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; - eck.__set_key_metadata(encrypt_md->getKeyMetaData()); + eck.__set_key_metadata(encrypt_md->key_metadata()); eck.__set_path_in_schema(column_->path()->ToDotVector()); ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); @@ -955,12 +955,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__set_crypto_metadata(ccmd); // TODO: check file_encryption() is null or not - auto footer_key = properties_->file_encryption()->getFooterEncryptionKey(); + auto footer_key = properties_->file_encryption()->footer_encryption_key(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_key.empty() && encrypt_md->isEncrypted()) || - !encrypt_md->isEncryptedWithFooterKey()) { + if ((footer_key.empty() && encrypt_md->is_encrypted()) || + !encrypt_md->is_encrypted_with_footer_key()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; @@ -1208,7 +1208,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->file_encryption() != nullptr - && props->file_encryption()->getFooterSigningKey() == NULL_STRING) { + && props->file_encryption()->footer_signing_key() == NULL_STRING) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } @@ -1291,12 +1291,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption = properties_->file_encryption(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->getAlgorithm())); + crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->algorithm())); std::string key_metadata; - if (file_encryption->encryptedFooter()) - key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); + if (file_encryption->encrypted_footer()) + key_metadata = file_encryption->footer_encryption_key_metadata(); else - key_metadata = file_encryption->getFooterSigningKeyMetadata(); + key_metadata = file_encryption->footer_signing_key_metadata(); if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 93b6fd19a0b..585700a0497 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -371,7 +371,7 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } inline FileEncryptionProperties* file_encryption() const { - return parquet_file_encryption_.get(); + return file_encryption_.get(); } inline Encoding::type dictionary_index_encoding() const { @@ -419,8 +419,8 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr column_encryption_props(const std::shared_ptr& path) const { - if (parquet_file_encryption_) { - return parquet_file_encryption_->getColumnProperties(path); + if (file_encryption_) { + return file_encryption_->column_properties(path); } else { return NULLPTR; } @@ -441,7 +441,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), - parquet_file_encryption_(file_encryption), + file_encryption_(file_encryption), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -452,7 +452,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; - std::shared_ptr parquet_file_encryption_; + std::shared_ptr file_encryption_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; From 250e1ed883322dbfc33be710614f0451b97e18cf Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Thu, 9 May 2019 14:15:11 +0700 Subject: [PATCH 048/125] fix const& --- cpp/src/parquet/encryption_properties.cc | 14 ++++++------ cpp/src/parquet/encryption_properties.h | 28 ++++++++++++------------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index 36cccdb672d..258fe642efb 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -24,7 +24,7 @@ namespace parquet { ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( - std::string key_id) { + const std::string& key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t *data = reinterpret_cast(key_id.c_str()); @@ -88,11 +88,11 @@ FileDecryptionProperties::FileDecryptionProperties( const std::string& footer_key, const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, - std::string aad_prefix, + const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - std::map, + const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties) { + schema::ColumnPath::CmpColumnPath>& column_properties) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); @@ -113,7 +113,7 @@ FileDecryptionProperties::FileDecryptionProperties( } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( - std::string key_id) { + const std::string& key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t* data = reinterpret_cast(key_id.c_str()); @@ -143,8 +143,8 @@ std::shared_ptr FileEncryptionProperties::column_pro } FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, + const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index 738a762d0c6..e1b4221ae68 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -44,7 +44,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class Builder { public: // Convenience builder for regular (not nested) columns. - Builder(const std::string name) { + Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name), true); } @@ -66,7 +66,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } // Set a key retrieval metadata. - // use either withKeyMetaData or withKeyID, not both + // use either key_metadata() or key_id(), not both Builder* key_metadata(const std::string& key_metadata) { DCHECK(!key_metadata.empty()); DCHECK(key_metadata_.empty()); @@ -75,9 +75,9 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } // Set a key retrieval metadata (converted from String). - // use either withKeyMetaData or withKeyID, not both + // use either key_metadata() or key_id(), not both // key_id will be converted to metadata (UTF-8 array). - Builder* key_id(std::string key_id); + Builder* key_id(const std::string& key_id); std::shared_ptr build() { return std::shared_ptr( @@ -125,7 +125,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { class Builder { public: // convenience builder for regular (not nested) columns. - Builder(const std::string name) { + Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name)); } @@ -181,7 +181,7 @@ class PARQUET_EXPORT AADPrefixVerifier { // Throws exception if an AAD prefix is wrong. // In a data set, AAD Prefixes should be collected, // and then checked for missing files. - virtual void check(std::string aad_prefix) = 0; + virtual void check(const std::string& aad_prefix) = 0; }; class PARQUET_EXPORT FileDecryptionProperties { @@ -198,7 +198,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // If explicit key is not set, footer key will be fetched from // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(std::string footer_key) { + Builder* footer_key(const std::string& footer_key) { if (footer_key.empty ()) { return this; } @@ -258,7 +258,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // A must when a prefix is used for file encryption, but not stored in file. // If AAD prefix is stored in file, it will be compared to the explicitly // supplied value and an exception will be thrown if they differ. - Builder* aad_prefix(std::string aad_prefix) { + Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) { return this; } @@ -337,11 +337,11 @@ class PARQUET_EXPORT FileDecryptionProperties { const std::string& footer_key, const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, - std::string aad_prefix, + const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - std::map, + const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties); + schema::ColumnPath::CmpColumnPath>& column_properties); }; class PARQUET_EXPORT FileEncryptionProperties { @@ -371,7 +371,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set a key retrieval metadata (converted from String). // use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_id(std::string key_id); + Builder* footer_key_id(const std::string& key_id); // Set a key retrieval metadata. // use either footer_key_metadata or footer_key_id, not both. @@ -484,8 +484,8 @@ class PARQUET_EXPORT FileEncryptionProperties { schema::ColumnPath::CmpColumnPath> column_properties_; FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, + const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, From a0385a546f33b4ccefd2158f0ca4878222a789a2 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 10 May 2019 19:00:35 +0700 Subject: [PATCH 049/125] make format --- cpp/src/parquet/column_reader.cc | 65 +++---- cpp/src/parquet/column_writer-test.cc | 10 +- cpp/src/parquet/column_writer.cc | 70 ++++---- cpp/src/parquet/encryption.cc | 3 +- cpp/src/parquet/encryption.h | 12 +- cpp/src/parquet/encryption_properties.cc | 70 ++++---- cpp/src/parquet/encryption_properties.h | 199 +++++++++------------ cpp/src/parquet/file_reader.cc | 112 ++++++------ cpp/src/parquet/file_writer.cc | 69 ++++--- cpp/src/parquet/internal_file_decryptor.cc | 164 +++++++++-------- cpp/src/parquet/internal_file_decryptor.h | 63 ++++--- cpp/src/parquet/internal_file_encryptor.cc | 94 +++++----- cpp/src/parquet/internal_file_encryptor.h | 42 +++-- cpp/src/parquet/metadata.cc | 187 +++++++++---------- cpp/src/parquet/metadata.h | 32 ++-- cpp/src/parquet/parquet.pc | 30 ++++ cpp/src/parquet/parquet_version.h | 24 +++ cpp/src/parquet/properties.h | 9 +- cpp/src/parquet/thrift.h | 32 ++-- 19 files changed, 652 insertions(+), 635 deletions(-) create mode 100644 cpp/src/parquet/parquet.pc create mode 100644 cpp/src/parquet/parquet_version.h diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 97e7bc8c4e4..94e9279fd03 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "arrow/buffer.h" #include "arrow/util/bit-stream-utils.h" @@ -118,7 +119,7 @@ class SerializedPageReader : public PageReader { : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), first_page_(true), - column_has_dictionary_ (column_has_dictionary), + column_has_dictionary_(column_has_dictionary), row_group_ordinal_(row_group_ordinal), column_ordinal_(column_ordinal), page_ordinal_(-1), @@ -130,21 +131,17 @@ class SerializedPageReader : public PageReader { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); if (data_decryptor_ != NULLPTR) { - DCHECK (!data_decryptor_->file_aad().empty()); - //prepare the AAD for quick update later + DCHECK(!data_decryptor_->file_aad().empty()); + // prepare the AAD for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), - parquet_encryption::DataPage, - row_group_ordinal_, + data_decryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { - DCHECK (!meta_decryptor_->file_aad().empty()); + DCHECK(!meta_decryptor_->file_aad().empty()); data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + meta_decryptor_->file_aad(), parquet_encryption::DataPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } } @@ -190,15 +187,16 @@ std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with bool current_page_is_dictionary = false; - if (column_has_dictionary_ ){ + if (column_has_dictionary_) { if (first_page_) { current_page_is_dictionary = true; first_page_ = false; - } else + } else { page_ordinal_++; - } else + } + } else { page_ordinal_++; - + } while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; @@ -221,14 +219,11 @@ std::shared_ptr SerializedPageReader::NextPage() { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { aad = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1); meta_decryptor_->aad(aad); } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, - page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); meta_decryptor_->aad(data_page_headerAAD_); } } @@ -252,14 +247,12 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - if (data_decryptor_ != NULLPTR){ + if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); - if (current_page_is_dictionary){ + if (current_page_is_dictionary) { aad = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, + row_group_ordinal_, column_ordinal_, (int16_t)-1); data_decryptor_->aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); @@ -280,8 +273,8 @@ std::shared_ptr SerializedPageReader::NextPage() { // Decrypt it if we need to if (data_decryptor_ != nullptr) { decryption_buffer_->Resize(compressed_len - data_decryptor_->CiphertextSizeDelta()); - compressed_len = data_decryptor_->Decrypt( - buffer, compressed_len, decryption_buffer_->mutable_data()); + compressed_len = data_decryptor_->Decrypt(buffer, compressed_len, + decryption_buffer_->mutable_data()); buffer = decryption_buffer_->data(); } @@ -352,15 +345,11 @@ std::shared_ptr SerializedPageReader::NextPage() { return std::shared_ptr(nullptr); } -std::unique_ptr PageReader::Open(const std::shared_ptr& stream, - int64_t total_num_rows, - Compression::type codec, - bool column_has_dictionary, - int16_t row_group_ordinal, - int16_t column_ordinal, - ::arrow::MemoryPool* pool, - std::shared_ptr meta_decryptor, - std::shared_ptr data_decryptor) { +std::unique_ptr PageReader::Open( + const std::shared_ptr& stream, int64_t total_num_rows, + Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, + int16_t column_ordinal, ::arrow::MemoryPool* pool, + std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, row_group_ordinal, column_ordinal, pool, meta_decryptor, data_decryptor)); diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index 63179beab30..95593b72d0b 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -107,7 +107,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); std::unique_ptr pager = - PageWriter::Open(sink_, column_properties.compression(), nullptr, metadata_.get()); + PageWriter::Open(sink_, column_properties.compression(), metadata_.get()); std::shared_ptr writer = ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get()); return std::static_pointer_cast>(writer); @@ -244,8 +244,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = - ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, -1, -1, &app_version); + auto metadata_accessor = ColumnChunkMetaData::Make( + metadata_->contents(), this->descr_, -1, -1, &app_version); return metadata_accessor->is_stats_set(); } @@ -254,8 +254,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = - ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); + auto metadata_accessor = ColumnChunkMetaData::Make( + metadata_->contents(), this->descr_, -1, -1, &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); return {encoded_stats.has_min, encoded_stats.has_max}; } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index aff2cc86c14..9733df12a52 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "arrow/buffer-builder.h" @@ -30,10 +31,10 @@ #include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/statistics.h" #include "parquet/thrift.h" #include "parquet/types.h" @@ -128,7 +129,7 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata,int16_t row_group_ordinal, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, @@ -145,21 +146,17 @@ class SerializedPageWriter : public PageWriter { row_group_ordinal_(row_group_ordinal), column_ordinal_(column_chunk_ordinal), meta_encryptor_(meta_encryptor), - data_encryptor_(data_encryptor){ + data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR) { - //prepare the add for quick update later + // prepare the add for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_encryptor_->file_aad(), - parquet_encryption::DataPage, - row_group_ordinal_, + data_encryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + meta_encryptor_->file_aad(), parquet_encryption::DataPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -213,13 +210,12 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1)); } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -235,12 +231,10 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); - if (meta_encryptor_ != nullptr){ - meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + if (meta_encryptor_ != nullptr) { + meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, + row_group_ordinal_, column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk metadata_->WriteTo(sink_.get(), meta_encryptor_); @@ -286,10 +280,10 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); data_encryptor_->aad(data_pageAAD_); - encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + output_data_len); - output_data_len = data_encryptor_->Encrypt( - compressed_data->data(), output_data_len, - encrypted_data_buffer->mutable_data()); + encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + + output_data_len); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } @@ -307,11 +301,11 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, - page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); meta_encryptor_->aad(data_page_headerAAD_); } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -355,7 +349,7 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; - + std::shared_ptr meta_encryptor_; std::shared_ptr data_encryptor_; }; @@ -425,15 +419,13 @@ std::unique_ptr PageWriter::Open( std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor) { if (buffered_row_group) { - return std::unique_ptr( - new BufferedPageWriter(sink, codec, metadata, - row_group_ordinal, column_chunk_ordinal, - pool, meta_encryptor, data_encryptor)); + return std::unique_ptr(new BufferedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, + meta_encryptor, data_encryptor)); } else { - return std::unique_ptr( - new SerializedPageWriter(sink, codec, metadata, - row_group_ordinal, column_chunk_ordinal, - pool, meta_encryptor, data_encryptor)); + return std::unique_ptr(new SerializedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, + meta_encryptor, data_encryptor)); } } diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 0a2d9ef939a..cc8501e8891 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include "encryption.h" +#include "parquet/encryption.h" #include +#include namespace parquet { diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3a4481bd4aa..aff37839c8c 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -54,20 +54,20 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { class PARQUET_EXPORT HiddenColumnException : public ParquetException { public: - HiddenColumnException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} + explicit HiddenColumnException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} }; class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { public: - KeyAccessDeniedException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} + explicit KeyAccessDeniedException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} }; class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { public: - UnsupportedOperationException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} + explicit UnsupportedOperationException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} }; } // namespace parquet diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index 258fe642efb..f2385819470 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -18,6 +18,7 @@ #include "parquet/encryption_properties.h" #include +#include #include "arrow/util/utf8.h" @@ -25,9 +26,9 @@ namespace parquet { ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( const std::string& key_id) { - //key_id is expected to be in UTF8 encoding + // key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); - const uint8_t *data = reinterpret_cast(key_id.c_str()); + const uint8_t* data = reinterpret_cast(key_id.c_str()); if (!::arrow::util::ValidateUTF8(data, key_id.size())) { throw ParquetException("key id should be in UTF8 encoding"); } @@ -38,10 +39,9 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id } ColumnEncryptionProperties::ColumnEncryptionProperties( - bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata):column_path_(column_path) { + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata) + : column_path_(column_path) { DCHECK(column_path != nullptr); if (!encrypted) { DCHECK(key.empty() && key_metadata.empty()); @@ -52,7 +52,7 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( } encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_){ + if (encrypted_with_footer_key_) { DCHECK(key_metadata.empty()); } @@ -62,8 +62,8 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( } ColumnDecryptionProperties::ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ + const std::shared_ptr& column_path, const std::string& key) + : column_path_(column_path) { DCHECK(column_path != nullptr); if (!key.empty()) { @@ -87,14 +87,12 @@ const std::string& FileDecryptionProperties::column_key( FileDecryptionProperties::FileDecryptionProperties( const std::string& footer_key, const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - const std::string& aad_prefix, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - DCHECK(!footer_key.empty() || - NULLPTR != key_retriever || + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); if (!footer_key.empty()) { @@ -114,7 +112,7 @@ FileDecryptionProperties::FileDecryptionProperties( FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( const std::string& key_id) { - //key_id is expected to be in UTF8 encoding + // key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t* data = reinterpret_cast(key_id.c_str()); if (!::arrow::util::ValidateUTF8(data, key_id.size())) { @@ -130,7 +128,7 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key std::shared_ptr FileEncryptionProperties::column_properties( const std::shared_ptr& column_path) { - if (column_properties_.size () == 0) { + if (column_properties_.size() == 0) { auto builder = std::shared_ptr( new ColumnEncryptionProperties::Builder(column_path)); return builder->build(); @@ -142,38 +140,32 @@ std::shared_ptr FileEncryptionProperties::column_pro return NULLPTR; } -FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, - const std::string& footer_key, - const std::string& footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, - bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties) -: footer_key_(footer_key), - footer_key_metadata_(footer_key_metadata), - encrypted_footer_(encrypted_footer), - column_properties_(column_properties) { +FileEncryptionProperties::FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_properties_(column_properties) { DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 - || footer_key.length() == 24 - || footer_key.length() == 32); + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str( - reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH) ; + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH); bool supply_aad_prefix = false; if (aad_prefix.empty()) { file_aad_ = aad_file_unique_str; - } - else { + } else { file_aad_ = aad_prefix + aad_file_unique_str; if (!store_aad_prefix_in_file) supply_aad_prefix = true; } diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index e1b4221ae68..5848ad3fa68 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -18,22 +18,23 @@ #ifndef PARQUET_ENCRYPTION_PROPERTIES_H #define PARQUET_ENCRYPTION_PROPERTIES_H +#include #include #include #include +#include "arrow/util/logging.h" #include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "arrow/util/logging.h" #include "parquet/util/visibility.h" namespace parquet { static const std::string NULL_STRING = ""; -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM - = ParquetCipher::AES_GCM_V1; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = + ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; static constexpr bool DEFAULT_CHECK_SIGNATURE = true; @@ -44,21 +45,20 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class Builder { public: // Convenience builder for regular (not nested) columns. - Builder(const std::string& name) { + explicit Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name), true); } // Convenience builder for encrypted columns. - Builder(const std::shared_ptr& path) - : Builder(path, true) {} + explicit Builder(const std::shared_ptr& path) + : Builder(path, true) {} // Set a column-specific key. // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. Builder* key(const std::string& key) { - if (key.empty ()) - return this; + if (key.empty()) return this; DCHECK(!key.empty()); key_ = key; @@ -81,10 +81,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::shared_ptr build() { return std::shared_ptr( - new ColumnEncryptionProperties(encrypted_, - column_path_, - key_, - key_metadata_)); + new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); } private: @@ -94,7 +91,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::string key_metadata_; Builder(const std::shared_ptr& path, bool encrypted) - : column_path_(path), encrypted_(encrypted) {} + : column_path_(path), encrypted_(encrypted) {} }; const std::shared_ptr& column_path() { return column_path_; } @@ -114,10 +111,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::string key_; std::string key_metadata_; explicit ColumnEncryptionProperties( - bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata); + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata); }; class PARQUET_EXPORT ColumnDecryptionProperties { @@ -125,20 +120,18 @@ class PARQUET_EXPORT ColumnDecryptionProperties { class Builder { public: // convenience builder for regular (not nested) columns. - Builder(const std::string& name) { - Builder(schema::ColumnPath::FromDotString(name)); - } + explicit Builder(const std::string& name) + : Builder(schema::ColumnPath::FromDotString(name)) {} - Builder(const std::shared_ptr& path) - : column_path_(path) {} + explicit Builder(const std::shared_ptr& path) + : column_path_(path) {} // Set an explicit column key. If applied on a file that contains // key metadata for this column the metadata will be ignored, // the column will be decrypted with this key. // key length must be either 16, 24 or 32 bytes. Builder* key(const std::string& key) { - if (key.empty ()) - return this; + if (key.empty()) return this; DCHECK(!key.empty()); key_ = key; @@ -150,7 +143,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { new ColumnDecryptionProperties(column_path_, key_)); } - private: + private: const std::shared_ptr column_path_; std::string key_; }; @@ -170,8 +163,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { // to override key retriever (or to provide keys when key metadata and/or // key retriever are not available) explicit ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key); + const std::shared_ptr& column_path, const std::string& key); }; class PARQUET_EXPORT AADPrefixVerifier { @@ -187,10 +179,8 @@ class PARQUET_EXPORT AADPrefixVerifier { class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { - public: - Builder() { - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - } + public: + Builder() { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } // Set an explicit footer key. If applied on a file that contains // footer key metadata the metadata will be ignored, the footer @@ -199,7 +189,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. Builder* footer_key(const std::string& footer_key) { - if (footer_key.empty ()) { + if (footer_key.empty()) { return this; } DCHECK(!footer_key.empty()); @@ -213,14 +203,13 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* column_properties(const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties) { - if (column_properties.size () == 0) - return this; + Builder* column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; - if (column_properties_.size () != 0) + if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); column_properties_ = column_properties; @@ -233,10 +222,8 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* key_retriever(const std::shared_ptr& - key_retriever) { - if (key_retriever == NULLPTR) - return this; + Builder* key_retriever(const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; DCHECK(key_retriever_ == NULLPTR); key_retriever_ = key_retriever; @@ -268,10 +255,8 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set callback for verification of AAD Prefixes stored in file. - Builder* aad_prefix_verifier( - std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) - return this; + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; DCHECK(aad_prefix_verifier_ == NULLPTR); aad_prefix_verifier_ = aad_prefix_verifier; @@ -279,43 +264,33 @@ class PARQUET_EXPORT FileDecryptionProperties { } std::shared_ptr build() { - return std::shared_ptr( - new FileDecryptionProperties(footer_key_, - key_retriever_, - check_plaintext_footer_integrity_, - aad_prefix_, - aad_prefix_verifier_, - column_properties_)); + return std::shared_ptr(new FileDecryptionProperties( + footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, + aad_prefix_verifier_, column_properties_)); } - private: + private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; }; - const std::string& column_key( - const std::shared_ptr& column_path); + const std::string& column_key(const std::shared_ptr& column_path); - const std::string& footer_key() { - return footer_key_; - } + const std::string& footer_key() { return footer_key_; } const std::string& aad_prefix() { return aad_prefix_; } - std::shared_ptr key_retriever() { - return key_retriever_; - } + std::shared_ptr key_retriever() { return key_retriever_; } - bool check_plaintext_footer_integrity() { - return check_plaintext_footer_integrity_; - } + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } const std::shared_ptr& aad_prefix_verifier() { return aad_prefix_verifier_; @@ -327,8 +302,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -336,21 +311,20 @@ class PARQUET_EXPORT FileDecryptionProperties { FileDecryptionProperties( const std::string& footer_key, const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - const std::string& aad_prefix, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties); }; class PARQUET_EXPORT FileEncryptionProperties { public: class Builder { - public: - Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + public: + explicit Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } @@ -376,8 +350,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set a key retrieval metadata. // use either footer_key_metadata or footer_key_id, not both. Builder* footer_key_metadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) - return this; + if (footer_key_metadata.empty()) return this; DCHECK(footer_key_metadata_.empty()); footer_key_metadata_ = footer_key_metadata; @@ -386,8 +359,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set the file AAD Prefix. Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) - return this; + if (aad_prefix.empty()) return this; DCHECK(aad_prefix_.empty()); aad_prefix_ = aad_prefix; @@ -409,13 +381,11 @@ class PARQUET_EXPORT FileEncryptionProperties { // If called, the file columns not in the list will be left unencrypted. Builder* column_properties( const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties){ - if (column_properties.size () == 0) - return this; + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; - if (column_properties_.size () != 0) + if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); column_properties_ = column_properties; @@ -423,17 +393,12 @@ class PARQUET_EXPORT FileEncryptionProperties { } std::shared_ptr build() { - return std::shared_ptr( - new FileEncryptionProperties(parquet_cipher_, - footer_key_, - footer_key_metadata_, - encrypted_footer_, - aad_prefix_, - store_aad_prefix_in_file_, - column_properties_)); + return std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, + aad_prefix_, store_aad_prefix_in_file_, column_properties_)); } - private: + private: ParquetCipher::type parquet_cipher_; bool encrypted_footer_; std::string footer_key_; @@ -442,29 +407,28 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool store_aad_prefix_in_file_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; }; bool encrypted_footer() const { return encrypted_footer_; } - const EncryptionAlgorithm algorithm() { - return algorithm_; - } + const EncryptionAlgorithm algorithm() { return algorithm_; } const std::string& footer_encryption_key() { - return (encrypted_footer_? footer_key_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_ : NULL_STRING); } const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); } const std::string& footer_signing_key() { - return (encrypted_footer_? NULL_STRING : footer_key_); + return (encrypted_footer_ ? NULL_STRING : footer_key_); } const std::string& footer_signing_key_metadata() { - return (encrypted_footer_? NULL_STRING : footer_key_metadata_); + return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); } const std::string& file_aad() const { return file_aad_; } @@ -480,19 +444,16 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string file_aad_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; - - FileEncryptionProperties(ParquetCipher::type cipher, - const std::string& footer_key, - const std::string& footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, - bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties); + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; + + FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties); }; } // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 3bc4fd41304..797176c19f3 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -40,8 +40,8 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/memory.h" #include "parquet/util/crypto.h" +#include "parquet/util/memory.h" namespace parquet { @@ -94,7 +94,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { file_crypto_metadata_(file_crypto_metadata), properties_(props), row_group_ordinal_((int16_t)row_group_number), - file_decryptor_(file_decryptor){ + file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -104,8 +104,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, - file_decryptor_); + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -141,10 +140,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), - row_group_ordinal_, - (int16_t)i/* column_ordinal */, - properties_.memory_pool()); + col->has_dictionary_page(), row_group_ordinal_, + (int16_t)i/* column_ordinal */, properties_.memory_pool()); } // the column is encrypted @@ -155,9 +152,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, - (int16_t)i, properties_.memory_pool(), - meta_decryptor, data_decryptor); + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, + properties_.memory_pool(), meta_decryptor, data_decryptor); } // file is encrypted and the column is encrypted with its own key @@ -166,18 +162,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( - column_path, - column_key_metadata); - auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( - column_path, - column_key_metadata); - - return PageReader::Open(stream, col->num_values(), - col->compression(), - col->has_dictionary_page(), row_group_ordinal_, - (int16_t)i, properties_.memory_pool(), - meta_decryptor, data_decryptor); + auto meta_decryptor = + file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); + auto data_decryptor = + file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + + return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, + properties_.memory_pool(), meta_decryptor, data_decryptor); } private: @@ -205,10 +197,9 @@ class SerializedFile : public ParquetFileReader::Contents { void Close() override {} std::shared_ptr GetRowGroup(int i) override { - std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), - file_crypto_metadata_.get(), i, properties_, - file_decryptor_.get())); + std::unique_ptr contents(new SerializedRowGroup( + source_, file_metadata_.get(), file_crypto_metadata_.get(), i, properties_, + file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -285,49 +276,51 @@ class SerializedFile : public ParquetFileReader::Contents { if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and " - "in properties is not the same"); + throw ParquetException( + "ADD Prefix in file and " + "in properties is not the same"); } } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) - aad_prefix_verifier->check(aad_prefix); + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException("AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; file_decryptor_->file_aad(file_aad); file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); + file_decryptor_->footer_key_metadata( + file_metadata_->footer_signing_key_metadata()); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { - throw ParquetException("Invalid parquet file. Cannot verify plaintext" - "mode footer."); + throw ParquetException( + "Invalid parquet file. Cannot verify plaintext" + "mode footer."); } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (! file_metadata_->verify(encryptor, metadata_buffer->data() - + read_metadata_len)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext" - " footer metadata"); + if (!file_metadata_->verify(encryptor, + metadata_buffer->data() + read_metadata_len)) { + throw ParquetException( + "Invalid parquet file. Could not verify plaintext" + " footer metadata"); } } } - } - // encryption with encrypted footer - else { + } else { + // encryption with encrypted footer // both metadata & crypto metadata length uint32_t footer_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - kFooterSize); int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; - if (kFooterSize + footer_len > file_size) { throw ParquetException( "Invalid parquet file. File is less than " @@ -348,13 +341,14 @@ class SerializedFile : public ParquetFileReader::Contents { } auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == nullptr) { - throw ParquetException("No decryption properties are provided. Could not read " - "encrypted footer metadata"); + throw ParquetException( + "No decryption properties are provided. Could not read " + "encrypted footer metadata"); } file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); std::string aad_prefix = file_decryption_properties->aad_prefix(); @@ -362,20 +356,21 @@ class SerializedFile : public ParquetFileReader::Contents { if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties " - "is not the same"); + throw ParquetException( + "ADD Prefix in file and in properties " + "is not the same"); } } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) - aad_prefix_verifier->check(aad_prefix); + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException("AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; // save fileAAD for later use @@ -383,7 +378,8 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor_->algorithm(algo.algorithm); file_decryptor_->footer_key_metadata(file_crypto_metadata_->key_metadata()); - int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + int64_t metadata_offset = + file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; PARQUET_THROW_NOT_OK( diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d0ddc436b86..8e02ccc5e5a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -25,8 +25,8 @@ #include "parquet/platform.h" #include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" -#include "parquet/util/memory.h" #include "parquet/util/crypto.h" +#include "parquet/util/memory.h" using arrow::MemoryPool; @@ -82,8 +82,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, - int16_t row_group_ordinal, - const WriterProperties* properties, + int16_t row_group_ordinal, const WriterProperties* properties, bool buffered_row_group = false, InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), @@ -91,7 +90,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { properties_(properties), total_bytes_written_(0), closed_(false), - row_group_ordinal_ (row_group_ordinal), + row_group_ordinal_(row_group_ordinal), current_column_index_(0), num_rows_(0), buffered_row_group_(buffered_row_group), @@ -131,19 +130,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++current_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); - auto meta_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - auto data_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; - - std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, row_group_ordinal_, - (int16_t)(current_column_index_-1), - properties_->memory_pool(), false, - meta_encryptor, data_encryptor); + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; + + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + row_group_ordinal_, (int16_t)(current_column_index_ - 1), + properties_->memory_pool(), false, meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -240,18 +237,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); - auto meta_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - auto data_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, (int16_t)row_group_ordinal_, - (int16_t)current_column_index_, - properties_->memory_pool(), buffered_row_group_, - meta_encryptor, data_encryptor); + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + col_meta, (int16_t)row_group_ordinal_, + (int16_t)current_column_index_, properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -306,23 +302,24 @@ class FileSerializer : public ParquetFileWriter::Contents { auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); - uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); + uint32_t footer_and_crypto_len = + static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(PARQUET_EMAGIC, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption->algorithm(); + EncryptionAlgorithm algo = file_encryption->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, - file_encryption->footer_signing_key_metadata ()); + &signing_encryption, file_encryption->footer_signing_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, + false); } } @@ -442,8 +439,7 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& encryptor, - bool encrypt_footer) { + const std::shared_ptr& encryptor, bool encrypt_footer) { if (encryptor == nullptr) { // Write MetaData int64_t position = -1; @@ -461,8 +457,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin if (encrypt_footer) { // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); - } - else { + } else { uint32_t metadata_len = static_cast(sink->Tell()); file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 5082cbe8839..1935093bbd9 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -1,5 +1,22 @@ -#include "parquet/encryption_properties.h" +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "parquet/internal_file_decryptor.h" +#include "parquet/encryption_properties.h" #include "parquet/util/crypto.h" namespace parquet { @@ -12,10 +29,11 @@ static inline uint8_t* str2bytes(const std::string& str) { return reinterpret_cast(cbytes); } -FooterSigningEncryptor::FooterSigningEncryptor( - ParquetCipher::type algorithm, const std::string& key, - const std::string& file_aad, const std::string& aad) - : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { +FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, + const std::string& key, + const std::string& file_aad, + const std::string& aad) + : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { aes_encryptor_.reset(new parquet_encryption::AesEncryptor( algorithm, static_cast(key_.size()), true)); } @@ -24,41 +42,36 @@ int FooterSigningEncryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } -int FooterSigningEncryptor::SignedFooterEncrypt( - const uint8_t* footer, int footer_len, - uint8_t* nonce, uint8_t* encrypted_footer) { +int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + uint8_t* nonce, + uint8_t* encrypted_footer) { return aes_encryptor_->SignedFooterEncrypt( - footer, footer_len, str2bytes(key_), static_cast(key_.size()), - str2bytes(aad_), static_cast(aad_.size()), nonce, encrypted_footer); + footer, footer_len, str2bytes(key_), static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), nonce, encrypted_footer); } // Decryptor -Decryptor::Decryptor( - parquet_encryption::AesDecryptor* aes_decryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad) - : aes_decryptor_(aes_decryptor), key_(key) - , file_aad_(file_aad), aad_(aad) {} - -int Decryptor::CiphertextSizeDelta() { - return aes_decryptor_->CiphertextSizeDelta(); -} +Decryptor::Decryptor(parquet_encryption::AesDecryptor* aes_decryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad) + : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} -int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext) { - return aes_decryptor_->Decrypt( - ciphertext, ciphertext_len, - str2bytes(key_), static_cast(key_.size()), - str2bytes(aad_), static_cast(aad_.size()), plaintext); +int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } + +int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, + uint8_t* plaintext) { + return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), plaintext); } // InternalFileDecryptor InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) : properties_(properties) {} -std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() -{ - if (footer_signing_encryptor_ != NULLPTR) - return footer_signing_encryptor_; +std::shared_ptr +InternalFileDecryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -67,23 +80,23 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE if (properties_->key_retriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = - properties_->key_retriever()->GetKey(footer_key_metadata_); - } catch (KeyAccessDeniedException &e) { + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); + } catch (KeyAccessDeniedException& e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; throw ParquetException(ss.str()); } } if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify " - "plaintext footer metadata"); + throw ParquetException( + "Footer key unavailable. Could not verify " + "plaintext footer metadata"); } std::string aad = parquet_encryption::createFooterAAD(file_aad_); - footer_signing_encryptor_ = std::make_shared( - algorithm_, footer_key, file_aad_, aad); + footer_signing_encryptor_ = + std::make_shared(algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } @@ -93,8 +106,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { } std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( - const std::string& aad) -{ + const std::string& aad) { return GetFooterDecryptor(aad, true); } @@ -105,8 +117,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnDat std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( const std::string& aad, bool metadata) { - if (footer_decryptor_ != NULLPTR) - return footer_decryptor_; + if (footer_decryptor_ != NULLPTR) return footer_decryptor_; std::string footer_key = properties_->footer_key(); if (footer_key.empty()) { if (footer_key_metadata_.empty()) @@ -115,58 +126,54 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( throw ParquetException("No footer key or key retriever"); try { footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); - } catch (KeyAccessDeniedException &e) { + } catch (KeyAccessDeniedException& e) { std::stringstream ss; - ss << "Footer key: access denied " << e.what() << "\n";; + ss << "Footer key: access denied " << e.what() << "\n"; throw ParquetException(ss.str()); } } if (footer_key.empty()) { - throw ParquetException("Invalid footer encryption key. " - "Could not parse footer metadata"); + throw ParquetException( + "Invalid footer encryption key. " + "Could not parse footer metadata"); } - auto aes_decryptor = metadata - ? GetMetaAesDecryptor(footer_key.size()) - : GetDataAesDecryptor(footer_key.size()); - footer_decryptor_ = std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) + : GetDataAesDecryptor(footer_key.size()); + footer_decryptor_ = + std::make_shared(aes_decryptor, footer_key, file_aad_, aad); return footer_decryptor_; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad) { + const std::string& column_key_metadata, const std::string& aad) { return GetColumnDecryptor(column_path, column_key_metadata, aad, true); } std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad) { + const std::string& column_key_metadata, const std::string& aad) { return GetColumnDecryptor(column_path, column_key_metadata, aad, false); } std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad, bool metadata) { + const std::string& column_key_metadata, const std::string& aad, bool metadata) { std::string column_key; // first look if we already got the key from before - if (column_map_ != NULLPTR - && column_map_->find(column_path) != column_map_->end()) { + if (column_map_ != NULLPTR && column_map_->find(column_path) != column_map_->end()) { column_key = column_map_->at(column_path); } else { column_key = properties_->column_key(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && - properties_->key_retriever() != nullptr){ + properties_->key_retriever() != nullptr) { try { column_key = properties_->key_retriever()->GetKey(column_key_metadata); - } catch (KeyAccessDeniedException &e) { + } catch (KeyAccessDeniedException& e) { std::stringstream ss; - ss << "HiddenColumnException, path=" + - column_path->ToDotString() + " " + ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " << e.what() << "\n"; throw HiddenColumnException(ss.str()); } @@ -182,9 +189,8 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( (*column_map_)[column_path] = column_key; } - auto aes_decryptor = metadata - ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) + : GetDataAesDecryptor(column_key.size()); return std::make_shared(aes_decryptor, column_key, file_aad_, aad); } @@ -194,19 +200,20 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + meta_decryptor_128_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + meta_decryptor_196_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + meta_decryptor_256_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_256_.get(); } @@ -218,23 +225,24 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_128_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_196_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_256_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_256_.get(); } throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -} // namespace parquet +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 41740dc182f..987c8ec77b9 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -1,13 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #ifndef INTERNAL_FILE_DECRYPTOR_H #define INTERNAL_FILE_DECRYPTOR_H #include +#include +#include + #include "parquet/schema.h" namespace parquet_encryption { - class AesDecryptor; - class AesEncryptor; -} +class AesDecryptor; +class AesEncryptor; +} // namespace parquet_encryption namespace parquet { @@ -18,8 +38,8 @@ class FooterSigningEncryptor { FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, const std::string& file_aad, const std::string& aad); int CiphertextSizeDelta(); - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, - uint8_t* nonce, uint8_t* encrypted_footer); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* nonce, + uint8_t* encrypted_footer); private: ParquetCipher::type algorithm_; @@ -32,9 +52,8 @@ class FooterSigningEncryptor { class Decryptor { public: - Decryptor(parquet_encryption::AesDecryptor* decryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad); + Decryptor(parquet_encryption::AesDecryptor* decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } @@ -69,26 +88,23 @@ class InternalFileDecryptor { FileDecryptionProperties* properties() { return properties_; } std::shared_ptr GetFooterDecryptor(); - std::shared_ptr GetFooterDecryptorForColumnMeta( - const std::string& aad = ""); - std::shared_ptr GetFooterDecryptorForColumnData( - const std::string& aad = ""); + std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); + std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); std::shared_ptr GetColumnMetaDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad = ""); + const std::string& column_key_metadata, const std::string& aad = ""); std::shared_ptr GetColumnDataDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad = ""); + const std::string& column_key_metadata, const std::string& aad = ""); private: FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; // A map between ColumnPath and their encryption keys - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; + std::shared_ptr, std::string, + parquet::schema::ColumnPath::CmpColumnPath>> + column_map_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_decryptor_; @@ -101,17 +117,16 @@ class InternalFileDecryptor { std::unique_ptr data_decryptor_196_; std::unique_ptr data_decryptor_256_; - std::shared_ptr GetFooterDecryptor( - const std::string& aad, bool metadata); + std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad, bool metadata = false); + const std::string& column_key_metadata, const std::string& aad, + bool metadata = false); parquet_encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); parquet_encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); }; -} +} // namespace parquet -#endif // INTERNAL_FILE_ENCRYPTORS_H +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index fcd224339a8..c2127f06dd1 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -1,5 +1,22 @@ -#include "parquet/encryption_properties.h" +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "parquet/internal_file_encryptor.h" +#include "parquet/encryption_properties.h" #include "parquet/util/crypto.h" namespace parquet { @@ -12,20 +29,17 @@ static inline uint8_t* str2bytes(const std::string& str) { } // Encryptor -Encryptor::Encryptor( - parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, - const std::string& file_aad, const std::string& aad) - : aes_encryptor_(aes_encryptor), key_(key) - , file_aad_(file_aad), aad_(aad) {} - -int Encryptor::CiphertextSizeDelta() { - return aes_encryptor_->CiphertextSizeDelta(); -} +Encryptor::Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad) + : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} + +int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) { - return aes_encryptor_->Encrypt( - plaintext, plaintext_len, str2bytes(key_), static_cast(key_.size()), - str2bytes(aad_), static_cast(aad_.size()), ciphertext); + return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), ciphertext); } // InternalFileEncryptor @@ -38,8 +52,8 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); - return std::make_shared(aes_encryptor, footer_key, - properties_->file_aad(), aad); + return std::make_shared(aes_encryptor, footer_key, properties_->file_aad(), + aad); } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { @@ -62,9 +76,9 @@ std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( return GetColumnEncryptor(column_path, false); } -std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( - const std::shared_ptr& column_path, - bool metadata) { +std::shared_ptr +InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( + const std::shared_ptr& column_path, bool metadata) { auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { return NULLPTR; @@ -77,16 +91,14 @@ std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColu } else { key = properties_->footer_signing_key(); } - } - else { + } else { key = column_prop->key(); } - + ParquetCipher::type algorithm = properties_->algorithm().algorithm; - auto aes_encryptor = metadata - ? GetMetaAesEncryptor(algorithm, key.size()) - : GetDataAesEncryptor(algorithm, key.size()); - + auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size()) + : GetDataAesEncryptor(algorithm, key.size()); + std::string file_aad = properties_->file_aad(); // TODO: aad @@ -98,19 +110,20 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_128_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, true)); } return meta_encryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_196_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, true)); } return meta_encryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_256_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, true)); } return meta_encryptor_256_.get(); } @@ -122,23 +135,24 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_128_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, false)); } return data_encryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_196_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, false)); } return data_encryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_256_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, false)); } return data_encryptor_256_.get(); } throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -} // namespace parquet \ No newline at end of file +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 75c167b5339..ccef1315f60 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -1,8 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H +#include +#include + +#include "parquet/schema.h" + namespace parquet_encryption { - class AesEncryptor; +class AesEncryptor; } namespace parquet { @@ -11,9 +33,8 @@ class FileEncryptionProperties; class Encryptor { public: - Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad); + Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } @@ -33,8 +54,10 @@ class InternalFileEncryptor { std::shared_ptr GetFooterEncryptor(); std::shared_ptr GetFooterSigningEncryptor(); - std::shared_ptr GetColumnMetaEncryptor(const std::shared_ptr& column_path); - std::shared_ptr GetColumnDataEncryptor(const std::shared_ptr& column_path); + std::shared_ptr GetColumnMetaEncryptor( + const std::shared_ptr& column_path); + std::shared_ptr GetColumnDataEncryptor( + const std::shared_ptr& column_path); private: FileEncryptionProperties* properties_; @@ -47,8 +70,7 @@ class InternalFileEncryptor { std::unique_ptr data_encryptor_256_; std::shared_ptr GetColumnEncryptor( - const std::shared_ptr& column_path, - bool metadata); + const std::shared_ptr& column_path, bool metadata); parquet_encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, size_t key_len); @@ -56,6 +78,6 @@ class InternalFileEncryptor { size_t key_len); }; -} +} // namespace parquet -#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 044932b89de..1132dd0f704 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -22,6 +22,8 @@ #include "arrow/util/logging.h" +#include +#include // IWYU pragma: keep #include "parquet/exception.h" #include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" @@ -30,8 +32,6 @@ #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/thrift.h" -#include -#include // IWYU pragma: keep namespace parquet { @@ -76,7 +76,7 @@ static std::shared_ptr MakeTypedColumnStats( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); + metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); } // Default behavior return TypedStatistics::Make( @@ -161,14 +161,12 @@ const std::string& ColumnCryptoMetaData::key_metadata() const { // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: - explicit ColumnChunkMetaDataImpl( - const format::ColumnChunk* column, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, - int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor = NULLPTR) - : column_(column), descr_(descr), writer_version_(writer_version) { + explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor = NULLPTR) + : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; if (column->__isset.crypto_metadata) { @@ -176,31 +174,26 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (file_decryptor->properties() == NULLPTR) { - throw ParquetException("Cannot decrypt ColumnMetadata. " - "FileDecryptionProperties must be provided."); + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); } // should decrypt metadata - std::shared_ptr path = - std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; DCHECK(file_decryptor != NULLPTR); std::string aad_column_metadata = parquet_encryption::createModuleAAD( - file_decryptor->file_aad(), - parquet_encryption::ColumnMetaData, - row_group_ordinal, - column_ordinal, (int16_t)-1); - auto decryptor = file_decryptor->GetColumnMetaDecryptor( - path, key_metadata, aad_column_metadata); + file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, + row_group_ordinal, column_ordinal, (int16_t)-1); + auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, + aad_column_metadata); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( - reinterpret_cast( - column->encrypted_column_metadata.c_str()), - &len, &metadata_, - decryptor, - false); + reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, decryptor, false); } } for (auto encoding : metadata_.encodings) { @@ -228,8 +221,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!metadata_.__isset.statistics || - descr_->sort_order() == SortOrder::UNKNOWN) { + if (!metadata_.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { @@ -244,9 +236,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return is_stats_set() ? possible_stats_ : nullptr; } - inline Compression::type compression() const { - return FromThrift(metadata_.codec); - } + inline Compression::type compression() const { return FromThrift(metadata_.codec); } const std::vector& encodings() const { return encodings_; } @@ -260,17 +250,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline int64_t data_page_offset() const { return metadata_.data_page_offset; } - inline bool has_index_page() const { - return metadata_.__isset.index_page_offset; - } + inline bool has_index_page() const { return metadata_.__isset.index_page_offset; } - inline int64_t index_page_offset() const { - return metadata_.index_page_offset; - } + inline int64_t index_page_offset() const { return metadata_.index_page_offset; } - inline int64_t total_compressed_size() const { - return metadata_.total_compressed_size; - } + inline int64_t total_compressed_size() const { return metadata_.total_compressed_size; } inline int64_t total_uncompressed_size() const { return metadata_.total_uncompressed_size; @@ -295,30 +279,23 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, + const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, + int16_t column_ordinal, const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, row_group_ordinal, - column_ordinal, writer_version, - file_decryptor)); + new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, + writer_version, file_decryptor)); } -ColumnChunkMetaData::ColumnChunkMetaData( - const void* metadata, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, - int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor) - : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( - reinterpret_cast(metadata), - descr, - row_group_ordinal, - column_ordinal, - writer_version, - file_decryptor))} {} +ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor) + : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + reinterpret_cast(metadata), descr, + row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -394,23 +371,23 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t file_offset() const { return row_group_->file_offset; } - inline int64_t total_compressed_size() const { return row_group_->total_compressed_size; } + inline int64_t total_compressed_size() const { + return row_group_->total_compressed_size; + } inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk( - int i, int16_t row_group_ordinal, - InternalFileDecryptor* file_decryptor = NULLPTR) { + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make( - &row_group_->columns[i], schema_->Column(i), - row_group_ordinal, (int16_t)i, - writer_version_, file_decryptor); + return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), + row_group_ordinal, (int16_t)i, writer_version_, + file_decryptor); } private: @@ -451,9 +428,8 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl( - const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& decryptor = nullptr) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, @@ -480,16 +456,16 @@ class FileMetaData::FileMetaDataImpl { // encrypt with nonce uint8_t* nonce = const_cast(reinterpret_cast(tail)); - uint8_t* tag = const_cast(reinterpret_cast(tail)) - + parquet_encryption::NonceLength; - - std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + serialized_len); - uint32_t encrypted_len = - encryptor->SignedFooterEncrypt(serialized_data, serialized_len, - nonce, encrypted_buffer.data()); - return 0 == memcmp( - encrypted_buffer.data() + encrypted_len - parquet_encryption::GCMTagLength, - tag, parquet_encryption::GCMTagLength); + uint8_t* tag = const_cast(reinterpret_cast(tail)) + + parquet_encryption::NonceLength; + + std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + + serialized_len); + uint32_t encrypted_len = encryptor->SignedFooterEncrypt( + serialized_data, serialized_len, nonce, encrypted_buffer.data()); + return 0 == memcmp(encrypted_buffer.data() + encrypted_len - + parquet_encryption::GCMTagLength, + tag, parquet_encryption::GCMTagLength); } inline uint32_t size() const { return metadata_len_; } @@ -503,7 +479,9 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } - inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } + inline bool is_encryption_algorithm_set() const { + return metadata_->__isset.encryption_algorithm; + } inline EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } @@ -522,9 +500,10 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); // encrypt the footer key - std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); - unsigned encrypted_len = encryptor->Encrypt(serialized_data, serialized_len, - encrypted_data.data()); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); // write unencrypted footer dst->Write(serialized_data, serialized_len); @@ -532,9 +511,8 @@ class FileMetaData::FileMetaDataImpl { dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); // write tag dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, - parquet_encryption::GCMTagLength); - } - else { + parquet_encryption::GCMTagLength); + } else { serializer.Serialize(metadata_.get(), dst, encryptor, false); } } @@ -621,8 +599,7 @@ class FileMetaData::FileMetaDataImpl { }; std::shared_ptr FileMetaData::Make( - const void* metadata, - uint32_t* metadata_len, + const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( @@ -643,7 +620,8 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify(std::shared_ptr encryptor, const void* tail) { +bool FileMetaData::verify(std::shared_ptr encryptor, + const void* tail) { return impl_->verify(encryptor, tail); } @@ -655,7 +633,9 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } -bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } +bool FileMetaData::is_encryption_algorithm_set() const { + return impl_->is_encryption_algorithm_set(); +} EncryptionAlgorithm FileMetaData::encryption_algorithm() const { return impl_->encryption_algorithm(); @@ -965,13 +945,16 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; uint32_t serialized_len; - serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); + serializer.SerializeToBuffer(&column_metadata_, &serialized_len, + &serialized_data); - std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); - unsigned encrypted_len = encryptor->Encrypt( - serialized_data, serialized_len, encrypted_data.data()); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); - const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); + const char* temp = + const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Keep redacted metadata version for old readers @@ -985,8 +968,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(metadata_redacted); - } - else { + } else { // don't set meta_data column_chunk_->__isset.meta_data = true; } @@ -1194,7 +1176,8 @@ void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) { impl_->set_num_rows(num_rows); } -void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { +void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, + int16_t row_group_ordinal) { impl_->Finish(total_bytes_written, row_group_ordinal); } @@ -1207,8 +1190,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->file_encryption() != nullptr - && props->file_encryption()->footer_signing_key() == NULL_STRING) { + if (props->file_encryption() != nullptr && + props->file_encryption()->footer_signing_key() == NULL_STRING) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index d42016627d1..13d2161c7d0 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -19,17 +19,17 @@ #define PARQUET_FILE_METADATA_H #include +#include #include #include #include -#include #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "parquet/platform.h" -#include "parquet/schema.h" #include "parquet/properties.h" +#include "parquet/schema.h" #include "parquet/types.h" namespace parquet { @@ -125,9 +125,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, - const ApplicationVersion* writer_version = NULLPTR, + const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -180,8 +179,9 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, - InternalFileDecryptor* file_decryptor = NULLPTR) const; + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal = -1, + InternalFileDecryptor* file_decryptor = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -196,14 +196,13 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len, - const std::shared_ptr& decryptor = NULLPTR); + static std::shared_ptr Make( + const void* serialized_metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); - bool verify(std::shared_ptr encryptor, - const void* tail); + bool verify(std::shared_ptr encryptor, const void* tail); // file metadata uint32_t size() const; int num_columns() const; @@ -219,7 +218,8 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -299,6 +299,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, const std::shared_ptr& encryptor = NULLPTR); + private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); @@ -349,8 +350,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { RowGroupMetaDataBuilder* AppendRowGroup(); // Complete the Thrift structure - std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm = NULLPTR, - const std::string& footer_signing_key_metadata = ""); + std::unique_ptr Finish( + const EncryptionAlgorithm* signing_algorithm = NULLPTR, + const std::string& footer_signing_key_metadata = ""); // crypto metadata std::unique_ptr GetCryptoMetaData(); diff --git a/cpp/src/parquet/parquet.pc b/cpp/src/parquet/parquet.pc new file mode 100644 index 00000000000..e46eea65b72 --- /dev/null +++ b/cpp/src/parquet/parquet.pc @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=/usr/local +libdir=${prefix}/lib +includedir=${prefix}/include + +so_version=13 +abi_version=13 +full_so_version=13.0.0 + +Name: Apache Parquet +Description: Apache Parquet is a columnar storage format. +Version: 1.5.1-SNAPSHOT +Libs: -L${libdir} -lparquet +Cflags: -I${includedir} diff --git a/cpp/src/parquet/parquet_version.h b/cpp/src/parquet/parquet_version.h new file mode 100644 index 00000000000..dd83e45028d --- /dev/null +++ b/cpp/src/parquet/parquet_version.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_VERSION_H +#define PARQUET_VERSION_H + +// define the parquet created by version +#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT" + +#endif // PARQUET_VERSION_H diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 585700a0497..9b1949e3947 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -21,15 +21,14 @@ #include #include #include +#include +#include "parquet/encryption_properties.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "arrow/util/logging.h" -#include "arrow/util/utf8.h" -#include "parquet/encryption_properties.h" namespace parquet { @@ -417,8 +416,8 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr column_encryption_props(const - std::shared_ptr& path) const { + std::shared_ptr column_encryption_props( + const std::shared_ptr& path) const { if (file_encryption_) { return file_encryption_->column_properties(path); } else { diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 4a5a98fff8c..1e2068c9646 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -28,6 +28,7 @@ #include #endif #include +#include // TCompactProtocol requires some #defines to work right. #define SIGNED_RIGHT_SHIFT_IS 1 @@ -43,8 +44,8 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/platform.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/statistics.h" #include "parquet/util/crypto.h" @@ -85,19 +86,13 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) } static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) { - return AadMetadata { - aesGcmV1.aad_prefix, - aesGcmV1.aad_file_unique, - aesGcmV1.supply_aad_prefix - }; + return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique, + aesGcmV1.supply_aad_prefix}; } static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { - return AadMetadata { - aesGcmCtrV1.aad_prefix, - aesGcmCtrV1.aad_file_unique, - aesGcmCtrV1.supply_aad_prefix - }; + return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique, + aesGcmCtrV1.supply_aad_prefix}; } static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { @@ -228,22 +223,20 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali uint8_t clenBytes[4]; memcpy(clenBytes, buf, 4); clen = *(reinterpret_cast(clenBytes)); - } - else { + } else { clen = *len; } // decrypt const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; std::vector decrypted_buffer(clen - decryptor->CiphertextSizeDelta()); - uint32_t decrypted_buffer_len = decryptor->Decrypt( - cipherBuf, 0, decrypted_buffer.data()); + uint32_t decrypted_buffer_len = + decryptor->Decrypt(cipherBuf, 0, decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - } } @@ -287,12 +280,13 @@ class ThriftSerializer { return static_cast(out_length); } else { std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); - int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, - cipher_buffer.data()); + int cipher_buffer_len = + encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > static_cast(cipher_buffer.size())) { std::stringstream ss; - ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; + ss << "cipher length is greater than cipher buffer capacity: " + << cipher_buffer_len << cipher_buffer.size() << "\n"; throw ParquetException(ss.str()); } From 71a279b62cbd33497b41860143e9f3cc39fab27c Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 08:26:56 +0300 Subject: [PATCH 050/125] Add plaintext_files_allowed --- cpp/src/parquet/encryption_properties.cc | 4 +++- cpp/src/parquet/encryption_properties.h | 26 +++++++++++++++++++++--- cpp/src/parquet/file_reader.cc | 9 +++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index f2385819470..cf34908a1c1 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -91,7 +91,8 @@ FileDecryptionProperties::FileDecryptionProperties( std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); @@ -108,6 +109,7 @@ FileDecryptionProperties::FileDecryptionProperties( key_retriever_ = key_retriever; aad_prefix_ = aad_prefix; column_properties_ = column_properties; + plaintext_files_allowed_ = plaintext_files_allowed; } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index 5848ad3fa68..b40acb34167 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -38,6 +38,7 @@ static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; class PARQUET_EXPORT ColumnEncryptionProperties { @@ -180,7 +181,10 @@ class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { public: - Builder() { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } + Builder() { + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; + } // Set an explicit footer key. If applied on a file that contains // footer key metadata the metadata will be ignored, the footer @@ -263,10 +267,21 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } + // By default, reading plaintext (unencrypted) files is not + // allowed when using a decryptor + // - in order to detect files that were not encrypted by mistake. + // However, the default behavior can be overriden by calling this method. + // The caller should use then a different method to ensure encryption + // of files with sensitive data. + Builder* plaintext_files_allowed() { + plaintext_files_allowed_ = true; + return this; + } + std::shared_ptr build() { return std::shared_ptr(new FileDecryptionProperties( footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, - aad_prefix_verifier_, column_properties_)); + aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); } private: @@ -281,6 +296,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; }; const std::string& column_key(const std::shared_ptr& column_path); @@ -292,6 +308,8 @@ class PARQUET_EXPORT FileDecryptionProperties { bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } + bool plaintext_files_allowed() { return plaintext_files_allowed_; } + const std::shared_ptr& aad_prefix_verifier() { return aad_prefix_verifier_; } @@ -307,6 +325,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; FileDecryptionProperties( const std::string& footer_key, @@ -315,7 +334,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed); }; class PARQUET_EXPORT FileEncryptionProperties { diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 797176c19f3..fe87734a51c 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -263,7 +263,14 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); - if (file_metadata_->is_encryption_algorithm_set()) { + if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties != NULLPTR) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } + } else { auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); From a844c10f53d4d5274a13b4f1bff79fc264bd44b3 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 08:47:25 +0300 Subject: [PATCH 051/125] Remove file_crypto_metadata_ field from SerializedRowGroup and SerializedFile classes --- cpp/src/parquet/file_reader.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index fe87734a51c..7a5fbc1b851 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -86,12 +86,10 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, - FileMetaData* file_metadata, - FileCryptoMetaData* file_crypto_metadata, int row_group_number, + FileMetaData* file_metadata, int row_group_number, const ReaderProperties& props, InternalFileDecryptor* file_decryptor) : source_(source), file_metadata_(file_metadata), - file_crypto_metadata_(file_crypto_metadata), properties_(props), row_group_ordinal_((int16_t)row_group_number), file_decryptor_(file_decryptor) { @@ -175,7 +173,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { private: std::shared_ptr source_; FileMetaData* file_metadata_; - FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; @@ -198,7 +195,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), file_crypto_metadata_.get(), i, properties_, + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -354,7 +351,7 @@ class SerializedFile : public ParquetFileReader::Contents { } file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; - file_crypto_metadata_ = + std::shared_ptr file_crypto_metadata = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); @@ -383,7 +380,7 @@ class SerializedFile : public ParquetFileReader::Contents { // save fileAAD for later use file_decryptor_->file_aad(fileAAD); file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata(file_crypto_metadata_->key_metadata()); + file_decryptor_->footer_key_metadata(file_crypto_metadata->key_metadata()); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; @@ -406,7 +403,6 @@ class SerializedFile : public ParquetFileReader::Contents { private: std::shared_ptr source_; std::shared_ptr file_metadata_; - std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; std::unique_ptr file_decryptor_; }; From 674866a352ce011b096b54ad6bd1490df8b269c6 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 08:58:56 +0300 Subject: [PATCH 052/125] Pass file_aad, algorithm and key_metadata to InternalFileDecryptor constructor --- cpp/src/parquet/file_reader.cc | 22 ++++++++++------------ cpp/src/parquet/internal_file_decryptor.cc | 8 ++++++-- cpp/src/parquet/internal_file_decryptor.h | 10 ++++------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 7a5fbc1b851..965bd37f658 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -260,19 +260,17 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); + auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file - auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties != NULLPTR) { if (!file_decryption_properties->plaintext_files_allowed()) { throw ParquetException("Applying decryption properties on plaintext file"); } } } else { - auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); std::string aad_prefix = file_decryption_properties->aad_prefix(); @@ -298,10 +296,10 @@ class SerializedFile : public ParquetFileReader::Contents { } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_->file_aad(file_aad); - file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata( - file_metadata_->footer_signing_key_metadata()); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, + file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata())); + if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException( @@ -376,11 +374,11 @@ class SerializedFile : public ParquetFileReader::Contents { "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; - // save fileAAD for later use - file_decryptor_->file_aad(fileAAD); - file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata(file_crypto_metadata->key_metadata()); + std::string file_aad = aad_prefix + algo.aad.aad_file_unique; + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, + file_aad, algo.algorithm, + file_crypto_metadata->key_metadata())); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 1935093bbd9..5b6faa5c27b 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -66,8 +66,12 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, } // InternalFileDecryptor -InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) - : properties_(properties) {} + InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) + : properties_(properties), file_add_(file_aad), + algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) {} std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 987c8ec77b9..320c3546ed4 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -70,17 +70,15 @@ class Decryptor { class InternalFileDecryptor { public: - explicit InternalFileDecryptor(FileDecryptionProperties* properties); + explicit InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); - void file_aad(const std::string& file_aad) { file_aad_ = file_aad; } std::string& file_aad() { return file_aad_; } - void algorithm(ParquetCipher::type algorithm) { algorithm_ = algorithm; } ParquetCipher::type algorithm() { return algorithm_; } - void footer_key_metadata(const std::string& footer_key_metadata) { - footer_key_metadata_ = footer_key_metadata; - } std::string& footer_key_metadata() { return footer_key_metadata_; } std::shared_ptr GetFooterSigningEncryptor(); From 9826a6d2befd31545d9e2732eae77c80fbcc2b01 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 15:12:07 +0300 Subject: [PATCH 053/125] Fixes to previous commits --- cpp/src/parquet/file_reader.cc | 3 +-- cpp/src/parquet/internal_file_decryptor.cc | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 965bd37f658..267aca67cd7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -347,11 +347,10 @@ class SerializedFile : public ParquetFileReader::Contents { "No decryption properties are provided. Could not read " "encrypted footer metadata"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; std::shared_ptr file_crypto_metadata = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string aad_prefix = file_decryption_properties->aad_prefix(); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 5b6faa5c27b..7514d37d924 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -70,7 +70,7 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, const std::string& file_aad, ParquetCipher::type algorithm, const std::string& footer_key_metadata) - : properties_(properties), file_add_(file_aad), + : properties_(properties), file_aad_(file_aad), algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) {} std::shared_ptr From 3ddcd8c47bcb41bb32ec791d9c35cb8acb948727 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 15:30:19 +0300 Subject: [PATCH 054/125] Put encryption_properties.h/cc content in encryption.h/cc --- cpp/src/parquet/encryption.cc | 159 +++++++ cpp/src/parquet/encryption.h | 454 ++++++++++++++++++- cpp/src/parquet/encryption_properties.cc | 182 -------- cpp/src/parquet/encryption_properties.h | 481 --------------------- cpp/src/parquet/internal_file_decryptor.cc | 2 +- cpp/src/parquet/internal_file_encryptor.cc | 2 +- cpp/src/parquet/properties.h | 2 +- 7 files changed, 615 insertions(+), 667 deletions(-) delete mode 100644 cpp/src/parquet/encryption_properties.cc delete mode 100644 cpp/src/parquet/encryption_properties.h diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index cc8501e8891..f7c4ab30ceb 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -19,6 +19,10 @@ #include #include +#include +#include + +#include "arrow/util/utf8.h" namespace parquet { @@ -43,4 +47,159 @@ const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("key id should be in UTF8 encoding"); + } + + DCHECK(!key_id.empty()); + this->key_metadata(key_id); + return this; +} + +ColumnEncryptionProperties::ColumnEncryptionProperties( + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata) + : column_path_(column_path) { + DCHECK(column_path != nullptr); + if (!encrypted) { + DCHECK(key.empty() && key_metadata.empty()); + } + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) { + DCHECK(key_metadata.empty()); + } + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; +} + +ColumnDecryptionProperties::ColumnDecryptionProperties( + const std::shared_ptr& column_path, const std::string& key) + : column_path_(column_path) { + DCHECK(column_path != nullptr); + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + key_ = key; +} + +const std::string& FileDecryptionProperties::column_key( + const std::shared_ptr& column_path) { + if (column_properties_.find(column_path) != column_properties_.end()) { + auto column_prop = column_properties_[column_path]; + if (column_prop != nullptr) { + return column_prop->key(); + } + } + return NULL_STRING; +} + +FileDecryptionProperties::FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || + 0 != column_properties.size()); + + if (!footer_key.empty()) { + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + } + if (footer_key.empty() && check_plaintext_footer_integrity) { + DCHECK(NULLPTR != key_retriever); + } + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_properties_ = column_properties; + plaintext_files_allowed_ = plaintext_files_allowed; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("footer key id should be in UTF8 encoding"); + } + + if (key_id.empty()) { + return this; + } + + return footer_key_metadata(key_id); +} + +std::shared_ptr FileEncryptionProperties::column_properties( + const std::shared_ptr& column_path) { + if (column_properties_.size() == 0) { + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder(column_path)); + return builder->build(); + } + if (column_properties_.find(column_path) != column_properties_.end()) { + return column_properties_[column_path]; + } + + return NULLPTR; +} + +FileEncryptionProperties::FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_properties_(column_properties) { + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + + uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; + memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); + RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH); + + bool supply_aad_prefix = false; + if (aad_prefix.empty()) { + file_aad_ = aad_file_unique_str; + } else { + file_aad_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } +} + } // namespace parquet diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index aff37839c8c..a54c4bb26ab 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -20,12 +20,28 @@ #include #include +#include +#include -#include +#include "arrow/util/logging.h" +#include "parquet/encryption.h" #include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "parquet/util/visibility.h" + namespace parquet { +static const std::string NULL_STRING = ""; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = + ParquetCipher::AES_GCM_V1; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; +static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; +static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; + class PARQUET_EXPORT DecryptionKeyRetriever { public: virtual const std::string& GetKey(const std::string& key_metadata) = 0; @@ -70,6 +86,442 @@ class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class Builder { + public: + // Convenience builder for regular (not nested) columns. + explicit Builder(const std::string& name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + + // Convenience builder for encrypted columns. + explicit Builder(const std::shared_ptr& path) + : Builder(path, true) {} + + // Set a column-specific key. + // If key is not set on an encrypted column, the column will + // be encrypted with the footer key. + // keyBytes Key length must be either 16, 24 or 32 bytes. + Builder* key(const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + // Set a key retrieval metadata. + // use either key_metadata() or key_id(), not both + Builder* key_metadata(const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either key_metadata() or key_id(), not both + // key_id will be converted to metadata (UTF-8 array). + Builder* key_id(const std::string& key_id); + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); + } + + private: + const std::shared_ptr column_path_; + bool encrypted_; + std::string key_; + std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} + }; + + const std::shared_ptr& column_path() { return column_path_; } + bool is_encrypted() const { return encrypted_; } + bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + private: + const std::shared_ptr column_path_; + bool encrypted_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + explicit ColumnEncryptionProperties( + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata); +}; + +class PARQUET_EXPORT ColumnDecryptionProperties { + public: + class Builder { + public: + // convenience builder for regular (not nested) columns. + explicit Builder(const std::string& name) + : Builder(schema::ColumnPath::FromDotString(name)) {} + + explicit Builder(const std::shared_ptr& path) + : column_path_(path) {} + + // Set an explicit column key. If applied on a file that contains + // key metadata for this column the metadata will be ignored, + // the column will be decrypted with this key. + // key length must be either 16, 24 or 32 bytes. + Builder* key(const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); + } + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& column_path() { return column_path_; } + const std::string& key() const { return key_; } + + private: + const std::shared_ptr column_path_; + std::string key_; + + // This class is only required for setting explicit column decryption keys - + // to override key retriever (or to provide keys when key metadata and/or + // key retriever are not available) + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, const std::string& key); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + // Verifies identity (AAD Prefix) of individual file, + // or of file collection in a data set. + // Throws exception if an AAD prefix is wrong. + // In a data set, AAD Prefixes should be collected, + // and then checked for missing files. + virtual void check(const std::string& aad_prefix) = 0; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class Builder { + public: + Builder() { + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; + } + + // Set an explicit footer key. If applied on a file that contains + // footer key metadata the metadata will be ignored, the footer + // will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from + // key retriever. + // param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* footer_key(const std::string& footer_key) { + if (footer_key.empty()) { + return this; + } + DCHECK(!footer_key.empty()); + footer_key_ = footer_key; + return this; + } + + // Set explicit column keys (decryption properties). + // Its also possible to set a key retriever on this property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + column_properties_ = column_properties; + return this; + } + + // Set a key retriever callback. Its also possible to + // set explicit footer or column keys on this file property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* key_retriever(const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; + } + + // Skip integrity verification of plaintext footers. + // If not called, integrity of plaintext footers will be checked in runtime, + // and an exception will be thrown in the following situations: + // - footer signing key is not available + // (not passed, or not found by key retriever) + // - footer content and signature don't match + Builder* disable_footer_signature_verification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + // Explicitly supply the file AAD prefix. + // A must when a prefix is used for file encryption, but not stored in file. + // If AAD prefix is stored in file, it will be compared to the explicitly + // supplied value and an exception will be thrown if they differ. + Builder* aad_prefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; + } + + // Set callback for verification of AAD Prefixes stored in file. + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; + } + + // By default, reading plaintext (unencrypted) files is not + // allowed when using a decryptor + // - in order to detect files that were not encrypted by mistake. + // However, the default behavior can be overriden by calling this method. + // The caller should use then a different method to ensure encryption + // of files with sensitive data. + Builder* plaintext_files_allowed() { + plaintext_files_allowed_ = true; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new FileDecryptionProperties( + footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, + aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + }; + + const std::string& column_key(const std::shared_ptr& column_path); + + const std::string& footer_key() { return footer_key_; } + + const std::string& aad_prefix() { return aad_prefix_; } + std::shared_ptr key_retriever() { return key_retriever_; } + + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } + + bool plaintext_files_allowed() { return plaintext_files_allowed_; } + + const std::shared_ptr& aad_prefix_verifier() { + return aad_prefix_verifier_; + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed); +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + class Builder { + public: + explicit Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + // Create files with plaintext footer. + // If not called, the files will be created with encrypted footer (default). + Builder* enable_plaintext_footer() { + encrypted_footer_ = false; + return this; + } + + // Set encryption algorithm. + // If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* algorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_id(const std::string& key_id); + + // Set a key retrieval metadata. + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_metadata(const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; + } + + // Set the file AAD Prefix. + Builder* aad_prefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; + } + + // Skip storing AAD Prefix in file. + // If not called, and if AAD Prefix is set, it will be stored. + Builder* disable_store_aad_prefix_storage() { + DCHECK(!aad_prefix_.empty()); + + store_aad_prefix_in_file_ = false; + return this; + } + + // Set the list of encrypted columns and their properties (keys etc). + // If not called, all columns will be encrypted with the footer key. + // If called, the file columns not in the list will be left unencrypted. + Builder* column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + column_properties_ = column_properties; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, + aad_prefix_, store_aad_prefix_in_file_, column_properties_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; + }; + bool encrypted_footer() const { return encrypted_footer_; } + + const EncryptionAlgorithm algorithm() { return algorithm_; } + + const std::string& footer_encryption_key() { + return (encrypted_footer_ ? footer_key_ : NULL_STRING); + } + + const std::string& footer_encryption_key_metadata() { + return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); + } + + const std::string& footer_signing_key() { + return (encrypted_footer_ ? NULL_STRING : footer_key_); + } + + const std::string& footer_signing_key_metadata() { + return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); + } + + const std::string& file_aad() const { return file_aad_; } + + std::shared_ptr column_properties( + const std::shared_ptr& column_path); + + private: + EncryptionAlgorithm algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + bool encrypted_footer_; + std::string file_aad_; + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; + + FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties); +}; + + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc deleted file mode 100644 index cf34908a1c1..00000000000 --- a/cpp/src/parquet/encryption_properties.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/encryption_properties.h" - -#include -#include - -#include "arrow/util/utf8.h" - -namespace parquet { - -ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( - const std::string& key_id) { - // key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) { - throw ParquetException("key id should be in UTF8 encoding"); - } - - DCHECK(!key_id.empty()); - this->key_metadata(key_id); - return this; -} - -ColumnEncryptionProperties::ColumnEncryptionProperties( - bool encrypted, const std::shared_ptr& column_path, - const std::string& key, const std::string& key_metadata) - : column_path_(column_path) { - DCHECK(column_path != nullptr); - if (!encrypted) { - DCHECK(key.empty() && key_metadata.empty()); - } - - if (!key.empty()) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - } - - encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_) { - DCHECK(key_metadata.empty()); - } - - encrypted_ = encrypted; - key_metadata_ = key_metadata; - key_ = key; -} - -ColumnDecryptionProperties::ColumnDecryptionProperties( - const std::shared_ptr& column_path, const std::string& key) - : column_path_(column_path) { - DCHECK(column_path != nullptr); - - if (!key.empty()) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - } - - key_ = key; -} - -const std::string& FileDecryptionProperties::column_key( - const std::shared_ptr& column_path) { - if (column_properties_.find(column_path) != column_properties_.end()) { - auto column_prop = column_properties_[column_path]; - if (column_prop != nullptr) { - return column_prop->key(); - } - } - return NULL_STRING; -} - -FileDecryptionProperties::FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, - bool plaintext_files_allowed) { - DCHECK(!footer_key.empty() || NULLPTR != key_retriever || - 0 != column_properties.size()); - - if (!footer_key.empty()) { - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); - } - if (footer_key.empty() && check_plaintext_footer_integrity) { - DCHECK(NULLPTR != key_retriever); - } - aad_prefix_verifier_ = aad_prefix_verifier; - footer_key_ = footer_key; - check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; - key_retriever_ = key_retriever; - aad_prefix_ = aad_prefix; - column_properties_ = column_properties; - plaintext_files_allowed_ = plaintext_files_allowed; -} - -FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( - const std::string& key_id) { - // key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) { - throw ParquetException("footer key id should be in UTF8 encoding"); - } - - if (key_id.empty()) { - return this; - } - - return footer_key_metadata(key_id); -} - -std::shared_ptr FileEncryptionProperties::column_properties( - const std::shared_ptr& column_path) { - if (column_properties_.size() == 0) { - auto builder = std::shared_ptr( - new ColumnEncryptionProperties::Builder(column_path)); - return builder->build(); - } - if (column_properties_.find(column_path) != column_properties_.end()) { - return column_properties_[column_path]; - } - - return NULLPTR; -} - -FileEncryptionProperties::FileEncryptionProperties( - ParquetCipher::type cipher, const std::string& footer_key, - const std::string& footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) - : footer_key_(footer_key), - footer_key_metadata_(footer_key_metadata), - encrypted_footer_(encrypted_footer), - column_properties_(column_properties) { - DCHECK(!footer_key.empty()); - // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); - - uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; - memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); - RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH); - - bool supply_aad_prefix = false; - if (aad_prefix.empty()) { - file_aad_ = aad_file_unique_str; - } else { - file_aad_ = aad_prefix + aad_file_unique_str; - if (!store_aad_prefix_in_file) supply_aad_prefix = true; - } - algorithm_.algorithm = cipher; - algorithm_.aad.aad_file_unique = aad_file_unique_str; - algorithm_.aad.supply_aad_prefix = supply_aad_prefix; - if (!aad_prefix.empty() && store_aad_prefix_in_file) { - algorithm_.aad.aad_prefix = aad_prefix; - } -} - -} // namespace parquet diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h deleted file mode 100644 index b40acb34167..00000000000 --- a/cpp/src/parquet/encryption_properties.h +++ /dev/null @@ -1,481 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ENCRYPTION_PROPERTIES_H -#define PARQUET_ENCRYPTION_PROPERTIES_H - -#include -#include -#include -#include - -#include "arrow/util/logging.h" -#include "parquet/encryption.h" -#include "parquet/exception.h" -#include "parquet/schema.h" -#include "parquet/types.h" -#include "parquet/util/visibility.h" - -namespace parquet { - -static const std::string NULL_STRING = ""; -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = - ParquetCipher::AES_GCM_V1; -static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; -static constexpr bool DEFAULT_CHECK_SIGNATURE = true; -static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; -static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; - -class PARQUET_EXPORT ColumnEncryptionProperties { - public: - class Builder { - public: - // Convenience builder for regular (not nested) columns. - explicit Builder(const std::string& name) { - Builder(schema::ColumnPath::FromDotString(name), true); - } - - // Convenience builder for encrypted columns. - explicit Builder(const std::shared_ptr& path) - : Builder(path, true) {} - - // Set a column-specific key. - // If key is not set on an encrypted column, the column will - // be encrypted with the footer key. - // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - // Set a key retrieval metadata. - // use either key_metadata() or key_id(), not both - Builder* key_metadata(const std::string& key_metadata) { - DCHECK(!key_metadata.empty()); - DCHECK(key_metadata_.empty()); - key_metadata_ = key_metadata; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either key_metadata() or key_id(), not both - // key_id will be converted to metadata (UTF-8 array). - Builder* key_id(const std::string& key_id); - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); - } - - private: - const std::shared_ptr column_path_; - bool encrypted_; - std::string key_; - std::string key_metadata_; - - Builder(const std::shared_ptr& path, bool encrypted) - : column_path_(path), encrypted_(encrypted) {} - }; - - const std::shared_ptr& column_path() { return column_path_; } - bool is_encrypted() const { return encrypted_; } - bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } - const std::string& key() const { return key_; } - const std::string& key_metadata() const { return key_metadata_; } - - ColumnEncryptionProperties() = default; - ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; - ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - - private: - const std::shared_ptr column_path_; - bool encrypted_; - bool encrypted_with_footer_key_; - std::string key_; - std::string key_metadata_; - explicit ColumnEncryptionProperties( - bool encrypted, const std::shared_ptr& column_path, - const std::string& key, const std::string& key_metadata); -}; - -class PARQUET_EXPORT ColumnDecryptionProperties { - public: - class Builder { - public: - // convenience builder for regular (not nested) columns. - explicit Builder(const std::string& name) - : Builder(schema::ColumnPath::FromDotString(name)) {} - - explicit Builder(const std::shared_ptr& path) - : column_path_(path) {} - - // Set an explicit column key. If applied on a file that contains - // key metadata for this column the metadata will be ignored, - // the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_)); - } - - private: - const std::shared_ptr column_path_; - std::string key_; - }; - - ColumnDecryptionProperties() = default; - ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; - ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - - const std::shared_ptr& column_path() { return column_path_; } - const std::string& key() const { return key_; } - - private: - const std::shared_ptr column_path_; - std::string key_; - - // This class is only required for setting explicit column decryption keys - - // to override key retriever (or to provide keys when key metadata and/or - // key retriever are not available) - explicit ColumnDecryptionProperties( - const std::shared_ptr& column_path, const std::string& key); -}; - -class PARQUET_EXPORT AADPrefixVerifier { - public: - // Verifies identity (AAD Prefix) of individual file, - // or of file collection in a data set. - // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, - // and then checked for missing files. - virtual void check(const std::string& aad_prefix) = 0; -}; - -class PARQUET_EXPORT FileDecryptionProperties { - public: - class Builder { - public: - Builder() { - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; - } - - // Set an explicit footer key. If applied on a file that contains - // footer key metadata the metadata will be ignored, the footer - // will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from - // key retriever. - // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(const std::string& footer_key) { - if (footer_key.empty()) { - return this; - } - DCHECK(!footer_key.empty()); - footer_key_ = footer_key; - return this; - } - - // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - column_properties_ = column_properties; - return this; - } - - // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* key_retriever(const std::shared_ptr& key_retriever) { - if (key_retriever == NULLPTR) return this; - - DCHECK(key_retriever_ == NULLPTR); - key_retriever_ = key_retriever; - return this; - } - - // Skip integrity verification of plaintext footers. - // If not called, integrity of plaintext footers will be checked in runtime, - // and an exception will be thrown in the following situations: - // - footer signing key is not available - // (not passed, or not found by key retriever) - // - footer content and signature don't match - Builder* disable_footer_signature_verification() { - check_plaintext_footer_integrity_ = false; - return this; - } - - // Explicitly supply the file AAD prefix. - // A must when a prefix is used for file encryption, but not stored in file. - // If AAD prefix is stored in file, it will be compared to the explicitly - // supplied value and an exception will be thrown if they differ. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) { - return this; - } - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - return this; - } - - // Set callback for verification of AAD Prefixes stored in file. - Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) return this; - - DCHECK(aad_prefix_verifier_ == NULLPTR); - aad_prefix_verifier_ = aad_prefix_verifier; - return this; - } - - // By default, reading plaintext (unencrypted) files is not - // allowed when using a decryptor - // - in order to detect files that were not encrypted by mistake. - // However, the default behavior can be overriden by calling this method. - // The caller should use then a different method to ensure encryption - // of files with sensitive data. - Builder* plaintext_files_allowed() { - plaintext_files_allowed_ = true; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr(new FileDecryptionProperties( - footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, - aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - bool plaintext_files_allowed_; - }; - - const std::string& column_key(const std::shared_ptr& column_path); - - const std::string& footer_key() { return footer_key_; } - - const std::string& aad_prefix() { return aad_prefix_; } - std::shared_ptr key_retriever() { return key_retriever_; } - - bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } - - bool plaintext_files_allowed() { return plaintext_files_allowed_; } - - const std::shared_ptr& aad_prefix_verifier() { - return aad_prefix_verifier_; - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - bool plaintext_files_allowed_; - - FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, - bool plaintext_files_allowed); -}; - -class PARQUET_EXPORT FileEncryptionProperties { - public: - class Builder { - public: - explicit Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { - footer_key_ = footer_key; - store_aad_prefix_in_file_ = false; - } - - // Create files with plaintext footer. - // If not called, the files will be created with encrypted footer (default). - Builder* enable_plaintext_footer() { - encrypted_footer_ = false; - return this; - } - - // Set encryption algorithm. - // If not called, files will be encrypted with AES_GCM_V1 (default). - Builder* algorithm(ParquetCipher::type parquet_cipher) { - parquet_cipher_ = parquet_cipher; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_id(const std::string& key_id); - - // Set a key retrieval metadata. - // use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_metadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) return this; - - DCHECK(footer_key_metadata_.empty()); - footer_key_metadata_ = footer_key_metadata; - return this; - } - - // Set the file AAD Prefix. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) return this; - - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - store_aad_prefix_in_file_ = true; - return this; - } - - // Skip storing AAD Prefix in file. - // If not called, and if AAD Prefix is set, it will be stored. - Builder* disable_store_aad_prefix_storage() { - DCHECK(!aad_prefix_.empty()); - - store_aad_prefix_in_file_ = false; - return this; - } - - // Set the list of encrypted columns and their properties (keys etc). - // If not called, all columns will be encrypted with the footer key. - // If called, the file columns not in the list will be left unencrypted. - Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - column_properties_ = column_properties; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr(new FileEncryptionProperties( - parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, - aad_prefix_, store_aad_prefix_in_file_, column_properties_)); - } - - private: - ParquetCipher::type parquet_cipher_; - bool encrypted_footer_; - std::string footer_key_; - std::string footer_key_metadata_; - - std::string aad_prefix_; - bool store_aad_prefix_in_file_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; - }; - bool encrypted_footer() const { return encrypted_footer_; } - - const EncryptionAlgorithm algorithm() { return algorithm_; } - - const std::string& footer_encryption_key() { - return (encrypted_footer_ ? footer_key_ : NULL_STRING); - } - - const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); - } - - const std::string& footer_signing_key() { - return (encrypted_footer_ ? NULL_STRING : footer_key_); - } - - const std::string& footer_signing_key_metadata() { - return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); - } - - const std::string& file_aad() const { return file_aad_; } - - std::shared_ptr column_properties( - const std::shared_ptr& column_path); - - private: - EncryptionAlgorithm algorithm_; - std::string footer_key_; - std::string footer_key_metadata_; - bool encrypted_footer_; - std::string file_aad_; - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; - - FileEncryptionProperties( - ParquetCipher::type cipher, const std::string& footer_key, - const std::string& footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); -}; - -} // namespace parquet - -#endif // PARQUET_ENCRYPTION_PROPERTIES_H diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 7514d37d924..a386f643175 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -16,7 +16,7 @@ // under the License. #include "parquet/internal_file_decryptor.h" -#include "parquet/encryption_properties.h" +#include "parquet/encryption.h" #include "parquet/util/crypto.h" namespace parquet { diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index c2127f06dd1..f786fc9c797 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -16,7 +16,7 @@ // under the License. #include "parquet/internal_file_encryptor.h" -#include "parquet/encryption_properties.h" +#include "parquet/encryption.h" #include "parquet/util/crypto.h" namespace parquet { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 9b1949e3947..aee4dbcb2f9 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -23,7 +23,7 @@ #include #include -#include "parquet/encryption_properties.h" +#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" From 5998317bd905f74066f176d3d880e417cced6d2e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 21:45:14 +0300 Subject: [PATCH 055/125] Remove encryption_properties.cc from CMakeLists.txt --- cpp/src/parquet/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 1d2e8e45dd4..9452122b522 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -167,7 +167,6 @@ set(PARQUET_SRCS deprecated_io.cc encoding.cc encryption.cc - encryption_properties.cc internal_file_decryptor.cc internal_file_encryptor.cc file_reader.cc From b70b68fe31ddd68d819821faf3e47ee0daf66f57 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 21:49:22 +0300 Subject: [PATCH 056/125] Add column_metadata_map_, column_data_map_, footer_signing_encryptor_ and footer_encryptor_ to InternalFileEncryptor --- cpp/src/parquet/internal_file_encryptor.cc | 56 ++++++++++++++++++---- cpp/src/parquet/internal_file_encryptor.h | 13 +++++ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index f786fc9c797..cd0f648e7c2 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -44,26 +44,49 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) - : properties_(properties) {} + : properties_(properties) { + column_data_map_ = std::shared_ptr, + std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr, + std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); +} std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { + if (footer_encryptor_ != NULLPTR) { + return footer_encryptor_; + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); + std::shared_ptr encryptor = std::make_shared( + aes_encryptor, footer_key, properties_->file_aad(), + aad); + footer_encryptor_ = encryptor; + return encryptor; - return std::make_shared(aes_encryptor, footer_key, properties_->file_aad(), - aad); } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) { + return footer_signing_encryptor_; + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); - - return std::make_shared(aes_encryptor, footer_signing_key, - properties_->file_aad(), aad); + std::shared_ptr encryptor = std::make_shared( + aes_encryptor, footer_signing_key, + properties_->file_aad(), aad); + footer_signing_encryptor_ = encryptor; + return encryptor; } std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( @@ -79,6 +102,16 @@ std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata) { + // first look if we already got the encryptor from before + if (metadata) { + if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { + return column_metadata_map_->at(column_path); + } + } else { + if (column_data_map_->find(column_path) != column_data_map_->end()) { + return column_data_map_->at(column_path); + } + } auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { return NULLPTR; @@ -100,9 +133,14 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( : GetDataAesEncryptor(algorithm, key.size()); std::string file_aad = properties_->file_aad(); - - // TODO: aad - return std::make_shared(aes_encryptor, key, file_aad, ""); + std::shared_ptr encryptor = std::make_shared( + aes_encryptor, key, file_aad, ""); + if (metadata) + (*column_metadata_map_)[column_path] = encryptor; + else + (*column_data_map_)[column_path] = encryptor; + + return encryptor; } parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index ccef1315f60..fcc85c473ca 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -20,6 +20,7 @@ #include #include +#include #include "parquet/schema.h" @@ -62,6 +63,18 @@ class InternalFileEncryptor { private: FileEncryptionProperties* properties_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; + + std::shared_ptr footer_signing_encryptor_; + std::shared_ptr footer_encryptor_; + std::unique_ptr meta_encryptor_128_; std::unique_ptr meta_encryptor_196_; std::unique_ptr meta_encryptor_256_; From 551649fb2f730f317fe1c5c91f14a4f26ea158e4 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:30:51 +0300 Subject: [PATCH 057/125] Add column_data_map_, column_metadata_map_, footer_data_decryptor_ and footer_metadata_decryptor_ to InternalFileDecryptor --- cpp/src/parquet/internal_file_decryptor.cc | 74 ++++++++++++++++------ cpp/src/parquet/internal_file_decryptor.h | 21 ++++-- 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index a386f643175..81c551154af 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -67,11 +67,25 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, // InternalFileDecryptor InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, - const std::string& file_aad, - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) : properties_(properties), file_aad_(file_aad), - algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) {} + algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { + column_data_map_ = std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + } std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { @@ -100,7 +114,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); footer_signing_encryptor_ = - std::make_shared(algorithm_, footer_key, file_aad_, aad); + std::make_shared(algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } @@ -121,7 +135,12 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnDat std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( const std::string& aad, bool metadata) { - if (footer_decryptor_ != NULLPTR) return footer_decryptor_; + if (metadata) { + if (footer_metadata_decryptor_ != NULLPTR) return footer_metadata_decryptor_; + } else { + if (footer_data_decryptor_ != NULLPTR) return footer_data_decryptor_; + } + std::string footer_key = properties_->footer_key(); if (footer_key.empty()) { if (footer_key_metadata_.empty()) @@ -144,9 +163,15 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) : GetDataAesDecryptor(footer_key.size()); - footer_decryptor_ = - std::make_shared(aes_decryptor, footer_key, file_aad_, aad); - return footer_decryptor_; + std::shared_ptr decryptor = + std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + + if (metadata) + footer_metadata_decryptor_ = decryptor; + else + footer_data_decryptor_ = decryptor; + + return decryptor; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( @@ -165,10 +190,17 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr column_path, const std::string& column_key_metadata, const std::string& aad, bool metadata) { std::string column_key; - // first look if we already got the key from before - if (column_map_ != NULLPTR && column_map_->find(column_path) != column_map_->end()) { - column_key = column_map_->at(column_path); + // first look if we already got the decryptor from before + if (metadata) { + if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { + return column_metadata_map_->at(column_path); + } } else { + if (column_data_map_->find(column_path) != column_data_map_->end()) { + return column_data_map_->at(column_path); + } + } + column_key = properties_->column_key(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && @@ -182,21 +214,23 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( throw HiddenColumnException(ss.str()); } } - } if (column_key.empty()) { throw HiddenColumnException("HiddenColumnException, path=" + column_path->ToDotString()); } - if (column_map_ != NULLPTR) { - // save column key for future use - (*column_map_)[column_path] = column_key; - } - auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + : GetDataAesDecryptor(column_key.size()); + + std::shared_ptr decryptor = std::make_shared( + aes_decryptor, column_key, file_aad_, aad); + if (metadata) + (*column_metadata_map_)[column_path] = decryptor; + else + (*column_data_map_)[column_path] = decryptor; + + return decryptor; - return std::make_shared(aes_decryptor, column_key, file_aad_, aad); } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 320c3546ed4..0b0c41789db 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -71,9 +71,9 @@ class Decryptor { class InternalFileDecryptor { public: explicit InternalFileDecryptor(FileDecryptionProperties* properties, - const std::string& file_aad, - ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); std::string& file_aad() { return file_aad_; } @@ -99,10 +99,17 @@ class InternalFileDecryptor { FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - // A map between ColumnPath and their encryption keys - std::shared_ptr, std::string, - parquet::schema::ColumnPath::CmpColumnPath>> - column_map_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; + + std::shared_ptr footer_metadata_decryptor_; + std::shared_ptr footer_data_decryptor_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_decryptor_; From 9b5e5bebc7dd9f597903664e8e40ca96d1f2b739 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:38:52 +0300 Subject: [PATCH 058/125] Rename aad to update_aad in Encryptor and Decryptor classes --- cpp/src/parquet/column_reader.cc | 8 ++++---- cpp/src/parquet/column_writer.cc | 10 +++++----- cpp/src/parquet/internal_file_decryptor.h | 2 +- cpp/src/parquet/internal_file_encryptor.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 94e9279fd03..868ee59abc4 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -221,10 +221,10 @@ std::shared_ptr SerializedPageReader::NextPage() { aad = parquet_encryption::createModuleAAD( meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); - meta_decryptor_->aad(aad); + meta_decryptor_->update_aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - meta_decryptor_->aad(data_page_headerAAD_); + meta_decryptor_->update_aad(data_page_headerAAD_); } } DeserializeThriftMsg(reinterpret_cast(buffer.data()), @@ -253,10 +253,10 @@ std::shared_ptr SerializedPageReader::NextPage() { aad = parquet_encryption::createModuleAAD( data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); - data_decryptor_->aad(aad); + data_decryptor_->update_aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - data_decryptor_->aad(data_pageAAD_); + data_decryptor_->update_aad(data_pageAAD_); } } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 9733df12a52..1e963187559 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -184,7 +184,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { - data_encryptor_->aad( + data_encryptor_->update_aad( parquet_encryption::createModuleAAD(data_encryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, @@ -210,7 +210,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } @@ -232,7 +232,7 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); if (meta_encryptor_ != nullptr) { - meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } @@ -279,7 +279,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - data_encryptor_->aad(data_pageAAD_); + data_encryptor_->update_aad(data_pageAAD_); encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + output_data_len); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -302,7 +302,7 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - meta_encryptor_->aad(data_page_headerAAD_); + meta_encryptor_->update_aad(data_page_headerAAD_); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 0b0c41789db..733c448f4f7 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -56,7 +56,7 @@ class Decryptor { const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } - void aad(const std::string& aad) { aad_ = aad; } + void update_aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index fcc85c473ca..bfa7cca3e0d 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -37,7 +37,7 @@ class Encryptor { Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } - void aad(const std::string& aad) { aad_ = aad; } + void update_aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); From 60ef74143df11405a6267d253c321bc541bfa4c4 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:48:10 +0300 Subject: [PATCH 059/125] Move PARQUET_EMAGIC and PARQUET_MAGIC to file_writer.h and use it in file_reader.cc --- cpp/src/parquet/file_reader.cc | 3 +-- cpp/src/parquet/file_writer.cc | 16 ++++++---------- cpp/src/parquet/file_writer.h | 4 ++++ 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 267aca67cd7..17c7d8917f7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -30,6 +30,7 @@ #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" +#include "parquet/file_writer.h" #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" @@ -48,8 +49,6 @@ namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; -static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; -static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 8e02ccc5e5a..2cbb5f760cf 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -34,10 +34,6 @@ using parquet::schema::GroupNode; namespace parquet { -// FIXME: copied from reader-internal.cc -static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; -static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', 'E'}; - // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -305,7 +301,7 @@ class FileSerializer : public ParquetFileWriter::Contents { uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - sink_->Write(PARQUET_EMAGIC, 4); + sink_->Write(kParquetEMagic, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; @@ -392,15 +388,15 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { // Unencrypted parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); if (file_encryption->encrypted_footer()) { - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { // plaintext mode footer - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } } @@ -452,7 +448,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); } else { if (encrypt_footer) { // encrypt and write to sink @@ -463,7 +459,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin metadata_len = static_cast(sink->Tell()) - metadata_len; sink->Write(reinterpret_cast(&metadata_len), 4); - sink->Write(PARQUET_MAGIC, 4); + sink->Write(kParquetMagic, 4); } } } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index db9008fff56..c51e354bff1 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -44,6 +44,10 @@ namespace parquet { class ColumnWriter; class OutputStream; +// FIXME: copied from reader-internal.cc +static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; + class PARQUET_EXPORT RowGroupWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more From e5c2b481859c839070514a7524777e0e43983893 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:57:29 +0300 Subject: [PATCH 060/125] Rename file_encryption to file_encryption_properties in file_writer.cc --- cpp/src/parquet/file_writer.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 2cbb5f760cf..e454e60a37a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -283,12 +283,12 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - auto file_encryption = properties_->file_encryption(); - if (file_encryption == nullptr) { + auto file_encryption_properties = properties_->file_encryption(); + if (file_encryption_properties == nullptr) { file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); } else { - if (file_encryption->encrypted_footer()) { + if (file_encryption_properties->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -305,14 +305,14 @@ class FileSerializer : public ParquetFileWriter::Contents { } else { // footer plain mode EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption->algorithm(); + EncryptionAlgorithm algo = file_encryption_properties->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, file_encryption->footer_signing_key_metadata()); + &signing_encryption, file_encryption_properties->footer_signing_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); @@ -385,13 +385,13 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr file_encryptor_; void StartFile() { - auto file_encryption = properties_->file_encryption(); - if (file_encryption == nullptr) { + auto file_encryption_properties = properties_->file_encryption(); + if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { - file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); - if (file_encryption->encrypted_footer()) { + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties)); + if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { From 03d249533612ddb549e6d7396f55197a43828df0 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 13 May 2019 07:07:45 +0300 Subject: [PATCH 061/125] Remove unused footer_decryptor_ from InternalFileDecryptor class and fix indentation in GetFooterSigningEncryptor --- cpp/src/parquet/internal_file_decryptor.cc | 7 ++++--- cpp/src/parquet/internal_file_decryptor.h | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 81c551154af..0835f0686be 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -89,7 +89,8 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { - if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; + if (footer_signing_encryptor_ != NULLPTR) + return footer_signing_encryptor_; std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -113,8 +114,8 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); - footer_signing_encryptor_ = - std::make_shared(algorithm_, footer_key, file_aad_, aad); + footer_signing_encryptor_ = std::make_shared( + algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 733c448f4f7..3abdae2f493 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -112,7 +112,6 @@ class InternalFileDecryptor { std::shared_ptr footer_data_decryptor_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; - std::shared_ptr footer_decryptor_; std::shared_ptr footer_signing_encryptor_; std::unique_ptr meta_decryptor_128_; From 3537789ff5399673fe83b00d9d3e698d79cd3eb5 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 10:44:53 +0300 Subject: [PATCH 062/125] Fix format --- cpp/src/parquet/file_reader.cc | 21 ++++++++++----------- cpp/src/parquet/file_writer.cc | 3 ++- cpp/src/parquet/internal_file_decryptor.cc | 21 ++++++++++----------- cpp/src/parquet/internal_file_encryptor.cc | 1 - 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 17c7d8917f7..de493a56054 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -261,11 +261,11 @@ class SerializedFile : public ParquetFileReader::Contents { auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file - if (file_decryption_properties != NULLPTR) { - if (!file_decryption_properties->plaintext_files_allowed()) { - throw ParquetException("Applying decryption properties on plaintext file"); - } - } + if (file_decryption_properties != NULLPTR) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } } else { if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); @@ -295,10 +295,11 @@ class SerializedFile : public ParquetFileReader::Contents { } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, - file_aad, algo.algorithm, - file_metadata_->footer_signing_key_metadata())); - + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, + file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata())); + if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException( @@ -376,8 +377,6 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, file_crypto_metadata->key_metadata())); - - int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index e454e60a37a..1cfd0d8ebb3 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -312,7 +312,8 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, file_encryption_properties->footer_signing_key_metadata()); + &signing_encryption, + file_encryption_properties->footer_signing_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 0835f0686be..03870baa0f4 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -73,18 +73,18 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, : properties_(properties), file_aad_(file_aad), algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { column_data_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); column_metadata_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); } std::shared_ptr @@ -231,7 +231,6 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( (*column_data_map_)[column_path] = decryptor; return decryptor; - } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index cd0f648e7c2..3801cf6c70b 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -71,7 +71,6 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { aad); footer_encryptor_ = encryptor; return encryptor; - } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { From 9e18a00189f80790ec7aa10816e2c5c7dea7a0cb Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 11:36:37 +0300 Subject: [PATCH 063/125] Change implementation of NULL_STRING --- cpp/src/parquet/encryption.cc | 2 +- cpp/src/parquet/encryption.h | 13 ++++++++----- cpp/src/parquet/metadata.cc | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index f7c4ab30ceb..7f3effb2540 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -104,7 +104,7 @@ const std::string& FileDecryptionProperties::column_key( return column_prop->key(); } } - return NULL_STRING; + return empty_string_; } FileDecryptionProperties::FileDecryptionProperties( diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index a54c4bb26ab..329845edf82 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -33,7 +33,6 @@ namespace parquet { -static const std::string NULL_STRING = ""; static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; @@ -364,6 +363,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; + const std::string empty_string_ = ""; + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_properties_; @@ -481,19 +482,19 @@ class PARQUET_EXPORT FileEncryptionProperties { const EncryptionAlgorithm algorithm() { return algorithm_; } const std::string& footer_encryption_key() { - return (encrypted_footer_ ? footer_key_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_ : empty_string_); } const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_metadata_ : empty_string_); } const std::string& footer_signing_key() { - return (encrypted_footer_ ? NULL_STRING : footer_key_); + return (encrypted_footer_ ? empty_string_ : footer_key_); } const std::string& footer_signing_key_metadata() { - return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); + return (encrypted_footer_ ? empty_string_ : footer_key_metadata_); } const std::string& file_aad() const { return file_aad_; } @@ -508,6 +509,8 @@ class PARQUET_EXPORT FileEncryptionProperties { bool encrypted_footer_; std::string file_aad_; + const std::string empty_string_ = ""; + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_properties_; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1132dd0f704..6d664ed3f0a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1191,7 +1191,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->file_encryption() != nullptr && - props->file_encryption()->footer_signing_key() == NULL_STRING) { + props->file_encryption()->footer_signing_key().empty()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } From 752e33c08fcab67e46018c3d286773e3f59a382f Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 11:45:01 +0300 Subject: [PATCH 064/125] Change ParquetException message format in file_reader.cc --- cpp/src/parquet/file_reader.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index de493a56054..3358764b370 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -278,8 +278,7 @@ class SerializedFile : public ParquetFileReader::Contents { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { throw ParquetException( - "ADD Prefix in file and " - "in properties is not the same"); + "ADD Prefix in file and in properties is not the same"); } } aad_prefix = algo.aad.aad_prefix; @@ -289,9 +288,8 @@ class SerializedFile : public ParquetFileReader::Contents { } if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + "AAD prefix used for file encryption, but not stored in file" + "and not supplied in decryption properties"); } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; @@ -303,16 +301,14 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException( - "Invalid parquet file. Cannot verify plaintext" - "mode footer."); + "Invalid parquet file. Cannot verify plaintext mode footer."); } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); if (!file_metadata_->verify(encryptor, metadata_buffer->data() + read_metadata_len)) { - throw ParquetException( - "Invalid parquet file. Could not verify plaintext" - " footer metadata"); + throw ParquetException("Invalid parquet file. Could not verify plaintext " + "footer metadata"); } } } From a24f1d48e4a0cfe1fad9023cabdce70b84f398c1 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 12:12:38 +0300 Subject: [PATCH 065/125] Make format --- cpp/src/parquet/encryption.cc | 6 +- cpp/src/parquet/encryption.h | 8 +-- cpp/src/parquet/file_reader.cc | 39 ++++++----- cpp/src/parquet/internal_file_decryptor.cc | 79 +++++++++++----------- cpp/src/parquet/internal_file_decryptor.h | 16 ++--- cpp/src/parquet/internal_file_encryptor.cc | 32 ++++----- cpp/src/parquet/internal_file_encryptor.h | 18 ++--- 7 files changed, 96 insertions(+), 102 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 7f3effb2540..c53de674687 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -17,10 +17,10 @@ #include "parquet/encryption.h" -#include -#include #include +#include #include +#include #include "arrow/util/utf8.h" @@ -114,7 +114,7 @@ FileDecryptionProperties::FileDecryptionProperties( std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, + schema::ColumnPath::CmpColumnPath>& column_properties, bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 329845edf82..2c969bde2a9 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -19,8 +19,8 @@ #define PARQUET_ENCRYPTION_H #include -#include #include +#include #include #include "arrow/util/logging.h" @@ -30,7 +30,6 @@ #include "parquet/types.h" #include "parquet/util/visibility.h" - namespace parquet { static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = @@ -318,7 +317,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // The caller should use then a different method to ensure encryption // of files with sensitive data. Builder* plaintext_files_allowed() { - plaintext_files_allowed_ = true; + plaintext_files_allowed_ = true; return this; } @@ -380,7 +379,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, + schema::ColumnPath::CmpColumnPath>& column_properties, bool plaintext_files_allowed); }; @@ -524,7 +523,6 @@ class PARQUET_EXPORT FileEncryptionProperties { schema::ColumnPath::CmpColumnPath>& column_properties); }; - } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 3358764b370..885fb4ceb1b 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -30,11 +30,11 @@ #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" -#include "parquet/file_writer.h" #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" #include "parquet/exception.h" +#include "parquet/file_writer.h" #include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" @@ -60,9 +60,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -72,9 +72,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } @@ -194,8 +194,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), i, properties_, - file_decryptor_.get())); + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -260,7 +259,7 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); auto file_decryption_properties = properties_.file_decryption_properties(); - if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file + if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file if (file_decryption_properties != NULLPTR) { if (!file_decryption_properties->plaintext_files_allowed()) { throw ParquetException("Applying decryption properties on plaintext file"); @@ -294,8 +293,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::string file_aad = aad_prefix + algo.aad.aad_file_unique; file_decryptor_.reset(new InternalFileDecryptor( - file_decryption_properties, - file_aad, algo.algorithm, + file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata())); if (file_decryption_properties->check_plaintext_footer_integrity()) { @@ -307,8 +305,9 @@ class SerializedFile : public ParquetFileReader::Contents { auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); if (!file_metadata_->verify(encryptor, metadata_buffer->data() + read_metadata_len)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext " - "footer metadata"); + throw ParquetException( + "Invalid parquet file. Could not verify plaintext " + "footer metadata"); } } } @@ -370,9 +369,9 @@ class SerializedFile : public ParquetFileReader::Contents { "in decryption properties"); } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, - file_aad, algo.algorithm, - file_crypto_metadata->key_metadata())); + file_decryptor_.reset( + new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata())); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; @@ -480,9 +479,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) - << "The file only has " << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) << "The file only has " + << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 03870baa0f4..6bc93a33884 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -66,31 +66,30 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, } // InternalFileDecryptor - InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, - const std::string& file_aad, - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) - : properties_(properties), file_aad_(file_aad), - algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { - column_data_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - } +InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) + : properties_(properties), + file_aad_(file_aad), + algorithm_(algorithm), + footer_key_metadata_(footer_key_metadata) { + column_data_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); +} std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { - if (footer_signing_encryptor_ != NULLPTR) - return footer_signing_encryptor_; + if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -114,8 +113,8 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); - footer_signing_encryptor_ = std::make_shared( - algorithm_, footer_key, file_aad_, aad); + footer_signing_encryptor_ = + std::make_shared(algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } @@ -165,7 +164,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) : GetDataAesDecryptor(footer_key.size()); std::shared_ptr decryptor = - std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_decryptor, footer_key, file_aad_, aad); if (metadata) footer_metadata_decryptor_ = decryptor; @@ -202,29 +201,29 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( } } - column_key = properties_->column_key(column_path); - // No explicit column key given via API. Retrieve via key metadata. - if (column_key.empty() && !column_key_metadata.empty() && - properties_->key_retriever() != nullptr) { - try { - column_key = properties_->key_retriever()->GetKey(column_key_metadata); - } catch (KeyAccessDeniedException& e) { - std::stringstream ss; - ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " - << e.what() << "\n"; - throw HiddenColumnException(ss.str()); - } + column_key = properties_->column_key(column_path); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + properties_->key_retriever() != nullptr) { + try { + column_key = properties_->key_retriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " << e.what() + << "\n"; + throw HiddenColumnException(ss.str()); } + } if (column_key.empty()) { throw HiddenColumnException("HiddenColumnException, path=" + column_path->ToDotString()); } auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + : GetDataAesDecryptor(column_key.size()); - std::shared_ptr decryptor = std::make_shared( - aes_decryptor, column_key, file_aad_, aad); + std::shared_ptr decryptor = + std::make_shared(aes_decryptor, column_key, file_aad_, aad); if (metadata) (*column_metadata_map_)[column_path] = decryptor; else diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 3abdae2f493..1e0a587b6f9 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -99,14 +99,14 @@ class InternalFileDecryptor { FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_data_map_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_metadata_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; std::shared_ptr footer_metadata_decryptor_; std::shared_ptr footer_data_decryptor_; diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 3801cf6c70b..968b165785d 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -45,17 +45,17 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { - column_data_map_ = std::shared_ptr, - std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr, - std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + column_data_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); } std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { @@ -67,8 +67,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_key, properties_->file_aad(), - aad); + aes_encryptor, footer_key, properties_->file_aad(), aad); footer_encryptor_ = encryptor; return encryptor; } @@ -82,8 +81,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_signing_key, - properties_->file_aad(), aad); + aes_encryptor, footer_signing_key, properties_->file_aad(), aad); footer_signing_encryptor_ = encryptor; return encryptor; } @@ -132,8 +130,8 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( : GetDataAesEncryptor(algorithm, key.size()); std::string file_aad = properties_->file_aad(); - std::shared_ptr encryptor = std::make_shared( - aes_encryptor, key, file_aad, ""); + std::shared_ptr encryptor = + std::make_shared(aes_encryptor, key, file_aad, ""); if (metadata) (*column_metadata_map_)[column_path] = encryptor; else diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index bfa7cca3e0d..7fcb943735c 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -18,9 +18,9 @@ #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H +#include #include #include -#include #include "parquet/schema.h" @@ -63,14 +63,14 @@ class InternalFileEncryptor { private: FileEncryptionProperties* properties_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_data_map_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_metadata_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; From 194738198adecae5c708e84db4af2e6b67530c91 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 10:54:11 +0300 Subject: [PATCH 066/125] Add comments to encryption-reader-writer.cc example --- .../low-level-api/encryption-reader-writer.cc | 148 ++++++++++-------- 1 file changed, 81 insertions(+), 67 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index e0d44d60667..a75c0352cda 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -22,24 +22,22 @@ #include -/* - * This example describes writing and reading Parquet Files in C++ and serves as a - * reference to the API. - * The file contains all the physical data types supported by Parquet. - * This example uses the RowGroupWriter API that supports writing RowGroups optimized for - *memory consumption - **/ -/* Parquet is a structured columnar file format - * Parquet File = "Parquet data" + "Parquet Metadata" - * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a - * columnar layout - * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their - * Columns - * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a - * complex (nested) type (internal nodes) - * For specific details, please refer the format here: - * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + +/* + * This example describes writing and reading Parquet Files in C++ with encrypted columns + * and serves as a reference to the Parquet Modular Encryption API. + * + * A detailed description of the Parquet Modular Encryption specification can be found here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The example contains writing and reading eight columns with the following four different + * encryption configurations: + * 1) uniform encryption - footer and all columns are encrypted with footer key. + * 2) non-uniform encryption - footer and ba_field column are encrypted with different keys. + * 3) plaintext footer mode where all columns are encrypted with footer key. + * 4) footer and ba_field column are encrypted with different keys. no column key + * is provided upon decryption and thus HiddenColumnException is thrown. **/ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; @@ -49,71 +47,83 @@ const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes int main(int argc, char** argv) { - std::vector> file_encryption_properties; - std::vector> file_decryption_properties; + std::vector> vector_of_encryption_configurations; + std::vector> vector_of_decryption_configurations; - // uniform encryption + // encryption configuration #1 - uniform encryption - all columns and footer are + // encrypted with footer key. parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder decryption_properties_builder_1; - decryption_properties_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); - - // non-uniform with column keys + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_1.build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_1 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->build()); + + // encryption configuration #2 - footer and ba_field column are encrypted with + // different keys. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); - encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); - auto encryption_col0 = encryption_col_builder_0.build(); - encryption_cols[path_ptr] = encryption_col0; - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); - decryption_col_builder2.key(COLUMN_ENCRYPTION_KEY); - decryption_cols[path_ptr] = decryption_col_builder2.build(); - - file_encryption_builder_2.column_properties(encryption_cols); + encryption_cols[path_ptr] = encryption_col_builder_0. + key(COLUMN_ENCRYPTION_KEY) + ->build(); - parquet::FileDecryptionProperties::Builder decryption_properties_builder_2; - decryption_properties_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); - decryption_properties_builder_2.column_properties(decryption_cols); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); + decryption_cols[path_ptr] = decryption_col_builder2. + key(COLUMN_ENCRYPTION_KEY) + ->build(); - // plain mode footer = unencrypted footer + parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_2 + .column_properties(encryption_cols) + ->build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_2 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->column_properties(decryption_cols) + ->build()); + + // encryption configuration #3 - plain mode footer parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_3.enable_plaintext_footer(); - - parquet::FileDecryptionProperties::Builder decryption_properties_builder_3; - decryption_properties_builder_3.footer_key(FOOTER_ENCRYPTION_KEY); - - // plaintext mode footer, hidden column + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_3 + .set_plaintext_footer() + ->build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_3 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->build()); + + // encryption configuration #4 - footer and ba_field column are encrypted with different keys. + // no column key is provided upon decryption and thus HiddenColumnException is thrown. parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder file_decryption_builder_4; - file_encryption_builder_4.enable_plaintext_footer(); - file_encryption_builder_4.column_properties(encryption_cols); // reusing encryption_cols - parquet::FileDecryptionProperties::Builder decryption_properties_builder_4; - decryption_properties_builder_4.footer_key(FOOTER_ENCRYPTION_KEY); + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_4 + .set_plaintext_footer() + ->column_properties(encryption_cols) + ->build()); - file_encryption_properties.push_back(file_encryption_builder_1.build()); - file_encryption_properties.push_back(file_encryption_builder_2.build()); - file_encryption_properties.push_back(file_encryption_builder_3.build()); - file_encryption_properties.push_back(file_encryption_builder_4.build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_4 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->build()); - file_decryption_properties.push_back(decryption_properties_builder_1.build()); - file_decryption_properties.push_back(decryption_properties_builder_2.build()); - file_decryption_properties.push_back(decryption_properties_builder_3.build()); - file_decryption_properties.push_back(decryption_properties_builder_4.build()); - - for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { + for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); ++example_id) { /********************************************************************************** PARQUET WRITER EXAMPLE **********************************************************************************/ - // parquet::REQUIRED fields do not need definition and repetition level values - // parquet::OPTIONAL fields require only definition level values - // parquet::REPEATED fields require both definition and repetition level values - // setup for encryption + try { // Create a local file output stream instance. @@ -127,7 +137,9 @@ int main(int argc, char** argv) { // Add writer properties parquet::WriterProperties::Builder builder; builder.compression(parquet::Compression::SNAPPY); - builder.encryption(file_encryption_properties[example_id]); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(vector_of_encryption_configurations[example_id]); std::shared_ptr props = builder.build(); @@ -243,7 +255,9 @@ int main(int argc, char** argv) { try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption_properties(file_decryption_properties[example_id]); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties(vector_of_decryption_configurations[example_id]); // Create a ParquetReader instance std::unique_ptr parquet_reader = From 30c886af740cff5b7ff63061c30c94d5c0022bf4 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 10:56:21 +0300 Subject: [PATCH 067/125] Rename enable_plaintext_footer to set_plaintext_footer --- cpp/src/parquet/encryption.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 2c969bde2a9..e1f66b7fa8e 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -396,7 +396,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Create files with plaintext footer. // If not called, the files will be created with encrypted footer (default). - Builder* enable_plaintext_footer() { + Builder* set_plaintext_footer() { encrypted_footer_ = false; return this; } From 8f911c1af0a91bd7e47efabcdab97e827ad5b543 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:03:16 +0300 Subject: [PATCH 068/125] Rename aad variable in NextPage function --- cpp/src/parquet/column_reader.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 868ee59abc4..f7a1711dc37 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -201,7 +201,6 @@ std::shared_ptr SerializedPageReader::NextPage() { while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; uint32_t allowed_page_size = kDefaultPageHeaderSize; - std::string aad; // Page headers can be very large because of page statistics // We try to deserialize a larger buffer progressively @@ -218,10 +217,11 @@ std::shared_ptr SerializedPageReader::NextPage() { try { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD( + std::string dictionary_page_header_aad; + dictionary_page_header_aad = parquet_encryption::createModuleAAD( meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); - meta_decryptor_->update_aad(aad); + meta_decryptor_->update_aad(dictionary_page_header_aad); } else { parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); meta_decryptor_->update_aad(data_page_headerAAD_); @@ -250,10 +250,11 @@ std::shared_ptr SerializedPageReader::NextPage() { if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD( + std::string dictionary_page_aad; + dictionary_page_aad = parquet_encryption::createModuleAAD( data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); - data_decryptor_->update_aad(aad); + data_decryptor_->update_aad(dictionary_page_aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); data_decryptor_->update_aad(data_pageAAD_); From 18b598fbe086da53382a02f3869d1e17f73f0ec3 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:07:26 +0300 Subject: [PATCH 069/125] Change comment in GetColumnPageReader --- cpp/src/parquet/file_reader.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 885fb4ceb1b..1117ae0f2d3 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -129,8 +129,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { bool encrypted = true; - // file is unencrypted - // or file is encrypted but column is unencrypted + // Column is encrypted only if crypto_metadata exists. if (!crypto_metadata) { encrypted = false; } From 8be2efe1ca517894f3326ee36fbf034c3134cbad Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:10:58 +0300 Subject: [PATCH 070/125] Change additional comments in GetColumnPageReader --- cpp/src/parquet/file_reader.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1117ae0f2d3..232bca8f82f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -140,9 +140,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { (int16_t)i/* column_ordinal */, properties_.memory_pool()); } - // the column is encrypted + // The column is encrypted - // the column is encrypted with footer key + // The column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); @@ -152,8 +152,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.memory_pool(), meta_decryptor, data_decryptor); } - // file is encrypted and the column is encrypted with its own key - + // The column is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); From 131cee692b4e3abfae5abfaf2632ef0a1144eace Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:18:20 +0300 Subject: [PATCH 071/125] Add comments in file_writer.cc --- cpp/src/parquet/file_writer.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 1cfd0d8ebb3..fae26f11b71 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -284,10 +284,12 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption(); - if (file_encryption_properties == nullptr) { + + if (file_encryption_properties == nullptr) { // plaintext regular file file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); - } else { + } else { // Encrypted file + //Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -302,8 +304,7 @@ class FileSerializer : public ParquetFileWriter::Contents { static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); - } else { - // footer plain mode + } else { // Encrypted file with plaintext footer EncryptionAlgorithm signing_encryption; EncryptionAlgorithm algo = file_encryption_properties->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; @@ -437,7 +438,7 @@ std::unique_ptr ParquetFileWriter::Open( void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, bool encrypt_footer) { - if (encryptor == nullptr) { + if (encryptor == nullptr) { // plaintext regular file // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); @@ -450,11 +451,11 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { - if (encrypt_footer) { + } else { // Encrypted file + if (encrypt_footer) { // Encrypted file with encrypted footer // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); - } else { + } else { // Encrypted file with plaintext footer uint32_t metadata_len = static_cast(sink->Tell()); file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; From cb552f67baca53417064f57bca6e39640b7774ad Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:33:30 +0300 Subject: [PATCH 072/125] Create both data and metadata decryptors to avoid redundant retrieval of key from the key_retriever --- cpp/src/parquet/internal_file_decryptor.cc | 46 +++++++++++++--------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 6bc93a33884..e49d66ce4cc 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -161,17 +161,22 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( "Could not parse footer metadata"); } - auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) - : GetDataAesDecryptor(footer_key.size()); - std::shared_ptr decryptor = - std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + // Create both data and metadata decryptors to avoid redundant retrieval of key + // from the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); - if (metadata) - footer_metadata_decryptor_ = decryptor; - else - footer_data_decryptor_ = decryptor; + std::shared_ptr footer_metadata_decryptor = + std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); + std::shared_ptr footer_data_decryptor = + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); + + footer_metadata_decryptor_ = footer_metadata_decryptor; + footer_data_decryptor_ = footer_data_decryptor; - return decryptor; + if (metadata) + return footer_metadata_decryptor; + return footer_data_decryptor; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( @@ -219,17 +224,22 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( column_path->ToDotString()); } - auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + // Create both data and metadata decryptors to avoid redundant retrieval of key + // using the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); - std::shared_ptr decryptor = - std::make_shared(aes_decryptor, column_key, file_aad_, aad); - if (metadata) - (*column_metadata_map_)[column_path] = decryptor; - else - (*column_data_map_)[column_path] = decryptor; + std::shared_ptr metadata_decryptor = + std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); + std::shared_ptr data_decryptor = + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); + + (*column_metadata_map_)[column_path] = metadata_decryptor; + (*column_data_map_)[column_path] = data_decryptor; - return decryptor; + if (metadata) + return metadata_decryptor; + return data_decryptor; } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( From 8cc235fc9c8ed5c4f04a002dc754fc653cb4b149 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:38:44 +0300 Subject: [PATCH 073/125] Fix metadata parameter sent to parquet_encryption::AesDecryptor --- cpp/src/parquet/internal_file_decryptor.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index e49d66ce4cc..d08b807c9ad 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -248,19 +248,19 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { meta_decryptor_128_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { meta_decryptor_196_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { meta_decryptor_256_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); } return meta_decryptor_256_.get(); } From 96623845512ae36de5ec2aff1f5ac74e558f1700 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:41:47 +0300 Subject: [PATCH 074/125] Rename aad in GetFooterEncryptor and GetFooterSigningEncryptor --- cpp/src/parquet/internal_file_encryptor.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 968b165785d..ac4b6aee388 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -63,11 +63,11 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { return footer_encryptor_; } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_key, properties_->file_aad(), aad); + aes_encryptor, footer_key, properties_->file_aad(), footer_aad); footer_encryptor_ = encryptor; return encryptor; } @@ -77,11 +77,11 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { return footer_signing_encryptor_; } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_signing_key, properties_->file_aad(), aad); + aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); footer_signing_encryptor_ = encryptor; return encryptor; } From 7bec5b311ae56140f7472007e75dbc53d7235898 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:49:13 +0300 Subject: [PATCH 075/125] Rename verify to verify_signature --- cpp/src/parquet/file_reader.cc | 5 +++-- cpp/src/parquet/metadata.cc | 7 ++++--- cpp/src/parquet/metadata.h | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 232bca8f82f..7b4df85e0a1 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -301,8 +301,9 @@ class SerializedFile : public ParquetFileReader::Contents { } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify(encryptor, - metadata_buffer->data() + read_metadata_len)) { + if (!file_metadata_->verify_signature(encryptor, + metadata_buffer->data() + + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " "footer metadata"); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 6d664ed3f0a..f33463e52ca 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -447,7 +447,8 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } - bool verify(std::shared_ptr encryptor, const void* tail) { + bool verify_signature(std::shared_ptr encryptor, + const void* tail) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -620,9 +621,9 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify(std::shared_ptr encryptor, +bool FileMetaData::verify_signature(std::shared_ptr encryptor, const void* tail) { - return impl_->verify(encryptor, tail); + return impl_->verify_signature(encryptor, tail); } uint32_t FileMetaData::size() const { return impl_->size(); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 13d2161c7d0..465afb7a9d5 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -202,7 +202,7 @@ class PARQUET_EXPORT FileMetaData { ~FileMetaData(); - bool verify(std::shared_ptr encryptor, const void* tail); + bool verify_signature(std::shared_ptr encryptor, const void* tail); // file metadata uint32_t size() const; int num_columns() const; From 4c1cfb3ba2f9f31c1385b08ab0d5fec065ad03e8 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:53:58 +0300 Subject: [PATCH 076/125] Add comments to void WriteTo --- cpp/src/parquet/metadata.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index f33463e52ca..9d0cf5e6c04 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -495,6 +495,8 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; + // Only in encrypted files with plaintext footers the + // encryption_algorithm is set in footer if (is_encryption_algorithm_set()) { uint8_t* serialized_data; uint32_t serialized_len; @@ -513,7 +515,8 @@ class FileMetaData::FileMetaDataImpl { // write tag dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, parquet_encryption::GCMTagLength); - } else { + } else { // either plaintext file (when encryptor is null) + // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); } } From 1441ca08275d340cd97422d57051540f8f07854f Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:57:19 +0300 Subject: [PATCH 077/125] Add additional comment in void WriteTo --- cpp/src/parquet/metadata.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9d0cf5e6c04..95136e8a1ea 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -510,9 +510,8 @@ class FileMetaData::FileMetaDataImpl { // write unencrypted footer dst->Write(serialized_data, serialized_len); - // write nonce + // Write signature (nonce and tag) dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); - // write tag dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, parquet_encryption::GCMTagLength); } else { // either plaintext file (when encryptor is null) From b219b8d33372469dba8f09a4f7924cdc57d1ee40 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:08:26 +0300 Subject: [PATCH 078/125] Rename file_encryption to file_encryption_properties in WriterProperties --- cpp/src/parquet/file_writer.cc | 4 ++-- cpp/src/parquet/metadata.cc | 18 +++++++++--------- cpp/src/parquet/properties.h | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index fae26f11b71..c9f8bb3855c 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -283,7 +283,7 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - auto file_encryption_properties = properties_->file_encryption(); + auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // plaintext regular file file_metadata_ = metadata_->Finish(); @@ -387,7 +387,7 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr file_encryptor_; void StartFile() { - auto file_encryption_properties = properties_->file_encryption(); + auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 95136e8a1ea..e7c42911807 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -937,8 +937,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - // TODO: check file_encryption() is null or not - auto footer_key = properties_->file_encryption()->footer_encryption_key(); + // TODO: check file_encryption_properties() is null or not + auto footer_key = properties_->file_encryption_properties()->footer_encryption_key(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key @@ -1193,8 +1193,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->file_encryption() != nullptr && - props->file_encryption()->footer_signing_key().empty()) { + if (props->file_encryption_properties() != nullptr && + props->file_encryption_properties()->footer_signing_key().empty()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } @@ -1275,14 +1275,14 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return nullptr; } - auto file_encryption = properties_->file_encryption(); + auto file_encryption_properties = properties_->file_encryption_properties(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->algorithm())); + crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption_properties->algorithm())); std::string key_metadata; - if (file_encryption->encrypted_footer()) - key_metadata = file_encryption->footer_encryption_key_metadata(); + if (file_encryption_properties->encrypted_footer()) + key_metadata = file_encryption_properties->footer_encryption_key_metadata(); else - key_metadata = file_encryption->footer_signing_key_metadata(); + key_metadata = file_encryption_properties->footer_signing_key_metadata(); if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index aee4dbcb2f9..025a5d4c1e9 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -281,8 +281,8 @@ class PARQUET_EXPORT WriterProperties { } Builder* encryption( - const std::shared_ptr& file_encryption) { - file_encryption_ = file_encryption; + const std::shared_ptr& file_encryption_properties) { + file_encryption_properties_ = file_encryption_properties; return this; } @@ -333,7 +333,7 @@ class PARQUET_EXPORT WriterProperties { return std::shared_ptr(new WriterProperties( pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, - pagesize_, version_, created_by_, std::move(file_encryption_), + pagesize_, version_, created_by_, std::move(file_encryption_properties_), default_column_properties_, column_properties)); } @@ -345,7 +345,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; - std::shared_ptr file_encryption_; + std::shared_ptr file_encryption_properties_; // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -369,8 +369,8 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } - inline FileEncryptionProperties* file_encryption() const { - return file_encryption_.get(); + inline FileEncryptionProperties* file_encryption_properties() const { + return file_encryption_properties_.get(); } inline Encoding::type dictionary_index_encoding() const { @@ -418,8 +418,8 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr column_encryption_props( const std::shared_ptr& path) const { - if (file_encryption_) { - return file_encryption_->column_properties(path); + if (file_encryption_properties_) { + return file_encryption_properties_->column_properties(path); } else { return NULLPTR; } @@ -430,7 +430,7 @@ class PARQUET_EXPORT WriterProperties { ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, - std::shared_ptr file_encryption, + std::shared_ptr file_encryption_properties, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -440,7 +440,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), - file_encryption_(file_encryption), + file_encryption_properties_(file_encryption_properties), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -451,7 +451,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; - std::shared_ptr file_encryption_; + std::shared_ptr file_encryption_properties_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; From 88687ad29258aba2848c4204dac66876e066c523 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:13:26 +0300 Subject: [PATCH 079/125] Use encrypted_footer instead of footer_signing_key when checking for encrypted footer --- cpp/src/parquet/metadata.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e7c42911807..32d0f487b8d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1194,7 +1194,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->file_encryption_properties() != nullptr && - props->file_encryption_properties()->footer_signing_key().empty()) { + props->file_encryption_properties()->encrypted_footer()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } From 65ab86481ef5e1e2b3fa5cbce79f02c51f644277 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:18:24 +0300 Subject: [PATCH 080/125] Rename column_encryption_props to column_encryption_properties --- cpp/src/parquet/metadata.cc | 2 +- cpp/src/parquet/properties.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 32d0f487b8d..b94a01653e6 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -912,7 +912,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void WriteTo(::arrow::io::OutputStream* sink, const std::shared_ptr& encryptor) { ThriftSerializer serializer; - const auto& encrypt_md = properties_->column_encryption_props(column_->path()); + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is unencrypted if (!encrypt_md || !encrypt_md->is_encrypted()) { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 025a5d4c1e9..82e58bfd20b 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -416,7 +416,7 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr column_encryption_props( + std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { return file_encryption_properties_->column_properties(path); From 49ff22b861cd355e1d660ced4079e13c2839fecd Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:58:43 +0300 Subject: [PATCH 081/125] Add comments in thrift.h --- cpp/src/parquet/thrift.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 1e2068c9646..10b0729d964 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -200,7 +200,7 @@ template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const std::shared_ptr& decryptor = NULLPTR, bool shouldReadLength = false) { - if (decryptor == NULLPTR) { + if (decryptor == NULLPTR) { // thrift message is not encrypted // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -216,7 +216,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } uint32_t bytes_left = tmem_transport->available_read(); *len = *len - bytes_left; - } else { + } else { // thrift message is encrypted uint32_t clen; if (shouldReadLength) { // first 4 bytes for length @@ -275,10 +275,10 @@ class ThriftSerializer { uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - if (encryptor == NULLPTR) { + if (encryptor == NULLPTR) { // obj is not encrypted PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); - } else { + } else { // obj is encrypted std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); From aef0d7defa0570d5f6eae74f4d540e7452cf37e5 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 13:19:20 +0300 Subject: [PATCH 082/125] Change parameters order in ColumnChunkMetaData::Make --- cpp/src/parquet/column_writer-test.cc | 8 ++++---- cpp/src/parquet/metadata.cc | 12 +++++++----- cpp/src/parquet/metadata.h | 8 +++++--- cpp/src/parquet/statistics-test.cc | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index 95593b72d0b..dd0d65aa5cd 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -244,8 +244,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = ColumnChunkMetaData::Make( - metadata_->contents(), this->descr_, -1, -1, &app_version); + auto metadata_accessor = + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); return metadata_accessor->is_stats_set(); } @@ -254,8 +254,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = ColumnChunkMetaData::Make( - metadata_->contents(), this->descr_, -1, -1, &app_version); + auto metadata_accessor = + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); return {encoded_stats.has_min, encoded_stats.has_max}; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index b94a01653e6..b6d355b7635 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -279,9 +279,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, - int16_t column_ordinal, const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor) { + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor, + int16_t row_group_ordinal, + int16_t column_ordinal) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, file_decryptor)); @@ -386,8 +388,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - row_group_ordinal, (int16_t)i, writer_version_, - file_decryptor); + writer_version_, file_decryptor, + row_group_ordinal, (int16_t)i); } private: diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 465afb7a9d5..02f0e613fd5 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -125,9 +125,11 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, - int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - InternalFileDecryptor* file_decryptor = NULLPTR); + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULLPTR, + InternalFileDecryptor* file_decryptor = NULLPTR, + int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1); ~ColumnChunkMetaData(); diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc index a218accb017..fa1caa96d31 100644 --- a/cpp/src/parquet/statistics-test.cc +++ b/cpp/src/parquet/statistics-test.cc @@ -519,7 +519,7 @@ void AssertStatsSet(const ApplicationVersion& version, const ColumnDescriptor* column, bool expected_is_set) { auto metadata_builder = ColumnChunkMetaDataBuilder::Make(props, column); auto column_chunk = - ColumnChunkMetaData::Make(metadata_builder->contents(), column, -1, -1, &version); + ColumnChunkMetaData::Make(metadata_builder->contents(), column, &version); EncodedStatistics stats; stats.set_is_signed(false); metadata_builder->SetStatistics(stats); From b4debc4bbd54408b56ee99598e54e52133115532 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 13:33:23 +0300 Subject: [PATCH 083/125] Change parameters order in PageReader::Open --- cpp/src/parquet/column_reader.cc | 8 ++++---- cpp/src/parquet/column_reader.h | 5 +++-- cpp/src/parquet/file_reader.cc | 14 ++++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f7a1711dc37..2c37f97a480 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -348,12 +348,12 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, - int16_t column_ordinal, ::arrow::MemoryPool* pool, + Compression::type codec, ::arrow::MemoryPool* pool, + bool column_has_dictionary, int16_t row_group_ordinal, int16_t column_ordinal, std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, - row_group_ordinal, column_ordinal, pool, meta_decryptor, data_decryptor)); + new SerializedPageReader(stream, total_num_rows, codec, pool, column_has_dictionary, + row_group_ordinal, column_ordinal, meta_decryptor, data_decryptor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index f6abd435858..5472a28808e 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -82,9 +82,10 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, bool column_has_dictionary = false, - int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, + Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool column_has_dictionary = false, + int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, std::shared_ptr meta_decryptor = NULLPTR, std::shared_ptr data_decryptor = NULLPTR); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 7b4df85e0a1..4aa785393d7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -136,8 +136,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, - (int16_t)i/* column_ordinal */, properties_.memory_pool()); + properties_.memory_pool(), col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i/* column_ordinal */, ); } // The column is encrypted @@ -148,8 +148,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - properties_.memory_pool(), meta_decryptor, data_decryptor); + properties_.memory_pool(), col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i, meta_decryptor, + data_decryptor); } // The column is encrypted with its own key @@ -163,8 +164,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - properties_.memory_pool(), meta_decryptor, data_decryptor); + properties_.memory_pool(), col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i, meta_decryptor, + data_decryptor); } private: From 11d696554304e42710f930355aa835ad30653045 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 13:57:56 +0300 Subject: [PATCH 084/125] Remove footer_encryption_key and footer_signing_key --- cpp/src/parquet/encryption.h | 18 ++++-------------- cpp/src/parquet/file_writer.cc | 2 +- cpp/src/parquet/internal_file_encryptor.cc | 20 +++++++++++++------- cpp/src/parquet/metadata.cc | 10 +++------- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index e1f66b7fa8e..b0ed3cc608c 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -480,20 +480,12 @@ class PARQUET_EXPORT FileEncryptionProperties { const EncryptionAlgorithm algorithm() { return algorithm_; } - const std::string& footer_encryption_key() { - return (encrypted_footer_ ? footer_key_ : empty_string_); + const std::string& footer_key() { + return footer_key_; } - const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_ ? footer_key_metadata_ : empty_string_); - } - - const std::string& footer_signing_key() { - return (encrypted_footer_ ? empty_string_ : footer_key_); - } - - const std::string& footer_signing_key_metadata() { - return (encrypted_footer_ ? empty_string_ : footer_key_metadata_); + const std::string& footer_key_metadata() { + return footer_key_metadata_; } const std::string& file_aad() const { return file_aad_; } @@ -508,8 +500,6 @@ class PARQUET_EXPORT FileEncryptionProperties { bool encrypted_footer_; std::string file_aad_; - const std::string empty_string_ = ""; - std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_properties_; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index c9f8bb3855c..e88e766f379 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -314,7 +314,7 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( &signing_encryption, - file_encryption_properties->footer_signing_key_metadata()); + file_encryption_properties->footer_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index ac4b6aee388..e39ecd95011 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -62,9 +62,14 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { if (footer_encryptor_ != NULLPTR) { return footer_encryptor_; } + + if (!properties_->encrypted_footer()) { + throw ParquetException("Requesting footer encryptor in file " + "with unencrypted footer"); + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); - std::string footer_key = properties_->footer_encryption_key(); + std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( aes_encryptor, footer_key, properties_->file_aad(), footer_aad); @@ -76,9 +81,14 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { if (footer_signing_encryptor_ != NULLPTR) { return footer_signing_encryptor_; } + + if (properties_->encrypted_footer()) { + throw ParquetException("Requesting signing footer encryptor in file " + "with encrypted footer"); + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); - std::string footer_signing_key = properties_->footer_signing_key(); + std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); @@ -116,11 +126,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( std::string key; if (column_prop->is_encrypted_with_footer_key()) { - if (properties_->encrypted_footer()) { - key = properties_->footer_encryption_key(); - } else { - key = properties_->footer_signing_key(); - } + key = properties_->footer_key(); } else { key = column_prop->key(); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index b6d355b7635..1182d2b0a32 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -939,8 +939,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - // TODO: check file_encryption_properties() is null or not - auto footer_key = properties_->file_encryption_properties()->footer_encryption_key(); + DCHECK(properties_->file_encryption_properties()); + auto footer_key = properties_->file_encryption_properties()->footer_key(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key @@ -1280,11 +1280,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption_properties = properties_->file_encryption_properties(); crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption_properties->algorithm())); - std::string key_metadata; - if (file_encryption_properties->encrypted_footer()) - key_metadata = file_encryption_properties->footer_encryption_key_metadata(); - else - key_metadata = file_encryption_properties->footer_signing_key_metadata(); + std::string key_metadata = file_encryption_properties->footer_key_metadata(); if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); From b4efaac38c0a5996acc213a6fc9851f414609548 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 14:28:03 +0300 Subject: [PATCH 085/125] Remove ParquetException in GetFooterSigningEncryptor and GetFooterEncryptor --- cpp/src/parquet/internal_file_encryptor.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index e39ecd95011..479da97ba34 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -63,10 +63,6 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { return footer_encryptor_; } - if (!properties_->encrypted_footer()) { - throw ParquetException("Requesting footer encryptor in file " - "with unencrypted footer"); - } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_key = properties_->footer_key(); @@ -82,10 +78,6 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { return footer_signing_encryptor_; } - if (properties_->encrypted_footer()) { - throw ParquetException("Requesting signing footer encryptor in file " - "with encrypted footer"); - } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_signing_key = properties_->footer_key(); From 9fd10b1bb41c2bbcbc361253d2232e7ce5cb709c Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 15:20:08 +0300 Subject: [PATCH 086/125] make format --- cpp/src/parquet/column_reader.h | 8 +++----- cpp/src/parquet/encryption.h | 8 ++------ cpp/src/parquet/file_reader.cc | 5 ++--- cpp/src/parquet/file_writer.cc | 17 ++++++++--------- cpp/src/parquet/internal_file_decryptor.cc | 12 +++++------- cpp/src/parquet/metadata.cc | 17 ++++++++--------- cpp/src/parquet/metadata.h | 6 +++--- 7 files changed, 31 insertions(+), 42 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 5472a28808e..f3363106bc9 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -82,11 +82,9 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool column_has_dictionary = false, - int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, - std::shared_ptr meta_decryptor = NULLPTR, + Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool column_has_dictionary = false, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, std::shared_ptr meta_decryptor = NULLPTR, std::shared_ptr data_decryptor = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index b0ed3cc608c..5ad94e99934 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -480,13 +480,9 @@ class PARQUET_EXPORT FileEncryptionProperties { const EncryptionAlgorithm algorithm() { return algorithm_; } - const std::string& footer_key() { - return footer_key_; - } + const std::string& footer_key() { return footer_key_; } - const std::string& footer_key_metadata() { - return footer_key_metadata_; - } + const std::string& footer_key_metadata() { return footer_key_metadata_; } const std::string& file_aad() const { return file_aad_; } diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 4aa785393d7..8cbbb5f7ed5 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -303,9 +303,8 @@ class SerializedFile : public ParquetFileReader::Contents { } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify_signature(encryptor, - metadata_buffer->data() - + read_metadata_len)) { + if (!file_metadata_->verify_signature( + encryptor, metadata_buffer->data() + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " "footer metadata"); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index e88e766f379..233e96d3cea 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -285,11 +285,11 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); - if (file_encryption_properties == nullptr) { // plaintext regular file + if (file_encryption_properties == nullptr) { // plaintext regular file file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); - } else { // Encrypted file - //Encrypted file with encrypted footer + } else { // Encrypted file + // Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -304,7 +304,7 @@ class FileSerializer : public ParquetFileWriter::Contents { static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); - } else { // Encrypted file with plaintext footer + } else { // Encrypted file with plaintext footer EncryptionAlgorithm signing_encryption; EncryptionAlgorithm algo = file_encryption_properties->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; @@ -313,8 +313,7 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, - file_encryption_properties->footer_key_metadata()); + &signing_encryption, file_encryption_properties->footer_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); @@ -438,7 +437,7 @@ std::unique_ptr ParquetFileWriter::Open( void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, bool encrypt_footer) { - if (encryptor == nullptr) { // plaintext regular file + if (encryptor == nullptr) { // plaintext regular file // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); @@ -451,11 +450,11 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { // Encrypted file + } else { // Encrypted file if (encrypt_footer) { // Encrypted file with encrypted footer // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); - } else { // Encrypted file with plaintext footer + } else { // Encrypted file with plaintext footer uint32_t metadata_len = static_cast(sink->Tell()); file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d08b807c9ad..d2f4ea139c9 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -167,15 +167,14 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); std::shared_ptr footer_metadata_decryptor = - std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); std::shared_ptr footer_data_decryptor = - std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); footer_metadata_decryptor_ = footer_metadata_decryptor; footer_data_decryptor_ = footer_data_decryptor; - if (metadata) - return footer_metadata_decryptor; + if (metadata) return footer_metadata_decryptor; return footer_data_decryptor; } @@ -232,13 +231,12 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr metadata_decryptor = std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); std::shared_ptr data_decryptor = - std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); (*column_metadata_map_)[column_path] = metadata_decryptor; (*column_data_map_)[column_path] = data_decryptor; - if (metadata) - return metadata_decryptor; + if (metadata) return metadata_decryptor; return data_decryptor; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1182d2b0a32..97154c38006 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -280,10 +280,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor, - int16_t row_group_ordinal, - int16_t column_ordinal) { + const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor, + int16_t row_group_ordinal, int16_t column_ordinal) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, file_decryptor)); @@ -388,8 +386,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryptor, - row_group_ordinal, (int16_t)i); + writer_version_, file_decryptor, row_group_ordinal, + (int16_t)i); } private: @@ -516,7 +514,7 @@ class FileMetaData::FileMetaDataImpl { dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, parquet_encryption::GCMTagLength); - } else { // either plaintext file (when encryptor is null) + } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); } @@ -626,7 +624,7 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { } bool FileMetaData::verify_signature(std::shared_ptr encryptor, - const void* tail) { + const void* tail) { return impl_->verify_signature(encryptor, tail); } @@ -1279,7 +1277,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption_properties = properties_->file_encryption_properties(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption_properties->algorithm())); + crypto_metadata_->__set_encryption_algorithm( + ToThrift(file_encryption_properties->algorithm())); std::string key_metadata = file_encryption_properties->footer_key_metadata(); if (!key_metadata.empty()) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 02f0e613fd5..6e66c87e5a2 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -127,8 +127,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, - InternalFileDecryptor* file_decryptor = NULLPTR, - int16_t row_group_ordinal = -1, + InternalFileDecryptor* file_decryptor = NULLPTR, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1); ~ColumnChunkMetaData(); @@ -204,7 +203,8 @@ class PARQUET_EXPORT FileMetaData { ~FileMetaData(); - bool verify_signature(std::shared_ptr encryptor, const void* tail); + bool verify_signature(std::shared_ptr encryptor, + const void* tail); // file metadata uint32_t size() const; int num_columns() const; From 068743cc5712bdea28985dc5e2ec65affcf40c90 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 15:32:03 +0300 Subject: [PATCH 087/125] make format in thrift.h --- cpp/src/parquet/thrift.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 10b0729d964..497a0e0785c 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -200,7 +200,8 @@ template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const std::shared_ptr& decryptor = NULLPTR, bool shouldReadLength = false) { - if (decryptor == NULLPTR) { // thrift message is not encrypted + // thrift message is not encrypted + if (decryptor == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -216,7 +217,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } uint32_t bytes_left = tmem_transport->available_read(); *len = *len - bytes_left; - } else { // thrift message is encrypted + } else { // thrift message is encrypted uint32_t clen; if (shouldReadLength) { // first 4 bytes for length @@ -275,10 +276,11 @@ class ThriftSerializer { uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - if (encryptor == NULLPTR) { // obj is not encrypted + // obj is not encrypted + if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); - } else { // obj is encrypted + } else { // obj is encrypted std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); From cb110401fbd51b68e691eb57328180e8796ba5b1 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Wed, 15 May 2019 09:49:41 +0700 Subject: [PATCH 088/125] fix rebase mistake in parquet.thrift --- cpp/src/parquet/parquet.thrift | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index 000b74dde1c..b6795e3d487 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -570,7 +570,7 @@ struct PageHeader { /** Uncompressed page size in bytes (not including this header) **/ 2: required i32 uncompressed_page_size - /** Compressed page size in bytes (not including this header) **/ + /** Compressed (and potentially encrypted) page size in bytes, not including this header **/ 3: required i32 compressed_page_size /** 32bit crc for the data below. This allows for disabling checksumming in HDFS @@ -874,7 +874,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -955,6 +955,7 @@ struct FileMetaData { 9: optional binary footer_signing_key_metadata } +/** Crypto metadata for files with encrypted footer **/ struct FileCryptoMetaData { /** * Encryption algorithm. This field is only used for files @@ -967,3 +968,5 @@ struct FileCryptoMetaData { * and (possibly) columns **/ 2: optional binary key_metadata } + + From 8ba072c3f35736080bdb0b150f3238be95d136f1 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Sun, 19 May 2019 16:29:12 +0300 Subject: [PATCH 089/125] Fix aad settings in thrift.h --- cpp/src/parquet/thrift.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 497a0e0785c..73b1ab007c5 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -162,17 +162,29 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { format::AesGcmV1 aesGcmV1; - aesGcmV1.__set_aad_prefix(aad.aad_prefix); + // aad_file_unique is always set + aesGcmV1.__isset.aad_file_unique = true; aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmV1.__isset.supply_aad_prefix = true; aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmV1.__isset.aad_prefix = true; + aesGcmV1.__set_aad_prefix(aad.aad_prefix); + } return aesGcmV1; } static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { format::AesGcmCtrV1 aesGcmCtrV1; - aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + // aad_file_unique is always set + aesGcmCtrV1.__isset.aad_file_unique = true; aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmCtrV1.__isset.supply_aad_prefix = true; aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmCtrV1.__isset.aad_prefix = true; + aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + } return aesGcmCtrV1; } From 04507d76bcfe1d1b4573a735b216a0009e8d277a Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 20 May 2019 09:18:28 +0300 Subject: [PATCH 090/125] Port key erasure mechanism --- cpp/src/parquet/encryption.cc | 13 +++ cpp/src/parquet/encryption.h | 101 +++++++++++++++++++-- cpp/src/parquet/file_reader.cc | 11 ++- cpp/src/parquet/file_writer.cc | 3 + cpp/src/parquet/internal_file_decryptor.cc | 41 ++++++--- cpp/src/parquet/internal_file_decryptor.h | 4 + cpp/src/parquet/internal_file_encryptor.cc | 35 ++++--- cpp/src/parquet/internal_file_encryptor.h | 4 + 8 files changed, 177 insertions(+), 35 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index c53de674687..9666e9ae2bb 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -65,6 +65,12 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, const std::string& key_metadata) : column_path_(column_path) { + // column encryption properties object (with a column key) can be used for writing only + // one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + DCHECK(column_path != nullptr); if (!encrypted) { DCHECK(key.empty() && key_metadata.empty()); @@ -87,6 +93,7 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( ColumnDecryptionProperties::ColumnDecryptionProperties( const std::shared_ptr& column_path, const std::string& key) : column_path_(column_path) { + utilized_ = false; DCHECK(column_path != nullptr); if (!key.empty()) { @@ -133,6 +140,7 @@ FileDecryptionProperties::FileDecryptionProperties( aad_prefix_ = aad_prefix; column_properties_ = column_properties; plaintext_files_allowed_ = plaintext_files_allowed; + utilized_ = false; } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( @@ -176,6 +184,11 @@ FileEncryptionProperties::FileEncryptionProperties( footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), column_properties_(column_properties) { + // file encryption properties object can be used for writing only one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 5ad94e99934..05f8da611b9 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/util/logging.h" #include "parquet/encryption.h" @@ -101,11 +102,14 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; + // The key is cloned, and will be wiped out (array values set to 0) upon completion of + // file reading. + // Caller is responsible for wiping out the input key array. + Builder* key(std::string column_key) { + if (column_key.empty()) return this; + + DCHECK(key_.empty()); + key_ = column_key; return this; } @@ -143,6 +147,19 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } const std::string& key() const { return key_; } const std::string& key_metadata() const { return key_metadata_; } + void wipeout_encryption_key() { + if (!key_.empty()) { + std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); + } + } + + bool is_utilized() { + if (key_.empty()) + return false; // can re-use column properties without encryption keys + return utilized_; + } + + void set_utilized() { utilized_ = true; } ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; @@ -154,6 +171,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; + bool utilized_; explicit ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, const std::string& key_metadata); @@ -198,10 +216,20 @@ class PARQUET_EXPORT ColumnDecryptionProperties { const std::shared_ptr& column_path() { return column_path_; } const std::string& key() const { return key_; } + bool is_utilized() { return utilized_; } + + void set_utilized() { utilized_ = true; } + + void wipeout_decryption_key() { + if (!key_.empty()) { + std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); + } + } private: const std::shared_ptr column_path_; std::string key_; + bool utilized_; // This class is only required for setting explicit column decryption keys - // to override key retriever (or to provide keys when key metadata and/or @@ -235,12 +263,15 @@ class PARQUET_EXPORT FileDecryptionProperties { // If explicit key is not set, footer key will be fetched from // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(const std::string& footer_key) { - if (footer_key.empty()) { + // The key is cloned, and will be wiped out (array values set to 0) upon completion of + // file reading. + // Caller is responsible for wiping out the input key array. + Builder* footer_key(const std::string column_key) { + if (column_key.empty()) { return this; } - DCHECK(!footer_key.empty()); - footer_key_ = footer_key; + DCHECK(footer_key_.empty()); + footer_key_ = column_key; return this; } @@ -259,6 +290,15 @@ class PARQUET_EXPORT FileDecryptionProperties { if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + column_properties_ = column_properties; return this; } @@ -357,6 +397,26 @@ class PARQUET_EXPORT FileDecryptionProperties { return aad_prefix_verifier_; } + void wipeout_decryption_keys() { + if (!footer_key_.empty()) + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_decryption_key(); + } + } + + bool is_utilized() { + if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) + return false; + + return utilized_; + } + + void set_utilized() { utilized_ = true; } + private: std::string footer_key_; std::string aad_prefix_; @@ -371,6 +431,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; bool plaintext_files_allowed_; + bool utilized_; FileDecryptionProperties( const std::string& footer_key, @@ -453,6 +514,14 @@ class PARQUET_EXPORT FileEncryptionProperties { if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } column_properties_ = column_properties; return this; } @@ -489,12 +558,26 @@ class PARQUET_EXPORT FileEncryptionProperties { std::shared_ptr column_properties( const std::shared_ptr& column_path); + bool is_utilized() { return utilized_; } + + void set_utilized() { utilized_ = true; } + + void wipeout_encryption_keys() { + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_encryption_key(); + } + } + private: EncryptionAlgorithm algorithm_; std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; std::string file_aad_; + bool utilized_; std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8cbbb5f7ed5..48272213f59 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -190,7 +190,16 @@ class SerializedFile : public ParquetFileReader::Contents { const ReaderProperties& props = default_reader_properties()) : source_(source), properties_(props) {} - void Close() override {} + ~SerializedFile() override { + try { + Close(); + } catch (...) { + } + } + + void Close() override { + if (file_decryptor_) file_decryptor_->wipeout_decryption_keys(); + } std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents(new SerializedRowGroup( diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 233e96d3cea..fd198b9536e 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -318,6 +318,9 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); } + if (file_encryptor_) { + file_encryptor_->wipeout_encryption_keys(); + } } sink_->Close(); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d2f4ea139c9..b112896c8fc 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -35,7 +35,7 @@ FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& aad) : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { aes_encryptor_.reset(new parquet_encryption::AesEncryptor( - algorithm, static_cast(key_.size()), true)); + algorithm, static_cast(key_.size()), true, NULLPTR)); } int FooterSigningEncryptor::CiphertextSizeDelta() { @@ -74,6 +74,14 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie file_aad_(file_aad), algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { + if (properties_->is_utilized()) { + throw ParquetException( + "Re-using decryption properties with explicit keys for another file"); + } + properties_->set_utilized(); + + all_decryptors_ = std::shared_ptr>( + new std::list); column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( @@ -87,6 +95,13 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie schema::ColumnPath::CmpColumnPath>()); } +void InternalFileDecryptor::wipeout_decryption_keys() { + properties_->wipeout_decryption_keys(); + for (auto const& i : *all_decryptors_) { + i->WipeOut(); + } +} + std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; @@ -245,20 +260,20 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); + meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); + meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); + meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_256_.get(); } @@ -270,20 +285,20 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 1e0a587b6f9..fd13a6f289f 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -18,6 +18,7 @@ #ifndef INTERNAL_FILE_DECRYPTOR_H #define INTERNAL_FILE_DECRYPTOR_H +#include #include #include #include @@ -85,6 +86,8 @@ class InternalFileDecryptor { FileDecryptionProperties* properties() { return properties_; } + void wipeout_decryption_keys(); + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); @@ -113,6 +116,7 @@ class InternalFileDecryptor { ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_signing_encryptor_; + std::shared_ptr> all_decryptors_; std::unique_ptr meta_decryptor_128_; std::unique_ptr meta_decryptor_196_; diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 479da97ba34..99f18b38fbd 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -45,6 +45,9 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { + all_encryptors_ = std::shared_ptr>( + new std::list); + column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( @@ -58,6 +61,14 @@ InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* propertie schema::ColumnPath::CmpColumnPath>()); } +void InternalFileEncryptor::wipeout_encryption_keys() { + properties_->wipeout_encryption_keys(); + + for (auto const& i : *all_encryptors_) { + i->WipeOut(); + } +} + std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { if (footer_encryptor_ != NULLPTR) { return footer_encryptor_; @@ -143,20 +154,20 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_128_.get(); } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_196_.get(); } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_256_.get(); } @@ -168,20 +179,20 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, false, all_encryptors_)); } return data_encryptor_128_.get(); } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, false, all_encryptors_)); } return data_encryptor_196_.get(); } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, false, all_encryptors_)); } return data_encryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 7fcb943735c..40d00bb5778 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -18,6 +18,7 @@ #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H +#include #include #include #include @@ -59,6 +60,7 @@ class InternalFileEncryptor { const std::shared_ptr& column_path); std::shared_ptr GetColumnDataEncryptor( const std::shared_ptr& column_path); + void wipeout_encryption_keys(); private: FileEncryptionProperties* properties_; @@ -75,6 +77,8 @@ class InternalFileEncryptor { std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; + std::shared_ptr> all_encryptors_; + std::unique_ptr meta_encryptor_128_; std::unique_ptr meta_encryptor_196_; std::unique_ptr meta_encryptor_256_; From 7636dd4035d845bd56ab584bfaa62624b9fea302 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 20 May 2019 10:18:24 +0300 Subject: [PATCH 091/125] Fix columnMetaData --- cpp/src/parquet/internal_file_encryptor.h | 14 +++++ cpp/src/parquet/metadata.cc | 62 ++++++++++++++--------- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 40d00bb5778..7b382de461e 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -23,6 +23,7 @@ #include #include +#include "parquet/encryption.h" #include "parquet/schema.h" namespace parquet_encryption { @@ -32,6 +33,7 @@ class AesEncryptor; namespace parquet { class FileEncryptionProperties; +class ColumnEncryptionProperties; class Encryptor { public: @@ -43,6 +45,18 @@ class Encryptor { int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); + bool encryptColumnMetaData( + bool encrypted_footer, + const std::shared_ptr& column_encryption_properties) { + // if column is not encrypted then do not encrypt the column metadata + if (!column_encryption_properties || !column_encryption_properties->is_encrypted()) + return false; + // if plaintext footer then encrypt the column metadata + if (!encrypted_footer) return true; + // if column is not encrypted with footer key then encrypt the column metadata + return !column_encryption_properties->is_encrypted_with_footer_key(); + } + private: parquet_encryption::AesEncryptor* aes_encryptor_; std::string key_; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 97154c38006..9eef469a7e9 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -912,20 +912,21 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void WriteTo(::arrow::io::OutputStream* sink, const std::shared_ptr& encryptor) { ThriftSerializer serializer; - const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is unencrypted - if (!encrypt_md || !encrypt_md->is_encrypted()) { + if (encryptor == NULLPTR) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); serializer.Serialize(column_chunk_, sink); } else { // column is encrypted + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + bool encrypt_metadata = encryptor->encryptColumnMetaData( + properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); column_chunk_->__isset.crypto_metadata = true; - - // encrypted with footer key format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { + // encrypted with footer key ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key @@ -937,17 +938,15 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - DCHECK(properties_->file_encryption_properties()); - auto footer_key = properties_->file_encryption_properties()->footer_key(); - - // non-uniform: footer is unencrypted, or column is encrypted with a column-specific - // key - if ((footer_key.empty() && encrypt_md->is_encrypted()) || - !encrypt_md->is_encrypted_with_footer_key()) { + if (!encrypt_metadata) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + } else { // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; uint32_t serialized_len; + serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); @@ -961,25 +960,40 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Keep redacted metadata version for old readers - if (footer_key.empty()) { - format::ColumnMetaData metadata_redacted = column_metadata_; - if (metadata_redacted.__isset.statistics) { - metadata_redacted.__isset.statistics = false; + if (!properties_->file_encryption_properties()->encrypted_footer()) { + // metadata_redacted should be stripped of the column_metadata_ statistics. + format::ColumnMetaData metadata_redacted; + metadata_redacted.__set_type(column_metadata_.type); + metadata_redacted.__set_encodings(column_metadata_.encodings); + metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); + metadata_redacted.__set_codec(column_metadata_.codec); + metadata_redacted.__set_num_values(column_metadata_.num_values); + metadata_redacted.__set_total_uncompressed_size( + column_metadata_.total_uncompressed_size); + metadata_redacted.__set_total_compressed_size( + column_metadata_.total_compressed_size); + if (column_metadata_.__isset.key_value_metadata) { + metadata_redacted.__isset.key_value_metadata = true; + metadata_redacted.__set_key_value_metadata( + column_metadata_.key_value_metadata); + } + metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); + if (column_metadata_.__isset.index_page_offset) { + metadata_redacted.__isset.index_page_offset = true; + metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); } - if (metadata_redacted.__isset.encoding_stats) { - metadata_redacted.__isset.encoding_stats = false; + if (column_metadata_.__isset.dictionary_page_offset) { + metadata_redacted.__isset.dictionary_page_offset = true; + metadata_redacted.__set_dictionary_page_offset( + column_metadata_.dictionary_page_offset); } + metadata_redacted.__isset.statistics = false; + metadata_redacted.__isset.encoding_stats = false; + column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(metadata_redacted); - } else { - // don't set meta_data - column_chunk_->__isset.meta_data = true; } - } else { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); } - serializer.Serialize(column_chunk_, sink); } } From ca2d8a6a0a00e5cf0b460e4b2efaad2c9247d568 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 20 May 2019 10:49:33 +0300 Subject: [PATCH 092/125] Minor fixes to previous code --- cpp/src/parquet/encryption.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 05f8da611b9..90d9361b2b8 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -103,7 +103,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. // The key is cloned, and will be wiped out (array values set to 0) upon completion of - // file reading. + // file writing. // Caller is responsible for wiping out the input key array. Builder* key(std::string column_key) { if (column_key.empty()) return this; @@ -262,16 +262,19 @@ class PARQUET_EXPORT FileDecryptionProperties { // will be decrypted/verified with this key. // If explicit key is not set, footer key will be fetched from // key retriever. - // param footerKey Key length must be either 16, 24 or 32 bytes. - // The key is cloned, and will be wiped out (array values set to 0) upon completion of - // file reading. + // With explicit keys or AAD prefix, new encryption properties object must be created + // for each encrypted file. + // Explicit encryption keys (footer and column) are cloned. + // Upon completion of file reading, the cloned encryption keys in the properties will + // be wiped out (array values set to 0). // Caller is responsible for wiping out the input key array. - Builder* footer_key(const std::string column_key) { - if (column_key.empty()) { + // param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* footer_key(const std::string footer_key) { + if (footer_key.empty()) { return this; } DCHECK(footer_key_.empty()); - footer_key_ = column_key; + footer_key_ = footer_key; return this; } From 56204abfbce8ead583b5afae8c33f65ad840533d Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Fri, 17 May 2019 17:29:11 +0700 Subject: [PATCH 093/125] fix build issue on MacOS --- cpp/src/parquet/column_reader.cc | 3 ++- cpp/src/parquet/column_writer.cc | 4 ++-- cpp/src/parquet/encryption.h | 1 + cpp/src/parquet/internal_file_decryptor.cc | 2 +- cpp/src/parquet/internal_file_decryptor.h | 1 - 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 2c37f97a480..b7f64b2b46d 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -273,7 +273,8 @@ std::shared_ptr SerializedPageReader::NextPage() { // Decrypt it if we need to if (data_decryptor_ != nullptr) { - decryption_buffer_->Resize(compressed_len - data_decryptor_->CiphertextSizeDelta()); + PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( + compressed_len - data_decryptor_->CiphertextSizeDelta())); compressed_len = data_decryptor_->Decrypt(buffer, compressed_len, decryption_buffer_->mutable_data()); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 1e963187559..dfe30c17848 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -280,8 +280,8 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); data_encryptor_->update_aad(data_pageAAD_); - encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + - output_data_len); + PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( + data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 90d9361b2b8..89541f75c26 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -246,6 +246,7 @@ class PARQUET_EXPORT AADPrefixVerifier { // In a data set, AAD Prefixes should be collected, // and then checked for missing files. virtual void check(const std::string& aad_prefix) = 0; + virtual ~AADPrefixVerifier() {} }; class PARQUET_EXPORT FileDecryptionProperties { diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index b112896c8fc..11e44b7415d 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -33,7 +33,7 @@ FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, const std::string& file_aad, const std::string& aad) - : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { + : key_(key), file_aad_(file_aad), aad_(aad) { aes_encryptor_.reset(new parquet_encryption::AesEncryptor( algorithm, static_cast(key_.size()), true, NULLPTR)); } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index fd13a6f289f..4a8e0e7b0d4 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -43,7 +43,6 @@ class FooterSigningEncryptor { uint8_t* encrypted_footer); private: - ParquetCipher::type algorithm_; std::string key_; std::string file_aad_; std::string aad_; From 595716f580954eb49e5f536bb470b822ee96ba6b Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Thu, 30 May 2019 10:32:16 +0700 Subject: [PATCH 094/125] apply change from crypto package --- cpp/src/parquet/CMakeLists.txt | 4 +-- cpp/src/parquet/column_reader.cc | 23 +++++++-------- cpp/src/parquet/column_writer.cc | 22 +++++++------- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/file_writer.cc | 3 +- cpp/src/parquet/internal_file_decryptor.cc | 30 +++++++++---------- cpp/src/parquet/internal_file_decryptor.h | 34 +++++++++++----------- cpp/src/parquet/internal_file_encryptor.cc | 28 +++++++++--------- cpp/src/parquet/internal_file_encryptor.h | 31 ++++++++++---------- cpp/src/parquet/metadata.cc | 16 +++++----- cpp/src/parquet/thrift.h | 4 ++- 11 files changed, 99 insertions(+), 98 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 9452122b522..f8d60432fa6 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -167,6 +167,7 @@ set(PARQUET_SRCS deprecated_io.cc encoding.cc encryption.cc + encryption_internal.cc internal_file_decryptor.cc internal_file_encryptor.cc file_reader.cc @@ -180,8 +181,7 @@ set(PARQUET_SRCS properties.cc schema.cc statistics.cc - types.cc - util/crypto.cc) + types.cc) # Ensure that thrift compilation is done before using its generated headers # in parquet code. diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index b7f64b2b46d..b00b9e99195 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -32,13 +32,12 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" #include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" -#include "parquet/util/crypto.h" - using arrow::MemoryPool; namespace parquet { @@ -133,14 +132,14 @@ class SerializedPageReader : public PageReader { if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); // prepare the AAD for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, + data_pageAAD_ = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { DCHECK(!meta_decryptor_->file_aad().empty()); - data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), parquet_encryption::DataPageHeader, + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); } } @@ -218,12 +217,12 @@ std::shared_ptr SerializedPageReader::NextPage() { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { std::string dictionary_page_header_aad; - dictionary_page_header_aad = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + dictionary_page_header_aad = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); meta_decryptor_->update_aad(dictionary_page_header_aad); } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); meta_decryptor_->update_aad(data_page_headerAAD_); } } @@ -251,12 +250,12 @@ std::shared_ptr SerializedPageReader::NextPage() { DCHECK(!data_decryptor_->file_aad().empty()); if (current_page_is_dictionary) { std::string dictionary_page_aad; - dictionary_page_aad = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, + dictionary_page_aad = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); data_decryptor_->update_aad(dictionary_page_aad); } else { - parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); data_decryptor_->update_aad(data_pageAAD_); } } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index dfe30c17848..e35786f3781 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -31,6 +31,7 @@ #include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" +#include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" @@ -38,7 +39,6 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" #include "parquet/types.h" -#include "parquet/util/crypto.h" namespace parquet { @@ -149,13 +149,13 @@ class SerializedPageWriter : public PageWriter { data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR) { // prepare the add for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD( - data_encryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, + data_pageAAD_ = encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { - data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), parquet_encryption::DataPageHeader, + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); @@ -210,8 +210,8 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + meta_encryptor_->update_aad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } int64_t header_size = @@ -232,8 +232,8 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); if (meta_encryptor_ != nullptr) { - meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, + meta_encryptor_->update_aad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk @@ -278,7 +278,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { - parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); data_encryptor_->update_aad(data_pageAAD_); PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( data_encryptor_->CiphertextSizeDelta() + output_data_len)); @@ -301,7 +301,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); meta_encryptor_->update_aad(data_page_headerAAD_); } int64_t header_size = diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 48272213f59..b66c90ac0ca 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -33,6 +33,7 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/file_writer.h" #include "parquet/internal_file_decryptor.h" @@ -41,7 +42,6 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/crypto.h" #include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index fd198b9536e..ba6f69543ac 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -23,10 +23,9 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" #include "parquet/platform.h" +#include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" -#include "parquet/util/crypto.h" -#include "parquet/util/memory.h" using arrow::MemoryPool; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 11e44b7415d..b2232fa80c6 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -17,7 +17,7 @@ #include "parquet/internal_file_decryptor.h" #include "parquet/encryption.h" -#include "parquet/util/crypto.h" +#include "parquet/encryption_internal.h" namespace parquet { @@ -34,7 +34,7 @@ FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& file_aad, const std::string& aad) : key_(key), file_aad_(file_aad), aad_(aad) { - aes_encryptor_.reset(new parquet_encryption::AesEncryptor( + aes_encryptor_.reset(encryption::AesEncryptor::Make( algorithm, static_cast(key_.size()), true, NULLPTR)); } @@ -51,7 +51,7 @@ int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int foote } // Decryptor -Decryptor::Decryptor(parquet_encryption::AesDecryptor* aes_decryptor, +Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, const std::string& file_aad, const std::string& aad) : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} @@ -80,8 +80,8 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie } properties_->set_utilized(); - all_decryptors_ = std::shared_ptr>( - new std::list); + all_decryptors_ = std::shared_ptr>( + new std::vector); column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( @@ -126,7 +126,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { "plaintext footer metadata"); } - std::string aad = parquet_encryption::createFooterAAD(file_aad_); + std::string aad = encryption::CreateFooterAad(file_aad_); footer_signing_encryptor_ = std::make_shared(algorithm_, footer_key, file_aad_, aad); @@ -134,7 +134,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { } std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { - std::string aad = parquet_encryption::createFooterAAD(file_aad_); + std::string aad = encryption::CreateFooterAad(file_aad_); return GetFooterDecryptor(aad, true); } @@ -255,24 +255,24 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( return data_decryptor; } -parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( +encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + meta_decryptor_128_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + meta_decryptor_196_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + meta_decryptor_256_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_256_.get(); @@ -280,24 +280,24 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( +encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + data_decryptor_128_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + data_decryptor_196_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + data_decryptor_256_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_256_.get(); diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 4a8e0e7b0d4..3013183bf94 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -18,19 +18,19 @@ #ifndef INTERNAL_FILE_DECRYPTOR_H #define INTERNAL_FILE_DECRYPTOR_H -#include #include #include #include +#include #include "parquet/schema.h" -namespace parquet_encryption { +namespace parquet { + +namespace encryption { class AesDecryptor; class AesEncryptor; -} // namespace parquet_encryption - -namespace parquet { +} // namespace encryption class FileDecryptionProperties; @@ -47,12 +47,12 @@ class FooterSigningEncryptor { std::string file_aad_; std::string aad_; - std::shared_ptr aes_encryptor_; + std::shared_ptr aes_encryptor_; }; class Decryptor { public: - Decryptor(parquet_encryption::AesDecryptor* decryptor, const std::string& key, + Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } @@ -62,7 +62,7 @@ class Decryptor { int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); private: - parquet_encryption::AesDecryptor* aes_decryptor_; + encryption::AesDecryptor* aes_decryptor_; std::string key_; std::string file_aad_; std::string aad_; @@ -115,14 +115,14 @@ class InternalFileDecryptor { ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_signing_encryptor_; - std::shared_ptr> all_decryptors_; + std::shared_ptr> all_decryptors_; - std::unique_ptr meta_decryptor_128_; - std::unique_ptr meta_decryptor_196_; - std::unique_ptr meta_decryptor_256_; - std::unique_ptr data_decryptor_128_; - std::unique_ptr data_decryptor_196_; - std::unique_ptr data_decryptor_256_; + std::unique_ptr meta_decryptor_128_; + std::unique_ptr meta_decryptor_196_; + std::unique_ptr meta_decryptor_256_; + std::unique_ptr data_decryptor_128_; + std::unique_ptr data_decryptor_196_; + std::unique_ptr data_decryptor_256_; std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( @@ -130,8 +130,8 @@ class InternalFileDecryptor { const std::string& column_key_metadata, const std::string& aad, bool metadata = false); - parquet_encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); - parquet_encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); }; } // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 99f18b38fbd..dabc804b0ad 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -17,7 +17,7 @@ #include "parquet/internal_file_encryptor.h" #include "parquet/encryption.h" -#include "parquet/util/crypto.h" +#include "parquet/encryption_internal.h" namespace parquet { @@ -29,7 +29,7 @@ static inline uint8_t* str2bytes(const std::string& str) { } // Encryptor -Encryptor::Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, +Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad) : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} @@ -45,8 +45,8 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { - all_encryptors_ = std::shared_ptr>( - new std::list); + all_encryptors_ = std::shared_ptr>( + new std::vector); column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, @@ -75,7 +75,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( @@ -90,7 +90,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( @@ -149,24 +149,24 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( return encryptor; } -parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( +encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + meta_encryptor_128_.reset(encryption::AesEncryptor::Make( algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_128_.get(); } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + meta_encryptor_196_.reset(encryption::AesEncryptor::Make( algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_196_.get(); } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + meta_encryptor_256_.reset(encryption::AesEncryptor::Make( algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_256_.get(); @@ -174,24 +174,24 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( +encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + data_encryptor_128_.reset(encryption::AesEncryptor::Make( algorithm, key_len, false, all_encryptors_)); } return data_encryptor_128_.get(); } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + data_encryptor_196_.reset(encryption::AesEncryptor::Make( algorithm, key_len, false, all_encryptors_)); } return data_encryptor_196_.get(); } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + data_encryptor_256_.reset(encryption::AesEncryptor::Make( algorithm, key_len, false, all_encryptors_)); } return data_encryptor_256_.get(); diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 7b382de461e..b10367e45d0 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -18,26 +18,27 @@ #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H -#include #include #include #include +#include #include "parquet/encryption.h" #include "parquet/schema.h" -namespace parquet_encryption { -class AesEncryptor; -} namespace parquet { +namespace encryption { +class AesEncryptor; +} // namespace encryption + class FileEncryptionProperties; class ColumnEncryptionProperties; class Encryptor { public: - Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, + Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } void update_aad(const std::string& aad) { aad_ = aad; } @@ -58,7 +59,7 @@ class Encryptor { } private: - parquet_encryption::AesEncryptor* aes_encryptor_; + encryption::AesEncryptor* aes_encryptor_; std::string key_; std::string file_aad_; std::string aad_; @@ -91,21 +92,21 @@ class InternalFileEncryptor { std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; - std::shared_ptr> all_encryptors_; + std::shared_ptr> all_encryptors_; - std::unique_ptr meta_encryptor_128_; - std::unique_ptr meta_encryptor_196_; - std::unique_ptr meta_encryptor_256_; - std::unique_ptr data_encryptor_128_; - std::unique_ptr data_encryptor_196_; - std::unique_ptr data_encryptor_256_; + std::unique_ptr meta_encryptor_128_; + std::unique_ptr meta_encryptor_196_; + std::unique_ptr meta_encryptor_256_; + std::unique_ptr data_encryptor_128_; + std::unique_ptr data_encryptor_196_; + std::unique_ptr data_encryptor_256_; std::shared_ptr GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata); - parquet_encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, + encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, size_t key_len); - parquet_encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, + encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, size_t key_len); }; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9eef469a7e9..4c6c4f8b1f5 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -185,8 +185,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(file_decryptor != NULLPTR); - std::string aad_column_metadata = parquet_encryption::createModuleAAD( - file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, + std::string aad_column_metadata = encryption::CreateModuleAad( + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, column_ordinal, (int16_t)-1); auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, aad_column_metadata); @@ -458,15 +458,15 @@ class FileMetaData::FileMetaDataImpl { // encrypt with nonce uint8_t* nonce = const_cast(reinterpret_cast(tail)); uint8_t* tag = const_cast(reinterpret_cast(tail)) + - parquet_encryption::NonceLength; + encryption::kNonceLength; std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + serialized_len); uint32_t encrypted_len = encryptor->SignedFooterEncrypt( serialized_data, serialized_len, nonce, encrypted_buffer.data()); return 0 == memcmp(encrypted_buffer.data() + encrypted_len - - parquet_encryption::GCMTagLength, - tag, parquet_encryption::GCMTagLength); + encryption::kGcmTagLength, + tag, encryption::kGcmTagLength); } inline uint32_t size() const { return metadata_len_; } @@ -511,9 +511,9 @@ class FileMetaData::FileMetaDataImpl { // write unencrypted footer dst->Write(serialized_data, serialized_len); // Write signature (nonce and tag) - dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); - dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, - parquet_encryption::GCMTagLength); + dst->Write(encrypted_data.data() + 4, encryption::kNonceLength); + dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, + encryption::kGcmTagLength); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 73b1ab007c5..9307ed193fb 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -42,12 +42,14 @@ #include #include "arrow/util/logging.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/internal_file_decryptor.h" #include "parquet/internal_file_encryptor.h" #include "parquet/statistics.h" -#include "parquet/util/crypto.h" +#include "parquet/types.h" +#include "parquet/util/memory.h" #include "parquet/parquet_types.h" // IYWU pragma: export From 7336685a7431dff3a73c81df1532c7916ca19754 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 30 May 2019 13:22:27 +0700 Subject: [PATCH 095/125] format code --- cpp/src/parquet/column_reader.cc | 14 ++++----- cpp/src/parquet/column_writer.cc | 21 +++++++------ cpp/src/parquet/file_reader.cc | 18 +++++------ cpp/src/parquet/internal_file_decryptor.cc | 35 ++++++++++------------ cpp/src/parquet/internal_file_encryptor.cc | 29 +++++++++--------- cpp/src/parquet/internal_file_encryptor.h | 5 ++-- cpp/src/parquet/metadata.cc | 10 +++---- 7 files changed, 63 insertions(+), 69 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index b00b9e99195..5441234b02b 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -132,15 +132,15 @@ class SerializedPageReader : public PageReader { if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); // prepare the AAD for quick update later - data_pageAAD_ = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = + encryption::CreateModuleAad(data_decryptor_->file_aad(), encryption::kDataPage, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { DCHECK(!meta_decryptor_->file_aad().empty()); data_page_headerAAD_ = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDataPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1); + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, (int16_t)-1); } } @@ -251,8 +251,8 @@ std::shared_ptr SerializedPageReader::NextPage() { if (current_page_is_dictionary) { std::string dictionary_page_aad; dictionary_page_aad = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDictionaryPage, - row_group_ordinal_, column_ordinal_, (int16_t)-1); + data_decryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, + column_ordinal_, (int16_t)-1); data_decryptor_->update_aad(dictionary_page_aad); } else { encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e35786f3781..dbf8afdfc06 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -149,14 +149,14 @@ class SerializedPageWriter : public PageWriter { data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR) { // prepare the add for quick update later - data_pageAAD_ = encryption::CreateModuleAad( - data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = + encryption::CreateModuleAad(data_encryptor_->file_aad(), encryption::kDataPage, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kDataPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1); + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -185,10 +185,9 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { data_encryptor_->update_aad( - parquet_encryption::createModuleAAD(data_encryptor_->file_aad(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, + column_ordinal_, (int16_t)-1)); encrypted_data_buffer = std::static_pointer_cast( AllocateBuffer(pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -233,8 +232,8 @@ class SerializedPageWriter : public PageWriter { fallback); if (meta_encryptor_ != nullptr) { meta_encryptor_->update_aad(encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kColumnMetaData, - row_group_ordinal_, column_ordinal_, (int16_t)-1)); + meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk metadata_->WriteTo(sink_.get(), meta_encryptor_); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b66c90ac0ca..35d789c701d 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -60,9 +60,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -72,9 +72,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } @@ -488,9 +488,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) << "The file only has " - << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) + << "The file only has " << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index b2232fa80c6..d46101a819b 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -51,9 +51,8 @@ int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int foote } // Decryptor -Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad) +Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad) : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } @@ -255,50 +254,48 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( return data_decryptor; } -encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( - size_t key_size) { +encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, true, all_decryptors_)); + meta_decryptor_128_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, true, all_decryptors_)); + meta_decryptor_196_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, true, all_decryptors_)); + meta_decryptor_256_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_256_.get(); } throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( - size_t key_size) { +encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, false, all_decryptors_)); + data_decryptor_128_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, false, all_decryptors_)); + data_decryptor_196_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, false, all_decryptors_)); + data_decryptor_256_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index dabc804b0ad..b558f5c7fb8 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -29,9 +29,8 @@ static inline uint8_t* str2bytes(const std::string& str) { } // Encryptor -Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad) +Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad) : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } @@ -154,20 +153,20 @@ encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, true, all_encryptors_)); + meta_encryptor_128_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_128_.get(); } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, true, all_encryptors_)); + meta_encryptor_196_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_196_.get(); } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, true, all_encryptors_)); + meta_encryptor_256_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_256_.get(); } @@ -179,20 +178,20 @@ encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, false, all_encryptors_)); + data_encryptor_128_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); } return data_encryptor_128_.get(); } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, false, all_encryptors_)); + data_encryptor_196_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); } return data_encryptor_196_.get(); } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, false, all_encryptors_)); + data_encryptor_256_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); } return data_encryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index b10367e45d0..9fc0227e45f 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -26,7 +26,6 @@ #include "parquet/encryption.h" #include "parquet/schema.h" - namespace parquet { namespace encryption { @@ -105,9 +104,9 @@ class InternalFileEncryptor { const std::shared_ptr& column_path, bool metadata); encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, - size_t key_len); + size_t key_len); encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, - size_t key_len); + size_t key_len); }; } // namespace parquet diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 4c6c4f8b1f5..1ba108f54d6 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -186,8 +186,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(file_decryptor != NULLPTR); std::string aad_column_metadata = encryption::CreateModuleAad( - file_decryptor->file_aad(), encryption::kColumnMetaData, - row_group_ordinal, column_ordinal, (int16_t)-1); + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, + column_ordinal, (int16_t)-1); auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, aad_column_metadata); uint32_t len = static_cast(column->encrypted_column_metadata.size()); @@ -464,9 +464,9 @@ class FileMetaData::FileMetaDataImpl { serialized_len); uint32_t encrypted_len = encryptor->SignedFooterEncrypt( serialized_data, serialized_len, nonce, encrypted_buffer.data()); - return 0 == memcmp(encrypted_buffer.data() + encrypted_len - - encryption::kGcmTagLength, - tag, encryption::kGcmTagLength); + return 0 == + memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, + tag, encryption::kGcmTagLength); } inline uint32_t size() const { return metadata_len_; } From 22a5c1ec57f3a406bfb3635e860b41cbd083de0c Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 2 Jun 2019 09:29:12 +0700 Subject: [PATCH 096/125] post-rebase change --- cpp/src/parquet/column_reader.cc | 23 +++++++++-------- cpp/src/parquet/column_writer.cc | 36 ++++++++++++--------------- cpp/src/parquet/encryption.h | 1 - cpp/src/parquet/file_reader.cc | 23 +++++++++-------- cpp/src/parquet/file_writer.cc | 42 ++++++++++++++++++++------------ cpp/src/parquet/file_writer.h | 8 ++++-- cpp/src/parquet/metadata.cc | 10 +++++--- cpp/src/parquet/metadata.h | 2 +- cpp/src/parquet/thrift.h | 9 +++---- 9 files changed, 81 insertions(+), 73 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 5441234b02b..bee77118bb1 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -111,8 +111,7 @@ class SerializedPageReader : public PageReader { SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, - int16_t column_ordinal, - ::arrow::MemoryPool* pool, + int16_t column_ordinal, ::arrow::MemoryPool* pool, std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) : stream_(stream), @@ -227,8 +226,7 @@ std::shared_ptr SerializedPageReader::NextPage() { } } DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, - meta_decryptor_); + &header_size, ¤t_page_header_, meta_decryptor_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -274,10 +272,10 @@ std::shared_ptr SerializedPageReader::NextPage() { if (data_decryptor_ != nullptr) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( compressed_len - data_decryptor_->CiphertextSizeDelta())); - compressed_len = data_decryptor_->Decrypt(buffer, compressed_len, + compressed_len = data_decryptor_->Decrypt(page_buffer->data(), compressed_len, decryption_buffer_->mutable_data()); - buffer = decryption_buffer_->data(); + page_buffer = decryption_buffer_; } // Uncompress it if we need to @@ -348,12 +346,13 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool, - bool column_has_dictionary, int16_t row_group_ordinal, int16_t column_ordinal, - std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) { - return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, pool, column_has_dictionary, - row_group_ordinal, column_ordinal, meta_decryptor, data_decryptor)); + Compression::type codec, ::arrow::MemoryPool* pool, bool column_has_dictionary, + int16_t row_group_ordinal, int16_t column_ordinal, + std::shared_ptr meta_decryptor, + std::shared_ptr data_decryptor) { + return std::unique_ptr(new SerializedPageReader( + stream, total_num_rows, codec, column_has_dictionary, row_group_ordinal, + column_ordinal, pool, meta_decryptor, data_decryptor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index dbf8afdfc06..af91a4a2f73 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -128,9 +128,9 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { // and the page metadata. class SerializedPageWriter : public PageWriter { public: - SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, + SerializedPageWriter(const std::shared_ptr& sink, + Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) @@ -184,12 +184,11 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { - data_encryptor_->update_aad( - encryption::CreateModuleAad( - data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1)); - encrypted_data_buffer = std::static_pointer_cast( - AllocateBuffer(pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); + data_encryptor_->update_aad(encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + encrypted_data_buffer = std::static_pointer_cast(AllocateBuffer( + pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); @@ -356,19 +355,17 @@ class SerializedPageWriter : public PageWriter { // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: - BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t current_column_ordinal, + BufferedPageWriter(const std::shared_ptr& sink, + Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t current_column_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) - : final_sink_(sink), - metadata_(metadata) { + : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, - row_group_ordinal, current_column_ordinal, pool, - meta_encryptor, data_encryptor)); + pager_ = std::unique_ptr(new SerializedPageWriter( + in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, + meta_encryptor, data_encryptor)); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -413,8 +410,7 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, - bool buffered_row_group, + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group, std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor) { if (buffered_row_group) { diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 89541f75c26..3d4ec2163b0 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -29,7 +29,6 @@ #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/visibility.h" namespace parquet { diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 35d789c701d..94ca6985364 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -42,7 +42,6 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/memory.h" namespace parquet { @@ -125,7 +124,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); - std::unique_ptr crypto_meta_data = col->crypto_meta_data(); + std::unique_ptr crypto_metadata = col->crypto_metadata(); bool encrypted = true; @@ -137,7 +136,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i/* column_ordinal */, ); + row_group_ordinal_, (int16_t)i /* column_ordinal */); } // The column is encrypted @@ -234,7 +233,7 @@ class SerializedFile : public ParquetFileReader::Contents { // Check if all bytes are read. Check if last 4 bytes read have the magic bits if (footer_buffer->size() != footer_read_size || (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } @@ -324,8 +323,8 @@ class SerializedFile : public ParquetFileReader::Contents { // encryption with encrypted footer // both metadata & crypto metadata length uint32_t footer_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; if (kFooterSize + footer_len > file_size) { throw ParquetException( @@ -386,16 +385,16 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid encrypted parquet file. " - "Could not read footer metadata bytes."); + throw ParquetException( + "Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); } auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), - &metadata_len, - footer_decryptor); + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ba6f69543ac..be8c7e3c4d0 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -22,9 +22,9 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" -#include "parquet/platform.h" #include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" +#include "parquet/platform.h" #include "parquet/schema.h" using arrow::MemoryPool; @@ -76,9 +76,8 @@ inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) { class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, - RowGroupMetaDataBuilder* metadata, - int16_t row_group_ordinal, const WriterProperties* properties, - bool buffered_row_group = false, + RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, + const WriterProperties* properties, bool buffered_row_group = false, InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), @@ -293,14 +292,17 @@ class FileSerializer : public ParquetFileWriter::Contents { // encrypted footer file_metadata_ = metadata_->Finish(); - uint64_t metadata_start = static_cast(sink_->Tell()); + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint64_t metadata_start = static_cast(position); auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); uint32_t footer_and_crypto_len = - static_cast(sink_->Tell() - metadata_start); + static_cast(position - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); } else { // Encrypted file with plaintext footer @@ -343,7 +345,7 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, (int16_t)(num_row_groups_-1), properties_.get(), + sink_, rg_metadata, (int16_t)(num_row_groups_ - 1), properties_.get(), buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); @@ -396,8 +398,7 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties)); if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); - } - else { + } else { // plaintext mode footer PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } @@ -452,14 +453,17 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { // Encrypted file - if (encrypt_footer) { // Encrypted file with encrypted footer + } else { // Encrypted file + if (encrypt_footer) { // Encrypted file with encrypted footer // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); } else { // Encrypted file with plaintext footer - uint32_t metadata_len = static_cast(sink->Tell()); + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); file_metadata.WriteTo(sink, encryptor); - metadata_len = static_cast(sink->Tell()) - metadata_len; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; sink->Write(reinterpret_cast(&metadata_len), 4); sink->Write(kParquetMagic, 4); @@ -467,15 +471,21 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin } } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, bool encrypt_footer) { ParquetOutputWrapper wrapper(sink); - return WriteFileMetaData(file_metadata, &wrapper); + return WriteFileMetaData(file_metadata, &wrapper, encryptor, encrypt_footer); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ArrowOutputStream* sink) { + crypto_metadata.WriteTo(sink); } void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink) { ParquetOutputWrapper wrapper(sink); - crypto_metadata.WriteTo(sink); + crypto_metadata.WriteTo(&wrapper); } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index c51e354bff1..6c2158185e8 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -118,8 +118,12 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); +void WriteFileMetaData(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor = NULLPTR, + bool encrypt_footer = false); +PARQUET_EXPORT +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ::arrow::io::OutputStream* sink); PARQUET_EXPORT void WriteMetaDataFile(const FileMetaData& file_metadata, diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1ba108f54d6..c3294382a42 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -682,7 +682,7 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryptor) const { + const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); } @@ -735,7 +735,9 @@ FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) { FileCryptoMetaData::~FileCryptoMetaData() {} -void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } +void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { + impl_->WriteTo(dst); +} ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) @@ -910,7 +912,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { + const std::shared_ptr& encryptor) { ThriftSerializer serializer; // column is unencrypted @@ -1062,7 +1064,7 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { + const std::shared_ptr& encryptor) { impl_->WriteTo(sink, encryptor); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 6e66c87e5a2..d73c78c3c78 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -300,7 +300,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR); + const std::shared_ptr& encryptor = NULLPTR); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 9307ed193fb..709f17970b0 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -44,12 +44,11 @@ #include "arrow/util/logging.h" #include "parquet/encryption_internal.h" #include "parquet/exception.h" -#include "parquet/platform.h" #include "parquet/internal_file_decryptor.h" #include "parquet/internal_file_encryptor.h" +#include "parquet/platform.h" #include "parquet/statistics.h" #include "parquet/types.h" -#include "parquet/util/memory.h" #include "parquet/parquet_types.h" // IYWU pragma: export @@ -307,11 +306,11 @@ class ThriftSerializer { } if (shouldWriteLength) { - PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); + PARQUET_THROW_NOT_OK( + out->Write(reinterpret_cast(&cipher_buffer_len), 4)); PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); return static_cast(cipher_buffer_len + 4); - } - else { + } else { PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); return static_cast(cipher_buffer_len); } From 569725588c5be3229bb70f89f6b6fbe601e5d386 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 2 Jun 2019 14:12:15 +0700 Subject: [PATCH 097/125] add unit tests for encryption properties --- cpp/src/parquet/CMakeLists.txt | 4 + cpp/src/parquet/encryption-test.cc | 292 +++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 cpp/src/parquet/encryption-test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index f8d60432fa6..afa82be376e 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -320,6 +320,10 @@ add_parquet_test(arrow-test arrow/arrow-schema-test.cc test-util.cc) +if(PARQUET_BUILD_ENCRYPTION) + add_parquet_test(encryption-test) +endif() + # Those tests need to use static linking as they access thrift-generated # symbols which are not exported by parquet.dll on Windows (PARQUET-1420). add_parquet_test(file-deserialize-test diff --git a/cpp/src/parquet/encryption-test.cc b/cpp/src/parquet/encryption-test.cc new file mode 100644 index 00000000000..e4d4a27ed7f --- /dev/null +++ b/cpp/src/parquet/encryption-test.cc @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/encryption.h" + +namespace parquet { + +using schema::ColumnPath; + +namespace test { + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; +const std::string kFileName = "tester"; + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1->ToDotString(), column_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(false, column_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, column_props_1->key()); + ASSERT_EQ("kc1", column_props_1->key_metadata()); +} + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithFooterKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1->ToDotString(), column_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(true, column_props_1->is_encrypted_with_footer_key()); +} + +// Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST(TestEncryptionProperties, UniformEncryption) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ("kf", props->footer_key_metadata()); + + std::shared_ptr column_path = + parquet::schema::ColumnPath::FromDotString("a_column"); + std::shared_ptr out_col_props = + props->column_properties(column_path); + + ASSERT_EQ(true, out_col_props->is_encrypted()); + ASSERT_EQ(true, out_col_props->is_encrypted_with_footer_key()); +} + +// Encrypt two columns with their own keys and the same key for +// the footer and other columns +TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties; + column_properties[column_path_1] = column_builder_1.build(); + column_properties[column_path_2] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.column_properties(column_properties); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_properties(column_path_1); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_properties(column_path_2); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Encryption configuration 3: Encrypt two columns, don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties; + column_properties[column_path_1] = column_builder_1.build(); + column_properties[column_path_2] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.set_plaintext_footer(); + builder.column_properties(column_properties); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(false, props->encrypted_footer()); + ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_properties(column_path_1); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_properties(column_path_2); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + // other columns: encrypted with footer, footer is not encrypted + // so column is not encrypted as well + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Use aad_prefix +TEST(TestEncryptionProperties, UseAadPrefix) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->algorithm().aad.aad_prefix); + ASSERT_EQ(false, props->algorithm().aad.supply_aad_prefix); +} + +// Use aad_prefix and +// disable_aad_prefix_storage. +TEST(TestEncryptionProperties, UseAadPrefixNotStoreInFile) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + builder.disable_store_aad_prefix_storage(); + std::shared_ptr props = builder.build(); + + ASSERT_EQ("", props->algorithm().aad.aad_prefix); + ASSERT_EQ(true, props->algorithm().aad.supply_aad_prefix); +} + +// Use AES_GCM_CTR_V1 algorithm +TEST(TestEncryptionProperties, UseAES_GCM_CTR_V1Algorithm) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.algorithm(ParquetCipher::AES_GCM_CTR_V1); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(ParquetCipher::AES_GCM_CTR_V1, props->algorithm().algorithm); +} + +TEST(TestDecryptionProperties, UseKeyRetriever) { + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder builder; + builder.key_retriever(kr1); + std::shared_ptr props = builder.build(); + + auto out_key_retriever = props->key_retriever(); + ASSERT_EQ(kFooterEncryptionKey, out_key_retriever->GetKey("kf")); + ASSERT_EQ(kColumnEncryptionKey1, out_key_retriever->GetKey("kc1")); + ASSERT_EQ(kColumnEncryptionKey2, out_key_retriever->GetKey("kc2")); +} + +TEST(TestDecryptionProperties, SupplyAadPrefix) { + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->aad_prefix()); +} + +TEST(ColumnDecryptionProperties, SetKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + col_builder_1.key(kColumnEncryptionKey1); + + auto props = col_builder_1.build(); + ASSERT_EQ(kColumnEncryptionKey1, props->key()); +} + +TEST(TestDecryptionProperties, UsingExplicitFooterAndColumnKeys) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + parquet::ColumnDecryptionProperties::Builder col_builder_2(column_path_2); + + decryption_cols[column_path_1] = col_builder_1.key(kColumnEncryptionKey1)->build(); + decryption_cols[column_path_2] = col_builder_2.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.column_properties(decryption_cols); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, props->column_key(column_path_1)); + ASSERT_EQ(kColumnEncryptionKey2, props->column_key(column_path_2)); +} + +} // namespace test +} // namespace parquet From b7bdc8af34b98d78e80d8da12b46b07e2e77fb1b Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 2 Jun 2019 18:12:04 +0700 Subject: [PATCH 098/125] write unit tests for metadata --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/encryption-metadata-test.cc | 85 +++++++++++++++++++++ cpp/src/parquet/file_writer.cc | 10 +-- cpp/src/parquet/metadata.cc | 31 +++++--- cpp/src/parquet/metadata.h | 4 +- 5 files changed, 110 insertions(+), 21 deletions(-) create mode 100644 cpp/src/parquet/encryption-metadata-test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index afa82be376e..94717699e84 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -322,6 +322,7 @@ add_parquet_test(arrow-test if(PARQUET_BUILD_ENCRYPTION) add_parquet_test(encryption-test) + add_parquet_test(encryption-metadata-test) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc new file mode 100644 index 00000000000..f81493dbed0 --- /dev/null +++ b/cpp/src/parquet/encryption-metadata-test.cc @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/metadata.h" + +#include + +#include "parquet/properties.h" +#include "parquet/schema.h" + +namespace parquet { + +namespace metadata { + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; + +TEST(Metadata, EncryptFooter) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); + encryption_prop_builder.footer_key_metadata("kf"); + + WriterProperties::Builder writer_prop_builder; + writer_prop_builder.encryption(encryption_prop_builder.build()); + auto props = writer_prop_builder.build(); + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto file_metadata = f_builder->Finish(); + ASSERT_EQ(false, file_metadata->is_encryption_algorithm_set()); + + auto file_crypto_metadata = f_builder->GetCryptoMetaData(); + ASSERT_EQ(true, file_crypto_metadata != NULLPTR); +} + +TEST(Metadata, PlaintextFooter) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); + encryption_prop_builder.footer_key_metadata("kf"); + encryption_prop_builder.set_plaintext_footer(); + + WriterProperties::Builder writer_prop_builder; + writer_prop_builder.encryption(encryption_prop_builder.build()); + auto props = writer_prop_builder.build(); + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto file_metadata = f_builder->Finish(); + ASSERT_EQ(true, file_metadata->is_encryption_algorithm_set()); + + auto file_crypto_metadata = f_builder->GetCryptoMetaData(); + ASSERT_EQ(NULLPTR, file_crypto_metadata); +} + +} // namespace metadata +} // namespace parquet diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index be8c7e3c4d0..65f671b2167 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -306,15 +306,7 @@ class FileSerializer : public ParquetFileWriter::Contents { sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); } else { // Encrypted file with plaintext footer - EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption_properties->algorithm(); - signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; - signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; - if (!algo.aad.supply_aad_prefix) - signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; - signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; - file_metadata_ = metadata_->Finish( - &signing_encryption, file_encryption_properties->footer_key_metadata()); + file_metadata_ = metadata_->Finish(); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c3294382a42..9d8a01ae995 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -909,6 +909,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } column_metadata_.__set_encodings(thrift_encodings); + + // temporary fix: setting for columnchunk meta_data in case file is not encrypted + if (properties_->file_encryption_properties() == NULLPTR) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + } } void WriteTo(::arrow::io::OutputStream* sink, @@ -1222,8 +1228,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return current_row_group_builder_.get(); } - std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm, - const std::string& footer_signing_key_metadata) { + std::unique_ptr Finish() { int64_t total_rows = 0; for (auto row_group : row_groups_) { total_rows += row_group.num_rows; @@ -1269,8 +1274,20 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; - if (signing_algorithm != NULLPTR) { - metadata_->__set_encryption_algorithm(ToThrift(*signing_algorithm)); + // if plaintext footer, set footer signing algorithm + auto file_encryption_properties = properties_->file_encryption_properties(); + if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { + EncryptionAlgorithm signing_algorithm; + EncryptionAlgorithm algo = file_encryption_properties->algorithm(); + signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique; + signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; + if (!algo.aad.supply_aad_prefix) + signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix; + signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1; + + metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm)); + const std::string& footer_signing_key_metadata = + file_encryption_properties->footer_key_metadata(); if (footer_signing_key_metadata.size() > 0) { metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); } @@ -1340,11 +1357,7 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { return impl_->AppendRowGroup(); } -std::unique_ptr FileMetaDataBuilder::Finish( - const EncryptionAlgorithm* signing_algorithm, - const std::string& footer_signing_key_metadata) { - return impl_->Finish(signing_algorithm, footer_signing_key_metadata); -} +std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index d73c78c3c78..b3bee5d28b7 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -352,9 +352,7 @@ class PARQUET_EXPORT FileMetaDataBuilder { RowGroupMetaDataBuilder* AppendRowGroup(); // Complete the Thrift structure - std::unique_ptr Finish( - const EncryptionAlgorithm* signing_algorithm = NULLPTR, - const std::string& footer_signing_key_metadata = ""); + std::unique_ptr Finish(); // crypto metadata std::unique_ptr GetCryptoMetaData(); From fb826f5d3e35c99da06f3e0432596cbade80932d Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Sun, 2 Jun 2019 15:58:39 +0300 Subject: [PATCH 099/125] Add encryption samples --- cpp/examples/parquet/CMakeLists.txt | 6 +- ...yption-reader-writer-all-crypto-options.cc | 792 +++++++++++++++++ .../low-level-api/encryption-reader-writer.cc | 809 +++++++++--------- cpp/src/parquet/encryption.cc | 10 +- cpp/src/parquet/encryption.h | 259 ++++-- cpp/src/parquet/file_reader.cc | 52 +- 6 files changed, 1389 insertions(+), 539 deletions(-) create mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 2a2421c18ab..fb428730360 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -18,12 +18,15 @@ add_executable(parquet-low-level-example low-level-api/reader-writer.cc) add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) +add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_include_directories(parquet-encryption-example PRIVATE low-level-api/) +target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) target_link_libraries(parquet-encryption-example parquet_static) +target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated @@ -37,4 +40,5 @@ add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 parquet-encryption-example - parquet-arrow-example) + parquet-arrow-example + parquet-encryption-example-all-crypto-options) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc new file mode 100644 index 00000000000..98f0e57a7aa --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -0,0 +1,792 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * This file contains samples for writing and reading encrypted Parquet files in different + * encryption and decryption configurations. The samples have the following goals: + * 1) Demonstrate usage of different options for data encryption and decryption. + * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr) + * readers that support encryption. + * 3) Produce encrypted files with plaintext footer, for testing the ability of legacy + * readers to parse the footer and read unencrypted columns. + * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading + * encrypted files produced by these writers. + * + * The write sample produces number of parquet files, each encrypted with a different + * encryption configuration as described below. + * The name of each file is in the form of: + * tester.parquet.encrypted. + * + * The read sample creates a set of decryption configurations and then uses each of them + * to read all encrypted files in the input directory. + * + * The different encryption and decryption configurations are listed below. + * + * Usage: ./encryption-interop-tests + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The write sample creates files with eight columns in the following + * encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer. + * - Encryption configuration 3: Encrypt two columns. Don’t encrypt footer (to enable + * legacy readers) - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer. Supply aad_prefix + * for file identity verification. + * - Encryption configuration 5: Encrypt two columns and the footer. Supply aad_prefix, + * and call disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer. Use the + * alternative (AES_GCM_CTR_V1) algorithm. + * + * The read sample uses each of the following decryption configurations to read every + * encrypted files in the input directory: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + */ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; +const std::string fileName = "tester"; + +void PrintDecryptionConfiguration(int configuration); +void CheckResult(std::string file, int example_id, std::string exception_msg); +int ExtractEncryptionConfigurationNumber(std::string file); + +std::vector GetDirectoryFiles(const std::string& path) { + std::vector files; + struct dirent* entry; + DIR* dir = opendir(path.c_str()); + + if (dir == NULL) { + exit(-1); + } + while ((entry = readdir(dir)) != NULL) { + files.push_back(std::string(entry->d_name)); + } + closedir(dir); + return files; +} + +void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { + /********************************************************************************** + Creating a number of Encryption configurations + **********************************************************************************/ + + // This vector will hold various encryption configuraions. + std::vector> + vector_of_encryption_configurations; + + // Encryption configuration 1: Encrypt all columns and the footer with the same key. + // (uniform encryption) + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey); + // Add to list of encryption configurations. + vector_of_encryption_configurations.push_back( + file_encryption_builder_1.footer_key_metadata("kf")->build()); + + // Encryption configuration 2: Encrypt two columns and the footer. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_ptr1 = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21(path_ptr1); + encryption_col_builder_20.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols2[path_ptr] = encryption_col_builder_20.build(); + encryption_cols2[path_ptr1] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build()); + + // Encryption configuration 3: Encrypt two columns, don’t encrypt footer. + // (plaintext footer mode, readable by legacy readers) + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31(path_ptr1); + encryption_col_builder_30.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols3[path_ptr] = encryption_col_builder_30.build(); + encryption_cols3[path_ptr1] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build()); + + // Encryption configuration 4: Encrypt two columns and the footer. Use aad_prefix. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41(path_ptr1); + encryption_col_builder_40.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols4[path_ptr] = encryption_col_builder_40.build(); + encryption_cols4[path_ptr1] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(fileName) + ->build()); + + // Encryption configuration 5: Encrypt two columns and the footer. Use aad_prefix and + // disable_aad_prefix_storage. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51(path_ptr1); + encryption_col_builder_50.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols5[path_ptr] = encryption_col_builder_50.build(); + encryption_cols5[path_ptr1] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(fileName) + ->disable_store_aad_prefix_storage() + ->build()); + + // Encryption configuration 6: Encrypt two columns and the footer. Use AES_GCM_CTR_V1 + // algorithm. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61(path_ptr1); + encryption_col_builder_60.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols6[path_ptr] = encryption_col_builder_60.build(); + encryption_cols6[path_ptr1] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build()); + + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + + // Iterate over the encryption configurations and for each one write a parquet file. + for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); + ++example_id) { + std::stringstream ss; + ss << example_id + 1; + std::string test_number_string = ss.str(); + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + std::string file = + rootPath + fileName + std::string(test_number_string) + ".parquet.encrypted"; + std::cout << "Write " << file << std::endl; + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(vector_of_encryption_configurations[example_id]); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return; + } + } +} + +void InteropTestReadEncryptedParquetFiles(std::string rootPath) { + std::vector files_in_directory = GetDirectoryFiles(rootPath); + + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations; + + // Decryption configuration 1: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey); + string_kr2->PutKey("kc1", kColumnEncryptionKey1); + string_kr2->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31(path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey) + ->column_properties(decryption_cols) + ->build()); + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Iterate over the decryption configurations and use each one to read every files + // in the input directory. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations.size(); + ++example_id) { + PrintDecryptionConfiguration(example_id + 1); + for (auto const& file : files_in_directory) { + std::string exception_msg = ""; + if (file.find("parquet.encrypted") == + std::string::npos) // Skip non encrypted files + continue; + try { + std::cout << "--> Read file " << file << std::endl; + + parquet::ReaderProperties reader_properties = + parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + vector_of_decryption_configurations[example_id]->DeepClone()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(rootPath + file, false, + reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(file, example_id, exception_msg); + std::cout << "file [" << file << "] Parquet Reading Complete" << std::endl; + } + } +} + +void PrintDecryptionConfiguration(int configuration) { + std::cout << "\n\nDecryption configuration "; + if (configuration == 1) + std::cout << "1: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key." + << std::endl; + else if (configuration == 2) + std::cout << "2: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key. Pass aad_prefix." + << std::endl; + else if (configuration == 3) + std::cout << "3: \n\nDecrypt using explicit column and footer keys." << std::endl; + else { + std::cout << "Unknown configuraion" << std::endl; + exit(-1); + } + std::cout << std::endl; +} + +int ExtractEncryptionConfigurationNumber(std::string file) { + int encryption_configuration_number; + std::regex r("tester([0-9]+)\\.parquet.encrypted"); + std::smatch m; + std::regex_search(file, m, r); + if (m.size() == 0) { + std::cerr + << "Error: Error parsing filename to extract encryption configuration number. " + << std::endl; + } + std::string encryption_configuration_number_str = m.str(1); + encryption_configuration_number = atoi(encryption_configuration_number_str.c_str()); + if (encryption_configuration_number < 1 || encryption_configuration_number > 6) { + std::cerr << "Error: Unknown encryption configuration number. " << std::endl; + } + + return encryption_configuration_number; +} + +void CheckResult(std::string file, int example_id, std::string exception_msg) { + int encryption_configuration_number = ExtractEncryptionConfigurationNumber(file); + int decryption_configuration_number = example_id + 1; + + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) + std::cout << "Error: Expecting AAD related exception."; + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected to + // be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) { + std::cout << "Error: Expecting AAD related exception." << std::endl; + } + return; + } + } + if (!exception_msg.empty()) + std::cout << "Error: Unexpected exception was thrown." << exception_msg; +} + +int main(int argc, char** argv) { + enum Operation { write, read }; + std::string rootPath; + Operation operation = write; + if (argc < 3) { + std::cout << "Usage: encryption-reader-writer-all-crypto-options " + "" + << std::endl; + exit(1); + } + rootPath = argv[1]; + if (rootPath.compare("read") == 0) { + operation = read; + } + + rootPath = argv[2]; + std::cout << "Root path is: " << rootPath << std::endl; + + if (operation == write) { + InteropTestWriteEncryptedParquetFiles(rootPath); + } else + InteropTestReadEncryptedParquetFiles(rootPath); + + return 0; +} diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index a75c0352cda..5ce66769c0f 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -22,471 +22,428 @@ #include - - /* - * This example describes writing and reading Parquet Files in C++ with encrypted columns - * and serves as a reference to the Parquet Modular Encryption API. + * This file contains sample for writing and reading encrypted Parquet file with + * basic encryption configuration. * - * A detailed description of the Parquet Modular Encryption specification can be found here: + * A detailed description of the Parquet Modular Encryption specification can be found + * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * The example contains writing and reading eight columns with the following four different - * encryption configurations: - * 1) uniform encryption - footer and all columns are encrypted with footer key. - * 2) non-uniform encryption - footer and ba_field column are encrypted with different keys. - * 3) plaintext footer mode where all columns are encrypted with footer key. - * 4) footer and ba_field column are encrypted with different keys. no column key - * is provided upon decryption and thus HiddenColumnException is thrown. - **/ + * The write sample creates a file with eight columns where two of the columns and the + * footer are encrypted. + * + * The read sample decrypts using key retriever that holds the keys of two encrypted + * columns and the footer key. + */ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; -const std::string FOOTER_ENCRYPTION_KEY = "0123456789012345"; // 16 bytes -const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; int main(int argc, char** argv) { - std::vector> vector_of_encryption_configurations; - std::vector> vector_of_decryption_configurations; - - // encryption configuration #1 - uniform encryption - all columns and footer are - // encrypted with footer key. - parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_1; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_1.build()); - vector_of_decryption_configurations.push_back(file_decryption_builder_1 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->build()); - - // encryption configuration #2 - footer and ba_field column are encrypted with - // different keys. - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; - std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); - encryption_cols[path_ptr] = encryption_col_builder_0. - key(COLUMN_ENCRYPTION_KEY) - ->build(); + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // Encryption configuration: Encrypt two columns and the footer. std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); - decryption_cols[path_ptr] = decryption_col_builder2. - key(COLUMN_ENCRYPTION_KEY) - ->build(); - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_2; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_2 - .column_properties(encryption_cols) - ->build()); - vector_of_decryption_configurations.push_back(file_decryption_builder_2 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->column_properties(decryption_cols) - ->build()); - - // encryption configuration #3 - plain mode footer - parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_3; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_3 - .set_plaintext_footer() - ->build()); - vector_of_decryption_configurations.push_back(file_decryption_builder_3 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->build()); - - // encryption configuration #4 - footer and ba_field column are encrypted with different keys. - // no column key is provided upon decryption and thus HiddenColumnException is thrown. - parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_4; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_4 - .set_plaintext_footer() - ->column_properties(encryption_cols) - ->build()); - - vector_of_decryption_configurations.push_back(file_decryption_builder_4 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->build()); - - for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); ++example_id) { - /********************************************************************************** - PARQUET WRITER EXAMPLE - **********************************************************************************/ - - try { - - // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); - - // Setup the parquet schema - std::shared_ptr schema = SetupSchema(); - - // Add writer properties - parquet::WriterProperties::Builder builder; - builder.compression(parquet::Compression::SNAPPY); - - // Add the current encryption configuration to WriterProperties. - builder.encryption(vector_of_encryption_configurations[example_id]); - - std::shared_ptr props = builder.build(); - - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - // Append a RowGroup with a specific number of rows. - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols; + std::shared_ptr path_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_ptr1 = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder1(path_ptr1); + encryption_col_builder0.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder1.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols[path_ptr] = encryption_col_builder0.build(); + encryption_cols[path_ptr1] = encryption_col_builder1.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder( + kFooterEncryptionKey); + + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(file_encryption_builder.footer_key_metadata("kf") + ->column_properties(encryption_cols) + ->build()); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::ByteArray value; - char hello[FIXED_LENGTH] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = FIXED_LENGTH; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } - flba_writer->WriteBatch(1, nullptr, nullptr, &value); + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); } + } - // Close the ParquetFileWriter - file_writer->Close(); + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); - // Write the bytes to file - DCHECK(out_file->Close().ok()); - } catch (const std::exception& e) { - std::cerr << "Parquet write error: " << e.what() << std::endl; - return -1; + flba_writer->WriteBatch(1, nullptr, nullptr, &value); } - /********************************************************************************** - PARQUET READER EXAMPLE - **********************************************************************************/ - - try { - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - - // Add the current decryption configuration to ReaderProperties. - reader_properties.file_decryption_properties(vector_of_decryption_configurations[example_id]); - - // Create a ParquetReader instance - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - assert(num_row_groups == 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - assert(value == expected_value); - i++; - } + // Close the ParquetFileWriter + file_writer->Close(); - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - assert(value == i); - i++; - } + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - assert(value == expected_value); - if ((i % 2) == 0) { - assert(repetition_level == 1); - } else { - assert(repetition_level == 0); - } - i++; - } + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Decryption configuration: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder; + + + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + file_decryption_builder.key_retriever(kr1)->build()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); - } - i++; - } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - assert(value == expected_value); - i++; + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); } + i++; + } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - double expected_value = i * 1.1111111; - assert(value == expected_value); - i++; + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); } + i++; + } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // Verify the value written - char expected_value[FIXED_LENGTH] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - assert(values_read == 1); - assert(value.len == FIXED_LENGTH); - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - assert(definition_level == 1); - } else { - // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); - } - i++; - } + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist // There are no NULL values in the rows written assert(values_read == 1); - // Verify the value written - char v = static_cast(i); - char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(value.len == FIXED_LENGTH); assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - i++; + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); } + i++; } - } catch (const parquet::HiddenColumnException& e) { - std::cerr << "Parquet read error: hidden column: " << e.what() << std::endl; - } catch (const std::exception& e) { - std::cerr << "Parquet read error: " << e.what() << std::endl; - } - std::cout << "Example [" << (example_id+1) << "] Parquet Writing and Reading Complete" << std::endl; + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; return 0; } diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 9666e9ae2bb..8a88db00224 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -183,6 +183,8 @@ FileEncryptionProperties::FileEncryptionProperties( : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), + aad_prefix_(aad_prefix), + store_aad_prefix_in_file_(store_aad_prefix_in_file), column_properties_(column_properties) { // file encryption properties object can be used for writing only one file. // Upon completion of file writing, the encryption keys in the properties will be wiped @@ -194,11 +196,11 @@ FileEncryptionProperties::FileEncryptionProperties( DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); - uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; - memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); - RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + uint8_t aad_file_unique[kAadFileUniqueLength]; + memset(aad_file_unique, 0, kAadFileUniqueLength); + RAND_bytes(aad_file_unique, sizeof(kAadFileUniqueLength)); std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH); + kAadFileUniqueLength); bool supply_aad_prefix = false; if (aad_prefix.empty()) { diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3d4ec2163b0..7d943d15be5 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -32,13 +32,13 @@ namespace parquet { -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = ParquetCipher::AES_GCM_V1; -static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; -static constexpr bool DEFAULT_CHECK_SIGNATURE = true; -static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; -static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; +static constexpr int32_t kMaximalAadMetadataLength = 256; +static constexpr bool kDefaultEncryptedFooter = true; +static constexpr bool kDefaultCheckSignature = true; +static constexpr bool kDefaultAllowPlaintextFiles = false; +static constexpr int32_t kAadFileUniqueLength = 8; class PARQUET_EXPORT DecryptionKeyRetriever { public: @@ -46,7 +46,7 @@ class PARQUET_EXPORT DecryptionKeyRetriever { virtual ~DecryptionKeyRetriever() {} }; -// Simple integer key retriever +/// Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: void PutKey(uint32_t key_id, const std::string& key); @@ -88,22 +88,22 @@ class PARQUET_EXPORT ColumnEncryptionProperties { public: class Builder { public: - // Convenience builder for regular (not nested) columns. + /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name), true); } - // Convenience builder for encrypted columns. + /// Convenience builder for encrypted columns. explicit Builder(const std::shared_ptr& path) : Builder(path, true) {} - // Set a column-specific key. - // If key is not set on an encrypted column, the column will - // be encrypted with the footer key. - // keyBytes Key length must be either 16, 24 or 32 bytes. - // The key is cloned, and will be wiped out (array values set to 0) upon completion of - // file writing. - // Caller is responsible for wiping out the input key array. + /// Set a column-specific key. + /// If key is not set on an encrypted column, the column will + /// be encrypted with the footer key. + /// keyBytes Key length must be either 16, 24 or 32 bytes. + /// The key is cloned, and will be wiped out (array values set to 0) upon completion + /// of file writing. + /// Caller is responsible for wiping out the input key array. Builder* key(std::string column_key) { if (column_key.empty()) return this; @@ -112,8 +112,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { return this; } - // Set a key retrieval metadata. - // use either key_metadata() or key_id(), not both + /// Set a key retrieval metadata. + /// use either key_metadata() or key_id(), not both Builder* key_metadata(const std::string& key_metadata) { DCHECK(!key_metadata.empty()); DCHECK(key_metadata_.empty()); @@ -121,9 +121,9 @@ class PARQUET_EXPORT ColumnEncryptionProperties { return this; } - // Set a key retrieval metadata (converted from String). - // use either key_metadata() or key_id(), not both - // key_id will be converted to metadata (UTF-8 array). + /// Set a key retrieval metadata (converted from String). + /// use either key_metadata() or key_id(), not both + /// key_id will be converted to metadata (UTF-8 array). Builder* key_id(const std::string& key_id); std::shared_ptr build() { @@ -160,6 +160,12 @@ class PARQUET_EXPORT ColumnEncryptionProperties { void set_utilized() { utilized_ = true; } + std::shared_ptr DeepClone() { + std::string key_copy = key_; + return std::shared_ptr(new ColumnEncryptionProperties( + encrypted_, column_path_, key_copy, key_metadata_)); + } + ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; @@ -180,17 +186,17 @@ class PARQUET_EXPORT ColumnDecryptionProperties { public: class Builder { public: - // convenience builder for regular (not nested) columns. + /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) : Builder(schema::ColumnPath::FromDotString(name)) {} explicit Builder(const std::shared_ptr& path) : column_path_(path) {} - // Set an explicit column key. If applied on a file that contains - // key metadata for this column the metadata will be ignored, - // the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. + /// Set an explicit column key. If applied on a file that contains + /// key metadata for this column the metadata will be ignored, + /// the column will be decrypted with this key. + /// key length must be either 16, 24 or 32 bytes. Builder* key(const std::string& key) { if (key.empty()) return this; @@ -225,25 +231,31 @@ class PARQUET_EXPORT ColumnDecryptionProperties { } } + std::shared_ptr DeepClone() { + std::string key_copy = key_; + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_copy)); + } + private: const std::shared_ptr column_path_; std::string key_; bool utilized_; - // This class is only required for setting explicit column decryption keys - - // to override key retriever (or to provide keys when key metadata and/or - // key retriever are not available) + /// This class is only required for setting explicit column decryption keys - + /// to override key retriever (or to provide keys when key metadata and/or + /// key retriever are not available) explicit ColumnDecryptionProperties( const std::shared_ptr& column_path, const std::string& key); }; class PARQUET_EXPORT AADPrefixVerifier { public: - // Verifies identity (AAD Prefix) of individual file, - // or of file collection in a data set. - // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, - // and then checked for missing files. + /// Verifies identity (AAD Prefix) of individual file, + /// or of file collection in a data set. + /// Throws exception if an AAD prefix is wrong. + /// In a data set, AAD Prefixes should be collected, + /// and then checked for missing files. virtual void check(const std::string& aad_prefix) = 0; virtual ~AADPrefixVerifier() {} }; @@ -253,22 +265,22 @@ class PARQUET_EXPORT FileDecryptionProperties { class Builder { public: Builder() { - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; + check_plaintext_footer_integrity_ = kDefaultCheckSignature; + plaintext_files_allowed_ = kDefaultAllowPlaintextFiles; } - // Set an explicit footer key. If applied on a file that contains - // footer key metadata the metadata will be ignored, the footer - // will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from - // key retriever. - // With explicit keys or AAD prefix, new encryption properties object must be created - // for each encrypted file. - // Explicit encryption keys (footer and column) are cloned. - // Upon completion of file reading, the cloned encryption keys in the properties will - // be wiped out (array values set to 0). - // Caller is responsible for wiping out the input key array. - // param footerKey Key length must be either 16, 24 or 32 bytes. + /// Set an explicit footer key. If applied on a file that contains + /// footer key metadata the metadata will be ignored, the footer + /// will be decrypted/verified with this key. + /// If explicit key is not set, footer key will be fetched from + /// key retriever. + /// With explicit keys or AAD prefix, new encryption properties object must be + /// created for each encrypted file. + /// Explicit encryption keys (footer and column) are cloned. + /// Upon completion of file reading, the cloned encryption keys in the properties + /// will be wiped out (array values set to 0). + /// Caller is responsible for wiping out the input key array. + /// param footerKey Key length must be either 16, 24 or 32 bytes. Builder* footer_key(const std::string footer_key) { if (footer_key.empty()) { return this; @@ -278,12 +290,12 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. + /// Set explicit column keys (decryption properties). + /// Its also possible to set a key retriever on this property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. Builder* column_properties( const std::map, std::shared_ptr, @@ -306,12 +318,12 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. + /// Set a key retriever callback. Its also possible to + /// set explicit footer or column keys on this file property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. Builder* key_retriever(const std::shared_ptr& key_retriever) { if (key_retriever == NULLPTR) return this; @@ -320,21 +332,21 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Skip integrity verification of plaintext footers. - // If not called, integrity of plaintext footers will be checked in runtime, - // and an exception will be thrown in the following situations: - // - footer signing key is not available - // (not passed, or not found by key retriever) - // - footer content and signature don't match + /// Skip integrity verification of plaintext footers. + /// If not called, integrity of plaintext footers will be checked in runtime, + /// and an exception will be thrown in the following situations: + /// - footer signing key is not available + /// (not passed, or not found by key retriever) + /// - footer content and signature don't match Builder* disable_footer_signature_verification() { check_plaintext_footer_integrity_ = false; return this; } - // Explicitly supply the file AAD prefix. - // A must when a prefix is used for file encryption, but not stored in file. - // If AAD prefix is stored in file, it will be compared to the explicitly - // supplied value and an exception will be thrown if they differ. + /// Explicitly supply the file AAD prefix. + /// A must when a prefix is used for file encryption, but not stored in file. + /// If AAD prefix is stored in file, it will be compared to the explicitly + /// supplied value and an exception will be thrown if they differ. Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) { return this; @@ -344,7 +356,7 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Set callback for verification of AAD Prefixes stored in file. + /// Set callback for verification of AAD Prefixes stored in file. Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; @@ -353,12 +365,12 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // By default, reading plaintext (unencrypted) files is not - // allowed when using a decryptor - // - in order to detect files that were not encrypted by mistake. - // However, the default behavior can be overriden by calling this method. - // The caller should use then a different method to ensure encryption - // of files with sensitive data. + /// By default, reading plaintext (unencrypted) files is not + /// allowed when using a decryptor + /// - in order to detect files that were not encrypted by mistake. + /// However, the default behavior can be overriden by calling this method. + /// The caller should use then a different method to ensure encryption + /// of files with sensitive data. Builder* plaintext_files_allowed() { plaintext_files_allowed_ = true; return this; @@ -420,6 +432,36 @@ class PARQUET_EXPORT FileDecryptionProperties { void set_utilized() { utilized_ = true; } + /// FileDecryptionProperties object can be used for reading one file only. + /// (unless this object keeps the keyRetrieval callback only, and no explicit + /// keys or aadPrefix). + /// At the end, keys are wiped out in the memory. + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = "") { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileDecryptionProperties( + footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, + new_aad_prefix, aad_prefix_verifier_, column_properties_map_copy, + plaintext_files_allowed_)); + } + private: std::string footer_key_; std::string aad_prefix_; @@ -452,32 +494,32 @@ class PARQUET_EXPORT FileEncryptionProperties { class Builder { public: explicit Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + : parquet_cipher_(kDefaultEncryptionAlgorithm), + encrypted_footer_(kDefaultEncryptedFooter) { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } - // Create files with plaintext footer. - // If not called, the files will be created with encrypted footer (default). + /// Create files with plaintext footer. + /// If not called, the files will be created with encrypted footer (default). Builder* set_plaintext_footer() { encrypted_footer_ = false; return this; } - // Set encryption algorithm. - // If not called, files will be encrypted with AES_GCM_V1 (default). + /// Set encryption algorithm. + /// If not called, files will be encrypted with AES_GCM_V1 (default). Builder* algorithm(ParquetCipher::type parquet_cipher) { parquet_cipher_ = parquet_cipher; return this; } - // Set a key retrieval metadata (converted from String). - // use either footer_key_metadata or footer_key_id, not both. + /// Set a key retrieval metadata (converted from String). + /// use either footer_key_metadata or footer_key_id, not both. Builder* footer_key_id(const std::string& key_id); - // Set a key retrieval metadata. - // use either footer_key_metadata or footer_key_id, not both. + /// Set a key retrieval metadata. + /// use either footer_key_metadata or footer_key_id, not both. Builder* footer_key_metadata(const std::string& footer_key_metadata) { if (footer_key_metadata.empty()) return this; @@ -486,7 +528,7 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - // Set the file AAD Prefix. + /// Set the file AAD Prefix. Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) return this; @@ -496,8 +538,8 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - // Skip storing AAD Prefix in file. - // If not called, and if AAD Prefix is set, it will be stored. + /// Skip storing AAD Prefix in file. + /// If not called, and if AAD Prefix is set, it will be stored. Builder* disable_store_aad_prefix_storage() { DCHECK(!aad_prefix_.empty()); @@ -505,9 +547,9 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - // Set the list of encrypted columns and their properties (keys etc). - // If not called, all columns will be encrypted with the footer key. - // If called, the file columns not in the list will be left unencrypted. + /// Set the list of encrypted columns and their properties (keys etc). + /// If not called, all columns will be encrypted with the footer key. + /// If called, the file columns not in the list will be left unencrypted. Builder* column_properties( const std::map, std::shared_ptr, @@ -521,7 +563,7 @@ class PARQUET_EXPORT FileEncryptionProperties { std::shared_ptr> element : column_properties) { if (element.second->is_utilized()) { - ParquetException("Column properties utilized in another file"); + throw ParquetException("Column properties utilized in another file"); } element.second->set_utilized(); } @@ -574,13 +616,42 @@ class PARQUET_EXPORT FileEncryptionProperties { } } + /// FileEncryptionProperties object can be used for writing one file only. + /// (at the end, keys are wiped out in the memory). + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = "") { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileEncryptionProperties( + algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, + new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); + } + private: EncryptionAlgorithm algorithm_; std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; std::string file_aad_; + std::string aad_prefix_; bool utilized_; + bool store_aad_prefix_in_file_; std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 94ca6985364..f2b57fc66f4 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -238,7 +238,6 @@ class SerializedFile : public ParquetFileReader::Contents { } // no encryption or encryption with plaintext footer - // TODO: encryption with plaintext footer if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { uint32_t metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - @@ -281,22 +280,35 @@ class SerializedFile : public ParquetFileReader::Contents { std::string aad_prefix = file_decryption_properties->aad_prefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, but not stored in file" + "and not supplied in decryption properties"); + } + if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { throw ParquetException( - "ADD Prefix in file and in properties is not the same"); + "AAD Prefix in file and in properties is not the same"); } } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException( - "AAD prefix used for file encryption, but not stored in file" - "and not supplied in decryption properties"); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but not found in file"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD prefix used for file encryption, but not stored in file and not " + "supplied in decryption properties"); + } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; @@ -356,12 +368,18 @@ class SerializedFile : public ParquetFileReader::Contents { EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string aad_prefix = file_decryption_properties->aad_prefix(); + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { throw ParquetException( - "ADD Prefix in file and in properties " + "AAD Prefix in file and in properties " "is not the same"); } } @@ -369,12 +387,18 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but not found in file"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD prefix used for file encryption, but not stored in file and not " + "supplied in decryption properties"); + } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; file_decryptor_.reset( From d1aa339d5a28759a46a632d3c9b9a0eb53efbcf1 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 3 Jun 2019 21:34:13 +0700 Subject: [PATCH 100/125] fix lint and format issue --- cpp/src/parquet/encryption-metadata-test.cc | 6 +++--- cpp/src/parquet/encryption-test.cc | 14 +++++++------- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/file_writer.cc | 11 ++++++----- cpp/src/parquet/metadata.cc | 10 ++++++---- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc index f81493dbed0..6f29b364b5b 100644 --- a/cpp/src/parquet/encryption-metadata-test.cc +++ b/cpp/src/parquet/encryption-metadata-test.cc @@ -26,9 +26,9 @@ namespace parquet { namespace metadata { -const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 -const std::string kColumnEncryptionKey1 = "1234567890123450"; -const std::string kColumnEncryptionKey2 = "1234567890123451"; +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; TEST(Metadata, EncryptFooter) { parquet::schema::NodeVector fields; diff --git a/cpp/src/parquet/encryption-test.cc b/cpp/src/parquet/encryption-test.cc index e4d4a27ed7f..088a2c08bf8 100644 --- a/cpp/src/parquet/encryption-test.cc +++ b/cpp/src/parquet/encryption-test.cc @@ -27,10 +27,10 @@ using schema::ColumnPath; namespace test { -const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 -const std::string kColumnEncryptionKey1 = "1234567890123450"; -const std::string kColumnEncryptionKey2 = "1234567890123451"; -const std::string kFileName = "tester"; +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { std::shared_ptr column_path_1 = @@ -66,7 +66,7 @@ TEST(TestEncryptionProperties, UniformEncryption) { std::shared_ptr props = builder.build(); ASSERT_EQ(true, props->encrypted_footer()); - ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); ASSERT_EQ("kf", props->footer_key_metadata()); @@ -106,7 +106,7 @@ TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { std::shared_ptr props = builder.build(); ASSERT_EQ(true, props->encrypted_footer()); - ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); std::shared_ptr out_col_props_1 = @@ -163,7 +163,7 @@ TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { std::shared_ptr props = builder.build(); ASSERT_EQ(false, props->encrypted_footer()); - ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); std::shared_ptr out_col_props_1 = diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index f2b57fc66f4..ee60a563f9f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -280,7 +280,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::string aad_prefix = file_decryption_properties->aad_prefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException( "AAD prefix used for file encryption, but not stored in file" "and not supplied in decryption properties"); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 65f671b2167..6f59f3a47c6 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -303,8 +303,9 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Tell(&position)); uint32_t footer_and_crypto_len = static_cast(position - metadata_start); - sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - sink_->Write(kParquetEMagic, 4); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { // Encrypted file with plaintext footer file_metadata_ = metadata_->Finish(); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); @@ -316,7 +317,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } } - sink_->Close(); + PARQUET_THROW_NOT_OK(sink_->Close()); } } @@ -457,8 +458,8 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin PARQUET_THROW_NOT_OK(sink->Tell(&position)); metadata_len = static_cast(position) - metadata_len; - sink->Write(reinterpret_cast(&metadata_len), 4); - sink->Write(kParquetMagic, 4); + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); } } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9d8a01ae995..5115888ad47 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -509,11 +509,13 @@ class FileMetaData::FileMetaDataImpl { encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); // write unencrypted footer - dst->Write(serialized_data, serialized_len); + PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len)); // Write signature (nonce and tag) - dst->Write(encrypted_data.data() + 4, encryption::kNonceLength); - dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, - encryption::kGcmTagLength); + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + 4, encryption::kNonceLength)); + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, + encryption::kGcmTagLength)); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); From 44f37b05684edf78b53fc633ab5dc0f690de3c43 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 4 Jun 2019 08:10:22 +0700 Subject: [PATCH 101/125] fix metadata set, statistics set issues --- cpp/src/parquet/encryption-metadata-test.cc | 422 +++++++++++++++++++- cpp/src/parquet/metadata.cc | 208 +++++----- cpp/src/parquet/metadata.h | 1 + 3 files changed, 534 insertions(+), 97 deletions(-) diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc index 6f29b364b5b..7ff31f9b212 100644 --- a/cpp/src/parquet/encryption-metadata-test.cc +++ b/cpp/src/parquet/encryption-metadata-test.cc @@ -21,6 +21,7 @@ #include "parquet/properties.h" #include "parquet/schema.h" +#include "parquet/statistics.h" namespace parquet { @@ -28,9 +29,9 @@ namespace metadata { const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 const char kColumnEncryptionKey1[] = "1234567890123450"; -const char kColumnEncryptionKey2[] = "1234567890123451"; +// const char kColumnEncryptionKey2[] = "1234567890123451"; -TEST(Metadata, EncryptFooter) { +TEST(Metadata, UniformEncryption) { parquet::schema::NodeVector fields; parquet::schema::NodePtr root; parquet::SchemaDescriptor schema; @@ -40,19 +41,294 @@ TEST(Metadata, EncryptFooter) { root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); schema.Init(root); + int64_t nrows = 1000; + int32_t int_min = 100, int_max = 200; + EncodedStatistics stats_int; + stats_int.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&int_min), 4)) + .set_max(std::string(reinterpret_cast(&int_max), 4)); + EncodedStatistics stats_float; + float float_min = 100.100f, float_max = 200.200f; + stats_float.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&float_min), 4)) + .set_max(std::string(reinterpret_cast(&float_max), 4)); + + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); + encryption_prop_builder.footer_key_metadata("kf"); + + WriterProperties::Builder writer_prop_builder; + writer_prop_builder.version(ParquetVersion::PARQUET_2_0); + writer_prop_builder.encryption(encryption_prop_builder.build()); + auto props = writer_prop_builder.build(); + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto rg1_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + stats_int.set_is_signed(true); + col1_builder->SetStatistics(stats_int); + stats_float.set_is_signed(true); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); + + rg1_builder->set_num_rows(nrows / 2); + rg1_builder->Finish(1024); + + // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); + + rg2_builder->set_num_rows(nrows / 2); + rg2_builder->Finish(1024); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); + + auto file_crypto_metadata = f_builder->GetCryptoMetaData(); + ASSERT_EQ(true, file_crypto_metadata != NULLPTR); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_LE(0, static_cast(f_accessor->size())); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg1_column1->is_stats_set()); + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(0, rg1_column1->statistics()->null_count()); + ASSERT_EQ(0, rg1_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); + ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column1->num_values()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(3, rg1_column1->encodings().size()); + ASSERT_EQ(3, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column1->total_compressed_size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(10, rg1_column1->data_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg2_column1->is_stats_set()); + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(0, rg2_column1->statistics()->null_count()); + ASSERT_EQ(0, rg2_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); + ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(nrows / 2, rg2_column1->num_values()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(3, rg2_column1->encodings().size()); + ASSERT_EQ(3, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column1->total_compressed_size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(10, rg2_column1->data_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); +} + +TEST(Metadata, EncryptFooterAndOneColumn) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + int64_t nrows = 1000; + int32_t int_min = 100, int_max = 200; + EncodedStatistics stats_int; + stats_int.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&int_min), 4)) + .set_max(std::string(reinterpret_cast(&int_max), 4)); + EncodedStatistics stats_float; + float float_min = 100.100f, float_max = 200.200f; + stats_float.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&float_min), 4)) + .set_max(std::string(reinterpret_cast(&float_max), 4)); + + std::shared_ptr int_col_path = + parquet::schema::ColumnPath::FromDotString("int_col"); + ColumnEncryptionProperties::Builder int_col_builder(int_col_path); + int_col_builder.key(kColumnEncryptionKey1); + int_col_builder.key_id("kc1"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + encryption_col_props; + encryption_col_props[int_col_path] = int_col_builder.build(); + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); encryption_prop_builder.footer_key_metadata("kf"); + encryption_prop_builder.column_properties(encryption_col_props); WriterProperties::Builder writer_prop_builder; + writer_prop_builder.version(ParquetVersion::PARQUET_2_0); writer_prop_builder.encryption(encryption_prop_builder.build()); auto props = writer_prop_builder.build(); auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto file_metadata = f_builder->Finish(); - ASSERT_EQ(false, file_metadata->is_encryption_algorithm_set()); + auto rg1_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + stats_int.set_is_signed(true); + col1_builder->SetStatistics(stats_int); + stats_float.set_is_signed(true); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); + + rg1_builder->set_num_rows(nrows / 2); + rg1_builder->Finish(1024); + + // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); + + rg2_builder->set_num_rows(nrows / 2); + rg2_builder->Finish(1024); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); auto file_crypto_metadata = f_builder->GetCryptoMetaData(); ASSERT_EQ(true, file_crypto_metadata != NULLPTR); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_LE(0, static_cast(f_accessor->size())); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(false, rg1_column1->is_metadata_set()); + ASSERT_THROW(rg1_column1->is_stats_set(), ParquetException); + ASSERT_THROW(rg1_column1->statistics(), ParquetException); + ASSERT_THROW(rg1_column1->compression(), ParquetException); + ASSERT_THROW(rg1_column1->num_values(), ParquetException); + ASSERT_THROW(rg1_column1->encodings(), ParquetException); + ASSERT_THROW(rg1_column1->total_compressed_size(), ParquetException); + ASSERT_THROW(rg1_column1->total_uncompressed_size(), ParquetException); + ASSERT_THROW(rg1_column1->dictionary_page_offset(), ParquetException); + ASSERT_THROW(rg1_column1->data_page_offset(), ParquetException); + + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg1_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(3, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(false, rg1_column1->is_metadata_set()); + ASSERT_THROW(rg2_column1->is_stats_set(), ParquetException); + ASSERT_THROW(rg2_column1->statistics(), ParquetException); + ASSERT_THROW(rg2_column1->compression(), ParquetException); + ASSERT_THROW(rg2_column1->num_values(), ParquetException); + ASSERT_THROW(rg2_column1->encodings(), ParquetException); + ASSERT_THROW(rg2_column1->total_compressed_size(), ParquetException); + ASSERT_THROW(rg2_column1->total_uncompressed_size(), ParquetException); + ASSERT_THROW(rg2_column1->dictionary_page_offset(), ParquetException); + ASSERT_THROW(rg2_column1->data_page_offset(), ParquetException); + + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg2_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(3, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); } TEST(Metadata, PlaintextFooter) { @@ -65,20 +341,154 @@ TEST(Metadata, PlaintextFooter) { root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); schema.Init(root); + int64_t nrows = 1000; + int32_t int_min = 100, int_max = 200; + EncodedStatistics stats_int; + stats_int.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&int_min), 4)) + .set_max(std::string(reinterpret_cast(&int_max), 4)); + EncodedStatistics stats_float; + float float_min = 100.100f, float_max = 200.200f; + stats_float.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&float_min), 4)) + .set_max(std::string(reinterpret_cast(&float_max), 4)); + + std::shared_ptr int_col_path = + parquet::schema::ColumnPath::FromDotString("int_col"); + ColumnEncryptionProperties::Builder int_col_builder(int_col_path); + int_col_builder.key(kColumnEncryptionKey1); + int_col_builder.key_id("kc1"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + encryption_col_props; + encryption_col_props[int_col_path] = int_col_builder.build(); + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); encryption_prop_builder.footer_key_metadata("kf"); encryption_prop_builder.set_plaintext_footer(); + encryption_prop_builder.column_properties(encryption_col_props); WriterProperties::Builder writer_prop_builder; + writer_prop_builder.version(ParquetVersion::PARQUET_2_0); writer_prop_builder.encryption(encryption_prop_builder.build()); auto props = writer_prop_builder.build(); auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto file_metadata = f_builder->Finish(); - ASSERT_EQ(true, file_metadata->is_encryption_algorithm_set()); + auto rg1_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + stats_int.set_is_signed(true); + col1_builder->SetStatistics(stats_int); + stats_float.set_is_signed(true); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); + + rg1_builder->set_num_rows(nrows / 2); + rg1_builder->Finish(1024); + + // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); + + rg2_builder->set_num_rows(nrows / 2); + rg2_builder->Finish(1024); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + ASSERT_EQ(true, f_accessor->is_encryption_algorithm_set()); auto file_crypto_metadata = f_builder->GetCryptoMetaData(); ASSERT_EQ(NULLPTR, file_crypto_metadata); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_LE(0, static_cast(f_accessor->size())); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg1_column1->is_metadata_set()); + ASSERT_EQ(false, rg1_column1->is_stats_set()); + ASSERT_EQ(NULLPTR, rg1_column1->statistics()); + // ASSERT_THROW(rg1_column1->encodings(), ParquetException); + + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); + ASSERT_EQ(nrows / 2, rg1_column1->num_values()); + ASSERT_EQ(3, rg1_column1->encodings().size()); + ASSERT_EQ(512, rg1_column1->total_compressed_size()); + ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); + ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); + ASSERT_EQ(10, rg1_column1->data_page_offset()); + + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg1_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(3, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg2_column1->is_metadata_set()); + ASSERT_EQ(false, rg2_column1->is_stats_set()); + ASSERT_EQ(NULLPTR, rg2_column1->statistics()); + // ASSERT_THROW(rg2_column1->encodings(), ParquetException); + + ASSERT_EQ(nrows / 2, rg2_column1->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); + ASSERT_EQ(3, rg2_column1->encodings().size()); + ASSERT_EQ(512, rg2_column1->total_compressed_size()); + ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); + ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); + ASSERT_EQ(10, rg2_column1->data_page_offset()); + + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg2_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(3, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); } } // namespace metadata diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5115888ad47..05f9874ea21 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -167,37 +167,36 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { + is_metadata_set_ = column->__isset.meta_data; metadata_ = column->meta_data; - if (column->__isset.crypto_metadata) { + if (column->__isset.crypto_metadata && !is_metadata_set_) { format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { - if (file_decryptor->properties() == NULLPTR) { - throw ParquetException( - "Cannot decrypt ColumnMetadata. " - "FileDecryptionProperties must be provided."); + if (file_decryptor != NULLPTR && file_decryptor->properties() != NULLPTR) { + // should decrypt metadata + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + + std::string aad_column_metadata = encryption::CreateModuleAad( + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, + column_ordinal, (int16_t)-1); + auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, + aad_column_metadata); + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg( + reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, decryptor, false); + is_metadata_set_ = true; } - // should decrypt metadata - std::shared_ptr path = std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); - std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - - DCHECK(file_decryptor != NULLPTR); - - std::string aad_column_metadata = encryption::CreateModuleAad( - file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, - column_ordinal, (int16_t)-1); - auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, - aad_column_metadata); - uint32_t len = static_cast(column->encrypted_column_metadata.size()); - DeserializeThriftMsg( - reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, decryptor, false); } } - for (auto encoding : metadata_.encodings) { - encodings_.push_back(FromThrift(encoding)); + if (is_metadata_set_) { + for (auto encoding : metadata_.encodings) { + encodings_.push_back(FromThrift(encoding)); + } } possible_stats_ = nullptr; } @@ -206,12 +205,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline const std::string& file_path() const { return column_->file_path; } // column metadata - inline Type::type type() const { return FromThrift(metadata_.type); } + inline bool is_metadata_set() const { return is_metadata_set_; } + inline Type::type type() const { return FromThrift(GetMetadataIfSet().type); } - inline int64_t num_values() const { return metadata_.num_values; } + inline int64_t num_values() const { return GetMetadataIfSet().num_values; } std::shared_ptr path_in_schema() { - return std::make_shared(metadata_.path_in_schema); + return std::make_shared(GetMetadataIfSet().path_in_schema); } // Check if statistics are set and are valid @@ -221,11 +221,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!metadata_.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { + auto metadata = GetMetadataIfSet(); + if (!metadata.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(metadata_, descr_); + possible_stats_ = MakeColumnStats(metadata, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -236,28 +237,39 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return is_stats_set() ? possible_stats_ : nullptr; } - inline Compression::type compression() const { return FromThrift(metadata_.codec); } + inline Compression::type compression() const { + return FromThrift(GetMetadataIfSet().codec); + } - const std::vector& encodings() const { return encodings_; } + const std::vector& encodings() const { + GetMetadataIfSet(); + return encodings_; + } inline bool has_dictionary_page() const { - return metadata_.__isset.dictionary_page_offset; + return GetMetadataIfSet().__isset.dictionary_page_offset; } inline int64_t dictionary_page_offset() const { - return metadata_.dictionary_page_offset; + return GetMetadataIfSet().dictionary_page_offset; } - inline int64_t data_page_offset() const { return metadata_.data_page_offset; } + inline int64_t data_page_offset() const { return GetMetadataIfSet().data_page_offset; } - inline bool has_index_page() const { return metadata_.__isset.index_page_offset; } + inline bool has_index_page() const { + return GetMetadataIfSet().__isset.index_page_offset; + } - inline int64_t index_page_offset() const { return metadata_.index_page_offset; } + inline int64_t index_page_offset() const { + return GetMetadataIfSet().index_page_offset; + } - inline int64_t total_compressed_size() const { return metadata_.total_compressed_size; } + inline int64_t total_compressed_size() const { + return GetMetadataIfSet().total_compressed_size; + } inline int64_t total_uncompressed_size() const { - return metadata_.total_uncompressed_size; + return GetMetadataIfSet().total_uncompressed_size; } inline std::unique_ptr crypto_metadata() const { @@ -276,6 +288,16 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { format::ColumnMetaData metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; + bool is_metadata_set_; + + inline const format::ColumnMetaData& GetMetadataIfSet() const { + if (!is_metadata_set_) { + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); + } + return metadata_; + } }; std::unique_ptr ColumnChunkMetaData::Make( @@ -303,6 +325,8 @@ int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); } // column metadata +bool ColumnChunkMetaData::is_metadata_set() const { return impl_->is_metadata_set(); } + Type::type ColumnChunkMetaData::type() const { return impl_->type(); } int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); } @@ -912,27 +936,11 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_metadata_.__set_encodings(thrift_encodings); - // temporary fix: setting for columnchunk meta_data in case file is not encrypted - if (properties_->file_encryption_properties() == NULLPTR) { + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); - } - } - - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - ThriftSerializer serializer; - - // column is unencrypted - if (encryptor == NULLPTR) { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); - - serializer.Serialize(column_chunk_, sink); - } else { // column is encrypted - const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); - bool encrypt_metadata = encryptor->encryptColumnMetaData( - properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); + } else { column_chunk_->__isset.crypto_metadata = true; format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { @@ -948,10 +956,62 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); + bool encrypted_footer = + properties_->file_encryption_properties()->encrypted_footer(); + bool encrypt_metadata = + !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); if (!encrypt_metadata) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); - } else { // Serialize and encrypt ColumnMetadata separately + } else if (!encrypted_footer) { + // Keep redacted metadata version for old readers + format::ColumnMetaData metadata_redacted; + metadata_redacted.__set_type(column_metadata_.type); + metadata_redacted.__set_encodings(column_metadata_.encodings); + metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); + metadata_redacted.__set_codec(column_metadata_.codec); + metadata_redacted.__set_num_values(column_metadata_.num_values); + metadata_redacted.__set_total_uncompressed_size( + column_metadata_.total_uncompressed_size); + metadata_redacted.__set_total_compressed_size( + column_metadata_.total_compressed_size); + if (column_metadata_.__isset.key_value_metadata) { + metadata_redacted.__isset.key_value_metadata = true; + metadata_redacted.__set_key_value_metadata(column_metadata_.key_value_metadata); + } + metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); + if (column_metadata_.__isset.index_page_offset) { + metadata_redacted.__isset.index_page_offset = true; + metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); + } + if (column_metadata_.__isset.dictionary_page_offset) { + metadata_redacted.__isset.dictionary_page_offset = true; + metadata_redacted.__set_dictionary_page_offset( + column_metadata_.dictionary_page_offset); + } + metadata_redacted.__isset.statistics = false; + metadata_redacted.__isset.encoding_stats = false; + + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(metadata_redacted); + } + } + } + + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor) { + ThriftSerializer serializer; + + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + // column is unencrypted + if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { + serializer.Serialize(column_chunk_, sink); + } else { // column is encrypted + bool encrypt_metadata = encryptor->encryptColumnMetaData( + properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); + + if (encrypt_metadata) { + // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; @@ -969,40 +1029,6 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); - // Keep redacted metadata version for old readers - if (!properties_->file_encryption_properties()->encrypted_footer()) { - // metadata_redacted should be stripped of the column_metadata_ statistics. - format::ColumnMetaData metadata_redacted; - metadata_redacted.__set_type(column_metadata_.type); - metadata_redacted.__set_encodings(column_metadata_.encodings); - metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); - metadata_redacted.__set_codec(column_metadata_.codec); - metadata_redacted.__set_num_values(column_metadata_.num_values); - metadata_redacted.__set_total_uncompressed_size( - column_metadata_.total_uncompressed_size); - metadata_redacted.__set_total_compressed_size( - column_metadata_.total_compressed_size); - if (column_metadata_.__isset.key_value_metadata) { - metadata_redacted.__isset.key_value_metadata = true; - metadata_redacted.__set_key_value_metadata( - column_metadata_.key_value_metadata); - } - metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); - if (column_metadata_.__isset.index_page_offset) { - metadata_redacted.__isset.index_page_offset = true; - metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); - } - if (column_metadata_.__isset.dictionary_page_offset) { - metadata_redacted.__isset.dictionary_page_offset = true; - metadata_redacted.__set_dictionary_page_offset( - column_metadata_.dictionary_page_offset); - } - metadata_redacted.__isset.statistics = false; - metadata_redacted.__isset.encoding_stats = false; - - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(metadata_redacted); - } } serializer.Serialize(column_chunk_, sink); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index b3bee5d28b7..61a1d53e0b9 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -139,6 +139,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const std::string& file_path() const; // column metadata + bool is_metadata_set() const; Type::type type() const; int64_t num_values() const; std::shared_ptr path_in_schema() const; From 72ca77fe895be5069151b8128f97ab51728cc194 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 5 Jun 2019 08:44:09 +0300 Subject: [PATCH 102/125] Various changes to encryption-reader-writer-all-crypto-options test and to AAD handling in file_reader.cc --- ...yption-reader-writer-all-crypto-options.cc | 244 ++++-------------- ...ryption-reader-writer-all-crypto-options.h | 54 ++++ cpp/src/parquet/file_reader.cc | 60 +++-- 3 files changed, 133 insertions(+), 225 deletions(-) create mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 98f0e57a7aa..5241e559cae 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -23,7 +23,7 @@ #include #include -#include +#include /* * This file contains samples for writing and reading encrypted Parquet files in different @@ -52,21 +52,25 @@ * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * The write sample creates files with eight columns in the following + * The write sample creates files with four columns in the following * encryption configurations: * * - Encryption configuration 1: Encrypt all columns and the footer with the same key. * (uniform encryption) - * - Encryption configuration 2: Encrypt two columns and the footer. - * - Encryption configuration 3: Encrypt two columns. Don’t encrypt footer (to enable - * legacy readers) - plaintext footer mode. - * - Encryption configuration 4: Encrypt two columns and the footer. Supply aad_prefix - * for file identity verification. - * - Encryption configuration 5: Encrypt two columns and the footer. Supply aad_prefix, - * and call disable_aad_prefix_storage to prevent file + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file * identity storage in file metadata. - * - Encryption configuration 6: Encrypt two columns and the footer. Use the - * alternative (AES_GCM_CTR_V1) algorithm. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. * * The read sample uses each of the following decryption configurations to read every * encrypted files in the input directory: @@ -88,8 +92,11 @@ const std::string kColumnEncryptionKey2 = "1234567890123451"; const std::string fileName = "tester"; void PrintDecryptionConfiguration(int configuration); +// Check that the decryption result is as expected. void CheckResult(std::string file, int example_id, std::string exception_msg); -int ExtractEncryptionConfigurationNumber(std::string file); +// Returns true if FileName ends with suffix. Otherwise returns false. +// Used to skip unencrypted parquet files. +bool FileNameEndsWith(std::string file_name, std::string suffix); std::vector GetDirectoryFiles(const std::string& path) { std::vector files; @@ -123,7 +130,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { vector_of_encryption_configurations.push_back( file_encryption_builder_1.footer_key_metadata("kf")->build()); - // Encryption configuration 2: Encrypt two columns and the footer. + // Encryption configuration 2: Encrypt two columns and the footer, with different keys. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -148,7 +155,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->column_properties(encryption_cols2) ->build()); - // Encryption configuration 3: Encrypt two columns, don’t encrypt footer. + // Encryption configuration 3: Encrypt two columns, with different keys. + // Don’t encrypt footer. // (plaintext footer mode, readable by legacy readers) std::map, std::shared_ptr, @@ -170,7 +178,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->set_plaintext_footer() ->build()); - // Encryption configuration 4: Encrypt two columns and the footer. Use aad_prefix. + // Encryption configuration 4: Encrypt two columns and the footer, with different keys. + // Use aad_prefix. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -191,8 +200,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->aad_prefix(fileName) ->build()); - // Encryption configuration 5: Encrypt two columns and the footer. Use aad_prefix and - // disable_aad_prefix_storage. + // Encryption configuration 5: Encrypt two columns and the footer, with different keys. + // Use aad_prefix and disable_aad_prefix_storage. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -214,8 +223,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->disable_store_aad_prefix_storage() ->build()); - // Encryption configuration 6: Encrypt two columns and the footer. Use AES_GCM_CTR_V1 - // algorithm. + // Encryption configuration 6: Encrypt two columns and the footer, with different keys. + // Use AES_GCM_CTR_V1 algorithm. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -290,31 +299,6 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { int32_writer->WriteBatch(1, nullptr, nullptr, &value); } - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Write the Float column parquet::FloatWriter* float_writer = static_cast(rg_writer->NextColumn()); @@ -330,38 +314,6 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { double value = i * 1.1111111; double_writer->WriteBatch(1, nullptr, nullptr, &value); } - - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::ByteArray value; - char hello[FIXED_LENGTH] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = FIXED_LENGTH; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } - - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); - - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } // Close the ParquetFileWriter file_writer->Close(); @@ -387,7 +339,6 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { // Decryption configuration 1: Decrypt using key retriever callback that holds the keys // of two encrypted columns and the footer key. - std::shared_ptr string_kr1 = std::make_shared(); string_kr1->PutKey("kf", kFooterEncryptionKey); @@ -449,8 +400,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { PrintDecryptionConfiguration(example_id + 1); for (auto const& file : files_in_directory) { std::string exception_msg = ""; - if (file.find("parquet.encrypted") == - std::string::npos) // Skip non encrypted files + if (!FileNameEndsWith(file, "parquet.encrypted")) // Skip non encrypted files continue; try { std::cout << "--> Read file " << file << std::endl; @@ -476,7 +426,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { // Get the number of Columns int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); + assert(num_columns == 4); // Iterate over all the RowGroups in the file for (int r = 0; r < num_row_groups; ++r) { @@ -486,8 +436,6 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { int64_t values_read = 0; int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; int i; std::shared_ptr column_reader; @@ -534,63 +482,8 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { i++; } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - assert(value == expected_value); - if ((i % 2) == 0) { - assert(repetition_level == 1); - } else { - assert(repetition_level == 0); - } - i++; - } - - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); - } - i++; - } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); + column_reader = row_group_reader->Column(2); parquet::FloatReader* float_reader = static_cast(column_reader.get()); // Read all the rows in the column @@ -612,7 +505,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { } // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); + column_reader = row_group_reader->Column(3); parquet::DoubleReader* double_reader = static_cast(column_reader.get()); // Read all the rows in the column @@ -632,61 +525,6 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { assert(value == expected_value); i++; } - - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // Verify the value written - char expected_value[FIXED_LENGTH] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - assert(values_read == 1); - assert(value.len == FIXED_LENGTH); - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - assert(definition_level == 1); - } else { - // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); - } - i++; - } - - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - char v = static_cast(i); - char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - i++; - } } } catch (const std::exception& e) { exception_msg = e.what(); @@ -716,7 +554,8 @@ void PrintDecryptionConfiguration(int configuration) { std::cout << std::endl; } -int ExtractEncryptionConfigurationNumber(std::string file) { +// Check that the decryption result is as expected. +void CheckResult(std::string file, int example_id, std::string exception_msg) { int encryption_configuration_number; std::regex r("tester([0-9]+)\\.parquet.encrypted"); std::smatch m; @@ -732,11 +571,6 @@ int ExtractEncryptionConfigurationNumber(std::string file) { std::cerr << "Error: Unknown encryption configuration number. " << std::endl; } - return encryption_configuration_number; -} - -void CheckResult(std::string file, int example_id, std::string exception_msg) { - int encryption_configuration_number = ExtractEncryptionConfigurationNumber(file); int decryption_configuration_number = example_id + 1; // Encryption_configuration number five contains aad_prefix and @@ -765,6 +599,16 @@ void CheckResult(std::string file, int example_id, std::string exception_msg) { std::cout << "Error: Unexpected exception was thrown." << exception_msg; } +bool FileNameEndsWith(std::string file_name, std::string suffix) { + std::string::size_type idx = file_name.find_first_of('.'); + + if (idx != std::string::npos) { + std::string extension = file_name.substr(idx + 1); + if (extension.compare(suffix) == 0) return true; + } + return false; +} + int main(int argc, char** argv) { enum Operation { write, read }; std::string rootPath; diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h new file mode 100644 index 00000000000..db1b692b9f0 --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +using parquet::LogicalType; +using parquet::Repetition; +using parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; + +constexpr int FIXED_LENGTH = 10; + +static std::shared_ptr SetupSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, LogicalType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + LogicalType::TIME_MILLIS)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, + LogicalType::NONE)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index ee60a563f9f..65a496ab50b 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -277,41 +277,46 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided"); } - std::string aad_prefix = file_decryption_properties->aad_prefix(); - + // Handle AAD prefix EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properites; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { throw ParquetException( - "AAD prefix used for file encryption, but not stored in file" - "and not supplied in decryption properties"); + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } - if (!algo.aad.aad_prefix.empty()) { - if (!aad_prefix.empty()) { - if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + if (file_has_aad_prefix) { + if (!aad_prefix_in_properites.empty()) { + if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { throw ParquetException( - "AAD Prefix in file and in properties is not the same"); + "AAD Prefix in file and in properties " + "is not the same"); } } - aad_prefix = algo.aad.aad_prefix; + aad_prefix = aad_prefix_in_file; std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { throw ParquetException( - "AAD Prefix set in decryption properties, but not found in file"); + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); } std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) { throw ParquetException( - "AAD prefix used for file encryption, but not stored in file and not " - "supplied in decryption properties"); + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata())); @@ -367,37 +372,42 @@ class SerializedFile : public ParquetFileReader::Contents { FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); - std::string aad_prefix = file_decryption_properties->aad_prefix(); - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + // Handle AAD prefix + std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properites; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { throw ParquetException( "AAD prefix used for file encryption, " "but not stored in file and not supplied " "in decryption properties"); } - if (!algo.aad.aad_prefix.empty()) { - if (!aad_prefix.empty()) { - if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + if (file_has_aad_prefix) { + if (!aad_prefix_in_properites.empty()) { + if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { throw ParquetException( "AAD Prefix in file and in properties " "is not the same"); } } - aad_prefix = algo.aad.aad_prefix; + aad_prefix = aad_prefix_in_file; std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { throw ParquetException( - "AAD Prefix set in decryption properties, but not found in file"); + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); } std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) { throw ParquetException( - "AAD prefix used for file encryption, but not stored in file and not " - "supplied in decryption properties"); + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; From 899a6035c0100f662700fa173eb8188344f1f8be Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 5 Jun 2019 15:06:05 +0300 Subject: [PATCH 103/125] Fix logging error --- cpp/src/parquet/encryption.cc | 223 ++++++++++++++++++++++++++++++++++ cpp/src/parquet/encryption.h | 208 +++---------------------------- 2 files changed, 243 insertions(+), 188 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 8a88db00224..096df59b135 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -22,6 +22,7 @@ #include #include +#include "arrow/util/logging.h" #include "arrow/util/utf8.h" namespace parquet { @@ -47,6 +48,23 @@ const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key( + std::string column_key) { + if (column_key.empty()) return this; + + DCHECK(key_.empty()); + key_ = column_key; + return this; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata( + const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; +} + ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( const std::string& key_id) { // key_id is expected to be in UTF8 encoding @@ -61,6 +79,211 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id return this; } +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + + column_properties_ = column_properties; + return this; +} + +void FileDecryptionProperties::wipeout_decryption_keys() { + if (!footer_key_.empty()) + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_decryption_key(); + } +} + +bool FileDecryptionProperties::is_utilized() { + if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) + return false; + + return utilized_; +} + +std::shared_ptr FileDecryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileDecryptionProperties( + footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix, + aad_prefix_verifier_, column_properties_map_copy, plaintext_files_allowed_)); +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key( + const std::string footer_key) { + if (footer_key.empty()) { + return this; + } + DCHECK(footer_key_.empty()); + footer_key_ = footer_key; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever( + const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier( + std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; +} + +ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key( + const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; +} + +std::shared_ptr ColumnDecryptionProperties::Builder::build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); +} + +void ColumnDecryptionProperties::wipeout_decryption_key() { + if (!key_.empty()) { + std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); + } +} + +std::shared_ptr ColumnDecryptionProperties::DeepClone() { + std::string key_copy = key_; + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata( + const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + column_properties_ = column_properties; + return this; +} + +void FileEncryptionProperties::wipeout_encryption_keys() { + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_encryption_key(); + } +} + +std::shared_ptr FileEncryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileEncryptionProperties( + algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, + new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; +} + +FileEncryptionProperties::Builder* +FileEncryptionProperties::Builder::disable_store_aad_prefix_storage() { + DCHECK(!aad_prefix_.empty()); + + store_aad_prefix_in_file_ = false; + return this; +} + ColumnEncryptionProperties::ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, const std::string& key_metadata) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 7d943d15be5..f3994ecf822 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -24,7 +24,6 @@ #include #include -#include "arrow/util/logging.h" #include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/schema.h" @@ -104,22 +103,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { /// The key is cloned, and will be wiped out (array values set to 0) upon completion /// of file writing. /// Caller is responsible for wiping out the input key array. - Builder* key(std::string column_key) { - if (column_key.empty()) return this; - - DCHECK(key_.empty()); - key_ = column_key; - return this; - } + Builder* key(std::string column_key); /// Set a key retrieval metadata. /// use either key_metadata() or key_id(), not both - Builder* key_metadata(const std::string& key_metadata) { - DCHECK(!key_metadata.empty()); - DCHECK(key_metadata_.empty()); - key_metadata_ = key_metadata; - return this; - } + Builder* key_metadata(const std::string& key_metadata); /// Set a key retrieval metadata (converted from String). /// use either key_metadata() or key_id(), not both @@ -197,18 +185,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { /// key metadata for this column the metadata will be ignored, /// the column will be decrypted with this key. /// key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } + Builder* key(const std::string& key); - std::shared_ptr build() { - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_)); - } + std::shared_ptr build(); private: const std::shared_ptr column_path_; @@ -225,17 +204,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { void set_utilized() { utilized_ = true; } - void wipeout_decryption_key() { - if (!key_.empty()) { - std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); - } - } + void wipeout_decryption_key(); - std::shared_ptr DeepClone() { - std::string key_copy = key_; - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_copy)); - } + std::shared_ptr DeepClone(); private: const std::shared_ptr column_path_; @@ -281,14 +252,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// will be wiped out (array values set to 0). /// Caller is responsible for wiping out the input key array. /// param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(const std::string footer_key) { - if (footer_key.empty()) { - return this; - } - DCHECK(footer_key_.empty()); - footer_key_ = footer_key; - return this; - } + Builder* footer_key(const std::string footer_key); /// Set explicit column keys (decryption properties). /// Its also possible to set a key retriever on this property object. @@ -299,24 +263,7 @@ class PARQUET_EXPORT FileDecryptionProperties { Builder* column_properties( const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - for (std::pair, - std::shared_ptr> - element : column_properties) { - if (element.second->is_utilized()) { - throw ParquetException("Column properties utilized in another file"); - } - element.second->set_utilized(); - } - - column_properties_ = column_properties; - return this; - } + schema::ColumnPath::CmpColumnPath>& column_properties); /// Set a key retriever callback. Its also possible to /// set explicit footer or column keys on this file property object. @@ -324,13 +271,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// invocation of the retriever callback. /// If an explicit key is available for a footer or a column, /// its key metadata will be ignored. - Builder* key_retriever(const std::shared_ptr& key_retriever) { - if (key_retriever == NULLPTR) return this; - - DCHECK(key_retriever_ == NULLPTR); - key_retriever_ = key_retriever; - return this; - } + Builder* key_retriever(const std::shared_ptr& key_retriever); /// Skip integrity verification of plaintext footers. /// If not called, integrity of plaintext footers will be checked in runtime, @@ -347,23 +288,10 @@ class PARQUET_EXPORT FileDecryptionProperties { /// A must when a prefix is used for file encryption, but not stored in file. /// If AAD prefix is stored in file, it will be compared to the explicitly /// supplied value and an exception will be thrown if they differ. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) { - return this; - } - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - return this; - } + Builder* aad_prefix(const std::string& aad_prefix); /// Set callback for verification of AAD Prefixes stored in file. - Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) return this; - - DCHECK(aad_prefix_verifier_ == NULLPTR); - aad_prefix_verifier_ = aad_prefix_verifier; - return this; - } + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier); /// By default, reading plaintext (unencrypted) files is not /// allowed when using a decryptor @@ -412,23 +340,9 @@ class PARQUET_EXPORT FileDecryptionProperties { return aad_prefix_verifier_; } - void wipeout_decryption_keys() { - if (!footer_key_.empty()) - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + void wipeout_decryption_keys(); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_decryption_key(); - } - } - - bool is_utilized() { - if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) - return false; - - return utilized_; - } + bool is_utilized(); void set_utilized() { utilized_ = true; } @@ -439,28 +353,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// This method allows to clone identical properties for another file, /// with an option to update the aadPrefix (if newAadPrefix is null, /// aadPrefix will be cloned too) - std::shared_ptr DeepClone(std::string new_aad_prefix = "") { - std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; - - for (std::pair, - std::shared_ptr> - element : column_properties_) { - column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); - } - - if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; - return std::shared_ptr(new FileDecryptionProperties( - footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, - new_aad_prefix, aad_prefix_verifier_, column_properties_map_copy, - plaintext_files_allowed_)); - } + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); private: std::string footer_key_; @@ -520,32 +413,14 @@ class PARQUET_EXPORT FileEncryptionProperties { /// Set a key retrieval metadata. /// use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_metadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) return this; - - DCHECK(footer_key_metadata_.empty()); - footer_key_metadata_ = footer_key_metadata; - return this; - } + Builder* footer_key_metadata(const std::string& footer_key_metadata); /// Set the file AAD Prefix. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) return this; - - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - store_aad_prefix_in_file_ = true; - return this; - } + Builder* aad_prefix(const std::string& aad_prefix); /// Skip storing AAD Prefix in file. /// If not called, and if AAD Prefix is set, it will be stored. - Builder* disable_store_aad_prefix_storage() { - DCHECK(!aad_prefix_.empty()); - - store_aad_prefix_in_file_ = false; - return this; - } + Builder* disable_store_aad_prefix_storage(); /// Set the list of encrypted columns and their properties (keys etc). /// If not called, all columns will be encrypted with the footer key. @@ -553,23 +428,7 @@ class PARQUET_EXPORT FileEncryptionProperties { Builder* column_properties( const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - for (std::pair, - std::shared_ptr> - element : column_properties) { - if (element.second->is_utilized()) { - throw ParquetException("Column properties utilized in another file"); - } - element.second->set_utilized(); - } - column_properties_ = column_properties; - return this; - } + schema::ColumnPath::CmpColumnPath>& column_properties); std::shared_ptr build() { return std::shared_ptr(new FileEncryptionProperties( @@ -607,41 +466,14 @@ class PARQUET_EXPORT FileEncryptionProperties { void set_utilized() { utilized_ = true; } - void wipeout_encryption_keys() { - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_encryption_key(); - } - } + void wipeout_encryption_keys(); /// FileEncryptionProperties object can be used for writing one file only. /// (at the end, keys are wiped out in the memory). /// This method allows to clone identical properties for another file, /// with an option to update the aadPrefix (if newAadPrefix is null, /// aadPrefix will be cloned too) - std::shared_ptr DeepClone(std::string new_aad_prefix = "") { - std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; - - for (std::pair, - std::shared_ptr> - element : column_properties_) { - column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); - } - - if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; - return std::shared_ptr(new FileEncryptionProperties( - algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, - new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); - } + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); private: EncryptionAlgorithm algorithm_; From b59a3aa56f4de170fe2fc133d8d7ac585380d285 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 7 Jun 2019 22:49:46 +0700 Subject: [PATCH 104/125] post-rebase change --- cpp/src/parquet/file_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 6f59f3a47c6..81e45a1ca42 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -482,7 +482,7 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); return WriteFileMetaData(file_metadata, sink); } From afbb8bf0bc5c85ad682764adf9a15bd20102e982 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sat, 15 Jun 2019 22:08:21 +0700 Subject: [PATCH 105/125] fix isset of column chunk metadata and statistics --- cpp/src/parquet/file_writer.cc | 2 - cpp/src/parquet/metadata.cc | 191 ++++++++++++++++++--------------- 2 files changed, 105 insertions(+), 88 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 81e45a1ca42..11af1815577 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -316,8 +316,6 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_->wipeout_encryption_keys(); } } - - PARQUET_THROW_NOT_OK(sink_->Close()); } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 05f9874ea21..0bbb60a5950 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -167,10 +167,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { - is_metadata_set_ = column->__isset.meta_data; - metadata_ = column->meta_data; - - if (column->__isset.crypto_metadata && !is_metadata_set_) { + if (column->__isset.crypto_metadata) { // column metadata is encrypted + is_metadata_set_ = false; format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { @@ -192,9 +190,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { is_metadata_set_ = true; } } + } else { // column metadata is not encrypted + is_metadata_set_ = true; } if (is_metadata_set_) { - for (auto encoding : metadata_.encodings) { + const format::ColumnMetaData& meta_data = GetMetadataIfSet(); + for (auto encoding : meta_data.encodings) { encodings_.push_back(FromThrift(encoding)); } } @@ -221,12 +222,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - auto metadata = GetMetadataIfSet(); - if (!metadata.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { + const format::ColumnMetaData& meta_data = GetMetadataIfSet(); + if (!meta_data.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(metadata, descr_); + possible_stats_ = MakeColumnStats(meta_data, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -291,12 +292,17 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { - if (!is_metadata_set_) { - throw ParquetException( - "Cannot decrypt ColumnMetadata. " - "FileDecryptionProperties must be provided."); + if (column_->__isset.crypto_metadata) { + if (!is_metadata_set_) { + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); + } else { + return metadata_; + } + } else { + return column_->meta_data; } - return metadata_; } }; @@ -895,7 +901,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // column metadata void SetStatistics(const EncodedStatistics& val) { - column_metadata_.__set_statistics(ToThrift(val)); + column_chunk_->meta_data.__set_statistics(ToThrift(val)); } void Finish(int64_t num_values, int64_t dictionary_page_offset, @@ -903,19 +909,19 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback) { if (dictionary_page_offset > 0) { - column_metadata_.__set_dictionary_page_offset(dictionary_page_offset); + column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); } else { column_chunk_->__set_file_offset(data_page_offset + compressed_size); } - - column_metadata_.__set_num_values(num_values); + column_chunk_->__isset.meta_data = true; + column_chunk_->meta_data.__set_num_values(num_values); if (index_page_offset >= 0) { - column_metadata_.__set_index_page_offset(index_page_offset); + column_chunk_->meta_data.__set_index_page_offset(index_page_offset); } - column_metadata_.__set_data_page_offset(data_page_offset); - column_metadata_.__set_total_uncompressed_size(uncompressed_size); - column_metadata_.__set_total_compressed_size(compressed_size); + column_chunk_->meta_data.__set_data_page_offset(data_page_offset); + column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); + column_chunk_->meta_data.__set_total_compressed_size(compressed_size); std::vector thrift_encodings; if (has_dictionary) { @@ -934,14 +940,38 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { if (dictionary_fallback) { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } - column_metadata_.__set_encodings(thrift_encodings); + column_chunk_->meta_data.__set_encodings(thrift_encodings); + } + + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor) { + ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + // column is unencrypted if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); - } else { - column_chunk_->__isset.crypto_metadata = true; + serializer.Serialize(column_chunk_, sink); + } else { // column is encrypted + // copy column chunk, except for meta_data + format::ColumnChunk column_chunk; + column_chunk.__set_file_offset(column_chunk_->file_offset); + if (column_chunk_->__isset.file_path) { + column_chunk.__set_file_path(column_chunk_->file_path); + } + if (column_chunk_->__isset.offset_index_offset) { + column_chunk.__set_offset_index_offset(column_chunk_->offset_index_offset); + } + if (column_chunk_->__isset.offset_index_length) { + column_chunk.__set_offset_index_length(column_chunk_->offset_index_length); + } + if (column_chunk_->__isset.column_index_offset) { + column_chunk.__set_column_index_offset(column_chunk_->column_index_offset); + } + if (column_chunk_->__isset.column_index_length) { + column_chunk.__set_column_index_length(column_chunk_->column_index_length); + } + + column_chunk.__isset.crypto_metadata = true; format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { // encrypted with footer key @@ -954,62 +984,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk_->__set_crypto_metadata(ccmd); + column_chunk.__set_crypto_metadata(ccmd); bool encrypted_footer = properties_->file_encryption_properties()->encrypted_footer(); bool encrypt_metadata = !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); - if (!encrypt_metadata) { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); - } else if (!encrypted_footer) { - // Keep redacted metadata version for old readers - format::ColumnMetaData metadata_redacted; - metadata_redacted.__set_type(column_metadata_.type); - metadata_redacted.__set_encodings(column_metadata_.encodings); - metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); - metadata_redacted.__set_codec(column_metadata_.codec); - metadata_redacted.__set_num_values(column_metadata_.num_values); - metadata_redacted.__set_total_uncompressed_size( - column_metadata_.total_uncompressed_size); - metadata_redacted.__set_total_compressed_size( - column_metadata_.total_compressed_size); - if (column_metadata_.__isset.key_value_metadata) { - metadata_redacted.__isset.key_value_metadata = true; - metadata_redacted.__set_key_value_metadata(column_metadata_.key_value_metadata); - } - metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); - if (column_metadata_.__isset.index_page_offset) { - metadata_redacted.__isset.index_page_offset = true; - metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); - } - if (column_metadata_.__isset.dictionary_page_offset) { - metadata_redacted.__isset.dictionary_page_offset = true; - metadata_redacted.__set_dictionary_page_offset( - column_metadata_.dictionary_page_offset); - } - metadata_redacted.__isset.statistics = false; - metadata_redacted.__isset.encoding_stats = false; - - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(metadata_redacted); - } - } - } - - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - ThriftSerializer serializer; - - const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); - // column is unencrypted - if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { - serializer.Serialize(column_chunk_, sink); - } else { // column is encrypted - bool encrypt_metadata = encryptor->encryptColumnMetaData( - properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); - if (encrypt_metadata) { // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, @@ -1017,7 +997,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { uint8_t* serialized_data; uint32_t serialized_len; - serializer.SerializeToBuffer(&column_metadata_, &serialized_len, + serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len, &serialized_data); std::vector encrypted_data(encryptor->CiphertextSizeDelta() + @@ -1028,28 +1008,67 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); - column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); + column_chunk.__set_encrypted_column_metadata(encrypted_column_metadata); + + if (encrypted_footer) { + column_chunk.__isset.meta_data = false; + } else { + // Keep redacted metadata version for old readers + format::ColumnMetaData metadata_redacted; + metadata_redacted.__set_type(column_chunk_->meta_data.type); + metadata_redacted.__set_encodings(column_chunk_->meta_data.encodings); + metadata_redacted.__set_path_in_schema(column_chunk_->meta_data.path_in_schema); + metadata_redacted.__set_codec(column_chunk_->meta_data.codec); + metadata_redacted.__set_num_values(column_chunk_->meta_data.num_values); + metadata_redacted.__set_total_uncompressed_size( + column_chunk_->meta_data.total_uncompressed_size); + metadata_redacted.__set_total_compressed_size( + column_chunk_->meta_data.total_compressed_size); + if (column_chunk_->meta_data.__isset.key_value_metadata) { + metadata_redacted.__isset.key_value_metadata = true; + metadata_redacted.__set_key_value_metadata( + column_chunk_->meta_data.key_value_metadata); + } + metadata_redacted.__set_data_page_offset( + column_chunk_->meta_data.data_page_offset); + if (column_chunk_->meta_data.__isset.index_page_offset) { + metadata_redacted.__isset.index_page_offset = true; + metadata_redacted.__set_index_page_offset( + column_chunk_->meta_data.index_page_offset); + } + if (column_chunk_->meta_data.__isset.dictionary_page_offset) { + metadata_redacted.__isset.dictionary_page_offset = true; + metadata_redacted.__set_dictionary_page_offset( + column_chunk_->meta_data.dictionary_page_offset); + } + metadata_redacted.__isset.statistics = false; + metadata_redacted.__isset.encoding_stats = false; + + column_chunk.__isset.meta_data = true; + column_chunk.__set_meta_data(metadata_redacted); + } } - serializer.Serialize(column_chunk_, sink); + serializer.Serialize(&column_chunk, sink); } } const ColumnDescriptor* descr() const { return column_; } - int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } + int64_t total_compressed_size() const { + return column_chunk_->meta_data.total_compressed_size; + } private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; - column_metadata_ = column_chunk_->meta_data; - column_metadata_.__set_type(ToThrift(column_->physical_type())); - column_metadata_.__set_path_in_schema(column_->path()->ToDotVector()); - column_metadata_.__set_codec(ToThrift(properties_->compression(column_->path()))); + column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type())); + column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector()); + column_chunk_->meta_data.__set_codec( + ToThrift(properties_->compression(column_->path()))); } format::ColumnChunk* column_chunk_; std::unique_ptr owned_column_chunk_; - format::ColumnMetaData column_metadata_; const std::shared_ptr properties_; const ColumnDescriptor* column_; }; From 1661ebbfba9db652f8b091f56e60895eeb402b1a Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 16 Jun 2019 16:11:33 +0700 Subject: [PATCH 106/125] temporarily remove encryption-metadata-test --- cpp/src/parquet/CMakeLists.txt | 1 - cpp/src/parquet/encryption-metadata-test.cc | 495 -------------------- 2 files changed, 496 deletions(-) delete mode 100644 cpp/src/parquet/encryption-metadata-test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 94717699e84..afa82be376e 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -322,7 +322,6 @@ add_parquet_test(arrow-test if(PARQUET_BUILD_ENCRYPTION) add_parquet_test(encryption-test) - add_parquet_test(encryption-metadata-test) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc deleted file mode 100644 index 7ff31f9b212..00000000000 --- a/cpp/src/parquet/encryption-metadata-test.cc +++ /dev/null @@ -1,495 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/metadata.h" - -#include - -#include "parquet/properties.h" -#include "parquet/schema.h" -#include "parquet/statistics.h" - -namespace parquet { - -namespace metadata { - -const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 -const char kColumnEncryptionKey1[] = "1234567890123450"; -// const char kColumnEncryptionKey2[] = "1234567890123451"; - -TEST(Metadata, UniformEncryption) { - parquet::schema::NodeVector fields; - parquet::schema::NodePtr root; - parquet::SchemaDescriptor schema; - - fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); - fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); - root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); - schema.Init(root); - - int64_t nrows = 1000; - int32_t int_min = 100, int_max = 200; - EncodedStatistics stats_int; - stats_int.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&int_min), 4)) - .set_max(std::string(reinterpret_cast(&int_max), 4)); - EncodedStatistics stats_float; - float float_min = 100.100f, float_max = 200.200f; - stats_float.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&float_min), 4)) - .set_max(std::string(reinterpret_cast(&float_max), 4)); - - FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); - encryption_prop_builder.footer_key_metadata("kf"); - - WriterProperties::Builder writer_prop_builder; - writer_prop_builder.version(ParquetVersion::PARQUET_2_0); - writer_prop_builder.encryption(encryption_prop_builder.build()); - auto props = writer_prop_builder.build(); - - auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto rg1_builder = f_builder->AppendRowGroup(); - - // Write the metadata - // rowgroup1 metadata - auto col1_builder = rg1_builder->NextColumnChunk(); - auto col2_builder = rg1_builder->NextColumnChunk(); - // column metadata - stats_int.set_is_signed(true); - col1_builder->SetStatistics(stats_int); - stats_float.set_is_signed(true); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); - - rg1_builder->set_num_rows(nrows / 2); - rg1_builder->Finish(1024); - - // rowgroup2 metadata - auto rg2_builder = f_builder->AppendRowGroup(); - col1_builder = rg2_builder->NextColumnChunk(); - col2_builder = rg2_builder->NextColumnChunk(); - // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); - - rg2_builder->set_num_rows(nrows / 2); - rg2_builder->Finish(1024); - - // Read the metadata - auto f_accessor = f_builder->Finish(); - - ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); - - auto file_crypto_metadata = f_builder->GetCryptoMetaData(); - ASSERT_EQ(true, file_crypto_metadata != NULLPTR); - - // file metadata - ASSERT_EQ(nrows, f_accessor->num_rows()); - ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(2, f_accessor->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); - ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); - ASSERT_EQ(3, f_accessor->num_schema_elements()); - - // row group1 metadata - auto rg1_accessor = f_accessor->RowGroup(0); - ASSERT_EQ(2, rg1_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); - ASSERT_EQ(1024, rg1_accessor->total_byte_size()); - - auto rg1_column1 = rg1_accessor->ColumnChunk(0); - auto rg1_column2 = rg1_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg1_column1->is_stats_set()); - ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(0, rg1_column1->statistics()->null_count()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); - ASSERT_EQ(nrows / 2, rg1_column1->num_values()); - ASSERT_EQ(nrows / 2, rg1_column2->num_values()); - ASSERT_EQ(3, rg1_column1->encodings().size()); - ASSERT_EQ(3, rg1_column2->encodings().size()); - ASSERT_EQ(512, rg1_column1->total_compressed_size()); - ASSERT_EQ(512, rg1_column2->total_compressed_size()); - ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); - ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); - ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); - ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); - ASSERT_EQ(10, rg1_column1->data_page_offset()); - ASSERT_EQ(30, rg1_column2->data_page_offset()); - - auto rg2_accessor = f_accessor->RowGroup(1); - ASSERT_EQ(2, rg2_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); - ASSERT_EQ(1024, rg2_accessor->total_byte_size()); - - auto rg2_column1 = rg2_accessor->ColumnChunk(0); - auto rg2_column2 = rg2_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg2_column1->is_stats_set()); - ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(0, rg2_column1->statistics()->null_count()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); - ASSERT_EQ(nrows / 2, rg2_column1->num_values()); - ASSERT_EQ(nrows / 2, rg2_column2->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); - ASSERT_EQ(3, rg2_column1->encodings().size()); - ASSERT_EQ(3, rg2_column2->encodings().size()); - ASSERT_EQ(512, rg2_column1->total_compressed_size()); - ASSERT_EQ(512, rg2_column2->total_compressed_size()); - ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); - ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); - ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); - ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); - ASSERT_EQ(10, rg2_column1->data_page_offset()); - ASSERT_EQ(26, rg2_column2->data_page_offset()); -} - -TEST(Metadata, EncryptFooterAndOneColumn) { - parquet::schema::NodeVector fields; - parquet::schema::NodePtr root; - parquet::SchemaDescriptor schema; - - fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); - fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); - root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); - schema.Init(root); - - int64_t nrows = 1000; - int32_t int_min = 100, int_max = 200; - EncodedStatistics stats_int; - stats_int.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&int_min), 4)) - .set_max(std::string(reinterpret_cast(&int_max), 4)); - EncodedStatistics stats_float; - float float_min = 100.100f, float_max = 200.200f; - stats_float.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&float_min), 4)) - .set_max(std::string(reinterpret_cast(&float_max), 4)); - - std::shared_ptr int_col_path = - parquet::schema::ColumnPath::FromDotString("int_col"); - ColumnEncryptionProperties::Builder int_col_builder(int_col_path); - int_col_builder.key(kColumnEncryptionKey1); - int_col_builder.key_id("kc1"); - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - encryption_col_props; - encryption_col_props[int_col_path] = int_col_builder.build(); - - FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); - encryption_prop_builder.footer_key_metadata("kf"); - encryption_prop_builder.column_properties(encryption_col_props); - - WriterProperties::Builder writer_prop_builder; - writer_prop_builder.version(ParquetVersion::PARQUET_2_0); - writer_prop_builder.encryption(encryption_prop_builder.build()); - auto props = writer_prop_builder.build(); - - auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto rg1_builder = f_builder->AppendRowGroup(); - - // Write the metadata - // rowgroup1 metadata - auto col1_builder = rg1_builder->NextColumnChunk(); - auto col2_builder = rg1_builder->NextColumnChunk(); - // column metadata - stats_int.set_is_signed(true); - col1_builder->SetStatistics(stats_int); - stats_float.set_is_signed(true); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); - - rg1_builder->set_num_rows(nrows / 2); - rg1_builder->Finish(1024); - - // rowgroup2 metadata - auto rg2_builder = f_builder->AppendRowGroup(); - col1_builder = rg2_builder->NextColumnChunk(); - col2_builder = rg2_builder->NextColumnChunk(); - // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); - - rg2_builder->set_num_rows(nrows / 2); - rg2_builder->Finish(1024); - - // Read the metadata - auto f_accessor = f_builder->Finish(); - - ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); - - auto file_crypto_metadata = f_builder->GetCryptoMetaData(); - ASSERT_EQ(true, file_crypto_metadata != NULLPTR); - - // file metadata - ASSERT_EQ(nrows, f_accessor->num_rows()); - ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(2, f_accessor->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); - ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); - ASSERT_EQ(3, f_accessor->num_schema_elements()); - - // row group1 metadata - auto rg1_accessor = f_accessor->RowGroup(0); - ASSERT_EQ(2, rg1_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); - ASSERT_EQ(1024, rg1_accessor->total_byte_size()); - - auto rg1_column1 = rg1_accessor->ColumnChunk(0); - auto rg1_column2 = rg1_accessor->ColumnChunk(1); - ASSERT_EQ(false, rg1_column1->is_metadata_set()); - ASSERT_THROW(rg1_column1->is_stats_set(), ParquetException); - ASSERT_THROW(rg1_column1->statistics(), ParquetException); - ASSERT_THROW(rg1_column1->compression(), ParquetException); - ASSERT_THROW(rg1_column1->num_values(), ParquetException); - ASSERT_THROW(rg1_column1->encodings(), ParquetException); - ASSERT_THROW(rg1_column1->total_compressed_size(), ParquetException); - ASSERT_THROW(rg1_column1->total_uncompressed_size(), ParquetException); - ASSERT_THROW(rg1_column1->dictionary_page_offset(), ParquetException); - ASSERT_THROW(rg1_column1->data_page_offset(), ParquetException); - - ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); - ASSERT_EQ(nrows / 2, rg1_column2->num_values()); - ASSERT_EQ(3, rg1_column2->encodings().size()); - ASSERT_EQ(512, rg1_column2->total_compressed_size()); - ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); - ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); - ASSERT_EQ(30, rg1_column2->data_page_offset()); - - auto rg2_accessor = f_accessor->RowGroup(1); - ASSERT_EQ(2, rg2_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); - ASSERT_EQ(1024, rg2_accessor->total_byte_size()); - - auto rg2_column1 = rg2_accessor->ColumnChunk(0); - auto rg2_column2 = rg2_accessor->ColumnChunk(1); - ASSERT_EQ(false, rg1_column1->is_metadata_set()); - ASSERT_THROW(rg2_column1->is_stats_set(), ParquetException); - ASSERT_THROW(rg2_column1->statistics(), ParquetException); - ASSERT_THROW(rg2_column1->compression(), ParquetException); - ASSERT_THROW(rg2_column1->num_values(), ParquetException); - ASSERT_THROW(rg2_column1->encodings(), ParquetException); - ASSERT_THROW(rg2_column1->total_compressed_size(), ParquetException); - ASSERT_THROW(rg2_column1->total_uncompressed_size(), ParquetException); - ASSERT_THROW(rg2_column1->dictionary_page_offset(), ParquetException); - ASSERT_THROW(rg2_column1->data_page_offset(), ParquetException); - - ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); - ASSERT_EQ(nrows / 2, rg2_column2->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); - ASSERT_EQ(3, rg2_column2->encodings().size()); - ASSERT_EQ(512, rg2_column2->total_compressed_size()); - ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); - ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); - ASSERT_EQ(26, rg2_column2->data_page_offset()); -} - -TEST(Metadata, PlaintextFooter) { - parquet::schema::NodeVector fields; - parquet::schema::NodePtr root; - parquet::SchemaDescriptor schema; - - fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); - fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); - root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); - schema.Init(root); - - int64_t nrows = 1000; - int32_t int_min = 100, int_max = 200; - EncodedStatistics stats_int; - stats_int.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&int_min), 4)) - .set_max(std::string(reinterpret_cast(&int_max), 4)); - EncodedStatistics stats_float; - float float_min = 100.100f, float_max = 200.200f; - stats_float.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&float_min), 4)) - .set_max(std::string(reinterpret_cast(&float_max), 4)); - - std::shared_ptr int_col_path = - parquet::schema::ColumnPath::FromDotString("int_col"); - ColumnEncryptionProperties::Builder int_col_builder(int_col_path); - int_col_builder.key(kColumnEncryptionKey1); - int_col_builder.key_id("kc1"); - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - encryption_col_props; - encryption_col_props[int_col_path] = int_col_builder.build(); - - FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); - encryption_prop_builder.footer_key_metadata("kf"); - encryption_prop_builder.set_plaintext_footer(); - encryption_prop_builder.column_properties(encryption_col_props); - - WriterProperties::Builder writer_prop_builder; - writer_prop_builder.version(ParquetVersion::PARQUET_2_0); - writer_prop_builder.encryption(encryption_prop_builder.build()); - auto props = writer_prop_builder.build(); - - auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto rg1_builder = f_builder->AppendRowGroup(); - - // Write the metadata - // rowgroup1 metadata - auto col1_builder = rg1_builder->NextColumnChunk(); - auto col2_builder = rg1_builder->NextColumnChunk(); - // column metadata - stats_int.set_is_signed(true); - col1_builder->SetStatistics(stats_int); - stats_float.set_is_signed(true); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); - - rg1_builder->set_num_rows(nrows / 2); - rg1_builder->Finish(1024); - - // rowgroup2 metadata - auto rg2_builder = f_builder->AppendRowGroup(); - col1_builder = rg2_builder->NextColumnChunk(); - col2_builder = rg2_builder->NextColumnChunk(); - // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); - - rg2_builder->set_num_rows(nrows / 2); - rg2_builder->Finish(1024); - - // Read the metadata - auto f_accessor = f_builder->Finish(); - - ASSERT_EQ(true, f_accessor->is_encryption_algorithm_set()); - - auto file_crypto_metadata = f_builder->GetCryptoMetaData(); - ASSERT_EQ(NULLPTR, file_crypto_metadata); - - // file metadata - ASSERT_EQ(nrows, f_accessor->num_rows()); - ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(2, f_accessor->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); - ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); - ASSERT_EQ(3, f_accessor->num_schema_elements()); - - // row group1 metadata - auto rg1_accessor = f_accessor->RowGroup(0); - ASSERT_EQ(2, rg1_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); - ASSERT_EQ(1024, rg1_accessor->total_byte_size()); - - auto rg1_column1 = rg1_accessor->ColumnChunk(0); - auto rg1_column2 = rg1_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg1_column1->is_metadata_set()); - ASSERT_EQ(false, rg1_column1->is_stats_set()); - ASSERT_EQ(NULLPTR, rg1_column1->statistics()); - // ASSERT_THROW(rg1_column1->encodings(), ParquetException); - - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); - ASSERT_EQ(nrows / 2, rg1_column1->num_values()); - ASSERT_EQ(3, rg1_column1->encodings().size()); - ASSERT_EQ(512, rg1_column1->total_compressed_size()); - ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); - ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); - ASSERT_EQ(10, rg1_column1->data_page_offset()); - - ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); - ASSERT_EQ(nrows / 2, rg1_column2->num_values()); - ASSERT_EQ(3, rg1_column2->encodings().size()); - ASSERT_EQ(512, rg1_column2->total_compressed_size()); - ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); - ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); - ASSERT_EQ(30, rg1_column2->data_page_offset()); - - auto rg2_accessor = f_accessor->RowGroup(1); - ASSERT_EQ(2, rg2_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); - ASSERT_EQ(1024, rg2_accessor->total_byte_size()); - - auto rg2_column1 = rg2_accessor->ColumnChunk(0); - auto rg2_column2 = rg2_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg2_column1->is_metadata_set()); - ASSERT_EQ(false, rg2_column1->is_stats_set()); - ASSERT_EQ(NULLPTR, rg2_column1->statistics()); - // ASSERT_THROW(rg2_column1->encodings(), ParquetException); - - ASSERT_EQ(nrows / 2, rg2_column1->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); - ASSERT_EQ(3, rg2_column1->encodings().size()); - ASSERT_EQ(512, rg2_column1->total_compressed_size()); - ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); - ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); - ASSERT_EQ(10, rg2_column1->data_page_offset()); - - ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); - ASSERT_EQ(nrows / 2, rg2_column2->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); - ASSERT_EQ(3, rg2_column2->encodings().size()); - ASSERT_EQ(512, rg2_column2->total_compressed_size()); - ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); - ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); - ASSERT_EQ(26, rg2_column2->data_page_offset()); -} - -} // namespace metadata -} // namespace parquet From d604d108cf966569b8202a87690160b72b07b7fe Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 17 Jun 2019 15:53:08 +0700 Subject: [PATCH 107/125] fix windows compiling issue --- cpp/src/parquet/encryption.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index f3994ecf822..2ffe36ec73f 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -85,7 +85,7 @@ class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { class PARQUET_EXPORT ColumnEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { @@ -172,7 +172,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT ColumnDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) @@ -233,7 +233,7 @@ class PARQUET_EXPORT AADPrefixVerifier { class PARQUET_EXPORT FileDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: Builder() { check_plaintext_footer_integrity_ = kDefaultCheckSignature; @@ -384,7 +384,7 @@ class PARQUET_EXPORT FileDecryptionProperties { class PARQUET_EXPORT FileEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: explicit Builder(const std::string& footer_key) : parquet_cipher_(kDefaultEncryptionAlgorithm), From 00fbcf669e8b2847b0d2cb2d18119dab3154331e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 17 Jun 2019 23:23:29 +0700 Subject: [PATCH 108/125] fix issue of parquet-encryption-example --- cpp/src/parquet/column_writer.cc | 10 ++--- cpp/src/parquet/metadata.cc | 75 ++++++++++++-------------------- cpp/src/parquet/metadata.h | 6 +-- 3 files changed, 37 insertions(+), 54 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index af91a4a2f73..d84618cbc04 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -225,17 +225,17 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { - // index_page_offset = -1 since they are not supported - metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, - total_compressed_size_, total_uncompressed_size_, has_dictionary, - fallback); if (meta_encryptor_ != nullptr) { meta_encryptor_->update_aad(encryption::CreateModuleAad( meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } + // index_page_offset = -1 since they are not supported + metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, + total_compressed_size_, total_uncompressed_size_, has_dictionary, + fallback, meta_encryptor_); // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get(), meta_encryptor_); + metadata_->WriteTo(sink_.get()); } /** diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 0bbb60a5950..cc85721886f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -168,10 +168,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { if (column->__isset.crypto_metadata) { // column metadata is encrypted - is_metadata_set_ = false; format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + is_metadata_set_ = false; if (file_decryptor != NULLPTR && file_decryptor->properties() != NULLPTR) { // should decrypt metadata std::shared_ptr path = std::make_shared( @@ -186,9 +186,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, decryptor, false); + &len, &decrypted_metadata_, decryptor, false); is_metadata_set_ = true; } + } else { + is_metadata_set_ = true; } } else { // column metadata is not encrypted is_metadata_set_ = true; @@ -286,19 +288,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { mutable std::shared_ptr possible_stats_; std::vector encodings_; const format::ColumnChunk* column_; - format::ColumnMetaData metadata_; + format::ColumnMetaData decrypted_metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { - if (column_->__isset.crypto_metadata) { + if (column_->__isset.crypto_metadata + && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { throw ParquetException( "Cannot decrypt ColumnMetadata. " "FileDecryptionProperties must be provided."); } else { - return metadata_; + return decrypted_metadata_; } } else { return column_->meta_data; @@ -907,7 +910,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -941,37 +944,11 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } column_chunk_->meta_data.__set_encodings(thrift_encodings); - } - - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); - // column is unencrypted - if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { - serializer.Serialize(column_chunk_, sink); - } else { // column is encrypted - // copy column chunk, except for meta_data - format::ColumnChunk column_chunk; - column_chunk.__set_file_offset(column_chunk_->file_offset); - if (column_chunk_->__isset.file_path) { - column_chunk.__set_file_path(column_chunk_->file_path); - } - if (column_chunk_->__isset.offset_index_offset) { - column_chunk.__set_offset_index_offset(column_chunk_->offset_index_offset); - } - if (column_chunk_->__isset.offset_index_length) { - column_chunk.__set_offset_index_length(column_chunk_->offset_index_length); - } - if (column_chunk_->__isset.column_index_offset) { - column_chunk.__set_column_index_offset(column_chunk_->column_index_offset); - } - if (column_chunk_->__isset.column_index_length) { - column_chunk.__set_column_index_length(column_chunk_->column_index_length); - } - - column_chunk.__isset.crypto_metadata = true; + // column is encrypted + if (encrypt_md != NULLPTR && encrypt_md->is_encrypted()) { + column_chunk_->__isset.crypto_metadata = true; format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { // encrypted with footer key @@ -984,13 +961,14 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk.__set_crypto_metadata(ccmd); + column_chunk_->__set_crypto_metadata(ccmd); bool encrypted_footer = properties_->file_encryption_properties()->encrypted_footer(); bool encrypt_metadata = !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); if (encrypt_metadata) { + ThriftSerializer serializer; // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata @@ -1008,10 +986,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); - column_chunk.__set_encrypted_column_metadata(encrypted_column_metadata); + column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); if (encrypted_footer) { - column_chunk.__isset.meta_data = false; + column_chunk_->__isset.meta_data = false; } else { // Keep redacted metadata version for old readers format::ColumnMetaData metadata_redacted; @@ -1044,14 +1022,18 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { metadata_redacted.__isset.statistics = false; metadata_redacted.__isset.encoding_stats = false; - column_chunk.__isset.meta_data = true; - column_chunk.__set_meta_data(metadata_redacted); + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(metadata_redacted); } } - serializer.Serialize(&column_chunk, sink); } } + void WriteTo(::arrow::io::OutputStream* sink) { + ThriftSerializer serializer; + serializer.Serialize(column_chunk_, sink); + } + const ColumnDescriptor* descr() const { return column_; } int64_t total_compressed_size() const { return column_chunk_->meta_data.total_compressed_size; @@ -1111,14 +1093,15 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, + const std::shared_ptr& encryptor) { impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, dictionary_fallback); + compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, + encryptor); } -void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - impl_->WriteTo(sink, encryptor); +void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { + impl_->WriteTo(sink); } const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 61a1d53e0b9..3fc6899bfc6 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -294,14 +294,14 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); + bool dictionary_fallback, + const std::shared_ptr& encryptor = NULLPTR); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; // For writing metadata at end of column chunk - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR); + void WriteTo(::arrow::io::OutputStream* sink); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, From 905067d800b7a25a8c6a5a8741deaa910bcb7fdc Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 18 Jun 2019 21:06:31 +0700 Subject: [PATCH 109/125] rename encryption-test.cc to encryption-properties-test.cc --- cpp/src/parquet/CMakeLists.txt | 4 +++- .../{encryption-test.cc => encryption-properties-test.cc} | 0 2 files changed, 3 insertions(+), 1 deletion(-) rename cpp/src/parquet/{encryption-test.cc => encryption-properties-test.cc} (100%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index afa82be376e..c5ddd63538f 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -321,7 +321,9 @@ add_parquet_test(arrow-test test-util.cc) if(PARQUET_BUILD_ENCRYPTION) - add_parquet_test(encryption-test) + add_parquet_test(encryption-test + SOURCES + encryption-properties-test.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-test.cc b/cpp/src/parquet/encryption-properties-test.cc similarity index 100% rename from cpp/src/parquet/encryption-test.cc rename to cpp/src/parquet/encryption-properties-test.cc From 479da8f239cf4166c2bfe44ea74db024707f461d Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 18 Jun 2019 23:07:41 +0700 Subject: [PATCH 110/125] use isset instead of creating a copy of column chunk metadata --- cpp/src/parquet/metadata.cc | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index cc85721886f..56127c0d65c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -992,38 +992,9 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__isset.meta_data = false; } else { // Keep redacted metadata version for old readers - format::ColumnMetaData metadata_redacted; - metadata_redacted.__set_type(column_chunk_->meta_data.type); - metadata_redacted.__set_encodings(column_chunk_->meta_data.encodings); - metadata_redacted.__set_path_in_schema(column_chunk_->meta_data.path_in_schema); - metadata_redacted.__set_codec(column_chunk_->meta_data.codec); - metadata_redacted.__set_num_values(column_chunk_->meta_data.num_values); - metadata_redacted.__set_total_uncompressed_size( - column_chunk_->meta_data.total_uncompressed_size); - metadata_redacted.__set_total_compressed_size( - column_chunk_->meta_data.total_compressed_size); - if (column_chunk_->meta_data.__isset.key_value_metadata) { - metadata_redacted.__isset.key_value_metadata = true; - metadata_redacted.__set_key_value_metadata( - column_chunk_->meta_data.key_value_metadata); - } - metadata_redacted.__set_data_page_offset( - column_chunk_->meta_data.data_page_offset); - if (column_chunk_->meta_data.__isset.index_page_offset) { - metadata_redacted.__isset.index_page_offset = true; - metadata_redacted.__set_index_page_offset( - column_chunk_->meta_data.index_page_offset); - } - if (column_chunk_->meta_data.__isset.dictionary_page_offset) { - metadata_redacted.__isset.dictionary_page_offset = true; - metadata_redacted.__set_dictionary_page_offset( - column_chunk_->meta_data.dictionary_page_offset); - } - metadata_redacted.__isset.statistics = false; - metadata_redacted.__isset.encoding_stats = false; - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(metadata_redacted); + column_chunk_->meta_data.__isset.statistics = false; + column_chunk_->meta_data.__isset.encoding_stats = false; } } } From 7052838f9eb2052abbb3e7fe679e03c14135b51d Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Fri, 21 Jun 2019 06:26:28 +0300 Subject: [PATCH 111/125] Address review comments --- ...yption-reader-writer-all-crypto-options.cc | 31 +- cpp/src/parquet/CMakeLists.txt | 3 +- cpp/src/parquet/column_reader.cc | 128 +-- cpp/src/parquet/column_reader.h | 12 +- cpp/src/parquet/column_writer.cc | 80 +- .../parquet/encryption-configurations-test.cc | 767 ++++++++++++++++++ cpp/src/parquet/encryption.cc | 84 +- cpp/src/parquet/encryption.h | 126 +-- cpp/src/parquet/encryption_internal.cc | 9 +- cpp/src/parquet/encryption_internal.h | 4 +- cpp/src/parquet/file_reader.cc | 333 ++++---- cpp/src/parquet/file_writer.cc | 132 +-- cpp/src/parquet/file_writer.h | 20 +- cpp/src/parquet/internal_file_decryptor.cc | 112 +-- cpp/src/parquet/internal_file_decryptor.h | 28 +- cpp/src/parquet/internal_file_encryptor.cc | 115 +-- cpp/src/parquet/internal_file_encryptor.h | 30 +- cpp/src/parquet/metadata.cc | 6 +- cpp/src/parquet/parquet.pc | 30 - cpp/src/parquet/parquet.thrift | 4 +- cpp/src/parquet/parquet_version.h | 24 - cpp/src/parquet/thrift.h | 109 ++- 22 files changed, 1427 insertions(+), 760 deletions(-) create mode 100644 cpp/src/parquet/encryption-configurations-test.cc delete mode 100644 cpp/src/parquet/parquet.pc delete mode 100644 cpp/src/parquet/parquet_version.h diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 5241e559cae..069f997d1ba 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -27,7 +27,10 @@ /* * This file contains samples for writing and reading encrypted Parquet files in different - * encryption and decryption configurations. The samples have the following goals: + * encryption and decryption configurations. + * Each sample section is dedicated to an independent configuration and shows its creation + * from beginning to end. + * The samples have the following goals: * 1) Demonstrate usage of different options for data encryption and decryption. * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr) * readers that support encryption. @@ -36,7 +39,7 @@ * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading * encrypted files produced by these writers. * - * The write sample produces number of parquet files, each encrypted with a different + * Each write sample produces new independent parquet file, encrypted with a different * encryption configuration as described below. * The name of each file is in the form of: * tester.parquet.encrypted. @@ -113,7 +116,7 @@ std::vector GetDirectoryFiles(const std::string& path) { return files; } -void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { +void InteropTestWriteEncryptedParquetFiles(std::string root_path) { /********************************************************************************** Creating a number of Encryption configurations **********************************************************************************/ @@ -260,7 +263,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { using FileClass = ::arrow::io::FileOutputStream; std::shared_ptr out_file; std::string file = - rootPath + fileName + std::string(test_number_string) + ".parquet.encrypted"; + root_path + fileName + std::string(test_number_string) + ".parquet.encrypted"; std::cout << "Write " << file << std::endl; PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); @@ -326,8 +329,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { } } -void InteropTestReadEncryptedParquetFiles(std::string rootPath) { - std::vector files_in_directory = GetDirectoryFiles(rootPath); +void InteropTestReadEncryptedParquetFiles(std::string root_path) { + std::vector files_in_directory = GetDirectoryFiles(root_path); /********************************************************************************** Creating a number of Decryption configurations @@ -414,7 +417,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { // Create a ParquetReader instance std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(rootPath + file, false, + parquet::ParquetFileReader::OpenFile(root_path + file, false, reader_properties); // Get the File MetaData @@ -611,7 +614,7 @@ bool FileNameEndsWith(std::string file_name, std::string suffix) { int main(int argc, char** argv) { enum Operation { write, read }; - std::string rootPath; + std::string root_path; Operation operation = write; if (argc < 3) { std::cout << "Usage: encryption-reader-writer-all-crypto-options " @@ -619,18 +622,18 @@ int main(int argc, char** argv) { << std::endl; exit(1); } - rootPath = argv[1]; - if (rootPath.compare("read") == 0) { + root_path = argv[1]; + if (root_path.compare("read") == 0) { operation = read; } - rootPath = argv[2]; - std::cout << "Root path is: " << rootPath << std::endl; + root_path = argv[2]; + std::cout << "Root path is: " << root_path << std::endl; if (operation == write) { - InteropTestWriteEncryptedParquetFiles(rootPath); + InteropTestWriteEncryptedParquetFiles(root_path); } else - InteropTestReadEncryptedParquetFiles(rootPath); + InteropTestReadEncryptedParquetFiles(root_path); return 0; } diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index c5ddd63538f..5cbc9699067 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -323,7 +323,8 @@ add_parquet_test(arrow-test if(PARQUET_BUILD_ENCRYPTION) add_parquet_test(encryption-test SOURCES - encryption-properties-test.cc) + encryption-properties-test.cc + encryption-configurations-test.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index bee77118bb1..f96176121e0 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -110,36 +110,25 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - bool column_has_dictionary, int16_t row_group_ordinal, - int16_t column_ordinal, ::arrow::MemoryPool* pool, - std::shared_ptr meta_decryptor, - std::shared_ptr data_decryptor) + ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), first_page_(true), - column_has_dictionary_(column_has_dictionary), - row_group_ordinal_(row_group_ordinal), - column_ordinal_(column_ordinal), page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), - decryption_buffer_(AllocateBuffer(pool, 0)), - meta_decryptor_(meta_decryptor), - data_decryptor_(data_decryptor) { + decryption_buffer_(AllocateBuffer(pool, 0)) { + if (ctx != NULLPTR) { + column_has_dictionary_ = ctx->column_has_dictionary; + row_group_ordinal_ = ctx->row_group_ordinal; + column_ordinal_ = ctx->column_ordinal; + meta_decryptor_ = ctx->meta_decryptor; + data_decryptor_ = ctx->data_decryptor; + } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); - if (data_decryptor_ != NULLPTR) { - DCHECK(!data_decryptor_->file_aad().empty()); - // prepare the AAD for quick update later - data_pageAAD_ = - encryption::CreateModuleAad(data_decryptor_->file_aad(), encryption::kDataPage, - row_group_ordinal_, column_ordinal_, (int16_t)-1); - } - if (meta_decryptor_ != NULLPTR) { - DCHECK(!meta_decryptor_->file_aad().empty()); - data_page_headerAAD_ = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { + InitDecryption(); } } @@ -149,6 +138,12 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: + void UpdateDecryption(const std::shared_ptr& decryptor, + bool current_page_is_dictionary, int8_t module_type, + const std::string& pageAAD); + + void InitDecryption(); + std::shared_ptr stream_; format::PageHeader current_page_header_; @@ -158,11 +153,31 @@ class SerializedPageReader : public PageReader { std::unique_ptr<::arrow::util::Codec> decompressor_; std::shared_ptr decompression_buffer_; - bool first_page_; + // The fields below are used for calculation of AAD (additional authenticated data) + // suffix which is part of the Parquet Modular Encryption. + // The AAD suffix for a parquet module is built internally by Parquet, by direct + // concatenation the different parts of the module, which includes amongst other + // its row group ordinal, column ordinal and page ordinal. + // Please refer to the encryption specification for more details: + // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data + + // To calculate the AAD suffix of an encrypted module, the exact type of the module + // should be known. The following two fields indicate whether the page is data or + // dictionary page. + + // Indicates whether the column has dictionary page. bool column_has_dictionary_; + // If the column has dictionary page and the page currently processed in the first + // one then it is a dictionary page. + bool first_page_; + // The ordinal fields below are used for AAD suffix calculation. int16_t row_group_ordinal_; int16_t column_ordinal_; int16_t page_ordinal_; + // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page + // header in a single column respectively. + // While calculating AAD for different pages in a single column the pages AAD is + // updated by only the page ordinal. std::string data_pageAAD_; std::string data_page_headerAAD_; @@ -181,6 +196,38 @@ class SerializedPageReader : public PageReader { std::shared_ptr data_decryptor_; }; +void SerializedPageReader::InitDecryption() { + // Prepare the AAD for quick update later. + if (data_decryptor_ != NULLPTR) { + DCHECK(!data_decryptor_->file_aad().empty()); + data_pageAAD_ = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + if (meta_decryptor_ != NULLPTR) { + DCHECK(!meta_decryptor_->file_aad().empty()); + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } +} + +void SerializedPageReader::UpdateDecryption(const std::shared_ptr& decryptor, + bool current_page_is_dictionary, + int8_t module_type, + const std::string& pageAAD) { + DCHECK(decryptor != NULLPTR); + if (current_page_is_dictionary) { + std::string aad = encryption::CreateModuleAad(decryptor->file_aad(), module_type, + row_group_ordinal_, column_ordinal_, + static_cast(-1)); + decryptor->UpdateAad(aad); + } else { + encryption::QuickUpdatePageAad(pageAAD, page_ordinal_); + decryptor->UpdateAad(pageAAD); + } +} + std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with @@ -214,16 +261,8 @@ std::shared_ptr SerializedPageReader::NextPage() { header_size = static_cast(buffer.size()); try { if (meta_decryptor_ != NULLPTR) { - if (current_page_is_dictionary) { - std::string dictionary_page_header_aad; - dictionary_page_header_aad = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDictionaryPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1); - meta_decryptor_->update_aad(dictionary_page_header_aad); - } else { - encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); - meta_decryptor_->update_aad(data_page_headerAAD_); - } + UpdateDecryption(meta_decryptor_, current_page_is_dictionary, + encryption::kDictionaryPageHeader, data_page_headerAAD_); } DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, meta_decryptor_); @@ -245,17 +284,8 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; if (data_decryptor_ != NULLPTR) { - DCHECK(!data_decryptor_->file_aad().empty()); - if (current_page_is_dictionary) { - std::string dictionary_page_aad; - dictionary_page_aad = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1); - data_decryptor_->update_aad(dictionary_page_aad); - } else { - encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); - data_decryptor_->update_aad(data_pageAAD_); - } + UpdateDecryption(data_decryptor_, current_page_is_dictionary, + encryption::kDictionaryPage, data_pageAAD_); } // Read the compressed data page. @@ -346,13 +376,9 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool, bool column_has_dictionary, - int16_t row_group_ordinal, int16_t column_ordinal, - std::shared_ptr meta_decryptor, - std::shared_ptr data_decryptor) { - return std::unique_ptr(new SerializedPageReader( - stream, total_num_rows, codec, column_has_dictionary, row_group_ordinal, - column_ordinal, pool, meta_decryptor, data_decryptor)); + Compression::type codec, ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) { + return std::unique_ptr( + new SerializedPageReader(stream, total_num_rows, codec, pool, ctx)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index f3363106bc9..61aca8c1380 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -74,6 +74,14 @@ class PARQUET_EXPORT LevelDecoder { std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; }; +struct PageReaderContext { + bool column_has_dictionary; + int16_t row_group_ordinal; + int16_t column_ordinal; + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; +}; + // Abstract page iterator interface. This way, we can feed column pages to the // ColumnReader through whatever mechanism we choose class PARQUET_EXPORT PageReader { @@ -83,9 +91,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool column_has_dictionary = false, int16_t row_group_ordinal = -1, - int16_t column_ordinal = -1, std::shared_ptr meta_decryptor = NULLPTR, - std::shared_ptr data_decryptor = NULLPTR); + struct PageReaderContext* ctx = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index d84618cbc04..a6263612975 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -147,16 +147,8 @@ class SerializedPageWriter : public PageWriter { column_ordinal_(column_chunk_ordinal), meta_encryptor_(meta_encryptor), data_encryptor_(data_encryptor) { - if (data_encryptor_ != NULLPTR) { - // prepare the add for quick update later - data_pageAAD_ = - encryption::CreateModuleAad(data_encryptor_->file_aad(), encryption::kDataPage, - row_group_ordinal_, column_ordinal_, (int16_t)-1); - } - if (meta_encryptor_ != NULLPTR) { - data_page_headerAAD_ = encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { + InitEncryption(); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -184,9 +176,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { - data_encryptor_->update_aad(encryption::CreateModuleAad( - data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + UpdateEncryption(encryption::kDictionaryPage); encrypted_data_buffer = std::static_pointer_cast(AllocateBuffer( pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -208,9 +198,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->update_aad(encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kDictionaryPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1)); + UpdateEncryption(encryption::kDictionaryPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); @@ -226,10 +214,9 @@ class SerializedPageWriter : public PageWriter { void Close(bool has_dictionary, bool fallback) override { if (meta_encryptor_ != nullptr) { - meta_encryptor_->update_aad(encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + UpdateEncryption(encryption::kColumnMetaData); } + // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, @@ -276,8 +263,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { - encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); - data_encryptor_->update_aad(data_pageAAD_); + UpdateEncryption(encryption::kDataPage); PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -299,8 +285,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); - meta_encryptor_->update_aad(data_page_headerAAD_); + UpdateEncryption(encryption::kDataPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); @@ -329,6 +314,55 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: + void InitEncryption() { + // Prepare the AAD for quick update later. + if (data_encryptor_ != NULLPTR) { + data_pageAAD_ = encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + if (meta_encryptor_ != NULLPTR) { + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + } + + void UpdateEncryption(int8_t module_type) { + switch (module_type) { + case encryption::kColumnMetaData: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + case encryption::kDataPage: { + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); + data_encryptor_->UpdateAad(data_pageAAD_); + break; + } + case encryption::kDataPageHeader: { + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); + meta_encryptor_->UpdateAad(data_page_headerAAD_); + break; + } + case encryption::kDictionaryPageHeader: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + case encryption::kDictionaryPage: { + data_encryptor_->UpdateAad(encryption::CreateModuleAad( + data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + default: + throw ParquetException("Unknown module type in UpdateEncryption"); + } + } + std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; ::arrow::MemoryPool* pool_; diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc new file mode 100644 index 00000000000..2afc20e5223 --- /dev/null +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-util.h" + +/* + * This file contains unit-tests for writing and reading encrypted Parquet files with + * different encryption and decryption configurations. + * + * Each unit-test produces a single parquet file, encrypted with one of the encryption + * configuration described below; and is read multiple times using a set of decryption + * configurations, also described below. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * Each unit-test creates a single parquet file with eight columns using one of the + * following encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + * + * The written parquet file produced above is read by each of the following decryption + * configurations: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + */ + +namespace parquet { + +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; +using parquet::LogicalType; +using parquet::Repetition; +using parquet::Type; + +constexpr int kFixedLength = 10; + +namespace test { + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + rows_per_rowgroup_ = 50; + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + createDecryptionConfigurations(); + path_to_double_field_ = parquet::schema::ColumnPath::FromDotString("double_field"); + path_to_float_field_ = parquet::schema::ColumnPath::FromDotString("float_field"); + } + + protected: + std::shared_ptr path_to_double_field_; + std::shared_ptr path_to_float_field_; + int rows_per_rowgroup_; + std::shared_ptr schema_; + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + void createDecryptionConfigurations() { + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // Decryption configuration 1: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey_); + string_kr2->PutKey("kc1", kColumnEncryptionKey1_); + string_kr2->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( + path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey_) + ->column_properties(decryption_cols) + ->build()); + } + + std::shared_ptr EncryptFile( + std::shared_ptr encryption_configurations) { + auto sink = CreateOutputStream(); + + WriterProperties::Builder prop_builder; + + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + auto file_writer = ParquetFileWriter::Open(sink, schema_, writer_properties); + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(sink->Finish(&buffer)); + return buffer; + } + + void DecryptFile(std::shared_ptr buffer, int example_id, + int encryption_configuration) { + std::string exception_msg; + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[example_id]->DeepClone()); + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == kFixedLength); + assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); + i++; + } + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(encryption_configuration, example_id, exception_msg); + } + + // Check that the decryption result is as expected. + void CheckResult(int encryption_configuration_number, int example_id, + std::string exception_msg) { + int decryption_configuration_number = example_id + 1; + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + ASSERT_FALSE(found == std::string::npos); + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected + // to be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + ASSERT_FALSE(found == std::string::npos); + return; + } + } + if (!exception_msg.empty()) { + ASSERT_EQ(1, 0); + } + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, LogicalType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + LogicalType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, LogicalType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, LogicalType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration 1: Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST_F(TestEncryptionConfiguration, UniformEncryption) { + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 1 /* encryption_configuration_number */); + } +} + +// Encryption configuration 2: Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 2 /* encryption_configuration_number */); + } +} + +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { + // Encryption configuration 3: Encrypt two columns, with different keys. + // Don’t encrypt footer. + // (plaintext footer mode, readable by legacy readers) + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( + path_to_float_field_); + encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); + encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 3 /* encryption_configuration_number */); + } +} + +// Encryption configuration 4: Encrypt two columns and the footer, with different keys. +// Use aad_prefix. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( + path_to_float_field_); + encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); + encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(kFileName_) + ->build()); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 4 /* encryption_configuration_number */); + } +} + +// Encryption configuration 5: Encrypt two columns and the footer, with different keys. +// Use aad_prefix and disable_aad_prefix_storage. +TEST_F(TestEncryptionConfiguration, + EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( + path_to_float_field_); + encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); + encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_store_aad_prefix_storage() + ->build()); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 5 /* encryption_configuration_number */); + } +} + +// Encryption configuration 6: Encrypt two columns and the footer, with different keys. +// Use AES_GCM_CTR_V1 algorithm. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( + path_to_float_field_); + encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); + encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 6 /* encryption_configuration_number */); + } +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 096df59b135..69da7eeae56 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -17,11 +17,12 @@ #include "parquet/encryption.h" -#include #include #include #include +#include + #include "arrow/util/logging.h" #include "arrow/util/utf8.h" @@ -29,7 +30,7 @@ namespace parquet { // integer key retriever void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { - key_map_.insert(std::make_pair(key_id, key)); + key_map_.insert({key_id, key}); } const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { @@ -41,7 +42,7 @@ const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata // string key retriever void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { - key_map_.insert(std::make_pair(key_id, key)); + key_map_.insert({key_id, key}); } const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { @@ -80,17 +81,13 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id } FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { + const ColumnPathToDecryptionPropertiesMap& column_properties) { if (column_properties.size() == 0) return this; if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); - for (std::pair, - std::shared_ptr> - element : column_properties) { + for (const auto& element : column_properties) { if (element.second->is_utilized()) { throw ParquetException("Column properties utilized in another file"); } @@ -101,14 +98,11 @@ FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_pro return this; } -void FileDecryptionProperties::wipeout_decryption_keys() { - if (!footer_key_.empty()) - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); +void FileDecryptionProperties::WipeOutDecryptionKeys() { + footer_key_.clear(); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_decryption_key(); + for (const auto& element : column_properties_) { + element.second->WipeOutDecryptionKey(); } } @@ -122,17 +116,11 @@ bool FileDecryptionProperties::is_utilized() { std::shared_ptr FileDecryptionProperties::DeepClone( std::string new_aad_prefix) { std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; + ColumnPathToDecryptionPropertiesMap column_properties_map_copy; - for (std::pair, - std::shared_ptr> - element : column_properties_) { + for (const auto& element : column_properties_) { column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); + {element.second->column_path(), element.second->DeepClone()}); } if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; @@ -193,11 +181,7 @@ std::shared_ptr ColumnDecryptionProperties::Builder: new ColumnDecryptionProperties(column_path_, key_)); } -void ColumnDecryptionProperties::wipeout_decryption_key() { - if (!key_.empty()) { - std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); - } -} +void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); } std::shared_ptr ColumnDecryptionProperties::DeepClone() { std::string key_copy = key_; @@ -215,17 +199,13 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { + const ColumnPathToEncryptionPropertiesMap& column_properties) { if (column_properties.size() == 0) return this; if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); - for (std::pair, - std::shared_ptr> - element : column_properties) { + for (const auto& element : column_properties) { if (element.second->is_utilized()) { throw ParquetException("Column properties utilized in another file"); } @@ -235,29 +215,21 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_pro return this; } -void FileEncryptionProperties::wipeout_encryption_keys() { - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_encryption_key(); +void FileEncryptionProperties::WipeOutEncryptionKeys() { + footer_key_.clear(); + for (const auto& element : column_properties_) { + element.second->WipeOutEncryptionKey(); } } std::shared_ptr FileEncryptionProperties::DeepClone( std::string new_aad_prefix) { std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; + ColumnPathToEncryptionPropertiesMap column_properties_map_copy; - for (std::pair, - std::shared_ptr> - element : column_properties_) { + for (const auto& element : column_properties_) { column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); + {element.second->column_path(), element.second->DeepClone()}); } if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; @@ -342,9 +314,7 @@ FileDecryptionProperties::FileDecryptionProperties( const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, + const ColumnPathToDecryptionPropertiesMap& column_properties, bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); @@ -393,16 +363,14 @@ std::shared_ptr FileEncryptionProperties::column_pro return column_properties_[column_path]; } - return NULLPTR; + return nullptr; } FileEncryptionProperties::FileEncryptionProperties( ParquetCipher::type cipher, const std::string& footer_key, const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) + const ColumnPathToEncryptionPropertiesMap& column_properties) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 2ffe36ec73f..f12db27dbec 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -24,7 +24,6 @@ #include #include -#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" @@ -39,6 +38,18 @@ static constexpr bool kDefaultCheckSignature = true; static constexpr bool kDefaultAllowPlaintextFiles = false; static constexpr int32_t kAadFileUniqueLength = 8; +class ColumnDecryptionProperties; +using ColumnPathToDecryptionPropertiesMap = + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>; + +class ColumnEncryptionProperties; +using ColumnPathToEncryptionPropertiesMap = + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>; + class PARQUET_EXPORT DecryptionKeyRetriever { public: virtual const std::string& GetKey(const std::string& key_metadata) = 0; @@ -77,15 +88,16 @@ class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; -class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { - public: - explicit UnsupportedOperationException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} -}; +inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return nullptr; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} class PARQUET_EXPORT ColumnEncryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { @@ -109,6 +121,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { /// use either key_metadata() or key_id(), not both Builder* key_metadata(const std::string& key_metadata); + /// A convenience function to set key metadata using a string id. /// Set a key retrieval metadata (converted from String). /// use either key_metadata() or key_id(), not both /// key_id will be converted to metadata (UTF-8 array). @@ -134,11 +147,10 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } const std::string& key() const { return key_; } const std::string& key_metadata() const { return key_metadata_; } - void wipeout_encryption_key() { - if (!key_.empty()) { - std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); - } - } + + /// Upon completion of file writing, the encryption key + /// will be wiped out. + void WipeOutEncryptionKey() { key_.clear(); } bool is_utilized() { if (key_.empty()) @@ -146,6 +158,10 @@ class PARQUET_EXPORT ColumnEncryptionProperties { return utilized_; } + /// ColumnEncryptionProperties object can be used for writing one file only. + /// Mark ColumnEncryptionProperties as utilized once it is used in + /// FileEncryptionProperties as the encryption key will be wiped out upon + /// completion of file writing. void set_utilized() { utilized_ = true; } std::shared_ptr DeepClone() { @@ -172,7 +188,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT ColumnDecryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) @@ -202,9 +218,15 @@ class PARQUET_EXPORT ColumnDecryptionProperties { const std::string& key() const { return key_; } bool is_utilized() { return utilized_; } + /// ColumnDecryptionProperties object can be used for reading one file only. + /// Mark ColumnDecryptionProperties as utilized once it is used in + /// FileDecryptionProperties as the encryption key will be wiped out upon + /// completion of file reading. void set_utilized() { utilized_ = true; } - void wipeout_decryption_key(); + /// Upon completion of file reading, the encryption key + /// will be wiped out. + void WipeOutDecryptionKey(); std::shared_ptr DeepClone(); @@ -227,13 +249,13 @@ class PARQUET_EXPORT AADPrefixVerifier { /// Throws exception if an AAD prefix is wrong. /// In a data set, AAD Prefixes should be collected, /// and then checked for missing files. - virtual void check(const std::string& aad_prefix) = 0; + virtual void Verify(const std::string& aad_prefix) = 0; virtual ~AADPrefixVerifier() {} }; class PARQUET_EXPORT FileDecryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: Builder() { check_plaintext_footer_integrity_ = kDefaultCheckSignature; @@ -261,9 +283,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// If an explicit key is available for a footer or a column, /// its key metadata will be ignored. Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + const ColumnPathToDecryptionPropertiesMap& column_properties); /// Set a key retriever callback. Its also possible to /// set explicit footer or column keys on this file property object. @@ -314,11 +334,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; + ColumnPathToDecryptionPropertiesMap column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -340,10 +356,15 @@ class PARQUET_EXPORT FileDecryptionProperties { return aad_prefix_verifier_; } - void wipeout_decryption_keys(); + /// Upon completion of file reading, the encryption keys in the properties + /// will be wiped out (array values set to 0). + void WipeOutDecryptionKeys(); bool is_utilized(); + /// FileDecryptionProperties object can be used for reading one file only. + /// Mark FileDecryptionProperties as utilized once it is used to read a file as the + /// encryption keys will be wiped out upon completion of file reading. void set_utilized() { utilized_ = true; } /// FileDecryptionProperties object can be used for reading one file only. @@ -361,30 +382,25 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier_; const std::string empty_string_ = ""; - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; + ColumnPathToDecryptionPropertiesMap column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; bool plaintext_files_allowed_; bool utilized_; - FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, - bool plaintext_files_allowed); + FileDecryptionProperties(const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_properties, + bool plaintext_files_allowed); }; class PARQUET_EXPORT FileEncryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: explicit Builder(const std::string& footer_key) : parquet_cipher_(kDefaultEncryptionAlgorithm), @@ -426,9 +442,7 @@ class PARQUET_EXPORT FileEncryptionProperties { /// If not called, all columns will be encrypted with the footer key. /// If called, the file columns not in the list will be left unencrypted. Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + const ColumnPathToEncryptionPropertiesMap& column_properties); std::shared_ptr build() { return std::shared_ptr(new FileEncryptionProperties( @@ -444,10 +458,7 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool store_aad_prefix_in_file_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; + ColumnPathToEncryptionPropertiesMap column_properties_; }; bool encrypted_footer() const { return encrypted_footer_; } @@ -464,9 +475,14 @@ class PARQUET_EXPORT FileEncryptionProperties { bool is_utilized() { return utilized_; } + /// FileEncryptionProperties object can be used for writing one file only. + /// Mark FileEncryptionProperties as utilized once it is used to write a file as the + /// encryption keys will be wiped out upon completion of file writing. void set_utilized() { utilized_ = true; } - void wipeout_encryption_keys(); + /// Upon completion of file writing, the encryption keys + /// will be wiped out (array values set to 0). + void WipeOutEncryptionKeys(); /// FileEncryptionProperties object can be used for writing one file only. /// (at the end, keys are wiped out in the memory). @@ -484,18 +500,12 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool utilized_; bool store_aad_prefix_in_file_; + ColumnPathToEncryptionPropertiesMap column_properties_; - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; - - FileEncryptionProperties( - ParquetCipher::type cipher, const std::string& footer_key, - const std::string& footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const ColumnPathToEncryptionPropertiesMap& column_properties); }; } // namespace parquet diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index bf3239d42c4..b73f910b230 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -391,7 +391,7 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int AesEncryptor* AesEncryptor::Make( ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors) { + std::vector *all_encryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -399,9 +399,8 @@ AesEncryptor* AesEncryptor::Make( } AesEncryptor* encryptor = new AesEncryptor(alg_id, key_len, metadata); - if (all_encryptors != NULLPTR) { - all_encryptors->push_back(encryptor); - } + if (all_encryptors != NULLPTR) + all_encryptors->push_back(encryptor); return encryptor; } @@ -411,7 +410,7 @@ AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadat AesDecryptor* AesDecryptor::Make( ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors) { + std::vector *all_decryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index af668dc4136..410d1ed5472 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -47,7 +47,7 @@ constexpr int8_t kOffsetIndex = 7; class AesEncryptor { public: static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors); + std::vector *all_encryptors); ~AesEncryptor(); @@ -78,7 +78,7 @@ class AesEncryptor { class AesDecryptor { public: static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors); + std::vector *all_decryptors); ~AesDecryptor(); void WipeOut(); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 65a496ab50b..8e390aee65e 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -126,17 +126,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.GetStream(source_, col_start, col_length); std::unique_ptr crypto_metadata = col->crypto_metadata(); - bool encrypted = true; - // Column is encrypted only if crypto_metadata exists. if (!crypto_metadata) { - encrypted = false; - } - - if (!encrypted) { + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), NULLPTR, NULLPTR}; return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i /* column_ordinal */); + properties_.memory_pool(), &ctx); } // The column is encrypted @@ -145,11 +140,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (crypto_metadata->encrypted_with_footer_key()) { auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); - + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i, meta_decryptor, - data_decryptor); + properties_.memory_pool(), &ctx); } // The column is encrypted with its own key @@ -162,10 +156,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto data_decryptor = file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i, meta_decryptor, - data_decryptor); + properties_.memory_pool(), &ctx); } private: @@ -197,7 +191,7 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { - if (file_decryptor_) file_decryptor_->wipeout_decryption_keys(); + if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); } std::shared_ptr GetRowGroup(int i) override { @@ -237,7 +231,7 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Corrupt footer."); } - // no encryption or encryption with plaintext footer + // No encryption or encryption with plaintext footer mode. if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { uint32_t metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - @@ -266,178 +260,171 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); auto file_decryption_properties = properties_.file_decryption_properties(); - if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file + if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. if (file_decryption_properties != NULLPTR) { if (!file_decryption_properties->plaintext_files_allowed()) { throw ParquetException("Applying decryption properties on plaintext file"); } } } else { - if (file_decryption_properties == NULLPTR) { - throw ParquetException("No decryption properties are provided"); - } + // Encrypted file with plaintext footer mode. + ParseMetaDataOfEncryptedFileWithPlaintextFooter( + file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); + } + } else { + // Encrypted file with Encrypted footer. + ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, + file_size); + } + } - // Handle AAD prefix - EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); - std::string aad_prefix = aad_prefix_in_properites; - bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; - std::string aad_prefix_in_file = algo.aad.aad_prefix; - - if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { - throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); - } + private: + std::shared_ptr source_; + std::shared_ptr file_metadata_; + ReaderProperties properties_; + std::unique_ptr file_decryptor_; - if (file_has_aad_prefix) { - if (!aad_prefix_in_properites.empty()) { - if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { - throw ParquetException( - "AAD Prefix in file and in properties " - "is not the same"); - } - } - aad_prefix = aad_prefix_in_file; - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { - throw ParquetException( - "AAD Prefix set in decryption properties, but was not used " - "for file encryption"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) { - throw ParquetException( - "AAD Prefix Verifier is set, but AAD Prefix not found in file"); - } - } - std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor( - file_decryption_properties, file_aad, algo.algorithm, - file_metadata_->footer_signing_key_metadata())); - - if (file_decryption_properties->check_plaintext_footer_integrity()) { - if (metadata_len - read_metadata_len != 28) { - throw ParquetException( - "Invalid parquet file. Cannot verify plaintext mode footer."); - } + std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, + EncryptionAlgorithm& algo); - auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify_signature( - encryptor, metadata_buffer->data() + read_metadata_len)) { - throw ParquetException( - "Invalid parquet file. Could not verify plaintext " - "footer metadata"); - } - } - } - } else { - // encryption with encrypted footer - // both metadata & crypto metadata length - uint32_t footer_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); - int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; - if (kFooterSize + footer_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } - std::shared_ptr crypto_metadata_buffer; + void ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len); - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (footer_len + kFooterSize)) { - crypto_metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); - if (crypto_metadata_buffer->size() != footer_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } - } - auto file_decryption_properties = properties_.file_decryption_properties(); - if (file_decryption_properties == nullptr) { + void ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size); +}; + +void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size) { + // encryption with encrypted footer + // both metadata & crypto metadata length + uint32_t footer_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; + if (kFooterSize + footer_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (footer_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != footer_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == nullptr) { + throw ParquetException( + "No decryption properties are provided. Could not read " + "encrypted footer metadata"); + } + uint32_t crypto_metadata_len = footer_len; + std::shared_ptr file_crypto_metadata = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + // Handle AAD prefix + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, file_aad, + algo.algorithm, + file_crypto_metadata->key_metadata())); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + std::shared_ptr metadata_buffer; + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException( + "Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); + } + + auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len) { + // Providing decryption properties in plaintext footer mode is not mendatory, for + // example when reading by legacy reader. + if (file_decryption_properties != NULLPTR) { + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + // Handle AAD prefix + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset( + new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata())); + + if (file_decryption_properties->check_plaintext_footer_integrity()) { + if (metadata_len - read_metadata_len != 28) { throw ParquetException( - "No decryption properties are provided. Could not read " - "encrypted footer metadata"); + "Invalid parquet file. Cannot verify plaintext mode footer."); } - uint32_t crypto_metadata_len = footer_len; - std::shared_ptr file_crypto_metadata = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); - - // Handle AAD prefix - std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); - std::string aad_prefix = aad_prefix_in_properites; - bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; - std::string aad_prefix_in_file = algo.aad.aad_prefix; - - if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { + + auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); + if (!file_metadata_->verify_signature( + encryptor, metadata_buffer->data() + read_metadata_len)) { throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + "Invalid parquet file. Could not verify plaintext " + "footer metadata"); } + } + } +} - if (file_has_aad_prefix) { - if (!aad_prefix_in_properites.empty()) { - if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { - throw ParquetException( - "AAD Prefix in file and in properties " - "is not the same"); - } - } - aad_prefix = aad_prefix_in_file; - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { - throw ParquetException( - "AAD Prefix set in decryption properties, but was not used " - "for file encryption"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) { - throw ParquetException( - "AAD Prefix Verifier is set, but AAD Prefix not found in file"); - } - } - std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset( - new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, - file_crypto_metadata->key_metadata())); - int64_t metadata_offset = - file_size - kFooterSize - footer_len + crypto_metadata_len; - uint32_t metadata_len = footer_len - crypto_metadata_len; - std::shared_ptr metadata_buffer; - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { +std::string SerializedFile::HandleAadPrefix( + FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) { + std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properties; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + + if (file_has_aad_prefix) { + if (!aad_prefix_in_properties.empty()) { + if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) { throw ParquetException( - "Invalid encrypted parquet file. " - "Could not read footer metadata bytes."); + "AAD Prefix in file and in properties " + "is not the same"); } - - auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); - file_metadata_ = - FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); + } + aad_prefix = aad_prefix_in_file; + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->Verify(aad_prefix); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); } } - - private: - std::shared_ptr source_; - std::shared_ptr file_metadata_; - ReaderProperties properties_; - std::unique_ptr file_decryptor_; -}; + return aad_prefix + algo.aad.aad_file_unique; +} // ---------------------------------------------------------------------- // ParquetFileReader public API @@ -521,9 +508,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) - << "The file only has " << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) << "The file only has " + << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 11af1815577..1df7d05b3be 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -133,7 +133,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, - row_group_ordinal_, (int16_t)(current_column_index_ - 1), + row_group_ordinal_, static_cast(current_column_index_ - 1), properties_->memory_pool(), false, meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -237,11 +237,11 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto data_encryptor = file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) : NULLPTR; - std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, (int16_t)row_group_ordinal_, - (int16_t)current_column_index_, properties_->memory_pool(), - buffered_row_group_, meta_encryptor, data_encryptor); + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + static_cast(row_group_ordinal_), + static_cast(current_column_index_), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -283,38 +283,11 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); - if (file_encryption_properties == nullptr) { // plaintext regular file + if (file_encryption_properties == nullptr) { // Non encrypted file. file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); } else { // Encrypted file - // Encrypted file with encrypted footer - if (file_encryption_properties->encrypted_footer()) { - // encrypted footer - file_metadata_ = metadata_->Finish(); - - int64_t position = -1; - PARQUET_THROW_NOT_OK(sink_->Tell(&position)); - uint64_t metadata_start = static_cast(position); - auto crypto_metadata = metadata_->GetCryptoMetaData(); - WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); - PARQUET_THROW_NOT_OK(sink_->Tell(&position)); - uint32_t footer_and_crypto_len = - static_cast(position - metadata_start); - PARQUET_THROW_NOT_OK( - sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); - PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); - } else { // Encrypted file with plaintext footer - file_metadata_ = metadata_->Finish(); - auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, - false); - } - if (file_encryptor_) { - file_encryptor_->wipeout_encryption_keys(); - } + CloseEncryptedFile(file_encryption_properties); } } } @@ -336,7 +309,7 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, (int16_t)(num_row_groups_ - 1), properties_.get(), + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); @@ -369,6 +342,36 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } + void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { + // Encrypted file with encrypted footer + if (file_encryption_properties->encrypted_footer()) { + // encrypted footer + file_metadata_ = metadata_->Finish(); + + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint64_t metadata_start = static_cast(position); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + + auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true); + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint32_t footer_and_crypto_len = static_cast(position - metadata_start); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); + } else { // Encrypted file with plaintext footer + file_metadata_ = metadata_->Finish(); + auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor, + false); + } + if (file_encryptor_) { + file_encryptor_->WipeOutEncryptionKeys(); + } + } + std::shared_ptr sink_; bool is_open_; const std::shared_ptr properties_; @@ -390,7 +393,7 @@ class FileSerializer : public ParquetFileWriter::Contents { if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { - // plaintext mode footer + // Encrypted file with plaintext footer mode. PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } @@ -429,43 +432,52 @@ std::unique_ptr ParquetFileWriter::Open( key_value_metadata); } -void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& encryptor, bool encrypt_footer) { - if (encryptor == nullptr) { // plaintext regular file - // Write MetaData +void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + // Write MetaData + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); + + file_metadata.WriteTo(sink); + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; + + // Write Footer + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); +} + +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + if (encrypt_footer) { // Encrypted file with encrypted footer + // encrypt and write to sink + file_metadata.WriteTo(sink, encryptor); + } else { // Encrypted file with plaintext footer mode. int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); uint32_t metadata_len = static_cast(position); - - file_metadata.WriteTo(sink); + file_metadata.WriteTo(sink, encryptor); PARQUET_THROW_NOT_OK(sink->Tell(&position)); metadata_len = static_cast(position) - metadata_len; - // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { // Encrypted file - if (encrypt_footer) { // Encrypted file with encrypted footer - // encrypt and write to sink - file_metadata.WriteTo(sink, encryptor); - } else { // Encrypted file with plaintext footer - int64_t position = -1; - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - uint32_t metadata_len = static_cast(position); - file_metadata.WriteTo(sink, encryptor); - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - metadata_len = static_cast(position) - metadata_len; - - PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } } } void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, const std::shared_ptr& encryptor, bool encrypt_footer) { ParquetOutputWrapper wrapper(sink); - return WriteFileMetaData(file_metadata, &wrapper, encryptor, encrypt_footer); + return WriteFileMetaData(file_metadata, &wrapper); +} + +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + ParquetOutputWrapper wrapper(sink); + return WriteEncryptedFileMetadata(file_metadata, &wrapper, encryptor, encrypt_footer); } void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 6c2158185e8..5c90273ae4f 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -111,16 +111,24 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR, - bool encrypt_footer = false); +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer); + void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR, - bool encrypt_footer = false); +void WriteFileMetaData(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor = NULLPTR, + bool encrypt_footer = false); PARQUET_EXPORT void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ::arrow::io::OutputStream* sink); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d46101a819b..867c81a0113 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -22,12 +22,6 @@ namespace parquet { // FooterSigningEncryptor -static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); -} FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, @@ -78,25 +72,11 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie "Re-using decryption properties with explicit keys for another file"); } properties_->set_utilized(); - - all_decryptors_ = std::shared_ptr>( - new std::vector); - column_data_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); } -void InternalFileDecryptor::wipeout_decryption_keys() { - properties_->wipeout_decryption_keys(); - for (auto const& i : *all_decryptors_) { +void InternalFileDecryptor::WipeOutDecryptionKeys() { + properties_->WipeOutDecryptionKeys(); + for (auto const& i : all_decryptors_) { i->WipeOut(); } } @@ -180,16 +160,13 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); - std::shared_ptr footer_metadata_decryptor = + footer_metadata_decryptor_ = std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); - std::shared_ptr footer_data_decryptor = + footer_data_decryptor_ = std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); - footer_metadata_decryptor_ = footer_metadata_decryptor; - footer_data_decryptor_ = footer_data_decryptor; - - if (metadata) return footer_metadata_decryptor; - return footer_data_decryptor; + if (metadata) return footer_metadata_decryptor_; + return footer_data_decryptor_; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( @@ -210,12 +187,12 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::string column_key; // first look if we already got the decryptor from before if (metadata) { - if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { - return column_metadata_map_->at(column_path); + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); } } else { - if (column_data_map_->find(column_path) != column_data_map_->end()) { - return column_data_map_->at(column_path); + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); } } @@ -242,64 +219,43 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); - std::shared_ptr metadata_decryptor = + column_metadata_map_[column_path] = std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); - std::shared_ptr data_decryptor = + column_data_map_[column_path] = std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); - (*column_metadata_map_)[column_path] = metadata_decryptor; - (*column_data_map_)[column_path] = data_decryptor; + if (metadata) return column_metadata_map_[column_path]; + return column_data_map_[column_path]; +} - if (metadata) return metadata_decryptor; - return data_decryptor; +int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("decryption key must be 16, 24 or 32 bytes in length"); } encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); - } - return meta_decryptor_128_.get(); - } else if (key_len == 24) { - if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); - } - return meta_decryptor_196_.get(); - } else if (key_len == 32) { - if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); - } - return meta_decryptor_256_.get(); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (meta_decryptor_[index] == NULLPTR) { + meta_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return meta_decryptor_[index].get(); } encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); - } - return data_decryptor_128_.get(); - } else if (key_len == 24) { - if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); - } - return data_decryptor_196_.get(); - } else if (key_len == 32) { - if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); - } - return data_decryptor_256_.get(); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (data_decryptor_[index] == NULLPTR) { + data_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return data_decryptor_[index].get(); } } // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 3013183bf94..842eea7c680 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -56,7 +56,7 @@ class Decryptor { const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } - void update_aad(const std::string& aad) { aad_ = aad; } + void UpdateAad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); @@ -85,7 +85,7 @@ class InternalFileDecryptor { FileDecryptionProperties* properties() { return properties_; } - void wipeout_decryption_keys(); + void WipeOutDecryptionKeys(); std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); @@ -101,13 +101,11 @@ class InternalFileDecryptor { FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_data_map_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_metadata_map_; std::shared_ptr footer_metadata_decryptor_; @@ -115,14 +113,12 @@ class InternalFileDecryptor { ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_signing_encryptor_; - std::shared_ptr> all_decryptors_; + std::vector all_decryptors_; - std::unique_ptr meta_decryptor_128_; - std::unique_ptr meta_decryptor_196_; - std::unique_ptr meta_decryptor_256_; - std::unique_ptr data_decryptor_128_; - std::unique_ptr data_decryptor_196_; - std::unique_ptr data_decryptor_256_; + /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_decryptors and data_decryptors. + std::unique_ptr meta_decryptor_[3]; + std::unique_ptr data_decryptor_[3]; std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( @@ -132,6 +128,8 @@ class InternalFileDecryptor { encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + + int MapKeyLenToDecryptorArrayIndex(int key_len); }; } // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index b558f5c7fb8..6f097a1a0b4 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -21,13 +21,6 @@ namespace parquet { -static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); -} - // Encryptor Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad) @@ -44,26 +37,16 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { - all_encryptors_ = std::shared_ptr>( - new std::vector); - - column_data_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + if (properties_->is_utilized()) { + throw ParquetException("Re-using encryption properties for another file"); + } + properties_->set_utilized(); } -void InternalFileEncryptor::wipeout_encryption_keys() { - properties_->wipeout_encryption_keys(); +void InternalFileEncryptor::WipeOutEncryptionKeys() { + properties_->WipeOutEncryptionKeys(); - for (auto const& i : *all_encryptors_) { + for (auto const& i : all_encryptors_) { i->WipeOut(); } } @@ -77,10 +60,9 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); - std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_key, properties_->file_aad(), footer_aad); - footer_encryptor_ = encryptor; - return encryptor; + footer_encryptor_ = std::make_shared(aes_encryptor, footer_key, + properties_->file_aad(), footer_aad); + return footer_encryptor_; } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { @@ -92,10 +74,9 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); - std::shared_ptr encryptor = std::make_shared( + footer_signing_encryptor_ = std::make_shared( aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); - footer_signing_encryptor_ = encryptor; - return encryptor; + return footer_signing_encryptor_; } std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( @@ -113,17 +94,17 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata) { // first look if we already got the encryptor from before if (metadata) { - if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { - return column_metadata_map_->at(column_path); + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); } } else { - if (column_data_map_->find(column_path) != column_data_map_->end()) { - return column_data_map_->at(column_path); + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); } } auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { - return NULLPTR; + return nullptr; } std::string key; @@ -141,61 +122,43 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( std::shared_ptr encryptor = std::make_shared(aes_encryptor, key, file_aad, ""); if (metadata) - (*column_metadata_map_)[column_path] = encryptor; + column_metadata_map_[column_path] = encryptor; else - (*column_data_map_)[column_path] = encryptor; + column_data_map_[column_path] = encryptor; return encryptor; } +int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); - } - return meta_encryptor_128_.get(); - } else if (key_len == 24) { - if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); - } - return meta_encryptor_196_.get(); - } else if (key_len == 32) { - if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); - } - return meta_encryptor_256_.get(); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (meta_encryptor_[index] == NULLPTR) { + meta_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return meta_encryptor_[index].get(); } encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); - } - return data_encryptor_128_.get(); - } else if (key_len == 24) { - if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); - } - return data_encryptor_196_.get(); - } else if (key_len == 32) { - if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); - } - return data_encryptor_256_.get(); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (data_encryptor_[index] == NULLPTR) { + data_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return data_encryptor_[index].get(); } } // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 9fc0227e45f..efef532b163 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -40,12 +40,12 @@ class Encryptor { Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } - void update_aad(const std::string& aad) { aad_ = aad; } + void UpdateAad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); - bool encryptColumnMetaData( + bool EncryptColumnMetaData( bool encrypted_footer, const std::shared_ptr& column_encryption_properties) { // if column is not encrypted then do not encrypt the column metadata @@ -74,31 +74,27 @@ class InternalFileEncryptor { const std::shared_ptr& column_path); std::shared_ptr GetColumnDataEncryptor( const std::shared_ptr& column_path); - void wipeout_encryption_keys(); + void WipeOutEncryptionKeys(); private: FileEncryptionProperties* properties_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_data_map_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_metadata_map_; std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; - std::shared_ptr> all_encryptors_; + std::vector all_encryptors_; - std::unique_ptr meta_encryptor_128_; - std::unique_ptr meta_encryptor_196_; - std::unique_ptr meta_encryptor_256_; - std::unique_ptr data_encryptor_128_; - std::unique_ptr data_encryptor_196_; - std::unique_ptr data_encryptor_256_; + // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_encryptors and data_encryptors. + std::unique_ptr meta_encryptor_[3]; + std::unique_ptr data_encryptor_[3]; std::shared_ptr GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata); @@ -107,6 +103,8 @@ class InternalFileEncryptor { size_t key_len); encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, size_t key_len); + + int MapKeyLenToEncryptorArrayIndex(int key_len); }; } // namespace parquet diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 56127c0d65c..5bad8be22e6 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -186,7 +186,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &decrypted_metadata_, decryptor, false); + &len, &decrypted_metadata_, decryptor); is_metadata_set_ = true; } } else { @@ -466,7 +466,7 @@ class FileMetaData::FileMetaDataImpl { : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), decryptor, false); + metadata_.get(), decryptor); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -551,7 +551,7 @@ class FileMetaData::FileMetaDataImpl { encryption::kGcmTagLength)); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer - serializer.Serialize(metadata_.get(), dst, encryptor, false); + serializer.Serialize(metadata_.get(), dst, encryptor); } } diff --git a/cpp/src/parquet/parquet.pc b/cpp/src/parquet/parquet.pc deleted file mode 100644 index e46eea65b72..00000000000 --- a/cpp/src/parquet/parquet.pc +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=/usr/local -libdir=${prefix}/lib -includedir=${prefix}/include - -so_version=13 -abi_version=13 -full_so_version=13.0.0 - -Name: Apache Parquet -Description: Apache Parquet is a columnar storage format. -Version: 1.5.1-SNAPSHOT -Libs: -L${libdir} -lparquet -Cflags: -I${includedir} diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index b6795e3d487..288c72a60a6 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -874,7 +874,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -968,5 +968,3 @@ struct FileCryptoMetaData { * and (possibly) columns **/ 2: optional binary key_metadata } - - diff --git a/cpp/src/parquet/parquet_version.h b/cpp/src/parquet/parquet_version.h deleted file mode 100644 index dd83e45028d..00000000000 --- a/cpp/src/parquet/parquet_version.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_VERSION_H -#define PARQUET_VERSION_H - -// define the parquet created by version -#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT" - -#endif // PARQUET_VERSION_H diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 709f17970b0..1e5fdc3d84d 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -164,12 +164,9 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { format::AesGcmV1 aesGcmV1; // aad_file_unique is always set - aesGcmV1.__isset.aad_file_unique = true; aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); - aesGcmV1.__isset.supply_aad_prefix = true; aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); if (!aad.aad_prefix.empty()) { - aesGcmV1.__isset.aad_prefix = true; aesGcmV1.__set_aad_prefix(aad.aad_prefix); } return aesGcmV1; @@ -178,12 +175,9 @@ static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { format::AesGcmCtrV1 aesGcmCtrV1; // aad_file_unique is always set - aesGcmCtrV1.__isset.aad_file_unique = true; aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); - aesGcmCtrV1.__isset.supply_aad_prefix = true; aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); if (!aad.aad_prefix.empty()) { - aesGcmCtrV1.__isset.aad_prefix = true; aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); } return aesGcmCtrV1; @@ -192,11 +186,9 @@ static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; if (encryption.algorithm == ParquetCipher::AES_GCM_V1) { - encryption_algorithm.__isset.AES_GCM_V1 = true; - encryption_algorithm.AES_GCM_V1 = ToAesGcmV1Thrift(encryption.aad); + encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad)); } else { - encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; - encryption_algorithm.AES_GCM_CTR_V1 = ToAesGcmCtrV1Thrift(encryption.aad); + encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad)); } return encryption_algorithm; } @@ -206,50 +198,51 @@ static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryptio using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; +template +inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, + T* deserialized_msg) { + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; +} + // Deserialize a thrift message from buf/len. buf/len must at least contain // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& decryptor = NULLPTR, - bool shouldReadLength = false) { + const std::shared_ptr& decryptor = NULLPTR) { // thrift message is not encrypted if (decryptor == NULLPTR) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new ThriftBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; - shared_ptr tproto = // - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); } else { // thrift message is encrypted uint32_t clen; - if (shouldReadLength) { - // first 4 bytes for length - uint8_t clenBytes[4]; - memcpy(clenBytes, buf, 4); - clen = *(reinterpret_cast(clenBytes)); - } else { - clen = *len; - } + clen = *len; // decrypt - const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; - std::vector decrypted_buffer(clen - decryptor->CiphertextSizeDelta()); + std::shared_ptr decrypted_buffer = + std::static_pointer_cast(AllocateBuffer( + ::arrow::default_memory_pool(), + static_cast(clen - decryptor->CiphertextSizeDelta()))); + const uint8_t* cipher_buf = buf; uint32_t decrypted_buffer_len = - decryptor->Decrypt(cipherBuf, 0, decrypted_buffer.data()); + decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); - DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, + DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, deserialized_msg); } } @@ -283,8 +276,7 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryptor = NULLPTR, - bool shouldWriteLength = false) { + const std::shared_ptr& encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -294,26 +286,7 @@ class ThriftSerializer { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); } else { // obj is encrypted - std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); - int cipher_buffer_len = - encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); - - if (cipher_buffer_len > static_cast(cipher_buffer.size())) { - std::stringstream ss; - ss << "cipher length is greater than cipher buffer capacity: " - << cipher_buffer_len << cipher_buffer.size() << "\n"; - throw ParquetException(ss.str()); - } - - if (shouldWriteLength) { - PARQUET_THROW_NOT_OK( - out->Write(reinterpret_cast(&cipher_buffer_len), 4)); - PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); - return static_cast(cipher_buffer_len + 4); - } else { - PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); - return static_cast(cipher_buffer_len); - } + return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); } } @@ -330,6 +303,20 @@ class ThriftSerializer { } } + int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, + uint32_t out_length, + const std::shared_ptr& encryptor) { + std::shared_ptr cipher_buffer = + std::static_pointer_cast(AllocateBuffer( + ::arrow::default_memory_pool(), + static_cast(encryptor->CiphertextSizeDelta() + out_length))); + int cipher_buffer_len = + encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); + + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len); + } + shared_ptr mem_buffer_; shared_ptr protocol_; }; From b7f3ea3424a0f7814cdeb1d923efed50a31ca7b3 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Fri, 21 Jun 2019 16:23:17 +0300 Subject: [PATCH 112/125] Fix SerializedPageReader initialization --- cpp/src/parquet/column_reader.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f96176121e0..cbb8dd9e7dc 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -113,11 +113,16 @@ class SerializedPageReader : public PageReader { ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), + column_has_dictionary_(false), first_page_(true), + row_group_ordinal_(-1), + column_ordinal_(-1), page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), - decryption_buffer_(AllocateBuffer(pool, 0)) { + decryption_buffer_(AllocateBuffer(pool, 0)), + meta_decryptor_(NULLPTR), + data_decryptor_(NULLPTR) { if (ctx != NULLPTR) { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; From 80710e76c78af537172807ecdd109229a75cc855 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Sun, 23 Jun 2019 11:46:35 +0300 Subject: [PATCH 113/125] Fix Format --- cpp/src/parquet/encryption_internal.cc | 13 +++++-------- cpp/src/parquet/encryption_internal.h | 4 ++-- cpp/src/parquet/file_reader.cc | 12 ++++++------ cpp/src/parquet/metadata.cc | 4 ++-- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index b73f910b230..695b9b5db72 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -389,9 +389,8 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int } } -AesEncryptor* AesEncryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_encryptors) { +AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_encryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -399,8 +398,7 @@ AesEncryptor* AesEncryptor::Make( } AesEncryptor* encryptor = new AesEncryptor(alg_id, key_len, metadata); - if (all_encryptors != NULLPTR) - all_encryptors->push_back(encryptor); + if (all_encryptors != NULLPTR) all_encryptors->push_back(encryptor); return encryptor; } @@ -408,9 +406,8 @@ AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadat : impl_{std::unique_ptr( new AesDecryptorImpl(alg_id, key_len, metadata))} {} -AesDecryptor* AesDecryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_decryptors) { +AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_decryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index 410d1ed5472..9fe82bd28da 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -47,7 +47,7 @@ constexpr int8_t kOffsetIndex = 7; class AesEncryptor { public: static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_encryptors); + std::vector* all_encryptors); ~AesEncryptor(); @@ -78,7 +78,7 @@ class AesEncryptor { class AesDecryptor { public: static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_decryptors); + std::vector* all_decryptors); ~AesDecryptor(); void WipeOut(); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8e390aee65e..7b1ac4294cb 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -59,9 +59,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -71,9 +71,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5bad8be22e6..351b68a66da 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -294,8 +294,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { - if (column_->__isset.crypto_metadata - && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + if (column_->__isset.crypto_metadata && + column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { throw ParquetException( "Cannot decrypt ColumnMetadata. " From b3e277e50796d2311ebfd780881a1c0d46137402 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 23 Jun 2019 22:42:35 +0700 Subject: [PATCH 114/125] let parquet encryption be able to be off (when openssl is not found) --- cpp/examples/parquet/CMakeLists.txt | 26 +++-- cpp/src/parquet/CMakeLists.txt | 16 ++- cpp/src/parquet/column_reader.cc | 56 ++++++--- cpp/src/parquet/column_reader.h | 6 + cpp/src/parquet/column_writer.cc | 87 ++++++++++++-- cpp/src/parquet/column_writer.h | 7 ++ cpp/src/parquet/file_reader.cc | 138 ++++++++++++++++------ cpp/src/parquet/file_writer.cc | 72 ++++++++++-- cpp/src/parquet/file_writer.h | 19 ++-- cpp/src/parquet/metadata.cc | 171 ++++++++++++++++++++++++---- cpp/src/parquet/metadata.h | 75 ++++++++++-- cpp/src/parquet/properties.h | 45 ++++++-- cpp/src/parquet/thrift.h | 36 ++++-- 13 files changed, 618 insertions(+), 136 deletions(-) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index fb428730360..813484641fe 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -17,16 +17,20 @@ add_executable(parquet-low-level-example low-level-api/reader-writer.cc) add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) -add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) -add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) -target_include_directories(parquet-encryption-example PRIVATE low-level-api/) -target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) -target_link_libraries(parquet-encryption-example parquet_static) -target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) + +if (ARROW_USE_OPENSSL) + add_definitions(-DPARQUET_ENCRYPTION) + add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) + add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) + target_include_directories(parquet-encryption-example PRIVATE low-level-api/) + target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) + target_link_libraries(parquet-encryption-example parquet_static) + target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) +endif() add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated @@ -39,6 +43,10 @@ endif() add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 - parquet-encryption-example - parquet-arrow-example - parquet-encryption-example-all-crypto-options) + parquet-arrow-example) + +if (ARROW_USE_OPENSSL) + add_dependencies(parquet + parquet-encryption-example + parquet-encryption-example-all-crypto-options) +endif() diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 5cbc9699067..407657bc1b4 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -166,10 +166,6 @@ set(PARQUET_SRCS column_writer.cc deprecated_io.cc encoding.cc - encryption.cc - encryption_internal.cc - internal_file_decryptor.cc - internal_file_encryptor.cc file_reader.cc file_writer.cc metadata.cc @@ -183,6 +179,16 @@ set(PARQUET_SRCS statistics.cc types.cc) +if (ARROW_USE_OPENSSL) + add_definitions(-DPARQUET_ENCRYPTION) + set(PARQUET_SRCS + ${PARQUET_SRCS} + encryption.cc + encryption_internal.cc + internal_file_decryptor.cc + internal_file_encryptor.cc) +endif() + # Ensure that thrift compilation is done before using its generated headers # in parquet code. add_custom_target(parquet-thrift-deps ALL DEPENDS ${THRIFT_OUTPUT_FILES}) @@ -320,7 +326,7 @@ add_parquet_test(arrow-test arrow/arrow-schema-test.cc test-util.cc) -if(PARQUET_BUILD_ENCRYPTION) +if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES encryption-properties-test.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index cbb8dd9e7dc..9362fc5a7ed 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -32,12 +32,15 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#endif + using arrow::MemoryPool; namespace parquet { @@ -119,22 +122,28 @@ class SerializedPageReader : public PageReader { column_ordinal_(-1), page_ordinal_(-1), seen_num_rows_(0), - total_num_rows_(total_num_rows), + total_num_rows_(total_num_rows) +#ifdef PARQUET_ENCRYPTION + , decryption_buffer_(AllocateBuffer(pool, 0)), meta_decryptor_(NULLPTR), - data_decryptor_(NULLPTR) { + data_decryptor_(NULLPTR) +#endif + { if (ctx != NULLPTR) { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; column_ordinal_ = ctx->column_ordinal; +#ifdef PARQUET_ENCRYPTION meta_decryptor_ = ctx->meta_decryptor; data_decryptor_ = ctx->data_decryptor; + if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { + InitDecryption(); + } +#endif } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); - if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { - InitDecryption(); - } } // Implement the PageReader interface @@ -143,11 +152,13 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: +#ifdef PARQUET_ENCRYPTION void UpdateDecryption(const std::shared_ptr& decryptor, bool current_page_is_dictionary, int8_t module_type, const std::string& pageAAD); void InitDecryption(); +#endif std::shared_ptr stream_; @@ -179,12 +190,6 @@ class SerializedPageReader : public PageReader { int16_t row_group_ordinal_; int16_t column_ordinal_; int16_t page_ordinal_; - // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page - // header in a single column respectively. - // While calculating AAD for different pages in a single column the pages AAD is - // updated by only the page ordinal. - std::string data_pageAAD_; - std::string data_page_headerAAD_; // Maximum allowed page size uint32_t max_page_header_size_; @@ -195,12 +200,21 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; +#ifdef PARQUET_ENCRYPTION + // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page + // header in a single column respectively. + // While calculating AAD for different pages in a single column the pages AAD is + // updated by only the page ordinal. + std::string data_pageAAD_; + std::string data_page_headerAAD_; // Encryption std::shared_ptr decryption_buffer_; std::shared_ptr meta_decryptor_; std::shared_ptr data_decryptor_; +#endif }; +#ifdef PARQUET_ENCRYPTION void SerializedPageReader::InitDecryption() { // Prepare the AAD for quick update later. if (data_decryptor_ != NULLPTR) { @@ -232,14 +246,19 @@ void SerializedPageReader::UpdateDecryption(const std::shared_ptr& de decryptor->UpdateAad(pageAAD); } } +#endif // PARQUET_ENCRYPTION std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with +#ifdef PARQUET_ENCRYPTION bool current_page_is_dictionary = false; +#endif if (column_has_dictionary_) { if (first_page_) { +#ifdef PARQUET_ENCRYPTION current_page_is_dictionary = true; +#endif first_page_ = false; } else { page_ordinal_++; @@ -265,12 +284,17 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { +#ifdef PARQUET_ENCRYPTION if (meta_decryptor_ != NULLPTR) { UpdateDecryption(meta_decryptor_, current_page_is_dictionary, encryption::kDictionaryPageHeader, data_page_headerAAD_); } DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, meta_decryptor_); +#else + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_); +#endif // PARQUET_ENCRYPTION break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -288,11 +312,12 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; +#ifdef PARQUET_ENCRYPTION if (data_decryptor_ != NULLPTR) { UpdateDecryption(data_decryptor_, current_page_is_dictionary, encryption::kDictionaryPage, data_pageAAD_); } - +#endif // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -303,6 +328,7 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } +#ifdef PARQUET_ENCRYPTION // Decrypt it if we need to if (data_decryptor_ != nullptr) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( @@ -312,7 +338,7 @@ std::shared_ptr SerializedPageReader::NextPage() { page_buffer = decryption_buffer_; } - +#endif // PARQUET_ENCRYPTION // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 61aca8c1380..f361a695807 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -45,7 +45,10 @@ namespace parquet { class DictionaryPage; class Page; + +#ifdef PARQUET_ENCRYPTION class Decryptor; +#endif // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -78,8 +81,11 @@ struct PageReaderContext { bool column_has_dictionary; int16_t row_group_ordinal; int16_t column_ordinal; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr meta_decryptor; std::shared_ptr data_decryptor; +#endif }; // Abstract page iterator interface. This way, we can feed column pages to the diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a6263612975..087afe6c1c4 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -31,8 +31,6 @@ #include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -40,6 +38,11 @@ #include "parquet/thrift.h" #include "parquet/types.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" +#endif + namespace parquet { using ::arrow::internal::checked_cast; @@ -131,9 +134,13 @@ class SerializedPageWriter : public PageWriter { SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool() +#ifdef PARQUET_ENCRYPTION + , std::shared_ptr meta_encryptor = NULLPTR, - std::shared_ptr data_encryptor = NULLPTR) + std::shared_ptr data_encryptor = NULLPTR +#endif + ) : sink_(sink), metadata_(metadata), pool_(pool), @@ -144,12 +151,18 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_(0), page_ordinal_(0), row_group_ordinal_(row_group_ordinal), - column_ordinal_(column_chunk_ordinal), + column_ordinal_(column_chunk_ordinal) +#ifdef PARQUET_ENCRYPTION + , meta_encryptor_(meta_encryptor), - data_encryptor_(data_encryptor) { + data_encryptor_(data_encryptor) +#endif + { +#ifdef PARQUET_ENCRYPTION if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { InitEncryption(); } +#endif compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -174,6 +187,7 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int32_t output_data_len = static_cast(compressed_data->size()); +#ifdef PARQUET_ENCRYPTION std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { UpdateEncryption(encryption::kDictionaryPage); @@ -183,6 +197,7 @@ class SerializedPageWriter : public PageWriter { encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } +#endif format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); @@ -197,11 +212,16 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_ = start_pos; } +#ifdef PARQUET_ENCRYPTION if (meta_encryptor_) { UpdateEncryption(encryption::kDictionaryPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); +#else + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); +#endif + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -213,6 +233,7 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { +#ifdef PARQUET_ENCRYPTION if (meta_encryptor_ != nullptr) { UpdateEncryption(encryption::kColumnMetaData); } @@ -221,6 +242,12 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback, meta_encryptor_); +#else + // index_page_offset = -1 since they are not supported + metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, + total_compressed_size_, total_uncompressed_size_, has_dictionary, + fallback); +#endif // Write metadata at end of column chunk metadata_->WriteTo(sink_.get()); } @@ -261,6 +288,7 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int32_t output_data_len = static_cast(compressed_data->size()); +#ifdef PARQUET_ENCRYPTION std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { UpdateEncryption(encryption::kDataPage); @@ -270,6 +298,7 @@ class SerializedPageWriter : public PageWriter { encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } +#endif format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); @@ -284,11 +313,15 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } +#ifdef PARQUET_ENCRYPTION if (meta_encryptor_) { UpdateEncryption(encryption::kDataPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); +#else + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); +#endif PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -314,6 +347,7 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: +#ifdef PARQUET_ENCRYPTION void InitEncryption() { // Prepare the AAD for quick update later. if (data_encryptor_ != NULLPTR) { @@ -362,6 +396,7 @@ class SerializedPageWriter : public PageWriter { throw ParquetException("Unknown module type in UpdateEncryption"); } } +#endif std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; @@ -374,32 +409,49 @@ class SerializedPageWriter : public PageWriter { int16_t page_ordinal_; int16_t row_group_ordinal_; int16_t column_ordinal_; - std::string data_pageAAD_; - std::string data_page_headerAAD_; std::unique_ptr thrift_serializer_; // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; +#ifdef PARQUET_ENCRYPTION + std::string data_pageAAD_; + std::string data_page_headerAAD_; + std::shared_ptr meta_encryptor_; std::shared_ptr data_encryptor_; +#endif }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: +#ifdef PARQUET_ENCRYPTION BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t current_column_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) +#else + BufferedPageWriter(const std::shared_ptr& sink, + Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t current_column_ordinal, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) +#endif : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); + +#ifdef PARQUET_ENCRYPTION pager_ = std::unique_ptr(new SerializedPageWriter( in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, meta_encryptor, data_encryptor)); +#else + pager_ = std::unique_ptr( + new SerializedPageWriter(in_memory_sink_, codec, metadata, row_group_ordinal, + current_column_ordinal, pool)); +#endif } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -444,9 +496,13 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group, - std::shared_ptr meta_encryptor, - std::shared_ptr data_encryptor) { + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group +#ifdef PARQUET_ENCRYPTION + , + std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor +#endif +) { +#ifdef PARQUET_ENCRYPTION if (buffered_row_group) { return std::unique_ptr(new BufferedPageWriter( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, @@ -456,6 +512,15 @@ std::unique_ptr PageWriter::Open( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, meta_encryptor, data_encryptor)); } +#else + if (buffered_row_group) { + return std::unique_ptr(new BufferedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); + } else { + return std::unique_ptr(new SerializedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); + } +#endif } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 8d638182d8b..b4b5fa702e6 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -46,7 +46,10 @@ namespace parquet { class ColumnChunkMetaDataBuilder; class WriterProperties; + +#ifdef PARQUET_ENCRYPTION class Encryptor; +#endif class PARQUET_EXPORT LevelEncoder { public: @@ -87,9 +90,13 @@ class PARQUET_EXPORT PageWriter { ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), +#ifdef PARQUET_ENCRYPTION bool buffered_row_group = false, std::shared_ptr header_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR); +#else + bool buffered_row_group = false); +#endif // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 7b1ac4294cb..c29bad388af 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -33,7 +33,6 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" -#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/file_writer.h" #include "parquet/internal_file_decryptor.h" @@ -43,6 +42,10 @@ #include "parquet/schema.h" #include "parquet/types.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#endif + namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file @@ -85,12 +88,21 @@ class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, int row_group_number, - const ReaderProperties& props, InternalFileDecryptor* file_decryptor) + const ReaderProperties& props +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor +#endif + ) : source_(source), file_metadata_(file_metadata), properties_(props), - row_group_ordinal_((int16_t)row_group_number), - file_decryptor_(file_decryptor) { + row_group_ordinal_((int16_t)row_group_number) +#ifdef PARQUET_ENCRYPTION + , + file_decryptor_(file_decryptor) +#endif + { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -100,7 +112,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file +#ifdef PARQUET_ENCRYPTION auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); +#else + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_); +#endif + int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -124,6 +141,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); + +#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata = col->crypto_metadata(); // Column is encrypted only if crypto_metadata exists. @@ -160,6 +179,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); +#else + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool()); +#endif } private: @@ -168,7 +191,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; + +#ifdef PARQUET_ENCRYPTION InternalFileDecryptor* file_decryptor_; +#endif }; // ---------------------------------------------------------------------- @@ -191,12 +217,19 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { +#ifdef PARQUET_ENCRYPTION if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); +#endif } std::shared_ptr GetRowGroup(int i) override { - std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); + std::unique_ptr contents( + new SerializedRowGroup(source_, file_metadata_.get(), i, +#ifdef PARQUET_ENCRYPTION + properties_, file_decryptor_.get())); +#else + properties_)); +#endif return std::make_shared(std::move(contents)); } @@ -225,39 +258,32 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(file_size - footer_read_size, footer_read_size, &footer_buffer)); // Check if all bytes are read. Check if last 4 bytes read have the magic bits +#ifdef PARQUET_ENCRYPTION + if (footer_buffer->size() != footer_read_size || + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { + throw ParquetException("Invalid parquet file. Corrupt footer."); + } +#else if (footer_buffer->size() != footer_read_size || (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); + } else if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == + 0) { + throw ParquetException( + "Encrypted parquet file. " + "Should build with parquet encryption support."); } +#endif +#if PARQUET_ENCRYPTION // No encryption or encryption with plaintext footer mode. if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { - uint32_t metadata_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); - int64_t metadata_start = file_size - kFooterSize - metadata_len; - if (kFooterSize + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } - std::shared_ptr metadata_buffer; - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + kFooterSize)) { - metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } - } - - uint32_t read_metadata_len = metadata_len; - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. @@ -276,14 +302,29 @@ class SerializedFile : public ParquetFileReader::Contents { ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, file_size); } +#else // not defined PARQUET_ENCRYPTION + std::shared_ptr metadata_buffer; + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); +#endif } private: std::shared_ptr source_; std::shared_ptr file_metadata_; ReaderProperties properties_; + +#ifdef PARQUET_ENCRYPTION std::unique_ptr file_decryptor_; +#endif + + void ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, + int64_t footer_read_size, int64_t file_size, + std::shared_ptr* metadata_buffer, + uint32_t* metadata_len, uint32_t* read_metadata_len); +#if PARQUET_ENCRYPTION std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo); @@ -295,8 +336,39 @@ class SerializedFile : public ParquetFileReader::Contents { void ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size); +#endif }; +void SerializedFile::ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size, std::shared_ptr* metadata_buffer, uint32_t* metadata_len, + uint32_t* read_metadata_len) { + *metadata_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = file_size - kFooterSize - *metadata_len; + if (kFooterSize + *metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (*metadata_len + kFooterSize)) { + *metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - *metadata_len - kFooterSize, *metadata_len); + } else { + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_start, *metadata_len, metadata_buffer)); + if ((*metadata_buffer)->size() != *metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + *read_metadata_len = *metadata_len; + file_metadata_ = FileMetaData::Make((*metadata_buffer)->data(), read_metadata_len); +} + +#ifdef PARQUET_ENCRYPTION void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size) { @@ -374,8 +446,8 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify_signature( - encryptor, metadata_buffer->data() + read_metadata_len)) { + if (!file_metadata_->VerifySignature(encryptor, + metadata_buffer->data() + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " "footer metadata"); @@ -425,6 +497,7 @@ std::string SerializedFile::HandleAadPrefix( } return aad_prefix + algo.aad.aad_file_unique; } +#endif // PARQUET_ENCRYPTION // ---------------------------------------------------------------------- // ParquetFileReader public API @@ -511,6 +584,7 @@ std::shared_ptr ParquetFileReader::RowGroup(int i) { DCHECK(i < metadata()->num_row_groups()) << "The file only has " << metadata()->num_row_groups() << "row groups, requested reader for: " << i; + return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 1df7d05b3be..b6465d23cc6 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -22,11 +22,14 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/schema.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" +#endif + using arrow::MemoryPool; using parquet::schema::GroupNode; @@ -77,8 +80,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, - const WriterProperties* properties, bool buffered_row_group = false, - InternalFileEncryptor* file_encryptor = NULLPTR) + const WriterProperties* properties, bool buffered_row_group = false +#ifdef PARQUET_ENCRYPTION + , + InternalFileEncryptor* file_encryptor = NULLPTR +#endif + ) : sink_(sink), metadata_(metadata), properties_(properties), @@ -87,8 +94,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { row_group_ordinal_(row_group_ordinal), current_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group), - file_encryptor_(file_encryptor) { + buffered_row_group_(buffered_row_group) +#ifdef PARQUET_ENCRYPTION + , + file_encryptor_(file_encryptor) +#endif + { if (buffered_row_group) { InitColumns(); } else { @@ -124,6 +135,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++current_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); +#ifdef PARQUET_ENCRYPTION auto meta_encryptor = file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) : NULLPTR; @@ -135,6 +147,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { sink_, properties_->compression(column_descr->path()), col_meta, row_group_ordinal_, static_cast(current_column_index_ - 1), properties_->memory_pool(), false, meta_encryptor, data_encryptor); +#else + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + row_group_ordinal_, static_cast(current_column_index_ - 1), + properties_->memory_pool(), false); +#endif column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -203,7 +221,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { int current_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; + +#ifdef PARQUET_ENCRYPTION InternalFileEncryptor* file_encryptor_; +#endif void CheckRowsWritten() const { // verify when only one column is written at a time @@ -231,6 +252,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); +#ifdef PARQUET_ENCRYPTION auto meta_encryptor = file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) : NULLPTR; @@ -242,6 +264,13 @@ class RowGroupSerializer : public RowGroupWriter::Contents { static_cast(row_group_ordinal_), static_cast(current_column_index_), properties_->memory_pool(), buffered_row_group_, meta_encryptor, data_encryptor); +#else + std::unique_ptr pager = + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + col_meta, static_cast(row_group_ordinal_), + static_cast(current_column_index_), + properties_->memory_pool(), buffered_row_group_); +#endif column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -280,6 +309,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } row_group_writer_.reset(); +#ifdef PARQUET_ENCRYPTION // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); @@ -289,6 +319,10 @@ class FileSerializer : public ParquetFileWriter::Contents { } else { // Encrypted file CloseEncryptedFile(file_encryption_properties); } +#else + file_metadata_ = metadata_->Finish(); + WriteFileMetaData(*file_metadata_, sink_.get()); +#endif } } @@ -310,8 +344,11 @@ class FileSerializer : public ParquetFileWriter::Contents { auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), +#ifdef PARQUET_ENCRYPTION buffered_row_group, file_encryptor_.get())); - +#else + buffered_row_group)); +#endif row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } @@ -342,6 +379,7 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } +#ifdef PARQUET_ENCRYPTION void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { // Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { @@ -371,6 +409,7 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_->WipeOutEncryptionKeys(); } } +#endif std::shared_ptr sink_; bool is_open_; @@ -381,9 +420,12 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; +#ifdef PARQUET_ENCRYPTION std::unique_ptr file_encryptor_; +#endif void StartFile() { +#ifdef PARQUET_ENCRYPTION auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 @@ -397,6 +439,10 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } +#else + // Unencrypted parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); +#endif } }; @@ -447,6 +493,12 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); } +void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); + return WriteFileMetaData(file_metadata, sink); +} + +#ifdef PARQUET_ENCRYPTION void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, @@ -490,11 +542,7 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ParquetOutputWrapper wrapper(sink); crypto_metadata.WriteTo(&wrapper); } - -void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { - PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - return WriteFileMetaData(file_metadata, sink); -} +#endif // PARQUET_ENCRYPTION const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 5c90273ae4f..50f6cac2a2e 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -112,6 +112,16 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); + +PARQUET_EXPORT +void WriteFileMetaData(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); + +PARQUET_EXPORT +void WriteMetaDataFile(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); + +#ifdef PARQUET_ENCRYPTION PARQUET_EXPORT void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ArrowOutputStream* sink, @@ -120,10 +130,6 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); - -PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); PARQUET_EXPORT void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink, @@ -132,10 +138,7 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, PARQUET_EXPORT void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ::arrow::io::OutputStream* sink); - -PARQUET_EXPORT -void WriteMetaDataFile(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); +#endif class PARQUET_EXPORT ParquetFileWriter { public: diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 351b68a66da..2429fce6854 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -25,7 +25,6 @@ #include #include // IWYU pragma: keep #include "parquet/exception.h" -#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/schema-internal.h" @@ -33,6 +32,10 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/internal_file_decryptor.h" +#endif + namespace parquet { const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() { @@ -112,6 +115,8 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d } // MetaData Accessor + +#ifdef PARQUET_ENCRYPTION // ColumnCryptoMetaData class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { public: @@ -157,6 +162,7 @@ bool ColumnCryptoMetaData::encrypted_with_footer_key() const { const std::string& ColumnCryptoMetaData::key_metadata() const { return impl_->key_metadata(); } +#endif // PARQUET_ENCRYPTION // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { @@ -164,9 +170,14 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor = NULLPTR) + const ApplicationVersion* writer_version +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor = NULLPTR +#endif + ) : column_(column), descr_(descr), writer_version_(writer_version) { +#ifdef PARQUET_ENCRYPTION if (column->__isset.crypto_metadata) { // column metadata is encrypted format::ColumnCryptoMetaData ccmd = column->crypto_metadata; @@ -195,6 +206,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { // column metadata is not encrypted is_metadata_set_ = true; } +#else + is_metadata_set_ = true; +#endif // PARQUET_ENCRYPTION + if (is_metadata_set_) { const format::ColumnMetaData& meta_data = GetMetadataIfSet(); for (auto encoding : meta_data.encodings) { @@ -275,6 +290,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return GetMetadataIfSet().total_uncompressed_size; } +#ifdef PARQUET_ENCRYPTION inline std::unique_ptr crypto_metadata() const { if (column_->__isset.crypto_metadata) { return ColumnCryptoMetaData::Make( @@ -283,6 +299,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return nullptr; } } +#endif private: mutable std::shared_ptr possible_stats_; @@ -294,6 +311,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { +#ifdef PARQUET_ENCRYPTION if (column_->__isset.crypto_metadata && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { @@ -306,27 +324,51 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { return column_->meta_data; } +#else + return column_->meta_data; +#endif } }; std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor, - int16_t row_group_ordinal, int16_t column_ordinal) { + const ApplicationVersion* writer_version, int16_t row_group_ordinal, + int16_t column_ordinal +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor +#endif +) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, +#ifdef PARQUET_ENCRYPTION writer_version, file_decryptor)); +#else + writer_version)); +#endif } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor) + const ApplicationVersion* writer_version +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor +#endif + ) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} + row_group_ordinal, column_ordinal, +#ifdef PARQUET_ENCRYPTION + writer_version, file_decryptor))} { +} +#else + writer_version))} { +} +#endif + ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -384,9 +426,11 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { return impl_->crypto_metadata(); } +#endif // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { @@ -411,7 +455,12 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk( - int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { + int i, +#ifdef PARQUET_ENCRYPTION + int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { +#else + int16_t row_group_ordinal) { +#endif if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -419,8 +468,12 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryptor, row_group_ordinal, + writer_version_, row_group_ordinal, +#ifdef PARQUET_ENCRYPTION + (int16_t)i, file_decryptor); +#else (int16_t)i); +#endif } private: @@ -451,22 +504,37 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } +#ifdef PARQUET_ENCRYPTION std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } +#else +std::unique_ptr RowGroupMetaData::ColumnChunk( + int i, int16_t row_group_ordinal) const { + return impl_->ColumnChunk(i, row_group_ordinal); +} +#endif // file metadata class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} +#ifdef PARQUET_ENCRYPTION explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = nullptr) +#else + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) +#endif : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, +#ifdef PARQUET_ENCRYPTION metadata_.get(), decryptor); +#else + metadata_.get()); +#endif metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -480,8 +548,9 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } - bool verify_signature(std::shared_ptr encryptor, - const void* tail) { +#ifdef PARQUET_ENCRYPTION + bool VerifySignature(std::shared_ptr encryptor, + const void* signature) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -489,8 +558,8 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); // encrypt with nonce - uint8_t* nonce = const_cast(reinterpret_cast(tail)); - uint8_t* tag = const_cast(reinterpret_cast(tail)) + + uint8_t* nonce = const_cast(reinterpret_cast(signature)); + uint8_t* tag = const_cast(reinterpret_cast(signature)) + encryption::kNonceLength; std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + @@ -501,6 +570,7 @@ class FileMetaData::FileMetaDataImpl { memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, tag, encryption::kGcmTagLength); } +#endif inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } @@ -513,6 +583,8 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } + +#ifdef PARQUET_ENCRYPTION inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } @@ -522,9 +594,11 @@ class FileMetaData::FileMetaDataImpl { inline const std::string& footer_signing_key_metadata() { return metadata_->footer_signing_key_metadata; } +#endif const ApplicationVersion& writer_version() const { return writer_version_; } +#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; @@ -554,6 +628,12 @@ class FileMetaData::FileMetaDataImpl { serializer.Serialize(metadata_.get(), dst, encryptor); } } +#else + void WriteTo(::arrow::io::OutputStream* dst) const { + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); + } +#endif // PARQUET_ENCRYPTION std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -636,6 +716,7 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; +#ifdef PARQUET_ENCRYPTION std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) { @@ -643,11 +724,24 @@ std::shared_ptr FileMetaData::Make( return std::shared_ptr( new FileMetaData(metadata, metadata_len, decryptor)); } +#else +std::shared_ptr FileMetaData::Make(const void* metadata, + uint32_t* metadata_len) { + // This FileMetaData ctor is private, not compatible with std::make_shared + return std::shared_ptr(new FileMetaData(metadata, metadata_len)); +} +#endif +#ifdef PARQUET_ENCRYPTION FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} +#else +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) + : impl_{std::unique_ptr( + new FileMetaDataImpl(metadata, metadata_len))} {} +#endif FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -658,10 +752,12 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify_signature(std::shared_ptr encryptor, - const void* tail) { - return impl_->verify_signature(encryptor, tail); +#ifdef PARQUET_ENCRYPTION +bool FileMetaData::VerifySignature(std::shared_ptr encryptor, + const void* signature) { + return impl_->VerifySignature(encryptor, signature); } +#endif uint32_t FileMetaData::size() const { return impl_->size(); } @@ -671,6 +767,7 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } +#ifdef PARQUET_ENCRYPTION bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } @@ -682,6 +779,7 @@ EncryptionAlgorithm FileMetaData::encryption_algorithm() const { const std::string& FileMetaData::footer_signing_key_metadata() const { return impl_->footer_signing_key_metadata(); } +#endif // PARQUET_ENCRYPTION ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { @@ -716,11 +814,18 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } +#ifdef PARQUET_ENCRYPTION void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); } +#else +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { + return impl_->WriteTo(dst); +} +#endif +#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData::FileCryptoMetaDataImpl { public: FileCryptoMetaDataImpl() {} @@ -773,6 +878,7 @@ FileCryptoMetaData::~FileCryptoMetaData() {} void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } +#endif // PARQUET_ENCRYPTION ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) @@ -910,7 +1016,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback, const std::shared_ptr& encryptor) { + bool dictionary_fallback +#ifdef PARQUET_ENCRYPTION + , + const std::shared_ptr& encryptor +#endif + ) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -945,6 +1056,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->meta_data.__set_encodings(thrift_encodings); +#ifdef PARQUET_ENCRYPTION const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is encrypted if (encrypt_md != NULLPTR && encrypt_md->is_encrypted()) { @@ -998,6 +1110,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } } +#endif // PARQUET_ENCRYPTION } void WriteTo(::arrow::io::OutputStream* sink) { @@ -1064,11 +1177,19 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, +#ifdef PARQUET_ENCRYPTION bool dictionary_fallback, const std::shared_ptr& encryptor) { +#else + bool dictionary_fallback) { +#endif impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, - encryptor); + compressed_size, uncompressed_size, has_dictionary, +#ifdef PARQUET_ENCRYPTION + dictionary_fallback, encryptor); +#else + dictionary_fallback); +#endif } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { @@ -1216,10 +1337,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); +#ifdef PARQUET_ENCRYPTION if (props->file_encryption_properties() != nullptr && props->file_encryption_properties()->encrypted_footer()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } +#endif } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -1275,6 +1398,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; +#ifdef PARQUET_ENCRYPTION // if plaintext footer, set footer signing algorithm auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { @@ -1293,6 +1417,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); } } +#endif // PARQUET_ENCRYPTION parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), @@ -1304,6 +1429,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } +#ifdef PARQUET_ENCRYPTION std::unique_ptr BuildFileCryptoMetaData() { if (crypto_metadata_ == nullptr) { return nullptr; @@ -1325,10 +1451,13 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_crypto_metadata; } +#endif protected: std::unique_ptr metadata_; +#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata_; +#endif private: const std::shared_ptr properties_; @@ -1360,8 +1489,10 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +#ifdef PARQUET_ENCRYPTION std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); } +#endif } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 3fc6899bfc6..dfff04407ed 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -38,11 +38,14 @@ class ColumnDescriptor; class EncodedStatistics; class Statistics; class SchemaDescriptor; + +#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData; class InternalFileDecryptor; class Decryptor; class Encryptor; class FooterSigningEncryptor; +#endif namespace schema { @@ -105,6 +108,7 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +#ifdef PARQUET_ENCRYPTION class PARQUET_EXPORT ColumnCryptoMetaData { public: static std::unique_ptr Make(const uint8_t* metadata); @@ -120,15 +124,22 @@ class PARQUET_EXPORT ColumnCryptoMetaData { class ColumnCryptoMetaDataImpl; std::unique_ptr impl_; }; +#endif class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor +#ifdef PARQUET_ENCRYPTION + static std::unique_ptr Make( + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR); +#else static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR, - InternalFileDecryptor* file_decryptor = NULLPTR, int16_t row_group_ordinal = -1, + const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1); +#endif ~ColumnChunkMetaData(); @@ -154,13 +165,21 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; +#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata() const; +#endif private: +#ifdef PARQUET_ENCRYPTION explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); +#else + explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version = NULLPTR); +#endif // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -181,9 +200,15 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; + +#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR) const; +#else + std::unique_ptr ColumnChunk(int i, + int16_t row_group_ordinal = -1) const; +#endif private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -198,31 +223,48 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor + +#ifdef PARQUET_ENCRYPTION static std::shared_ptr Make( const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); +#else + static std::shared_ptr Make(const void* serialized_metadata, + uint32_t* metadata_len); +#endif ~FileMetaData(); +#ifdef PARQUET_ENCRYPTION + /// Verify signature of FileMetadata when file is encrypted but footer is not encrypted + /// (plaintext footer). + /// Signature is 28 bytes (12 byte nonce and 16 byte tags) when encrypting FileMetadata + bool VerifySignature(std::shared_ptr encryptor, + const void* signature); +#endif - bool verify_signature(std::shared_ptr encryptor, - const void* tail); // file metadata uint32_t size() const; int num_columns() const; int64_t num_rows() const; int num_row_groups() const; - bool is_encryption_algorithm_set() const; - EncryptionAlgorithm encryption_algorithm() const; - const std::string& footer_signing_key_metadata() const; ParquetVersion::type version() const; const std::string& created_by() const; int num_schema_elements() const; std::unique_ptr RowGroup(int i) const; - const ApplicationVersion& writer_version() const; +#ifdef PARQUET_ENCRYPTION + bool is_encryption_algorithm_set() const; + EncryptionAlgorithm encryption_algorithm() const; + const std::string& footer_signing_key_metadata() const; +#endif + +#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; +#else + void WriteTo(::arrow::io::OutputStream* dst) const; +#endif // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -237,8 +279,13 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; + +#ifdef PARQUET_ENCRYPTION explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); +#else + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); +#endif // PIMPL Idiom FileMetaData(); @@ -246,6 +293,7 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +#ifdef PARQUET_ENCRYPTION class PARQUET_EXPORT FileCryptoMetaData { public: // API convenience to get a MetaData accessor @@ -267,6 +315,7 @@ class PARQUET_EXPORT FileCryptoMetaData { class FileCryptoMetaDataImpl; std::unique_ptr impl_; }; +#endif // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { @@ -291,11 +340,19 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { int64_t total_compressed_size() const; // commit the metadata + +#ifdef PARQUET_ENCRYPTION void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback, const std::shared_ptr& encryptor = NULLPTR); +#else + void Finish(int64_t num_values, int64_t dictonary_page_offset, + int64_t index_page_offset, int64_t data_page_offset, + int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, + bool dictionary_fallback); +#endif // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; @@ -355,8 +412,10 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); +#ifdef PARQUET_ENCRYPTION // crypto metadata std::unique_ptr GetCryptoMetaData(); +#endif private: explicit FileMetaDataBuilder( diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 82e58bfd20b..594de2a9c0b 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -23,7 +23,10 @@ #include #include +#ifdef PARQUET_ENCRYPTION #include "parquet/encryption.h" +#endif + #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" @@ -62,6 +65,7 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } +#ifdef PARQUET_ENCRYPTION void file_decryption_properties( const std::shared_ptr& decryption) { file_decryption_properties_ = decryption; @@ -70,12 +74,16 @@ class PARQUET_EXPORT ReaderProperties { FileDecryptionProperties* file_decryption_properties() { return file_decryption_properties_.get(); } +#endif private: ::arrow::MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_decryption_properties_; +#endif }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -280,11 +288,13 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } +#ifdef PARQUET_ENCRYPTION Builder* encryption( const std::shared_ptr& file_encryption_properties) { file_encryption_properties_ = file_encryption_properties; return this; } +#endif Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); @@ -331,10 +341,13 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr(new WriterProperties( - pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, - pagesize_, version_, created_by_, std::move(file_encryption_properties_), - default_column_properties_, column_properties)); + return std::shared_ptr( + new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, + max_row_group_length_, pagesize_, version_, created_by_, +#ifdef PARQUET_ENCRYPTION + std::move(file_encryption_properties_), +#endif + default_column_properties_, column_properties)); } private: @@ -345,7 +358,10 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; +#endif // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -369,10 +385,6 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } - inline FileEncryptionProperties* file_encryption_properties() const { - return file_encryption_properties_.get(); - } - inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -416,6 +428,11 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } +#ifdef PARQUET_ENCRYPTION + inline FileEncryptionProperties* file_encryption_properties() const { + return file_encryption_properties_.get(); + } + std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { @@ -424,13 +441,16 @@ class PARQUET_EXPORT WriterProperties { return NULLPTR; } } +#endif private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties, +#endif const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -440,9 +460,12 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), +#ifdef PARQUET_ENCRYPTION file_encryption_properties_(file_encryption_properties), +#endif default_column_properties_(default_column_properties), - column_properties_(column_properties) {} + column_properties_(column_properties) { + } ::arrow::MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -451,7 +474,11 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; +#endif + ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 1e5fdc3d84d..10c202a2f2d 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -42,16 +42,19 @@ #include #include "arrow/util/logging.h" -#include "parquet/encryption_internal.h" + #include "parquet/exception.h" -#include "parquet/internal_file_decryptor.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/statistics.h" #include "parquet/types.h" #include "parquet/parquet_types.h" // IYWU pragma: export +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" +#endif namespace parquet { // Check if thrift version < 0.11.0 @@ -222,8 +225,13 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& decryptor = NULLPTR) { +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg +#ifdef PARQUET_ENCRYPTION + , + const std::shared_ptr& decryptor = NULLPTR +#endif +) { +#ifdef PARQUET_ENCRYPTION // thrift message is not encrypted if (decryptor == NULLPTR) { DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); @@ -245,6 +253,9 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, deserialized_msg); } +#else + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); +#endif // PARQUET_ENCRYPTION } /// Utility class to serialize thrift objects to a binary format. This object @@ -275,12 +286,17 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryptor = NULLPTR) { + int64_t Serialize(const T* obj, ArrowOutputStream* out +#ifdef PARQUET_ENCRYPTION + , + const std::shared_ptr& encryptor = NULLPTR +#endif + ) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); +#ifdef PARQUET_ENCRYPTION // obj is not encrypted if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); @@ -288,6 +304,10 @@ class ThriftSerializer { } else { // obj is encrypted return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); } +#else + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); +#endif } private: @@ -303,6 +323,7 @@ class ThriftSerializer { } } +#ifdef PARQUET_ENCRYPTION int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, uint32_t out_length, const std::shared_ptr& encryptor) { @@ -316,6 +337,7 @@ class ThriftSerializer { PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); return static_cast(cipher_buffer_len); } +#endif shared_ptr mem_buffer_; shared_ptr protocol_; From 977ef11b68f02da300f29ac7006e49c0e1cf1196 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 24 Jun 2019 11:42:16 +0300 Subject: [PATCH 115/125] Fix LogicalType --- ...cryption-reader-writer-all-crypto-options.h | 10 +++++----- cpp/src/parquet/CMakeLists.txt | 4 ++-- .../parquet/encryption-configurations-test.cc | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h index db1b692b9f0..2ca3a064768 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h @@ -21,7 +21,7 @@ #include #include -using parquet::LogicalType; +using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; using parquet::schema::GroupNode; @@ -34,18 +34,18 @@ static std::shared_ptr SetupSchema() { // Create a primitive node named 'boolean_field' with type:BOOLEAN, // repetition:REQUIRED fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, LogicalType::NONE)); + Type::BOOLEAN, ConvertedType::NONE)); // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, // logical type:TIME_MILLIS fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - LogicalType::TIME_MILLIS)); + ConvertedType::TIME_MILLIS)); fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, - LogicalType::NONE)); + ConvertedType::NONE)); // Create a GroupNode named 'schema' using the primitive nodes defined above // This GroupNode is the root node of the schema tree diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 407657bc1b4..b42704f5711 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -329,8 +329,8 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES - encryption-properties-test.cc - encryption-configurations-test.cc) + encryption-configurations-test.cc + encryption-properties-test.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index 2afc20e5223..b45c47f365b 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -73,7 +73,7 @@ namespace parquet { using schema::GroupNode; using schema::NodePtr; using schema::PrimitiveNode; -using parquet::LogicalType; +using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; @@ -540,34 +540,34 @@ class TestEncryptionConfiguration : public ::testing::Test { // Create a primitive node named 'boolean_field' with type:BOOLEAN, // repetition:REQUIRED fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, LogicalType::NONE)); + Type::BOOLEAN, ConvertedType::NONE)); // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, // logical type:TIME_MILLIS fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - LogicalType::TIME_MILLIS)); + ConvertedType::TIME_MILLIS)); // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, - Type::DOUBLE, LogicalType::NONE)); + Type::DOUBLE, ConvertedType::NONE)); // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, - Type::BYTE_ARRAY, LogicalType::NONE)); + Type::BYTE_ARRAY, ConvertedType::NONE)); // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, // repetition:REQUIRED, field_length = kFixedLength fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, kFixedLength)); // Create a GroupNode named 'schema' using the primitive nodes defined above From fb8ca61d73f0ab5079ce5a3d2ba2a70f3a16ad72 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 24 Jun 2019 23:35:20 +0700 Subject: [PATCH 116/125] keep encryption parameters at method declaration --- cpp/src/parquet/column_reader.cc | 10 +-- cpp/src/parquet/column_reader.h | 7 +- cpp/src/parquet/file_reader.cc | 37 ++++------ cpp/src/parquet/file_writer.cc | 27 +++---- cpp/src/parquet/metadata.cc | 116 +++++-------------------------- cpp/src/parquet/metadata.h | 40 ----------- cpp/src/parquet/properties.h | 31 +++------ cpp/src/parquet/thrift.h | 21 +++--- 8 files changed, 65 insertions(+), 224 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 9362fc5a7ed..9c8b63039cb 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -122,14 +122,10 @@ class SerializedPageReader : public PageReader { column_ordinal_(-1), page_ordinal_(-1), seen_num_rows_(0), - total_num_rows_(total_num_rows) -#ifdef PARQUET_ENCRYPTION - , + total_num_rows_(total_num_rows), decryption_buffer_(AllocateBuffer(pool, 0)), meta_decryptor_(NULLPTR), - data_decryptor_(NULLPTR) -#endif - { + data_decryptor_(NULLPTR) { if (ctx != NULLPTR) { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; @@ -200,7 +196,6 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; -#ifdef PARQUET_ENCRYPTION // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page // header in a single column respectively. // While calculating AAD for different pages in a single column the pages AAD is @@ -211,7 +206,6 @@ class SerializedPageReader : public PageReader { std::shared_ptr decryption_buffer_; std::shared_ptr meta_decryptor_; std::shared_ptr data_decryptor_; -#endif }; #ifdef PARQUET_ENCRYPTION diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index f361a695807..ae897f659d0 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -43,13 +43,10 @@ class RleDecoder; namespace parquet { +class Decryptor; class DictionaryPage; class Page; -#ifdef PARQUET_ENCRYPTION -class Decryptor; -#endif - // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -82,10 +79,8 @@ struct PageReaderContext { int16_t row_group_ordinal; int16_t column_ordinal; -#ifdef PARQUET_ENCRYPTION std::shared_ptr meta_decryptor; std::shared_ptr data_decryptor; -#endif }; // Abstract page iterator interface. This way, we can feed column pages to the diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index c29bad388af..27e3fa5220c 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -35,7 +35,6 @@ #include "parquet/deprecated_io.h" #include "parquet/exception.h" #include "parquet/file_writer.h" -#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -44,6 +43,11 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#else +namespace parquet { +class InternalFileDecryptor; +} #endif namespace parquet { @@ -88,21 +92,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, int row_group_number, - const ReaderProperties& props -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor -#endif - ) + const ReaderProperties& props, + InternalFileDecryptor* file_decryptor = NULLPTR) : source_(source), file_metadata_(file_metadata), properties_(props), - row_group_ordinal_((int16_t)row_group_number) -#ifdef PARQUET_ENCRYPTION - , - file_decryptor_(file_decryptor) -#endif - { + row_group_ordinal_((int16_t)row_group_number), + file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -112,11 +108,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file -#ifdef PARQUET_ENCRYPTION auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); -#else - auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_); -#endif int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && @@ -191,10 +183,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; - -#ifdef PARQUET_ENCRYPTION InternalFileDecryptor* file_decryptor_; -#endif }; // ---------------------------------------------------------------------- @@ -223,12 +212,12 @@ class SerializedFile : public ParquetFileReader::Contents { } std::shared_ptr GetRowGroup(int i) override { - std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), i, #ifdef PARQUET_ENCRYPTION - properties_, file_decryptor_.get())); + std::unique_ptr contents(new SerializedRowGroup( + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); #else - properties_)); + std::unique_ptr contents( + new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); #endif return std::make_shared(std::move(contents)); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index b6465d23cc6..5d1d6a6c455 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -28,6 +28,10 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" +#else +namespace parquet { +class InternalFileEncryptor; +} #endif using arrow::MemoryPool; @@ -80,12 +84,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, - const WriterProperties* properties, bool buffered_row_group = false -#ifdef PARQUET_ENCRYPTION - , - InternalFileEncryptor* file_encryptor = NULLPTR -#endif - ) + const WriterProperties* properties, bool buffered_row_group = false, + InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), properties_(properties), @@ -94,12 +94,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { row_group_ordinal_(row_group_ordinal), current_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group) -#ifdef PARQUET_ENCRYPTION - , - file_encryptor_(file_encryptor) -#endif - { + buffered_row_group_(buffered_row_group), + file_encryptor_(file_encryptor) { if (buffered_row_group) { InitColumns(); } else { @@ -221,10 +217,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { int current_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; - -#ifdef PARQUET_ENCRYPTION InternalFileEncryptor* file_encryptor_; -#endif void CheckRowsWritten() const { // verify when only one column is written at a time @@ -342,11 +335,13 @@ class FileSerializer : public ParquetFileWriter::Contents { } num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); +#ifdef PARQUET_ENCRYPTION std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), -#ifdef PARQUET_ENCRYPTION buffered_row_group, file_encryptor_.get())); #else + std::unique_ptr contents(new RowGroupSerializer( + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), buffered_row_group)); #endif row_group_writer_.reset(new RowGroupWriter(std::move(contents))); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 2429fce6854..3e085df6111 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -34,6 +34,10 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/internal_file_decryptor.h" +#else +namespace parquet { +class Decryptor; +} #endif namespace parquet { @@ -170,12 +174,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor = NULLPTR -#endif - ) + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { #ifdef PARQUET_ENCRYPTION if (column->__isset.crypto_metadata) { // column metadata is encrypted @@ -333,41 +333,21 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, int16_t row_group_ordinal, - int16_t column_ordinal -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor -#endif -) { + int16_t column_ordinal, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, -#ifdef PARQUET_ENCRYPTION writer_version, file_decryptor)); -#else - writer_version)); -#endif } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor -#endif - ) + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - row_group_ordinal, column_ordinal, -#ifdef PARQUET_ENCRYPTION - writer_version, file_decryptor))} { -} -#else - writer_version))} { -} -#endif + row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -455,12 +435,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk( - int i, -#ifdef PARQUET_ENCRYPTION - int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { -#else - int16_t row_group_ordinal) { -#endif + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -468,12 +443,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, row_group_ordinal, -#ifdef PARQUET_ENCRYPTION - (int16_t)i, file_decryptor); -#else - (int16_t)i); -#endif + writer_version_, row_group_ordinal, (int16_t)i, + file_decryptor); } private: @@ -504,37 +475,22 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -#ifdef PARQUET_ENCRYPTION std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } -#else -std::unique_ptr RowGroupMetaData::ColumnChunk( - int i, int16_t row_group_ordinal) const { - return impl_->ColumnChunk(i, row_group_ordinal); -} -#endif // file metadata class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} -#ifdef PARQUET_ENCRYPTION explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = nullptr) -#else - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) -#endif : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, -#ifdef PARQUET_ENCRYPTION metadata_.get(), decryptor); -#else - metadata_.get()); -#endif metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -598,10 +554,10 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } -#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; +#ifdef PARQUET_ENCRYPTION // Only in encrypted files with plaintext footers the // encryption_algorithm is set in footer if (is_encryption_algorithm_set()) { @@ -627,13 +583,10 @@ class FileMetaData::FileMetaDataImpl { // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor); } - } #else - void WriteTo(::arrow::io::OutputStream* dst) const { - ThriftSerializer serializer; serializer.Serialize(metadata_.get(), dst); - } #endif // PARQUET_ENCRYPTION + } std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -716,7 +669,6 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -#ifdef PARQUET_ENCRYPTION std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) { @@ -724,24 +676,11 @@ std::shared_ptr FileMetaData::Make( return std::shared_ptr( new FileMetaData(metadata, metadata_len, decryptor)); } -#else -std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len) { - // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); -} -#endif -#ifdef PARQUET_ENCRYPTION FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} -#else -FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) - : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} -#endif FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -814,16 +753,10 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -#ifdef PARQUET_ENCRYPTION void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); } -#else -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { - return impl_->WriteTo(dst); -} -#endif #ifdef PARQUET_ENCRYPTION class FileCryptoMetaData::FileCryptoMetaDataImpl { @@ -1016,12 +949,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback -#ifdef PARQUET_ENCRYPTION - , - const std::shared_ptr& encryptor -#endif - ) { + bool dictionary_fallback, const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -1177,19 +1105,11 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, -#ifdef PARQUET_ENCRYPTION bool dictionary_fallback, const std::shared_ptr& encryptor) { -#else - bool dictionary_fallback) { -#endif impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, -#ifdef PARQUET_ENCRYPTION - dictionary_fallback, encryptor); -#else - dictionary_fallback); -#endif + compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, + encryptor); } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index dfff04407ed..aa34a885bec 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -39,13 +39,11 @@ class EncodedStatistics; class Statistics; class SchemaDescriptor; -#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData; class InternalFileDecryptor; class Decryptor; class Encryptor; class FooterSigningEncryptor; -#endif namespace schema { @@ -129,17 +127,10 @@ class PARQUET_EXPORT ColumnCryptoMetaData { class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor -#ifdef PARQUET_ENCRYPTION static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR); -#else - static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, - int16_t column_ordinal = -1); -#endif ~ColumnChunkMetaData(); @@ -170,16 +161,10 @@ class PARQUET_EXPORT ColumnChunkMetaData { #endif private: -#ifdef PARQUET_ENCRYPTION explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); -#else - explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version = NULLPTR); -#endif // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -201,14 +186,9 @@ class PARQUET_EXPORT RowGroupMetaData { // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; -#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR) const; -#else - std::unique_ptr ColumnChunk(int i, - int16_t row_group_ordinal = -1) const; -#endif private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -224,14 +204,9 @@ class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor -#ifdef PARQUET_ENCRYPTION static std::shared_ptr Make( const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); -#else - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); -#endif ~FileMetaData(); #ifdef PARQUET_ENCRYPTION @@ -259,12 +234,8 @@ class PARQUET_EXPORT FileMetaData { const std::string& footer_signing_key_metadata() const; #endif -#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; -#else - void WriteTo(::arrow::io::OutputStream* dst) const; -#endif // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -280,12 +251,8 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; -#ifdef PARQUET_ENCRYPTION explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); -#else - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); -#endif // PIMPL Idiom FileMetaData(); @@ -341,18 +308,11 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { int64_t total_compressed_size() const; // commit the metadata -#ifdef PARQUET_ENCRYPTION void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback, const std::shared_ptr& encryptor = NULLPTR); -#else - void Finish(int64_t num_values, int64_t dictonary_page_offset, - int64_t index_page_offset, int64_t data_page_offset, - int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); -#endif // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 594de2a9c0b..406d1b8bb75 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -25,6 +25,12 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/encryption.h" +#else +namespace parquet { +class FileEncryptionProperties; +class FileDecryptionProperties; +class ColumnEncryptionProperties; +} // namespace parquet #endif #include "parquet/exception.h" @@ -80,10 +86,7 @@ class PARQUET_EXPORT ReaderProperties { ::arrow::MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; - -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_decryption_properties_; -#endif }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -341,13 +344,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, -#ifdef PARQUET_ENCRYPTION - std::move(file_encryption_properties_), -#endif - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_properties_), + default_column_properties_, column_properties)); } private: @@ -359,9 +359,7 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type version_; std::string created_by_; -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; -#endif // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -448,9 +446,7 @@ class PARQUET_EXPORT WriterProperties { ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties, -#endif const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -460,12 +456,9 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), -#ifdef PARQUET_ENCRYPTION file_encryption_properties_(file_encryption_properties), -#endif default_column_properties_(default_column_properties), - column_properties_(column_properties) { - } + column_properties_(column_properties) {} ::arrow::MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -475,9 +468,7 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type parquet_version_; std::string parquet_created_by_; -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; -#endif ColumnProperties default_column_properties_; std::unordered_map column_properties_; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 10c202a2f2d..b051df3480c 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -54,6 +54,11 @@ #include "parquet/encryption_internal.h" #include "parquet/internal_file_decryptor.h" #include "parquet/internal_file_encryptor.h" +#else +namespace parquet { +class Encryptor; +class Decryptor; +} // namespace parquet #endif namespace parquet { @@ -225,12 +230,8 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg -#ifdef PARQUET_ENCRYPTION - , - const std::shared_ptr& decryptor = NULLPTR -#endif -) { +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + const std::shared_ptr& decryptor = NULLPTR) { #ifdef PARQUET_ENCRYPTION // thrift message is not encrypted if (decryptor == NULLPTR) { @@ -286,12 +287,8 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out -#ifdef PARQUET_ENCRYPTION - , - const std::shared_ptr& encryptor = NULLPTR -#endif - ) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, + const std::shared_ptr& encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); From d7e024ca8670e9bb603c075b9827ed9f2e81fdb3 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Tue, 25 Jun 2019 11:03:20 +0700 Subject: [PATCH 117/125] add PARQUET_EXPORT into Builder class of encryption properties --- cpp/src/parquet/encryption.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index f12db27dbec..f4a66ce81fe 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -97,7 +97,7 @@ inline uint8_t* str2bytes(const std::string& str) { class PARQUET_EXPORT ColumnEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { @@ -188,7 +188,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT ColumnDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) @@ -255,7 +255,7 @@ class PARQUET_EXPORT AADPrefixVerifier { class PARQUET_EXPORT FileDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: Builder() { check_plaintext_footer_integrity_ = kDefaultCheckSignature; @@ -400,7 +400,7 @@ class PARQUET_EXPORT FileDecryptionProperties { class PARQUET_EXPORT FileEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: explicit Builder(const std::string& footer_key) : parquet_cipher_(kDefaultEncryptionAlgorithm), From 0c869dd2ec23e83a49e1d654174b07c227b2ed8e Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 25 Jun 2019 14:45:45 +0300 Subject: [PATCH 118/125] Change assert to ASSERT_EQ in encryption-configurations-test.cc --- .../parquet/encryption-configurations-test.cc | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index b45c47f365b..c913f7917c0 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -294,11 +294,11 @@ class TestEncryptionConfiguration : public ::testing::Test { // Get the number of RowGroups int num_row_groups = file_metadata->num_row_groups(); - assert(num_row_groups == 1); + ASSERT_EQ(num_row_groups, 1); // Get the number of Columns int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); + ASSERT_EQ(num_columns, 8); // Iterate over all the RowGroups in the file for (int r = 0; r < num_row_groups; ++r) { @@ -326,12 +326,12 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written bool expected_value = ((i % 2) == 0) ? true : false; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); i++; } // Get the Column Reader for the Int32 column @@ -346,11 +346,11 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written - assert(value == i); + ASSERT_EQ(value, i); i++; } // Get the Column Reader for the Int64 column @@ -366,17 +366,17 @@ class TestEncryptionConfiguration : public ::testing::Test { rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written int64_t expected_value = i * 1000 * 1000; expected_value *= 1000 * 1000; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); if ((i % 2) == 0) { - assert(repetition_level == 1); + ASSERT_EQ(repetition_level, 1); } else { - assert(repetition_level == 0); + ASSERT_EQ(repetition_level, 0); } i++; } @@ -393,16 +393,16 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written parquet::Int96 expected_value; expected_value.value[0] = i; expected_value.value[1] = i + 1; expected_value.value[2] = i + 2; for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); + ASSERT_EQ(value.value[j], expected_value.value[j]); } i++; } @@ -419,12 +419,12 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written float expected_value = static_cast(i) * 1.1f; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); i++; } // Get the Column Reader for the Double column @@ -439,12 +439,12 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written double expected_value = i * 1.1111111; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); i++; } // Get the Column Reader for the ByteArray column @@ -460,7 +460,7 @@ class TestEncryptionConfiguration : public ::testing::Test { rows_read = ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // Verify the value written char expected_value[kFixedLength] = "parquet"; expected_value[7] = static_cast('0' + i / 100); @@ -468,14 +468,14 @@ class TestEncryptionConfiguration : public ::testing::Test { expected_value[9] = static_cast('0' + i % 10); if (i % 2 == 0) { // only alternate values exist // There are no NULL values in the rows written - assert(values_read == 1); - assert(value.len == kFixedLength); - assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); - assert(definition_level == 1); + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); } else { // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); } i++; } @@ -491,13 +491,13 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written char v = static_cast(i); char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); i++; } } From 56b04554d2858596355fbfcb45fe299eabb17ca2 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 25 Jun 2019 22:45:54 +0700 Subject: [PATCH 119/125] fix cmake format --- cpp/src/parquet/CMakeLists.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index b42704f5711..8b07083704a 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -179,14 +179,14 @@ set(PARQUET_SRCS statistics.cc types.cc) -if (ARROW_USE_OPENSSL) +if(ARROW_USE_OPENSSL) add_definitions(-DPARQUET_ENCRYPTION) set(PARQUET_SRCS - ${PARQUET_SRCS} - encryption.cc - encryption_internal.cc - internal_file_decryptor.cc - internal_file_encryptor.cc) + ${PARQUET_SRCS} + encryption.cc + encryption_internal.cc + internal_file_decryptor.cc + internal_file_encryptor.cc) endif() # Ensure that thrift compilation is done before using its generated headers @@ -327,10 +327,8 @@ add_parquet_test(arrow-test test-util.cc) if(ARROW_USE_OPENSSL) - add_parquet_test(encryption-test - SOURCES - encryption-configurations-test.cc - encryption-properties-test.cc) + add_parquet_test(encryption-test SOURCES encryption-properties-test.cc + encryption-configurations-test.cc) endif() # Those tests need to use static linking as they access thrift-generated From d09bf9dfc43fde710b8c7f618fedcdaea2c1681f Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 26 Jun 2019 10:37:46 +0300 Subject: [PATCH 120/125] Add MemoryPool field to Decryptors/Encryptors --- .../parquet/encryption-configurations-test.cc | 6 ++-- cpp/src/parquet/file_reader.cc | 30 +++++++++---------- cpp/src/parquet/file_writer.cc | 3 +- cpp/src/parquet/internal_file_decryptor.cc | 27 ++++++++++------- cpp/src/parquet/internal_file_decryptor.h | 10 +++++-- cpp/src/parquet/internal_file_encryptor.cc | 22 +++++++++----- cpp/src/parquet/internal_file_encryptor.h | 9 ++++-- cpp/src/parquet/thrift.h | 4 +-- 8 files changed, 68 insertions(+), 43 deletions(-) diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index c913f7917c0..7c79fd14c9d 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -70,12 +70,12 @@ namespace parquet { -using schema::GroupNode; -using schema::NodePtr; -using schema::PrimitiveNode; using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; constexpr int kFixedLength = 10; diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 27e3fa5220c..da859bccd36 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -66,9 +66,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -78,9 +78,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } @@ -396,9 +396,9 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( // Handle AAD prefix EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, file_aad, - algo.algorithm, - file_crypto_metadata->key_metadata())); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata(), properties_.memory_pool())); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; @@ -424,9 +424,9 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); // Handle AAD prefix std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_.reset( - new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, - file_metadata_->footer_signing_key_metadata())); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata(), properties_.memory_pool())); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { @@ -570,9 +570,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) << "The file only has " - << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) + << "The file only has " << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 5d1d6a6c455..3fca600c285 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -426,7 +426,8 @@ class FileSerializer : public ParquetFileWriter::Contents { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { - file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties)); + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, + properties_->memory_pool())); if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 867c81a0113..9af59ae01e3 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -46,8 +46,13 @@ int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int foote // Decryptor Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, - const std::string& file_aad, const std::string& aad) - : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_decryptor_(aes_decryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } @@ -62,11 +67,13 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, const std::string& file_aad, ParquetCipher::type algorithm, - const std::string& footer_key_metadata) + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool) : properties_(properties), file_aad_(file_aad), algorithm_(algorithm), - footer_key_metadata_(footer_key_metadata) { + footer_key_metadata_(footer_key_metadata), + pool_(pool) { if (properties_->is_utilized()) { throw ParquetException( "Re-using decryption properties with explicit keys for another file"); @@ -160,10 +167,10 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); - footer_metadata_decryptor_ = - std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); + footer_metadata_decryptor_ = std::make_shared( + aes_metadata_decryptor, footer_key, file_aad_, aad, pool_); footer_data_decryptor_ = - std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad, pool_); if (metadata) return footer_metadata_decryptor_; return footer_data_decryptor_; @@ -219,10 +226,10 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); - column_metadata_map_[column_path] = - std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); + column_metadata_map_[column_path] = std::make_shared( + aes_metadata_decryptor, column_key, file_aad_, aad, pool_); column_data_map_[column_path] = - std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad, pool_); if (metadata) return column_metadata_map_[column_path]; return column_data_map_[column_path]; diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 842eea7c680..76033700329 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -53,10 +53,12 @@ class FooterSigningEncryptor { class Decryptor { public: Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, - const std::string& file_aad, const std::string& aad); + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); const std::string& file_aad() const { return file_aad_; } void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } int CiphertextSizeDelta(); int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); @@ -66,6 +68,7 @@ class Decryptor { std::string key_; std::string file_aad_; std::string aad_; + ::arrow::MemoryPool* pool_; }; class InternalFileDecryptor { @@ -73,7 +76,8 @@ class InternalFileDecryptor { explicit InternalFileDecryptor(FileDecryptionProperties* properties, const std::string& file_aad, ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool); std::string& file_aad() { return file_aad_; } @@ -120,6 +124,8 @@ class InternalFileDecryptor { std::unique_ptr meta_decryptor_[3]; std::unique_ptr data_decryptor_[3]; + ::arrow::MemoryPool* pool_; + std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( std::shared_ptr column_path, diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 6f097a1a0b4..63a84557c9d 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -23,8 +23,13 @@ namespace parquet { // Encryptor Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, - const std::string& file_aad, const std::string& aad) - : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_encryptor_(aes_encryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } @@ -35,8 +40,9 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip } // InternalFileEncryptor -InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) - : properties_(properties) { +InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties, + ::arrow::MemoryPool* pool) + : properties_(properties), pool_(pool) { if (properties_->is_utilized()) { throw ParquetException("Re-using encryption properties for another file"); } @@ -60,8 +66,8 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); - footer_encryptor_ = std::make_shared(aes_encryptor, footer_key, - properties_->file_aad(), footer_aad); + footer_encryptor_ = std::make_shared( + aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_); return footer_encryptor_; } @@ -75,7 +81,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); footer_signing_encryptor_ = std::make_shared( - aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); + aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_); return footer_signing_encryptor_; } @@ -120,7 +126,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( std::string file_aad = properties_->file_aad(); std::shared_ptr encryptor = - std::make_shared(aes_encryptor, key, file_aad, ""); + std::make_shared(aes_encryptor, key, file_aad, "", pool_); if (metadata) column_metadata_map_[column_path] = encryptor; else diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index efef532b163..e75a582e4a4 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -38,9 +38,11 @@ class ColumnEncryptionProperties; class Encryptor { public: Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, - const std::string& file_aad, const std::string& aad); + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); const std::string& file_aad() { return file_aad_; } void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); @@ -62,11 +64,12 @@ class Encryptor { std::string key_; std::string file_aad_; std::string aad_; + ::arrow::MemoryPool* pool_; }; class InternalFileEncryptor { public: - explicit InternalFileEncryptor(FileEncryptionProperties* propperties); + explicit InternalFileEncryptor(FileEncryptionProperties* propperties, ::arrow::MemoryPool* pool); std::shared_ptr GetFooterEncryptor(); std::shared_ptr GetFooterSigningEncryptor(); @@ -96,6 +99,8 @@ class InternalFileEncryptor { std::unique_ptr meta_encryptor_[3]; std::unique_ptr data_encryptor_[3]; + ::arrow::MemoryPool* pool_; + std::shared_ptr GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index b051df3480c..3d498fc0253 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -242,7 +242,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali // decrypt std::shared_ptr decrypted_buffer = std::static_pointer_cast(AllocateBuffer( - ::arrow::default_memory_pool(), + decryptor->pool(), static_cast(clen - decryptor->CiphertextSizeDelta()))); const uint8_t* cipher_buf = buf; uint32_t decrypted_buffer_len = @@ -326,7 +326,7 @@ class ThriftSerializer { const std::shared_ptr& encryptor) { std::shared_ptr cipher_buffer = std::static_pointer_cast(AllocateBuffer( - ::arrow::default_memory_pool(), + encryptor->pool(), static_cast(encryptor->CiphertextSizeDelta() + out_length))); int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); From 4afcccb3d9d0f18d6d670641113acfeaaf8107a2 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 26 Jun 2019 19:11:59 +0700 Subject: [PATCH 121/125] keep encryption parameters at method declaration (column_writer.cc/.h) --- cpp/src/parquet/column_writer.cc | 70 +++++------------------ cpp/src/parquet/column_writer.h | 9 +-- cpp/src/parquet/internal_file_encryptor.h | 3 +- 3 files changed, 16 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 087afe6c1c4..7f525f41948 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -134,13 +134,9 @@ class SerializedPageWriter : public PageWriter { SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool() -#ifdef PARQUET_ENCRYPTION - , + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, - std::shared_ptr data_encryptor = NULLPTR -#endif - ) + std::shared_ptr data_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), pool_(pool), @@ -151,18 +147,14 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_(0), page_ordinal_(0), row_group_ordinal_(row_group_ordinal), - column_ordinal_(column_chunk_ordinal) -#ifdef PARQUET_ENCRYPTION - , + column_ordinal_(column_chunk_ordinal), meta_encryptor_(meta_encryptor), - data_encryptor_(data_encryptor) -#endif - { -#ifdef PARQUET_ENCRYPTION + data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { +#ifdef PARQUET_ENCRYPTION InitEncryption(); - } #endif + } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -216,11 +208,9 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { UpdateEncryption(encryption::kDictionaryPageHeader); } +#endif int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); -#else - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); -#endif PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); @@ -237,17 +227,11 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_ != nullptr) { UpdateEncryption(encryption::kColumnMetaData); } - +#endif // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback, meta_encryptor_); -#else - // index_page_offset = -1 since they are not supported - metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, - total_compressed_size_, total_uncompressed_size_, has_dictionary, - fallback); -#endif // Write metadata at end of column chunk metadata_->WriteTo(sink_.get()); } @@ -317,11 +301,9 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { UpdateEncryption(encryption::kDataPageHeader); } +#endif int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); -#else - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); -#endif PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -418,40 +400,27 @@ class SerializedPageWriter : public PageWriter { #ifdef PARQUET_ENCRYPTION std::string data_pageAAD_; std::string data_page_headerAAD_; +#endif std::shared_ptr meta_encryptor_; std::shared_ptr data_encryptor_; -#endif }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: -#ifdef PARQUET_ENCRYPTION BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t current_column_ordinal, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) -#else - BufferedPageWriter(const std::shared_ptr& sink, - Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - int16_t row_group_ordinal, int16_t current_column_ordinal, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) -#endif : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); -#ifdef PARQUET_ENCRYPTION pager_ = std::unique_ptr(new SerializedPageWriter( in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, meta_encryptor, data_encryptor)); -#else - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, row_group_ordinal, - current_column_ordinal, pool)); -#endif } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -496,13 +465,9 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group -#ifdef PARQUET_ENCRYPTION - , - std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor -#endif -) { -#ifdef PARQUET_ENCRYPTION + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group, + std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor) { if (buffered_row_group) { return std::unique_ptr(new BufferedPageWriter( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, @@ -512,15 +477,6 @@ std::unique_ptr PageWriter::Open( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, meta_encryptor, data_encryptor)); } -#else - if (buffered_row_group) { - return std::unique_ptr(new BufferedPageWriter( - sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); - } else { - return std::unique_ptr(new SerializedPageWriter( - sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); - } -#endif } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index b4b5fa702e6..81154314400 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -45,11 +45,8 @@ class RleEncoder; namespace parquet { class ColumnChunkMetaDataBuilder; -class WriterProperties; - -#ifdef PARQUET_ENCRYPTION class Encryptor; -#endif +class WriterProperties; class PARQUET_EXPORT LevelEncoder { public: @@ -90,13 +87,9 @@ class PARQUET_EXPORT PageWriter { ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), -#ifdef PARQUET_ENCRYPTION bool buffered_row_group = false, std::shared_ptr header_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR); -#else - bool buffered_row_group = false); -#endif // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index e75a582e4a4..7d2ce7f4f12 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -69,7 +69,8 @@ class Encryptor { class InternalFileEncryptor { public: - explicit InternalFileEncryptor(FileEncryptionProperties* propperties, ::arrow::MemoryPool* pool); + explicit InternalFileEncryptor(FileEncryptionProperties* propperties, + ::arrow::MemoryPool* pool); std::shared_ptr GetFooterEncryptor(); std::shared_ptr GetFooterSigningEncryptor(); From afc7131c15fc99876b2f8a24f045add48db7044d Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 27 Jun 2019 12:14:30 +0300 Subject: [PATCH 122/125] Write to parquet stream to file in encryption test --- ...yption-reader-writer-all-crypto-options.cc | 3 +- cpp/src/parquet/CMakeLists.txt | 3 +- .../parquet/encryption-configurations-test.cc | 138 ++++++++++-------- cpp/src/parquet/encryption.h | 2 +- 4 files changed, 86 insertions(+), 60 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 069f997d1ba..346cd2e5b57 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -94,6 +94,8 @@ const std::string kColumnEncryptionKey1 = "1234567890123450"; const std::string kColumnEncryptionKey2 = "1234567890123451"; const std::string fileName = "tester"; +using FileClass = ::arrow::io::FileOutputStream; + void PrintDecryptionConfiguration(int configuration); // Check that the decryption result is as expected. void CheckResult(std::string file, int example_id, std::string exception_msg); @@ -260,7 +262,6 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { std::string test_number_string = ss.str(); try { // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; std::shared_ptr out_file; std::string file = root_path + fileName + std::string(test_number_string) + ".parquet.encrypted"; diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 8b07083704a..226d8fc9c95 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -328,7 +328,8 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES encryption-properties-test.cc - encryption-configurations-test.cc) + encryption-configurations-test.cc + test-util.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index 7c79fd14c9d..223738652d9 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -17,6 +17,10 @@ #include +#include + +#include + #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/file_reader.h" @@ -69,6 +73,15 @@ */ namespace parquet { +namespace test { +std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} + +using FileClass = ::arrow::io::FileOutputStream; using parquet::ConvertedType; using parquet::Repetition; @@ -79,8 +92,6 @@ using schema::PrimitiveNode; constexpr int kFixedLength = 10; -namespace test { - const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 const char kColumnEncryptionKey1[] = "1234567890123450"; const char kColumnEncryptionKey2[] = "1234567890123451"; @@ -89,18 +100,25 @@ const char kFileName[] = "tester"; class TestEncryptionConfiguration : public ::testing::Test { public: void SetUp() { - rows_per_rowgroup_ = 50; + createDecryptionConfigurations(); // Setup the parquet schema schema_ = SetupEncryptionSchema(); - createDecryptionConfigurations(); - path_to_double_field_ = parquet::schema::ColumnPath::FromDotString("double_field"); - path_to_float_field_ = parquet::schema::ColumnPath::FromDotString("float_field"); + std::string res = "test.parquet.encrypted"; + file_name_ = data_file(res.c_str()); + } + + void TearDown() { + // delete test file. + ASSERT_EQ(std::remove(file_name_.c_str()), 0); } protected: - std::shared_ptr path_to_double_field_; - std::shared_ptr path_to_float_field_; - int rows_per_rowgroup_; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::string file_name_; + int rows_per_rowgroup_ = 50; std::shared_ptr schema_; // This vector will hold various decryption configurations. std::vector> @@ -169,17 +187,21 @@ class TestEncryptionConfiguration : public ::testing::Test { ->build()); } - std::shared_ptr EncryptFile( - std::shared_ptr encryption_configurations) { - auto sink = CreateOutputStream(); + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file) { + std::shared_ptr out_file; WriterProperties::Builder prop_builder; - prop_builder.compression(parquet::Compression::SNAPPY); prop_builder.encryption(encryption_configurations); std::shared_ptr writer_properties = prop_builder.build(); - auto file_writer = ParquetFileWriter::Open(sink, schema_, writer_properties); + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + RowGroupWriter* row_group_writer; row_group_writer = file_writer->AppendRowGroup(); @@ -273,21 +295,23 @@ class TestEncryptionConfiguration : public ::testing::Test { // Close the ParquetFileWriter file_writer->Close(); - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(sink->Finish(&buffer)); - return buffer; + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + out_file->Close().ok(); + return; } - void DecryptFile(std::shared_ptr buffer, int example_id, - int encryption_configuration) { + void DecryptFile(std::string file, int example_id, int encryption_configuration) { std::string exception_msg; try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); reader_properties.file_decryption_properties( vector_of_decryption_configurations_[example_id]->DeepClone()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source, reader_properties); + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); // Get the File MetaData std::shared_ptr file_metadata = file_reader->metadata(); @@ -583,14 +607,14 @@ TEST_F(TestEncryptionConfiguration, UniformEncryption) { parquet::FileEncryptionProperties::Builder file_encryption_builder_1( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build()); + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 1 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 1 /* encryption_configuration_number */); } } @@ -617,23 +641,23 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { parquet::FileEncryptionProperties::Builder file_encryption_builder_2( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) - ->build()); + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 2 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); } } +// Encryption configuration 3: Encrypt two columns, with different keys. +// Don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { - // Encryption configuration 3: Encrypt two columns, with different keys. - // Don’t encrypt footer. - // (plaintext footer mode, readable by legacy readers) std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -650,17 +674,17 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { parquet::FileEncryptionProperties::Builder file_encryption_builder_3( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) - ->set_plaintext_footer() - ->build()); + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 3 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 3 /* encryption_configuration_number */); } } @@ -683,16 +707,16 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { parquet::FileEncryptionProperties::Builder file_encryption_builder_4( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) - ->aad_prefix(kFileName_) - ->build()); + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(kFileName_) + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 4 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 4 /* encryption_configuration_number */); } } @@ -716,17 +740,17 @@ TEST_F(TestEncryptionConfiguration, parquet::FileEncryptionProperties::Builder file_encryption_builder_5( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) - ->footer_key_metadata("kf") - ->aad_prefix(kFileName_) - ->disable_store_aad_prefix_storage() - ->build()); + this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_store_aad_prefix_storage() + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 5 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 5 /* encryption_configuration_number */); } } @@ -749,17 +773,17 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { parquet::FileEncryptionProperties::Builder file_encryption_builder_6( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) - ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) - ->build()); + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 6 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 6 /* encryption_configuration_number */); } } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index f4a66ce81fe..4958faac2ae 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -89,7 +89,7 @@ class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { }; inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return nullptr; + if (str.empty()) return NULLPTR; char* cbytes = const_cast(str.c_str()); return reinterpret_cast(cbytes); From c744cd2b02efa73b7d22123d0f0755da329ee1fd Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 1 Jul 2019 12:48:55 +0300 Subject: [PATCH 123/125] Add file reader and file writer Close to encryption-configurations-test.cc --- .../encryption-reader-writer-all-crypto-options.cc | 3 +-- cpp/src/parquet/CMakeLists.txt | 4 +++- cpp/src/parquet/encryption-configurations-test.cc | 9 ++------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 346cd2e5b57..06d43be8f5a 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -369,8 +369,7 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { vector_of_decryption_configurations.push_back( file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); - // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply - // aad_prefix. + // Decryption configuration 3: Decrypt using explicit column and footer keys. std::shared_ptr path_float_ptr = parquet::schema::ColumnPath::FromDotString("float_field"); std::shared_ptr path_double_ptr = diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 226d8fc9c95..a55343166b3 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -327,7 +327,9 @@ add_parquet_test(arrow-test test-util.cc) if(ARROW_USE_OPENSSL) - add_parquet_test(encryption-test SOURCES encryption-properties-test.cc + add_parquet_test(encryption-test + SOURCES + encryption-properties-test.cc encryption-configurations-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index 223738652d9..bcdc7eff446 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -161,8 +161,7 @@ class TestEncryptionConfiguration : public ::testing::Test { vector_of_decryption_configurations_.push_back( file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); - // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply - // aad_prefix. + // Decryption configuration 3: Decrypt using explicit column and footer keys. std::shared_ptr path_float_ptr = parquet::schema::ColumnPath::FromDotString("float_field"); std::shared_ptr path_double_ptr = @@ -295,11 +294,6 @@ class TestEncryptionConfiguration : public ::testing::Test { // Close the ParquetFileWriter file_writer->Close(); - // Close the ParquetFileWriter - file_writer->Close(); - - // Write the bytes to file - out_file->Close().ok(); return; } @@ -524,6 +518,7 @@ class TestEncryptionConfiguration : public ::testing::Test { ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); i++; } + file_reader->Close(); } } catch (const std::exception& e) { exception_msg = e.what(); From 76c2b028285dd8f12b40a88eb27f3f9daf5c3747 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 4 Jul 2019 07:28:40 +0300 Subject: [PATCH 124/125] Change encryption-configuration-test --- cpp/src/parquet/CMakeLists.txt | 3 +- ...figuration-encrypted-columns-and-footer.cc | 556 ++++++++++++++++++ ...ion-encrypted-columns-plaintext-footer.cc} | 255 +------- 3 files changed, 570 insertions(+), 244 deletions(-) create mode 100644 cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc rename cpp/src/parquet/{encryption-configurations-test.cc => encryption-configuration-encrypted-columns-plaintext-footer.cc} (63%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index a55343166b3..3c8e417bab1 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -329,8 +329,9 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES + encryption-configuration-encrypted-columns-and-footer.cc + encryption-configuration-encrypted-columns-plaintext-footer.cc encryption-properties-test.cc - encryption-configurations-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc new file mode 100644 index 00000000000..fd0e98ad82c --- /dev/null +++ b/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc @@ -0,0 +1,556 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-util.h" + +/* + * This file contains unit-test for writing and reading encrypted Parquet file with + * different encryption and decryption configuration. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The unit-test creates a single parquet file with eight columns using the + * following encryption configuration: + * + * - Encryption configuration : Encrypt two columns and the footer, with different + * keys. + * + * The written parquet file produced above is read by the following decryption + * configurations: + * + * - Decryption configuration : Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + */ + +namespace parquet { +namespace test { + +using FileClass = ::arrow::io::FileOutputStream; + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +constexpr int kFixedLength = 10; + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + createDecryptionConfigurations(); + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + std::string res = "test.parquet.encrypted"; + file_name_ = data_file(res.c_str()); + } + + void TearDown() { + // delete test file. + ASSERT_EQ(std::remove(file_name_.c_str()), 0); + } + + protected: + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::string file_name_; + int rows_per_rowgroup_ = 50; + std::shared_ptr schema_; + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); + } + + void createDecryptionConfigurations() { + /********************************************************************************** + Creating Decryption configuration + **********************************************************************************/ + + // Decryption configuration : Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + } + + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file) { + std::shared_ptr out_file; + + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + return; + } + + void DecryptFile(std::string file, int example_id, int encryption_configuration) { + std::string exception_msg; + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[example_id]->DeepClone()); + + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + ASSERT_EQ(num_row_groups, 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + ASSERT_EQ(num_columns, 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + ASSERT_EQ(value, i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + ASSERT_EQ(value, expected_value); + if ((i % 2) == 0) { + ASSERT_EQ(repetition_level, 1); + } else { + ASSERT_EQ(repetition_level, 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + ASSERT_EQ(value.value[j], expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); + } else { + // There are NULL values in the rows written + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + i++; + } + file_reader->Close(); + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(encryption_configuration, example_id, exception_msg); + } + + // Check that the decryption result is as expected. + void CheckResult(int encryption_configuration_number, int example_id, + std::string exception_msg) { + if (!exception_msg.empty()) { + ASSERT_EQ(1, 0); + } + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration : Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build(), + file_name_); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); + } +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc similarity index 63% rename from cpp/src/parquet/encryption-configurations-test.cc rename to cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc index bcdc7eff446..5b10a013e7b 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc @@ -29,47 +29,25 @@ #include "parquet/test-util.h" /* - * This file contains unit-tests for writing and reading encrypted Parquet files with - * different encryption and decryption configurations. - * - * Each unit-test produces a single parquet file, encrypted with one of the encryption - * configuration described below; and is read multiple times using a set of decryption - * configurations, also described below. + * This file contains unit-test for writing and reading encrypted Parquet file with + * different encryption and decryption configuration. * * A detailed description of the Parquet Modular Encryption specification can be found * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * Each unit-test creates a single parquet file with eight columns using one of the - * following encryption configurations: + * The unit-test creates a single parquet file with eight columns using the + * following encryption configuration: * - * - Encryption configuration 1: Encrypt all columns and the footer with the same key. - * (uniform encryption) - * - Encryption configuration 2: Encrypt two columns and the footer, with different - * keys. - * - Encryption configuration 3: Encrypt two columns, with different keys. - * Don’t encrypt footer (to enable legacy readers) - * - plaintext footer mode. - * - Encryption configuration 4: Encrypt two columns and the footer, with different - * keys. Supply aad_prefix for file identity - * verification. - * - Encryption configuration 5: Encrypt two columns and the footer, with different - * keys. Supply aad_prefix, and call - * disable_aad_prefix_storage to prevent file - * identity storage in file metadata. - * - Encryption configuration 6: Encrypt two columns and the footer, with different - * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + * - Encryption configuration : Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. * * The written parquet file produced above is read by each of the following decryption * configurations: * - * - Decryption configuration 1: Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. - * - Decryption configuration 2: Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. Supplies - * aad_prefix to verify file identity. - * - Decryption configuration 3: Decrypt using explicit column and footer keys - * (instead of key retrieval callback). + * - Decryption configuration : Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. */ namespace parquet { @@ -130,10 +108,10 @@ class TestEncryptionConfiguration : public ::testing::Test { void createDecryptionConfigurations() { /********************************************************************************** - Creating a number of Decryption configurations + Creating Decryption configuration **********************************************************************************/ - // Decryption configuration 1: Decrypt using key retriever callback that holds the + // Decryption configuration: Decrypt using key retriever callback that holds the // keys of two encrypted columns and the footer key. std::shared_ptr string_kr1 = std::make_shared(); @@ -147,43 +125,6 @@ class TestEncryptionConfiguration : public ::testing::Test { vector_of_decryption_configurations_.push_back( file_decryption_builder_1.key_retriever(kr1)->build()); - // Decryption configuration 2: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. Supply aad_prefix. - std::shared_ptr string_kr2 = - std::make_shared(); - string_kr2->PutKey("kf", kFooterEncryptionKey_); - string_kr2->PutKey("kc1", kColumnEncryptionKey1_); - string_kr2->PutKey("kc2", kColumnEncryptionKey2_); - std::shared_ptr kr2 = - std::static_pointer_cast(string_kr2); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_2; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); - - // Decryption configuration 3: Decrypt using explicit column and footer keys. - std::shared_ptr path_float_ptr = - parquet::schema::ColumnPath::FromDotString("float_field"); - std::shared_ptr path_double_ptr = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( - path_double_ptr); - parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); - - decryption_cols[path_double_ptr] = - decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); - decryption_cols[path_float_ptr] = - decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_3; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_3.footer_key(kFooterEncryptionKey_) - ->column_properties(decryption_cols) - ->build()); } void EncryptFile( @@ -529,26 +470,6 @@ class TestEncryptionConfiguration : public ::testing::Test { // Check that the decryption result is as expected. void CheckResult(int encryption_configuration_number, int example_id, std::string exception_msg) { - int decryption_configuration_number = example_id + 1; - // Encryption_configuration number five contains aad_prefix and - // disable_aad_prefix_storage. - // An exception is expected to be thrown if the file is not decrypted with aad_prefix. - if (encryption_configuration_number == 5) { - if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { - std::size_t found = exception_msg.find("AAD"); - ASSERT_FALSE(found == std::string::npos); - return; - } - } - // Decryption configuration number two contains aad_prefix. An exception is expected - // to be thrown if the file was not encrypted with the same aad_prefix. - if (decryption_configuration_number == 2) { - if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { - std::size_t found = exception_msg.find("AAD"); - ASSERT_FALSE(found == std::string::npos); - return; - } - } if (!exception_msg.empty()) { ASSERT_EQ(1, 0); } @@ -596,60 +517,7 @@ class TestEncryptionConfiguration : public ::testing::Test { } }; -// Encryption configuration 1: Encrypt all columns and the footer with the same key. -// (uniform encryption) -TEST_F(TestEncryptionConfiguration, UniformEncryption) { - parquet::FileEncryptionProperties::Builder file_encryption_builder_1( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 1 /* encryption_configuration_number */); - } -} - -// Encryption configuration 2: Encrypt two columns and the footer, with different keys. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols2; - std::shared_ptr path_to_double_field_ = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::shared_ptr path_to_float_field_ = - parquet::schema::ColumnPath::FromDotString("float_field"); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( - path_to_float_field_); - encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); - encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); - } -} - -// Encryption configuration 3: Encrypt two columns, with different keys. +// Encryption configuration: Encrypt two columns, with different keys. // Don’t encrypt footer. // (plaintext footer mode, readable by legacy readers) TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { @@ -683,104 +551,5 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { } } -// Encryption configuration 4: Encrypt two columns and the footer, with different keys. -// Use aad_prefix. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols4; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( - path_to_float_field_); - encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); - encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_4( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) - ->aad_prefix(kFileName_) - ->build(), - file_name_); - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 4 /* encryption_configuration_number */); - } -} - -// Encryption configuration 5: Encrypt two columns and the footer, with different keys. -// Use aad_prefix and disable_aad_prefix_storage. -TEST_F(TestEncryptionConfiguration, - EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols5; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( - path_to_float_field_); - encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); - encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_5( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) - ->footer_key_metadata("kf") - ->aad_prefix(kFileName_) - ->disable_store_aad_prefix_storage() - ->build(), - file_name_); - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 5 /* encryption_configuration_number */); - } -} - -// Encryption configuration 6: Encrypt two columns and the footer, with different keys. -// Use AES_GCM_CTR_V1 algorithm. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols6; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( - path_to_float_field_); - encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); - encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_6( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) - ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 6 /* encryption_configuration_number */); - } -} - } // namespace test } // namespace parquet From 90114116c7e1021c4a355c6e63fb3c3a33348ee7 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 4 Jul 2019 08:31:13 +0300 Subject: [PATCH 125/125] Delete encryption-configuration-encrypted-columns-plaintext-footer.cc test --- cpp/src/parquet/CMakeLists.txt | 1 - ...tion-encrypted-columns-plaintext-footer.cc | 555 ------------------ 2 files changed, 556 deletions(-) delete mode 100644 cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 3c8e417bab1..c2fb9af4792 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -330,7 +330,6 @@ if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES encryption-configuration-encrypted-columns-and-footer.cc - encryption-configuration-encrypted-columns-plaintext-footer.cc encryption-properties-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc deleted file mode 100644 index 5b10a013e7b..00000000000 --- a/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc +++ /dev/null @@ -1,555 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include - -#include - -#include "parquet/column_reader.h" -#include "parquet/column_writer.h" -#include "parquet/file_reader.h" -#include "parquet/file_writer.h" -#include "parquet/platform.h" -#include "parquet/test-util.h" - -/* - * This file contains unit-test for writing and reading encrypted Parquet file with - * different encryption and decryption configuration. - * - * A detailed description of the Parquet Modular Encryption specification can be found - * here: - * https://github.com/apache/parquet-format/blob/encryption/Encryption.md - * - * The unit-test creates a single parquet file with eight columns using the - * following encryption configuration: - * - * - Encryption configuration : Encrypt two columns, with different keys. - * Don’t encrypt footer (to enable legacy readers) - * - plaintext footer mode. - * - * The written parquet file produced above is read by each of the following decryption - * configurations: - * - * - Decryption configuration : Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. - */ - -namespace parquet { -namespace test { -std::string data_file(const char* file) { - std::string dir_string(test::get_data_dir()); - std::stringstream ss; - ss << dir_string << "/" << file; - return ss.str(); -} - -using FileClass = ::arrow::io::FileOutputStream; - -using parquet::ConvertedType; -using parquet::Repetition; -using parquet::Type; -using schema::GroupNode; -using schema::NodePtr; -using schema::PrimitiveNode; - -constexpr int kFixedLength = 10; - -const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 -const char kColumnEncryptionKey1[] = "1234567890123450"; -const char kColumnEncryptionKey2[] = "1234567890123451"; -const char kFileName[] = "tester"; - -class TestEncryptionConfiguration : public ::testing::Test { - public: - void SetUp() { - createDecryptionConfigurations(); - // Setup the parquet schema - schema_ = SetupEncryptionSchema(); - std::string res = "test.parquet.encrypted"; - file_name_ = data_file(res.c_str()); - } - - void TearDown() { - // delete test file. - ASSERT_EQ(std::remove(file_name_.c_str()), 0); - } - - protected: - std::shared_ptr path_to_double_field_ = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::shared_ptr path_to_float_field_ = - parquet::schema::ColumnPath::FromDotString("float_field"); - std::string file_name_; - int rows_per_rowgroup_ = 50; - std::shared_ptr schema_; - // This vector will hold various decryption configurations. - std::vector> - vector_of_decryption_configurations_; - std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); - std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); - std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); - std::string kFileName_ = std::string(kFileName); - - void createDecryptionConfigurations() { - /********************************************************************************** - Creating Decryption configuration - **********************************************************************************/ - - // Decryption configuration: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. - std::shared_ptr string_kr1 = - std::make_shared(); - string_kr1->PutKey("kf", kFooterEncryptionKey_); - string_kr1->PutKey("kc1", kColumnEncryptionKey1_); - string_kr1->PutKey("kc2", kColumnEncryptionKey2_); - std::shared_ptr kr1 = - std::static_pointer_cast(string_kr1); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_1; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_1.key_retriever(kr1)->build()); - - } - - void EncryptFile( - std::shared_ptr encryption_configurations, - std::string file) { - std::shared_ptr out_file; - - WriterProperties::Builder prop_builder; - prop_builder.compression(parquet::Compression::SNAPPY); - prop_builder.encryption(encryption_configurations); - std::shared_ptr writer_properties = prop_builder.build(); - - PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); - - RowGroupWriter* row_group_writer; - row_group_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::ByteArray value; - char hello[kFixedLength] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = kFixedLength; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); - - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Close the ParquetFileWriter - file_writer->Close(); - - return; - } - - void DecryptFile(std::string file, int example_id, int encryption_configuration) { - std::string exception_msg; - try { - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption_properties( - vector_of_decryption_configurations_[example_id]->DeepClone()); - - auto file_reader = - parquet::ParquetFileReader::OpenFile(file, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = file_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - ASSERT_EQ(num_row_groups, 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - ASSERT_EQ(num_columns, 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - file_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - ASSERT_EQ(value, i); - i++; - } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - ASSERT_EQ(value, expected_value); - if ((i % 2) == 0) { - ASSERT_EQ(repetition_level, 1); - } else { - ASSERT_EQ(repetition_level, 0); - } - i++; - } - - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - ASSERT_EQ(value.value[j], expected_value.value[j]); - } - i++; - } - - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - double expected_value = i * 1.1111111; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // Verify the value written - char expected_value[kFixedLength] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - ASSERT_EQ(value.len, kFixedLength); - ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); - ASSERT_EQ(definition_level, 1); - } else { - // There are NULL values in the rows written - ASSERT_EQ(values_read, 0); - ASSERT_EQ(definition_level, 0); - } - i++; - } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - char v = static_cast(i); - char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); - i++; - } - file_reader->Close(); - } - } catch (const std::exception& e) { - exception_msg = e.what(); - } - CheckResult(encryption_configuration, example_id, exception_msg); - } - - // Check that the decryption result is as expected. - void CheckResult(int encryption_configuration_number, int example_id, - std::string exception_msg) { - if (!exception_msg.empty()) { - ASSERT_EQ(1, 0); - } - } - - std::shared_ptr SetupEncryptionSchema() { - parquet::schema::NodeVector fields; - // Create a primitive node named 'boolean_field' with type:BOOLEAN, - // repetition:REQUIRED - fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, ConvertedType::NONE)); - - // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, - // logical type:TIME_MILLIS - fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - ConvertedType::TIME_MILLIS)); - - // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED - fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, - Type::DOUBLE, ConvertedType::NONE)); - - // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL - fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, - Type::BYTE_ARRAY, ConvertedType::NONE)); - - // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, - // repetition:REQUIRED, field_length = kFixedLength - fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, - kFixedLength)); - - // Create a GroupNode named 'schema' using the primitive nodes defined above - // This GroupNode is the root node of the schema tree - return std::static_pointer_cast( - GroupNode::Make("schema", Repetition::REQUIRED, fields)); - } -}; - -// Encryption configuration: Encrypt two columns, with different keys. -// Don’t encrypt footer. -// (plaintext footer mode, readable by legacy readers) -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols3; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( - path_to_float_field_); - encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); - encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_3( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) - ->set_plaintext_footer() - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 3 /* encryption_configuration_number */); - } -} - -} // namespace test -} // namespace parquet