From 1cd69ada657f0f0f8f37ab1a3f41ea24cc79d7d4 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 11 Jul 2018 18:51:01 +0700 Subject: [PATCH 01/18] change properties to support encryption --- src/parquet/properties.h | 332 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 330 insertions(+), 2 deletions(-) diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 83dc2057..f93bd23a 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -38,6 +38,121 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 0; static bool DEFAULT_USE_BUFFERED_STREAM = false; +// should find a better name??? +class PARQUET_EXPORT EncryptionProperties { + private: + static inline uint8_t* str2bytes(std::string str) + { + if (str.empty()) return nullptr; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); + } + + public: + EncryptionProperties() = default; + EncryptionProperties(Encryption::type algorithm, std::string key, + std::string key_metadata, std::string aad) + : algorithm_(algorithm), key_(key), key_metadata_(key_metadata), aad_(aad) {} + + int key_length() { return static_cast(key_.length()); } + uint8_t* key_bytes() { return str2bytes(key_); } + + int aad_length() { return static_cast(aad_.length()); } + uint8_t* aad_bytes() { return str2bytes(aad_); } + + Encryption::type algorithm() { return algorithm_; } + + std::string key_metadata() { return key_metadata_; } + + std::string key() { return key_; } + + private: + std::string key_; + std::string key_metadata_; + Encryption::type algorithm_; + std::string aad_; +}; + +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(bool encrypt, std::string path) + : encrypt_(encrypt), path_(path), encrypted_with_footer_key_(encrypt) {} + + bool encrypted() { return encrypt_; } + bool encrypted_with_footer_key() { return encrypted_with_footer_key_; } + std::string key() { return key_; } + std::string key_metadata() { return key_metadata_; } + + void set_encryption_key(std::string key, uint32_t key_id) + { + std::string key_metadata = key_id == 0 + ? "" : std::string(reinterpret_cast(&key_id), 4); + set_encryption_key(key, key_metadata); + } + + void set_encryption_key(std::string key, std::string key_metadata) + { + if (!encrypt_) throw ParquetException("Setting key on unencrypted column: " + path_); + if (key.empty()) throw ParquetException("Null key for " + path_); + + encrypted_with_footer_key_ = false; + key_ = key; + key_metadata_ = key_metadata; + } + + std::string path() { return path_; } + + private: + bool encrypt_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + std::string path_; +}; + + +class PARQUET_EXPORT FileDecryptionProperties { + public: + FileDecryptionProperties(std::string footer_key) : footer_key_(footer_key) + { + if (footer_key_.empty()) throw ParquetException("Decryption: null footer key"); + if (!(footer_key_.length() == 16 || footer_key_.length() == 24 + || footer_key_.length() == 32)) { + throw ParquetException("Wrong key length " + footer_key_.length()); + } + } + + // TODO + // FileDecryptionProperties(std::shared_ptr keyRetriever) {} + + void aad(std::string aad) { aad_ = aad; } + + void column_key(std::string name, std::string key) + { + column_key(std::vector({name}), key); + } + + void column_key(std::vector paths, std::string key) + { + if (key.empty()) throw ParquetException("Decryption: null column key"); + if (key.length() != 16 && key.length() != 24 && key.length() != 32) + throw ParquetException("Wrong key length " + key.length()); + + // TODO add to columns_ + } + + std::string footer_key() { return footer_key_; } + std::string aad() { return aad_; } + + private: + std::string footer_key_; + std::string aad_; + + std::vector columns_; +}; + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) @@ -70,10 +185,17 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } + void set_file_decryption(std::shared_ptr decryption) { + file_decryption_ = decryption; + } + + FileDecryptionProperties* file_decryption() { return file_decryption_.get(); } + private: ::arrow::MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + std::shared_ptr file_decryption_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -90,6 +212,7 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; +static const EncryptionProperties DEFAULT_ENCRYPTION = EncryptionProperties(); class PARQUET_EXPORT ColumnProperties { public: @@ -97,12 +220,14 @@ class PARQUET_EXPORT ColumnProperties { Compression::type codec = DEFAULT_COMPRESSION_TYPE, bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, - size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE) + size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE, + EncryptionProperties encryption = DEFAULT_ENCRYPTION) : encoding_(encoding), codec_(codec), dictionary_enabled_(dictionary_enabled), statistics_enabled_(statistics_enabled), - max_stats_size_(max_stats_size) {} + max_stats_size_(max_stats_size), + encryption_(encryption) {} void set_encoding(Encoding::type encoding) { encoding_ = encoding; } @@ -120,6 +245,8 @@ class PARQUET_EXPORT ColumnProperties { max_stats_size_ = max_stats_size; } + void set_encryption(EncryptionProperties encryption) { encryption_ = encryption; } + Encoding::type encoding() const { return encoding_; } Compression::type compression() const { return codec_; } @@ -130,12 +257,156 @@ class PARQUET_EXPORT ColumnProperties { size_t max_statistics_size() const { return max_stats_size_; } + EncryptionProperties encryption() const { return encryption_; } + private: Encoding::type encoding_; Compression::type codec_; bool dictionary_enabled_; bool statistics_enabled_; size_t max_stats_size_; + EncryptionProperties encryption_; +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + FileEncryptionProperties() = default; + FileEncryptionProperties(const FileEncryptionProperties&) = default; + + FileEncryptionProperties(Encryption::type algorithm, std::string key, + std::string key_metadata) + { + if (key.length() != 16 && key.length() != 24 && key.length() != 32) { + throw ParquetException("Wrong key length " + key.length()); // TODO io exception + } + if (!key_metadata.empty() && key_metadata.length() > 256) { + throw ParquetException("Footer key meta data is too long: " + key_metadata.length()); + } + + uniform_encryption_ = true; + footer_key_ = key; + footer_key_metadata_ = key_metadata; + single_key_encryption_ = !footer_key_.empty(); + algorithm_ = algorithm; + } + + FileEncryptionProperties(Encryption::type algorithm, std::string key, int key_id) + : FileEncryptionProperties(algorithm, key, + key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4)) {} + + void setup_columns(std::vector columns, bool encrypt_the_rest) + { + encrypt_the_rest_ = encrypt_the_rest; + columns_ = columns; + + if (!footer_key_metadata_.empty()) { + single_key_encryption_ = true; + + for (auto col = columns.begin(); col != columns.end(); col++) { + if (col->key().compare(footer_key_) != 0) { + single_key_encryption_ = false; + break; + } + } + } + else { + if (encrypt_the_rest) throw std::invalid_argument("Encrypt the rest with null footer key"); + bool all_are_unencrypted = true; + for (auto col = columns.begin(); col != columns.end(); col++) { + if (col->encrypted()) { + if (col->key().empty()) { + throw ParquetException("Encrypt column with null footer key"); + } + all_are_unencrypted = false; + } + } + + if (all_are_unencrypted) + throw ParquetException("Footer and all columns unencrypted"); + } + } + + std::shared_ptr footer_encryption() + { + if (footer_key_.empty()) { + return nullptr; + } + else { + return std::make_shared(algorithm_, footer_key_, + footer_key_metadata_, aad_); + } + } + + std::shared_ptr encryption_metadata( + const std::shared_ptr& path) { + // uniform encryption + if (uniform_encryption_) { + return nullptr; + } + + // non-uniform encryption + std::string pathStr = path->ToDotString(); + for(auto col = columns_.begin(); col != columns_.end(); col++) { // TODO + if (col->path() == pathStr) { + return std::shared_ptr(&(*col)); + } + } + // encrypted with footer key + if (encrypt_the_rest_) { + std::shared_ptr col( + new ColumnEncryptionProperties(true, path->ToDotString())); + col->set_encryption_key(footer_key_, footer_key_metadata_); + return col; + } + + // unencrypted + return std::shared_ptr( + new ColumnEncryptionProperties(false, path->ToDotString())); + + } + + std::shared_ptr encryption_properties( + const std::shared_ptr& path) { + // uniform encryption + if (uniform_encryption_) { + return footer_encryption(); + } + + // non-uniform encryption + std::string pathStr = path->ToDotString(); + for(auto col = columns_.begin(); col != columns_.end(); col++) { // TODO + if (col->path() == pathStr) { + return std::shared_ptr(new EncryptionProperties( + algorithm_, + col->key(), + col->key_metadata(), + aad_ + )); + } + } + + if (encrypt_the_rest_) { + return footer_encryption(); + } + + return std::shared_ptr(nullptr); + } + + void aad(std::string aad) { aad_ = aad; } + + bool encrypted_footer() { return footer_key_.length() != 0; } + + private: + std::string footer_key_; + std::string footer_key_metadata_; + Encryption::type algorithm_; + bool single_key_encryption_; + std::string aad_; + + bool uniform_encryption_; + + std::vector columns_; + bool encrypt_the_rest_; }; class PARQUET_EXPORT WriterProperties { @@ -278,6 +549,26 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } + Builder* encryption(std::string key) + { + return encryption(Encryption::AES_GCM_V1, key, 0); + } + + Builder* encryption(Encryption::type algorithm, std::string key, uint32_t key_id) + { + file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); + return this; + } + + Builder* column_encryption(std::vector columns, + bool encrypt_the_rest) { + if (file_encryption_.get() == nullptr) + throw ParquetException("null file encryption"); + + file_encryption_->setup_columns(columns, encrypt_the_rest); + return this; + } + Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); return this; @@ -326,6 +617,7 @@ class PARQUET_EXPORT WriterProperties { return std::shared_ptr( new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, pagesize_, version_, created_by_, + std::move(file_encryption_), default_column_properties_, column_properties)); } @@ -337,6 +629,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; + std::unique_ptr file_encryption_; // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -360,6 +653,18 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } + inline FileEncryptionProperties* file_encryption() const { + return parquet_file_encryption_.get(); } + + inline std::shared_ptr footer_encryption() const { + if (parquet_file_encryption_.get() == nullptr) { + return std::shared_ptr(nullptr); + } + else { + return parquet_file_encryption_->footer_encryption(); + } + } + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -403,11 +708,32 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } + std::shared_ptr encryption_metadata( + const std::shared_ptr& path) const { + if (parquet_file_encryption_) { + return parquet_file_encryption_->encryption_metadata(path); + } + else { + return nullptr; + } + } + + std::shared_ptr encryption( + const std::shared_ptr& path) const { + if (parquet_file_encryption_) { + return parquet_file_encryption_->encryption_properties(path); + } + else { + return nullptr; + } + } + private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, + std::shared_ptr file_encryption, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -417,6 +743,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), + parquet_file_encryption_(file_encryption), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -427,6 +754,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + std::shared_ptr parquet_file_encryption_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; From 322aed78010271ebc2a73e6efc0441327c529c3e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 11 Jul 2018 18:51:25 +0700 Subject: [PATCH 02/18] change metadata to support encryption --- src/parquet/metadata.cc | 409 ++++++++++++++++++++++++++++++---------- src/parquet/metadata.h | 57 +++++- 2 files changed, 364 insertions(+), 102 deletions(-) diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index 1cab51f0..635e7e18 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -52,16 +52,16 @@ static std::shared_ptr MakeTypedColumnStats( // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { return std::make_shared>( - descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, true); + descr, metadata.statistics.min_value, metadata.statistics.max_value, + metadata.num_values - metadata.statistics.null_count, + metadata.statistics.null_count, metadata.statistics.distinct_count, true); } // Default behavior return std::make_shared>( - descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max || metadata.statistics.__isset.min); + descr, metadata.statistics.min, metadata.statistics.max, + metadata.num_values - metadata.statistics.null_count, + metadata.statistics.null_count, metadata.statistics.distinct_count, + metadata.statistics.__isset.max || metadata.statistics.__isset.min); } std::shared_ptr MakeColumnStats( @@ -88,13 +88,53 @@ std::shared_ptr MakeColumnStats( } // MetaData Accessor +// ColumnCryptoMetaData +class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { +public: + explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata) + : crypto_metadata_(crypto_metadata) {} + + ~ColumnCryptoMetaDataImpl() {} + + inline std::vector path_in_schema() const { + return crypto_metadata_->path_in_schema; } + inline bool encrypted() const { return crypto_metadata_->encrypted; } + inline bool encrypted_with_footer_key() const { + return crypto_metadata_->encrypted_with_footer_key; } + inline std::string column_key_metadata() const { + return crypto_metadata_->column_key_metadata; } + +private: + const format::ColumnCryptoMetaData* crypto_metadata_; +}; + +std::unique_ptr ColumnCryptoMetaData::Make( + const uint8_t* metadata) { + return std::unique_ptr(new ColumnCryptoMetaData(metadata)); +} + +ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) + : impl_(new ColumnCryptoMetaDataImpl( + reinterpret_cast(metadata))) {} + +ColumnCryptoMetaData::~ColumnCryptoMetaData() {} + +std::vector ColumnCryptoMetaData::path_in_schema() const { + return impl_->path_in_schema(); } +bool ColumnCryptoMetaData::encrypted() const { return impl_->encrypted(); } +bool ColumnCryptoMetaData::encrypted_with_footer_key() const { + return impl_->encrypted_with_footer_key(); } +std::string ColumnCryptoMetaData::column_key_metadata() const { + return impl_->column_key_metadata(); } + + // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { - public: +public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) - : column_(column), descr_(descr), writer_version_(writer_version) { + : column_(column), descr_(descr), writer_version_(writer_version) { const format::ColumnMetaData& meta_data = column->meta_data; for (auto encoding : meta_data.encodings) { encodings_.push_back(FromThrift(encoding)); @@ -125,7 +165,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline bool is_stats_set() const { DCHECK(writer_version_ != nullptr); return column_->meta_data.__isset.statistics && - writer_version_->HasCorrectStatistics(type(), descr_->sort_order()); + writer_version_->HasCorrectStatistics(type(), descr_->sort_order()); } inline std::shared_ptr statistics() const { @@ -167,7 +207,17 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return column_->meta_data.total_uncompressed_size; } - private: + inline std::unique_ptr crypto_meta_data() const { + if (column_->__isset.crypto_meta_data) { + return ColumnCryptoMetaData::Make( + reinterpret_cast(&column_->crypto_meta_data)); + } + else { + return nullptr; + } + } + +private: mutable std::shared_ptr stats_; std::vector encodings_; const format::ColumnChunk* column_; @@ -179,15 +229,15 @@ std::unique_ptr ColumnChunkMetaData::Make( const uint8_t* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version)); + new ColumnChunkMetaData(metadata, descr, writer_version)); } ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) - : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( - reinterpret_cast(metadata), descr, - writer_version))} {} + : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + reinterpret_cast(metadata), descr, + writer_version))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -244,13 +294,17 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +std::unique_ptr ColumnChunkMetaData::crypto_meta_data() const { + return impl_->crypto_meta_data(); +} + // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { - public: +public: explicit RowGroupMetaDataImpl(const format::RowGroup* row_group, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) - : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} + : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} ~RowGroupMetaDataImpl() {} inline int num_columns() const { return static_cast(row_group_->columns.size()); } @@ -269,11 +323,11 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make( - reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), - writer_version_); + reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), + writer_version_); } - private: +private: const format::RowGroup* row_group_; const SchemaDescriptor* schema_; const ApplicationVersion* writer_version_; @@ -283,14 +337,14 @@ std::unique_ptr RowGroupMetaData::Make( const uint8_t* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) { return std::unique_ptr( - new RowGroupMetaData(metadata, schema, writer_version)); + new RowGroupMetaData(metadata, schema, writer_version)); } RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) - : impl_{std::unique_ptr(new RowGroupMetaDataImpl( - reinterpret_cast(metadata), schema, writer_version))} { + : impl_{std::unique_ptr(new RowGroupMetaDataImpl( + reinterpret_cast(metadata), schema, writer_version))} { } RowGroupMetaData::~RowGroupMetaData() {} @@ -308,13 +362,14 @@ std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const // file metadata class FileMetaData::FileMetaDataImpl { - public: +public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) - : metadata_len_(0) { + explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len, + std::shared_ptr encryption = nullptr) + : metadata_len_(0) { metadata_.reset(new format::FileMetaData); - DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get(), true, encryption.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -343,9 +398,8 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(OutputStream* dst) const { - SerializeThriftMsg(metadata_.get(), 1024, dst); - } + void WriteTo(OutputStream* dst, EncryptionProperties* encryption) const { + SerializeThriftMsg(metadata_.get(), 1024, dst, true, encryption); } std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -355,8 +409,8 @@ class FileMetaData::FileMetaDataImpl { throw ParquetException(ss.str()); } return RowGroupMetaData::Make( - reinterpret_cast(&metadata_->row_groups[i]), &schema_, - &writer_version_); + reinterpret_cast(&metadata_->row_groups[i]), &schema_, + &writer_version_); } const SchemaDescriptor* schema() const { return &schema_; } @@ -365,13 +419,13 @@ class FileMetaData::FileMetaDataImpl { return key_value_metadata_; } - private: +private: friend FileMetaDataBuilder; uint32_t metadata_len_; std::unique_ptr metadata_; void InitSchema() { schema::FlatSchemaConverter converter(&metadata_->schema[0], - static_cast(metadata_->schema.size())); + static_cast(metadata_->schema.size())); schema_.Init(converter.Convert()); } void InitColumnOrders() { @@ -409,17 +463,20 @@ class FileMetaData::FileMetaDataImpl { }; std::shared_ptr FileMetaData::Make(const uint8_t* metadata, - uint32_t* metadata_len) { + uint32_t* metadata_len, + std::shared_ptr encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); + return std::shared_ptr(new FileMetaData(metadata, metadata_len, + encryption)); } -FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len) - : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} +FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len, + std::shared_ptr encryption) + : impl_{std::unique_ptr( + new FileMetaDataImpl(metadata, metadata_len, encryption))} {} FileMetaData::FileMetaData() - : impl_{std::unique_ptr(new FileMetaDataImpl())} {} + : impl_{std::unique_ptr(new FileMetaDataImpl())} {} FileMetaData::~FileMetaData() {} @@ -437,13 +494,13 @@ int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { - case 1: - return ParquetVersion::PARQUET_1_0; - case 2: - return ParquetVersion::PARQUET_2_0; - default: - // Improperly set version, assuming Parquet 1.0 - break; + case 1: + return ParquetVersion::PARQUET_1_0; + case 2: + return ParquetVersion::PARQUET_2_0; + default: + // Improperly set version, assuming Parquet 1.0 + break; } return ParquetVersion::PARQUET_1_0; } @@ -462,11 +519,86 @@ std::shared_ptr FileMetaData::key_value_metadata() const return impl_->key_value_metadata(); } -void FileMetaData::WriteTo(OutputStream* dst) const { return impl_->WriteTo(dst); } +void FileMetaData::WriteTo(OutputStream* dst, EncryptionProperties* encryption) const { + return impl_->WriteTo(dst, encryption); +} + +class FileCryptoMetaData::FileCryptoMetaDataImpl { +public: + FileCryptoMetaDataImpl() {} + + explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) + { + metadata_.reset(new format::FileCryptoMetaData); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + metadata_len_ = *metadata_len; + } + + ~FileCryptoMetaDataImpl() {} + + Encryption::type encryption_algorithm() + { + return FromThrift(metadata_->encryption_algorithm); + } + + bool encrypted_footer() { return metadata_->encrypted_footer; } + + std::string footer_key_metadata() + { + return metadata_->__isset.footer_key_metadata ? metadata_->footer_key_metadata : ""; + } + + uint64_t footer_offset() { return metadata_->footer_offset; } + + std::string iv_prefix() + { + return metadata_->__isset.iv_prefix ? metadata_->iv_prefix : ""; + } + + void WriteTo(OutputStream* dst) + { + SerializeThriftMsg(metadata_.get(), 1024, dst, true); + } + + +private: + friend FileMetaDataBuilder; + std::unique_ptr metadata_; + uint32_t metadata_len_; +}; + +Encryption::type FileCryptoMetaData::encryption_algorithm() +{ + return impl_->encryption_algorithm(); +} +bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } +std::string FileCryptoMetaData::footer_key_metadata() +{ + return impl_->footer_key_metadata(); +} +uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } +std::string FileCryptoMetaData::iv_prefix() { return impl_->iv_prefix(); } + +std::shared_ptr FileCryptoMetaData::Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len) +{ + return std::shared_ptr( + new FileCryptoMetaData(serialized_metadata, metadata_len)); +} + +FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len) + : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} + +FileCryptoMetaData::FileCryptoMetaData() + : impl_(new FileCryptoMetaDataImpl()) {} + +FileCryptoMetaData::~FileCryptoMetaData() {} + +void FileCryptoMetaData::WriteTo(OutputStream *dst) { impl_->WriteTo(dst); } ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) - : application_(application), version{major, minor, patch, "", "", ""} {} + : application_(application), version{major, minor, patch, "", "", ""} {} ApplicationVersion::ApplicationVersion(const std::string& created_by) { boost::regex app_regex{ApplicationVersion::APPLICATION_FORMAT}; @@ -520,9 +652,9 @@ bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) cons bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const { return application_ == other_version.application_ && - version.major == other_version.version.major && - version.minor == other_version.version.minor && - version.patch == other_version.version.patch; + version.major == other_version.version.major && + version.minor == other_version.version.minor && + version.patch == other_version.version.patch; } // Reference: @@ -564,16 +696,17 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, // MetaData Builders // row-group metadata class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { - public: +public: explicit ColumnChunkMetaDataBuilderImpl(const std::shared_ptr& props, const ColumnDescriptor* column, uint8_t* contents) - : properties_(props), column_(column) { + : properties_(props), column_(column) { column_chunk_ = reinterpret_cast(contents); - column_chunk_->meta_data.__set_type(ToThrift(column->physical_type())); - column_chunk_->meta_data.__set_path_in_schema(column->path()->ToDotVector()); - column_chunk_->meta_data.__set_codec( - ToThrift(properties_->compression(column->path()))); + meta_data_ = column_chunk_->meta_data; + meta_data_.__set_type(ToThrift(column->physical_type())); + meta_data_.__set_path_in_schema(column->path()->ToDotVector()); + meta_data_.__set_codec( + ToThrift(properties_->compression(column->path()))); } ~ColumnChunkMetaDataBuilderImpl() {} @@ -600,7 +733,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { stats.__isset.max = val.has_max; } - column_chunk_->meta_data.__set_statistics(stats); + meta_data_.__set_statistics(stats); } void Finish(int64_t num_values, int64_t dictionary_page_offset, @@ -608,19 +741,19 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback) { if (dictionary_page_offset > 0) { - column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); + meta_data_.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); } else { column_chunk_->__set_file_offset(data_page_offset + compressed_size); } - column_chunk_->__isset.meta_data = true; - column_chunk_->meta_data.__set_num_values(num_values); + + meta_data_.__set_num_values(num_values); if (index_page_offset >= 0) { - column_chunk_->meta_data.__set_index_page_offset(index_page_offset); + meta_data_.__set_index_page_offset(index_page_offset); } - column_chunk_->meta_data.__set_data_page_offset(data_page_offset); - column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); - column_chunk_->meta_data.__set_total_compressed_size(compressed_size); + meta_data_.__set_data_page_offset(data_page_offset); + meta_data_.__set_total_uncompressed_size(uncompressed_size); + meta_data_.__set_total_compressed_size(compressed_size); std::vector thrift_encodings; if (has_dictionary) { thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); @@ -638,17 +771,59 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { if (dictionary_fallback) { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } - column_chunk_->meta_data.__set_encodings(thrift_encodings); + meta_data_.__set_encodings(thrift_encodings); } void WriteTo(OutputStream* sink) { - SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); + std::cout << "ColumnChunkMetaDataBuilderImpl::WriteTo" << std::endl; + auto encrypt_md = properties_->encryption_metadata(column_->path()); + + // file is not encrypted or uniform encrypted + if (encrypt_md == nullptr) { + SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); + } + else { // file is non-uniform encrypted + column_chunk_->__isset.crypto_meta_data = true; + column_chunk_->crypto_meta_data.__set_path_in_schema(column_->path()->ToDotVector()); + column_chunk_->crypto_meta_data.__set_encrypted(encrypt_md->encrypted()); + column_chunk_->crypto_meta_data.__set_encrypted(encrypt_md->encrypted_with_footer_key()); + column_chunk_->crypto_meta_data.__set_column_key_metadata(encrypt_md->key_metadata()); + + auto footer_encryption = properties_->footer_encryption(); + + // non-uniform: footer is unencrypted, or column is encrypted with a column-specific key + if ((footer_encryption == nullptr && encrypt_md->encrypted()) + || (footer_encryption != nullptr + && footer_encryption->key() == encrypt_md->key())) { + // don't set meta_data, + column_chunk_->__isset.meta_data = false; + + // Thrift-serialize the ColumnMetaData structure, + // encrypt it with the column key, and write the result to the output stream + // (first length, then buffer) + auto encrypt_props = properties_->encryption(column_->path()); + uint64_t metadata_start = sink->Tell(); + + SerializeThriftMsg(&meta_data_, sizeof(format::ColumnMetaData), sink, true, + encrypt_props.get()); + + // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. + column_chunk_->__set_file_offset(metadata_start); + } + else { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(meta_data_); + } + + SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); + } } const ColumnDescriptor* descr() const { return column_; } - private: +private: format::ColumnChunk* column_chunk_; + format::ColumnMetaData meta_data_; const std::shared_ptr properties_; const ColumnDescriptor* column_; }; @@ -657,14 +832,14 @@ std::unique_ptr ColumnChunkMetaDataBuilder::Make( const std::shared_ptr& props, const ColumnDescriptor* column, uint8_t* contents) { return std::unique_ptr( - new ColumnChunkMetaDataBuilder(props, column, contents)); + new ColumnChunkMetaDataBuilder(props, column, contents)); } ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( const std::shared_ptr& props, const ColumnDescriptor* column, uint8_t* contents) - : impl_{std::unique_ptr( - new ColumnChunkMetaDataBuilderImpl(props, column, contents))} {} + : impl_{std::unique_ptr( + new ColumnChunkMetaDataBuilderImpl(props, column, contents))} {} ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {} @@ -694,10 +869,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, } class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { - public: +public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, const SchemaDescriptor* schema, uint8_t* contents) - : properties_(props), schema_(schema), current_column_(0) { + : properties_(props), schema_(schema), current_column_(0) { row_group_ = reinterpret_cast(contents); InitializeColumns(schema->num_columns()); } @@ -712,8 +887,8 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { } auto column = schema_->Column(current_column_); auto column_builder = ColumnChunkMetaDataBuilder::Make( - properties_, column, - reinterpret_cast(&row_group_->columns[current_column_++])); + properties_, column, + reinterpret_cast(&row_group_->columns[current_column_++])); auto column_builder_ptr = column_builder.get(); column_builders_.push_back(std::move(column_builder)); return column_builder_ptr; @@ -750,7 +925,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { int64_t num_rows() { return row_group_->num_rows; } - private: +private: void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); } format::RowGroup* row_group_; @@ -764,14 +939,14 @@ std::unique_ptr RowGroupMetaDataBuilder::Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, uint8_t* contents) { return std::unique_ptr( - new RowGroupMetaDataBuilder(props, schema_, contents)); + new RowGroupMetaDataBuilder(props, schema_, contents)); } RowGroupMetaDataBuilder::RowGroupMetaDataBuilder( const std::shared_ptr& props, const SchemaDescriptor* schema_, uint8_t* contents) - : impl_{std::unique_ptr( - new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} + : impl_{std::unique_ptr( + new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() {} @@ -796,19 +971,22 @@ void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written) { // file metadata // TODO(PARQUET-595) Support key_value_metadata class FileMetaDataBuilder::FileMetaDataBuilderImpl { - public: +public: explicit FileMetaDataBuilderImpl( const SchemaDescriptor* schema, const std::shared_ptr& props, const std::shared_ptr& key_value_metadata) - : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { + : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); + if (props->footer_encryption().get() != nullptr) { + crypto_metadata_.reset(new format::FileCryptoMetaData()); + } } ~FileMetaDataBuilderImpl() {} RowGroupMetaDataBuilder* AppendRowGroup() { auto row_group = std::unique_ptr(new format::RowGroup()); auto row_group_builder = RowGroupMetaDataBuilder::Make( - properties_, schema_, reinterpret_cast(row_group.get())); + properties_, schema_, reinterpret_cast(row_group.get())); RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get(); row_group_builders_.push_back(std::move(row_group_builder)); row_groups_.push_back(std::move(row_group)); @@ -841,14 +1019,14 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { int32_t file_version = 0; switch (properties_->version()) { - case ParquetVersion::PARQUET_1_0: - file_version = 1; - break; - case ParquetVersion::PARQUET_2_0: - file_version = 2; - break; - default: - break; + case ParquetVersion::PARQUET_1_0: + file_version = 1; + break; + case ParquetVersion::PARQUET_2_0: + file_version = 2; + break; + default: + break; } metadata_->__set_version(file_version); metadata_->__set_created_by(properties_->created_by()); @@ -866,8 +1044,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->__isset.column_orders = true; parquet::schema::SchemaFlattener flattener( - static_cast(schema_->schema_root().get()), - &metadata_->schema); + static_cast(schema_->schema_root().get()), + &metadata_->schema); flattener.Flatten(); auto file_meta_data = std::unique_ptr(new FileMetaData()); file_meta_data->impl_->metadata_ = std::move(metadata_); @@ -875,10 +1053,40 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } - protected: + std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) + { + if (crypto_metadata_.get() == nullptr) { + return nullptr; + } + + auto file_encryption = properties_->file_encryption(); + + auto footer_encryption = properties_->footer_encryption(); + + // build format::FileCryptoMetaData + crypto_metadata_->__set_encryption_algorithm(ToThrift(footer_encryption->algorithm())); + crypto_metadata_->__set_encrypted_footer(file_encryption->encrypted_footer()); + + std::string footer_key_metadata = footer_encryption->key_metadata(); + if (!footer_key_metadata.empty()) { + crypto_metadata_->__set_footer_key_metadata(footer_key_metadata); + } + crypto_metadata_->__set_footer_offset(footerOffset); + + // TODO set iv_prefix??? + + // return as FileCryptoMetaData + std::unique_ptr file_crypto_meta_data = + std::unique_ptr(new FileCryptoMetaData()); + file_crypto_meta_data->impl_->metadata_ = std::move(crypto_metadata_); + return file_crypto_meta_data; + } + +protected: std::unique_ptr metadata_; + std::unique_ptr crypto_metadata_; - private: +private: const std::shared_ptr properties_; std::vector> row_groups_; std::vector> row_group_builders_; @@ -890,14 +1098,14 @@ std::unique_ptr FileMetaDataBuilder::Make( const SchemaDescriptor* schema, const std::shared_ptr& props, const std::shared_ptr& key_value_metadata) { return std::unique_ptr( - new FileMetaDataBuilder(schema, props, key_value_metadata)); + new FileMetaDataBuilder(schema, props, key_value_metadata)); } FileMetaDataBuilder::FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, const std::shared_ptr& key_value_metadata) - : impl_{std::unique_ptr( - new FileMetaDataBuilderImpl(schema, props, key_value_metadata))} {} + : impl_{std::unique_ptr( + new FileMetaDataBuilderImpl(schema, props, key_value_metadata))} {} FileMetaDataBuilder::~FileMetaDataBuilder() {} @@ -907,4 +1115,9 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData(uint64_t footerOffset) +{ + return impl_->BuildFileCryptoMetaData(footerOffset); +} + } // namespace parquet diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h index 5d51e3d2..e0c40631 100644 --- a/src/parquet/metadata.h +++ b/src/parquet/metadata.h @@ -87,6 +87,24 @@ class ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +class PARQUET_EXPORT ColumnCryptoMetaData { + public: + static std::unique_ptr Make( + const uint8_t* metadata); + ~ColumnCryptoMetaData(); + + std::vector path_in_schema() const; + bool encrypted() const; + bool encrypted_with_footer_key() const; + std::string column_key_metadata() const; + + private: + explicit ColumnCryptoMetaData(const uint8_t* metadata); + + class ColumnCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor @@ -115,7 +133,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; - + std::unique_ptr crypto_meta_data() const; private: explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = nullptr); @@ -155,7 +173,8 @@ class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor static std::shared_ptr Make(const uint8_t* serialized_metadata, - uint32_t* metadata_len); + uint32_t* metadata_len, + std::shared_ptr encryption = nullptr); ~FileMetaData(); @@ -171,7 +190,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(OutputStream* dst) const; + void WriteTo(OutputStream* dst, EncryptionProperties* encryption = nullptr) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -180,7 +199,8 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len, + std::shared_ptr encryption = nullptr); // PIMPL Idiom FileMetaData(); @@ -188,6 +208,32 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +class PARQUET_EXPORT FileCryptoMetaData { +public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + ~FileCryptoMetaData(); + + Encryption::type encryption_algorithm(); + bool encrypted_footer(); + std::string footer_key_metadata(); + uint64_t footer_offset(); + std::string iv_prefix(); + + void WriteTo(OutputStream* dst); + +private: + friend FileMetaDataBuilder; + explicit FileCryptoMetaData(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + + // PIMPL Idiom + FileCryptoMetaData(); + class FileCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { public: @@ -263,6 +309,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { // commit the metadata std::unique_ptr Finish(); + // crypto metadata + std::unique_ptr GetCryptoMetaData(uint64_t footerOffset); + private: explicit FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, From ee0671ab2caa163737664f32f0b0ffe7e2667fdc Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 11 Jul 2018 18:51:59 +0700 Subject: [PATCH 03/18] change thrift serialize/deserialize method to support encryption --- src/parquet/thrift.h | 117 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 96 insertions(+), 21 deletions(-) diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index ec7ac906..37f655af 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -42,6 +42,8 @@ #include "parquet/parquet_types.h" #include "parquet/util/logging.h" #include "parquet/util/memory.h" +#include "parquet/util/crypto.h" +#include "parquet/properties.h" namespace parquet { @@ -77,6 +79,10 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } +static inline Encryption::type FromThrift(format::EncryptionAlgorithm::type type) { + return static_cast(type); +} + static inline format::Type::type ToThrift(Type::type type) { return static_cast(type); } @@ -99,6 +105,10 @@ static inline format::CompressionCodec::type ToThrift(Compression::type type) { return static_cast(type); } +static inline format::EncryptionAlgorithm::type ToThrift(Encryption::type type) { + return static_cast(type); +} + // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities @@ -106,31 +116,81 @@ static inline format::CompressionCodec::type ToThrift(Compression::type type) { // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + bool isMetadata = true, + EncryptionProperties* encryption = nullptr + ) { + if (encryption == nullptr) { + shared_ptr tmem_transport( + new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + shared_ptr tproto = + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; + } + else { + // first 4 bytes for length + uint8_t clenBytes[4]; + clenBytes[3] = buf[3]; + clenBytes[2] = buf[2]; + clenBytes[1] = buf[1]; + clenBytes[0] = buf[0]; + + uint32_t clen = *(reinterpret_cast(clenBytes)); + + // decrypt + std::vector decrypted_buffer(clen); // TODO + std::vector key_bytes(encryption->key_length()); + for (int i = 0; i< encryption->key_length(); i++) { + key_bytes.push_back(encryption->key_bytes()[i]); + } + + int decrypted_buffer_len = parquet::decrypt( + encryption->algorithm(), isMetadata, &buf[4], clen, + key_bytes.data(), encryption->key_length(), nullptr, 0, + decrypted_buffer.data()); + + if (decrypted_buffer_len <= 0) { + throw ParquetException("Couldn't decrypt buffer\n"); + } + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new apache::thrift::transport::TMemoryBuffer( + decrypted_buffer.data(), decrypted_buffer_len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + shared_ptr tproto = + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + + *len = 4 + clen; + } } // Serialize obj into a buffer. The result is returned as a string. // The arguments are the object to be serialized and // the expected size of the serialized object template -inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { +inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, + bool isMetadata = true, + EncryptionProperties* encryption = nullptr) { shared_ptr mem_buffer( new apache::thrift::transport::TMemoryBuffer(len)); apache::thrift::protocol::TCompactProtocolFactoryT< @@ -150,8 +210,23 @@ inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { uint8_t* out_buffer; uint32_t out_length; mem_buffer->getBuffer(&out_buffer, &out_length); - out->Write(out_buffer, out_length); - return out_length; + if (encryption == nullptr) { + out->Write(out_buffer, out_length); + + return out_length; + } + else { + std::vector cipher_buffer(out_length + 28 + 10); // TODO + int cipher_buffer_len = parquet::encrypt( + encryption->algorithm(), isMetadata, out_buffer, out_length, + encryption->key_bytes(), encryption->key_length(), nullptr, 0, + cipher_buffer.data()); + + out->Write(reinterpret_cast(&cipher_buffer_len), 4); + out->Write(cipher_buffer.data(), cipher_buffer_len); + + return cipher_buffer_len + 4; + } } } // namespace parquet From 41b1426daf85d28ebdb4dbe6264e9ce3d6ee1d1c Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 11 Jul 2018 18:52:34 +0700 Subject: [PATCH 04/18] apply encryption into file/column writer --- src/parquet/column_writer.cc | 59 ++++++++++++++++++++++++++++++------ src/parquet/column_writer.h | 4 ++- src/parquet/file_writer.cc | 57 ++++++++++++++++++++++++++++++++-- 3 files changed, 107 insertions(+), 13 deletions(-) diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index a65bda85..dc96a335 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -30,6 +30,7 @@ #include "parquet/thrift.h" #include "parquet/util/logging.h" #include "parquet/util/memory.h" +#include "parquet/util/crypto.h" namespace parquet { @@ -129,6 +130,7 @@ static format::Statistics ToThrift(const EncodedStatistics& row_group_statistics class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(OutputStream* sink, Compression::type codec, + std::shared_ptr encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : sink_(sink), @@ -138,7 +140,8 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_(0), data_page_offset_(0), total_uncompressed_size_(0), - total_compressed_size_(0) { + total_compressed_size_(0), + encryption_(encryption) { compressor_ = GetCodecFromArrow(codec); } @@ -159,10 +162,26 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_encoding(ToThrift(page.encoding())); dict_page_header.__set_is_sorted(page.is_sorted()); + const uint8_t* data = compressed_data->data(); + int data_len = static_cast(compressed_data->size()); + + std::vector cdata; + if (encryption_.get()) { + int plen = data_len; + cdata.resize(plen + 28 + 10); + int clen = parquet::encrypt(encryption_->algorithm(), false, + compressed_data->data(), plen, + encryption_->key_bytes(), encryption_->key_length(), + nullptr, 0, + cdata.data()); + data = cdata.data(); + data_len = clen; + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(data_len)); page_header.__set_dictionary_page_header(dict_page_header); // TODO(PARQUET-594) crc checksum @@ -171,11 +190,13 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_ = start_pos; } int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), sink_); - sink_->Write(compressed_data->data(), compressed_data->size()); + SerializeThriftMsg(&page_header, sizeof(format::PageHeader), + sink_, true, encryption_.get()); + + sink_->Write(data, data_len); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += data_len + header_size; return sink_->Tell() - start_pos; } @@ -224,10 +245,25 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.repetition_level_encoding())); data_page_header.__set_statistics(ToThrift(page.statistics())); + const uint8_t* data = compressed_data->data(); + int data_len = static_cast(compressed_data->size()); + + std::vector cdata; + if (encryption_.get()) { + int plen = data_len; + cdata.resize(plen + 28 + 10); + int clen = parquet::encrypt(encryption_->algorithm(), false, + compressed_data->data(), plen, + encryption_->key_bytes(), encryption_->key_length(), + nullptr, 0, cdata.data()); + data = cdata.data(); + data_len = clen; + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(data_len)); page_header.__set_data_page_header(data_page_header); // TODO(PARQUET-594) crc checksum @@ -237,11 +273,13 @@ class SerializedPageWriter : public PageWriter { } int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), sink_); - sink_->Write(compressed_data->data(), compressed_data->size()); + SerializeThriftMsg(&page_header, sizeof(format::PageHeader), + sink_, true, encryption_.get()); + + sink_->Write(data, data_len); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += data_len + header_size; num_values_ += page.num_values(); return sink_->Tell() - start_pos; @@ -271,6 +309,8 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr<::arrow::Codec> compressor_; + + std::shared_ptr encryption_; }; // This implementation of the PageWriter writes to the final sink on Close . @@ -321,6 +361,7 @@ class BufferedPageWriter : public PageWriter { }; std::unique_ptr PageWriter::Open(OutputStream* sink, Compression::type codec, + std::shared_ptr encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool, bool buffered_row_group) { diff --git a/src/parquet/column_writer.h b/src/parquet/column_writer.h index 1ba428a9..7c60c279 100644 --- a/src/parquet/column_writer.h +++ b/src/parquet/column_writer.h @@ -74,7 +74,9 @@ class PageWriter { virtual ~PageWriter() {} static std::unique_ptr Open( - OutputStream* sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + OutputStream* sink, Compression::type codec, + std::shared_ptr encryption, + ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool buffered_row_group = false); diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 30673c59..6f096302 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -34,6 +34,7 @@ namespace parquet { // FIXME: copied from reader-internal.cc static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', '2'}; // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -123,7 +124,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), col_meta, + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + properties_->encryption(column_descr->path()), col_meta, // TODO properties_->memory_pool()); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -323,8 +325,57 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr row_group_writer_; void StartFile() { - // Parquet files always start with PAR1 - sink_->Write(PARQUET_MAGIC, 4); + if (properties_->file_encryption() == nullptr) { + // Parquet files always start with PAR1 + sink_->Write(PARQUET_MAGIC, 4); + } + else { + sink_->Write(PARQUET_EMAGIC, 4); + } + } + + void WriteMetaData() { + auto file_encryption = properties_->file_encryption(); + if (file_encryption == nullptr) { + // Write MetaData + uint32_t metadata_len = static_cast(sink_->Tell()); + + // Get a FileMetaData + auto metadata = metadata_->Finish(); + metadata->WriteTo(sink_.get()); + metadata_len = static_cast(sink_->Tell()) - metadata_len; + + // Write Footer + sink_->Write(reinterpret_cast(&metadata_len), 4); + sink_->Write(PARQUET_MAGIC, 4); + } + else { + // Write MetaData with encryption + uint64_t metadata_start = static_cast(sink_->Tell()); + + auto metadata = metadata_->Finish(); + if (file_encryption->encrypted_footer()) { + auto footer_encryption = file_encryption->footer_encryption(); + metadata->WriteTo(sink_.get(), footer_encryption.get()); + } + else { + metadata->WriteTo(sink_.get()); + } + + WriteFileEncryptMetaData(metadata_start); + sink_->Write(PARQUET_EMAGIC, 4); + } + } + + void WriteFileEncryptMetaData(int64_t footerOffset) { + uint64_t crypto_offset = static_cast(sink_->Tell()); + + // Get a FileCryptoMetaData + auto crypto_metadata = metadata_->GetCryptoMetaData(footerOffset); + crypto_metadata->WriteTo(sink_.get()); + + auto crypto_len = static_cast(sink_->Tell()) - crypto_offset; + sink_->Write(reinterpret_cast(&crypto_len), 4); } }; From 5daeb18632eaf2d26ceed42e53850efa44b62431 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 17 Jul 2018 09:44:07 +0700 Subject: [PATCH 05/18] improve code after reviewing: * remove isMetadata param in thrift * add calculate_cipher_size(), calculate_plain_size() method in EncryptionProperties * manually zero out secret key_ in EncryptionProperties destructor --- src/parquet/column_writer.cc | 8 ++++---- src/parquet/file_writer.cc | 4 ++-- src/parquet/metadata.cc | 8 ++++---- src/parquet/properties.h | 18 ++++++++++++++++++ src/parquet/thrift.h | 15 +++++---------- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index dc96a335..23d8ffc1 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -168,7 +168,7 @@ class SerializedPageWriter : public PageWriter { std::vector cdata; if (encryption_.get()) { int plen = data_len; - cdata.resize(plen + 28 + 10); + cdata.resize(encryption_->calculate_cipher_size(plen)); int clen = parquet::encrypt(encryption_->algorithm(), false, compressed_data->data(), plen, encryption_->key_bytes(), encryption_->key_length(), @@ -191,7 +191,7 @@ class SerializedPageWriter : public PageWriter { } int64_t header_size = SerializeThriftMsg(&page_header, sizeof(format::PageHeader), - sink_, true, encryption_.get()); + sink_, encryption_.get()); sink_->Write(data, data_len); @@ -251,7 +251,7 @@ class SerializedPageWriter : public PageWriter { std::vector cdata; if (encryption_.get()) { int plen = data_len; - cdata.resize(plen + 28 + 10); + cdata.resize(encryption_->calculate_cipher_size(plen)); int clen = parquet::encrypt(encryption_->algorithm(), false, compressed_data->data(), plen, encryption_->key_bytes(), encryption_->key_length(), @@ -274,7 +274,7 @@ class SerializedPageWriter : public PageWriter { int64_t header_size = SerializeThriftMsg(&page_header, sizeof(format::PageHeader), - sink_, true, encryption_.get()); + sink_, encryption_.get()); sink_->Write(data, data_len); diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 6f096302..8cc869ac 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -362,12 +362,12 @@ class FileSerializer : public ParquetFileWriter::Contents { metadata->WriteTo(sink_.get()); } - WriteFileEncryptMetaData(metadata_start); + WriteFileCryptoMetaData(metadata_start); sink_->Write(PARQUET_EMAGIC, 4); } } - void WriteFileEncryptMetaData(int64_t footerOffset) { + void WriteFileCryptoMetaData(int64_t footerOffset) { uint64_t crypto_offset = static_cast(sink_->Tell()); // Get a FileCryptoMetaData diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index 635e7e18..ce3cc120 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -369,7 +369,7 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); - DeserializeThriftMsg(metadata, metadata_len, metadata_.get(), true, encryption.get()); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get(), encryption.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -399,7 +399,7 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(OutputStream* dst, EncryptionProperties* encryption) const { - SerializeThriftMsg(metadata_.get(), 1024, dst, true, encryption); } + SerializeThriftMsg(metadata_.get(), 1024, dst, encryption); } std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -557,7 +557,7 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { void WriteTo(OutputStream* dst) { - SerializeThriftMsg(metadata_.get(), 1024, dst, true); + SerializeThriftMsg(metadata_.get(), 1024, dst); } @@ -804,7 +804,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { auto encrypt_props = properties_->encryption(column_->path()); uint64_t metadata_start = sink->Tell(); - SerializeThriftMsg(&meta_data_, sizeof(format::ColumnMetaData), sink, true, + SerializeThriftMsg(&meta_data_, sizeof(format::ColumnMetaData), sink, encrypt_props.get()); // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. diff --git a/src/parquet/properties.h b/src/parquet/properties.h index f93bd23a..8e54f6b6 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -55,6 +55,12 @@ class PARQUET_EXPORT EncryptionProperties { std::string key_metadata, std::string aad) : algorithm_(algorithm), key_(key), key_metadata_(key_metadata), aad_(aad) {} + ~EncryptionProperties() { + for (int i = 0; i < key_.length(); i++) { + key_[i] = '\0'; + } + } + int key_length() { return static_cast(key_.length()); } uint8_t* key_bytes() { return str2bytes(key_); } @@ -67,6 +73,18 @@ class PARQUET_EXPORT EncryptionProperties { std::string key() { return key_; } + uint32_t calculate_cipher_size(uint32_t plain_len) { + if (algorithm_ == Encryption::AES_GCM_V1) return plain_len + 28; + else if (algorithm_ == Encryption::AES_GCM_CTR_V1) return plain_len + 16; + return plain_len; + } + + uint32_t calculate_plain_size(uint32_t cipher_len) { + if (algorithm_ == Encryption::AES_GCM_V1) return cipher_len - 28; + else if (algorithm_ == Encryption::AES_GCM_CTR_V1) return cipher_len - 16; + return cipher_len; + } + private: std::string key_; std::string key_metadata_; diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 37f655af..0e417b83 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -117,7 +117,6 @@ static inline format::EncryptionAlgorithm::type ToThrift(Encryption::type type) // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - bool isMetadata = true, EncryptionProperties* encryption = nullptr ) { if (encryption == nullptr) { @@ -141,22 +140,19 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali else { // first 4 bytes for length uint8_t clenBytes[4]; - clenBytes[3] = buf[3]; - clenBytes[2] = buf[2]; - clenBytes[1] = buf[1]; - clenBytes[0] = buf[0]; + memcpy(clenBytes, buf, 4); uint32_t clen = *(reinterpret_cast(clenBytes)); // decrypt - std::vector decrypted_buffer(clen); // TODO + std::vector decrypted_buffer(encryption->calculate_plain_size(clen)); std::vector key_bytes(encryption->key_length()); for (int i = 0; i< encryption->key_length(); i++) { key_bytes.push_back(encryption->key_bytes()[i]); } int decrypted_buffer_len = parquet::decrypt( - encryption->algorithm(), isMetadata, &buf[4], clen, + encryption->algorithm(), true, &buf[4], clen, key_bytes.data(), encryption->key_length(), nullptr, 0, decrypted_buffer.data()); @@ -189,7 +185,6 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali // the expected size of the serialized object template inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, - bool isMetadata = true, EncryptionProperties* encryption = nullptr) { shared_ptr mem_buffer( new apache::thrift::transport::TMemoryBuffer(len)); @@ -216,9 +211,9 @@ inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, return out_length; } else { - std::vector cipher_buffer(out_length + 28 + 10); // TODO + std::vector cipher_buffer(encryption->calculate_cipher_size(len)); int cipher_buffer_len = parquet::encrypt( - encryption->algorithm(), isMetadata, out_buffer, out_length, + encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), encryption->key_length(), nullptr, 0, cipher_buffer.data()); From b64c020728cf2f8ca19bdd97d3194538c4bb9e31 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 19 Jul 2018 15:32:25 +0700 Subject: [PATCH 06/18] fix issues on encrypted writing --- src/parquet/metadata.cc | 2 ++ src/parquet/properties.h | 18 ++++++++++++------ src/parquet/thrift.h | 7 ++----- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index ce3cc120..b14626b0 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -780,6 +780,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // file is not encrypted or uniform encrypted if (encrypt_md == nullptr) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(meta_data_); SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); } else { // file is non-uniform encrypted diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 8e54f6b6..f0d8ae08 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -41,7 +41,7 @@ static bool DEFAULT_USE_BUFFERED_STREAM = false; // should find a better name??? class PARQUET_EXPORT EncryptionProperties { private: - static inline uint8_t* str2bytes(std::string str) + static inline uint8_t* str2bytes(std::string& str) { if (str.empty()) return nullptr; @@ -103,7 +103,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::string key() { return key_; } std::string key_metadata() { return key_metadata_; } - void set_encryption_key(std::string key, uint32_t key_id) + void set_encryption_key(std::string key, uint32_t key_id = 0) { std::string key_metadata = key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4); @@ -158,7 +158,13 @@ class PARQUET_EXPORT FileDecryptionProperties { if (key.length() != 16 && key.length() != 24 && key.length() != 32) throw ParquetException("Wrong key length " + key.length()); - // TODO add to columns_ + for (auto path = paths.begin(); path != paths.end(); path++) { + column_keys_[*path] = key; + } + } + + std::string column_key(std::string path) { + return column_keys_[path]; } std::string footer_key() { return footer_key_; } @@ -168,7 +174,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string footer_key_; std::string aad_; - std::vector columns_; + std::unordered_map column_keys_; }; class PARQUET_EXPORT ReaderProperties { @@ -317,7 +323,7 @@ class PARQUET_EXPORT FileEncryptionProperties { encrypt_the_rest_ = encrypt_the_rest; columns_ = columns; - if (!footer_key_metadata_.empty()) { + if (!footer_key_.empty()) { single_key_encryption_ = true; for (auto col = columns.begin(); col != columns.end(); col++) { @@ -328,7 +334,7 @@ class PARQUET_EXPORT FileEncryptionProperties { } } else { - if (encrypt_the_rest) throw std::invalid_argument("Encrypt the rest with null footer key"); + if (encrypt_the_rest) throw ParquetException("Encrypt the rest with null footer key"); bool all_are_unencrypted = true; for (auto col = columns.begin(); col != columns.end(); col++) { if (col->encrypted()) { diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 0e417b83..0ea89271 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -146,14 +146,11 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali // decrypt std::vector decrypted_buffer(encryption->calculate_plain_size(clen)); - std::vector key_bytes(encryption->key_length()); - for (int i = 0; i< encryption->key_length(); i++) { - key_bytes.push_back(encryption->key_bytes()[i]); - } int decrypted_buffer_len = parquet::decrypt( encryption->algorithm(), true, &buf[4], clen, - key_bytes.data(), encryption->key_length(), nullptr, 0, + encryption->key_bytes(), encryption->key_length(), + encryption->aad_bytes(), encryption->aad_length(), decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { From 3a1c144fe637b046d6810ab3256757535fc69e21 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 19 Jul 2018 15:53:31 +0700 Subject: [PATCH 07/18] change file/column readers to support encryption --- src/parquet/column_reader.cc | 30 ++++- src/parquet/column_reader.h | 3 +- src/parquet/file_reader.cc | 219 ++++++++++++++++++++++++++++++----- 3 files changed, 217 insertions(+), 35 deletions(-) diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index 28d0dcb6..f8b05820 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -33,6 +33,8 @@ #include "parquet/properties.h" #include "parquet/thrift.h" +#include "parquet/util/crypto.h" + using arrow::MemoryPool; namespace parquet { @@ -102,11 +104,14 @@ ReaderProperties default_reader_properties() { class SerializedPageReader : public PageReader { public: SerializedPageReader(std::unique_ptr stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool) + Compression::type codec, + std::shared_ptr encryption, + ::arrow::MemoryPool* pool) : stream_(std::move(stream)), decompression_buffer_(AllocateBuffer(pool, 0)), seen_num_rows_(0), - total_num_rows_(total_num_rows) { + total_num_rows_(total_num_rows), + encryption_(encryption) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); } @@ -134,6 +139,9 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; + + // Encryption + std::shared_ptr encryption_; }; std::shared_ptr SerializedPageReader::NextPage() { @@ -158,7 +166,7 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(bytes_available); try { - DeserializeThriftMsg(buffer, &header_size, ¤t_page_header_); + DeserializeThriftMsg(buffer, &header_size, ¤t_page_header_, encryption_.get()); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -186,6 +194,19 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } + std::vector ptext; + if (encryption_.get()) { + int clen = compressed_len; + ptext.resize(encryption_->calculate_plain_size(clen)); + int plen = parquet::decrypt(encryption_->algorithm(), false, buffer, clen, + encryption_->key_bytes(), encryption_->key_length(), + encryption_->aad_bytes(), encryption_->aad_length(), + ptext.data()); + + buffer = ptext.data(); + compressed_len = plen; + } + // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. @@ -257,9 +278,10 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open(std::unique_ptr stream, int64_t total_num_rows, Compression::type codec, + std::shared_ptr encryption, ::arrow::MemoryPool* pool) { return std::unique_ptr( - new SerializedPageReader(std::move(stream), total_num_rows, codec, pool)); + new SerializedPageReader(std::move(stream), total_num_rows, codec, encryption, pool)); } // ---------------------------------------------------------------------- diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h index 71346320..ef0cfe9a 100644 --- a/src/parquet/column_reader.h +++ b/src/parquet/column_reader.h @@ -35,6 +35,7 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/exception.h" +#include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/memory.h" @@ -86,7 +87,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( std::unique_ptr stream, int64_t total_num_rows, - Compression::type codec, + Compression::type codec, std::shared_ptr encryption, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index c5a0f342..9d831b31 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -53,6 +53,7 @@ namespace parquet { static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024; static constexpr uint32_t FOOTER_SIZE = 8; static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', '2'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; @@ -89,9 +90,13 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(RandomAccessSource* source, FileMetaData* file_metadata, + FileCryptoMetaData* file_crypto_metadata, int row_group_number, const ReaderProperties& props) - : source_(source), file_metadata_(file_metadata), properties_(props) { + : source_(source), file_metadata_(file_metadata), + file_crypto_metadata_(file_crypto_metadata), properties_(props) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); + +// if (row_group_number == 0) // TODO } const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); } @@ -122,14 +127,95 @@ class SerializedRowGroup : public RowGroupReader::Contents { } stream = properties_.GetStream(source_, col_start, col_length); + std::unique_ptr crypto_meta_data = col->crypto_meta_data(); + + bool encrypted = true; + + // file is unencrypted + if (!file_crypto_metadata_) { + encrypted = false; + } + // file is encrypted but column is unencrypted + if (crypto_meta_data && !crypto_meta_data->encrypted()) { + encrypted = false; + } + + if (!encrypted) { + return PageReader::Open(std::move(stream), col->num_values(), col->compression(), + nullptr, properties_.memory_pool()); + } + + bool encrypted_with_footer_key = false; + + // file is uniform encrypted + if (crypto_meta_data == nullptr) { + encrypted_with_footer_key = true; + } + + // file is non-uniform encrypted + else if (crypto_meta_data->encrypted() + && crypto_meta_data->encrypted_with_footer_key()) { + encrypted_with_footer_key = true; + } + + auto file_decryption = properties_.file_decryption(); + + if (encrypted_with_footer_key) { + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + std::string footer_key; + + if (footer_key_metadata.empty()) { + footer_key = file_decryption->footer_key(); + } + else { + // TODO: get key retriver to get key + } + + if (footer_key.empty()) { + throw ParquetException("column is encrypted with null footer key"); + } + + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm(), + footer_key, + footer_key_metadata, + file_decryption->aad() + ); + + return PageReader::Open(std::move(stream), col->num_values(), col->compression(), + footer_encryption, properties_.memory_pool()); + } + + // encrypted with column key + std::string column_key_metadata = crypto_meta_data->column_key_metadata(); + std::string column_key; + if (column_key_metadata.empty()) { + column_key = file_decryption->column_key(col->path_in_schema()->ToDotString()); + } + else { + // TODO: get from key retriever + } + + if (column_key.empty()) { + throw ParquetException("column is encrypted with null key, path=" + + col->path_in_schema()->ToDotString()); + } + auto column_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm(), + column_key, + column_key_metadata, + file_decryption->aad() + ); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), - properties_.memory_pool()); + column_encryption, properties_.memory_pool()); } private: RandomAccessSource* source_; FileMetaData* file_metadata_; + FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; }; @@ -157,7 +243,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( - new SerializedRowGroup(source_.get(), file_metadata_.get(), i, properties_)); + new SerializedRowGroup(source_.get(), file_metadata_.get(), file_crypto_metadata_.get(), i, properties_)); return std::make_shared(std::move(contents)); } @@ -180,42 +266,115 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(file_size - footer_read_size, footer_read_size, footer_buffer); // Check if all bytes are read. Check if last 4 bytes read have the magic bits - if (bytes_read != footer_read_size || - memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) != 0) { - throw ParquetException("Invalid parquet file. Corrupt footer."); - } + // no encryption + if (bytes_read == footer_read_size && memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) == 0) + { + uint32_t metadata_len = + *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); + int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; + if (FOOTER_SIZE + metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } - uint32_t metadata_len = - *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); - int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; - if (FOOTER_SIZE + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } + std::shared_ptr metadata_buffer = + AllocateBuffer(properties_.memory_pool(), metadata_len); - std::shared_ptr metadata_buffer = - AllocateBuffer(properties_.memory_pool(), metadata_len); - - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + FOOTER_SIZE)) { - memcpy(metadata_buffer->mutable_data(), - footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), - metadata_len); - } else { - bytes_read = - source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data()); - if (bytes_read != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (metadata_len + FOOTER_SIZE)) { + memcpy(metadata_buffer->mutable_data(), + footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), + metadata_len); + } else { + bytes_read = + source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data()); + if (bytes_read != metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } + // encryption + else if (bytes_read == footer_read_size && memcmp(footer_buffer + footer_read_size - 4, PARQUET_EMAGIC, 4) == 0) + { + // read crypto metadata + uint32_t crypto_metadata_len = + *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); + int64_t crypto_metadata_start = file_size - FOOTER_SIZE - crypto_metadata_len; + + if (FOOTER_SIZE + crypto_metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr crypto_metadata_buffer = + AllocateBuffer(properties_.memory_pool(), crypto_metadata_len); - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (crypto_metadata_len + FOOTER_SIZE)) { + memcpy(crypto_metadata_buffer->mutable_data(), + footer_buffer + (footer_read_size - crypto_metadata_len - FOOTER_SIZE), + crypto_metadata_len); + } else { + bytes_read = + source_->ReadAt(crypto_metadata_start, crypto_metadata_len, + crypto_metadata_buffer->mutable_data()); + if (bytes_read != crypto_metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + file_crypto_metadata_ = FileCryptoMetaData::Make( + crypto_metadata_buffer->data(), + &crypto_metadata_len); + + int64_t footer_offset = file_crypto_metadata_->footer_offset(); + uint32_t footer_read_size = (uint32_t) (crypto_metadata_start - footer_offset); + + std::shared_ptr footer_buffer = + AllocateBuffer(properties_.memory_pool(), footer_read_size); + bytes_read = + source_->ReadAt(footer_offset, footer_read_size, footer_buffer->mutable_data()); + + if (file_crypto_metadata_->encrypted_footer()) { + // get footer key metadata + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + std::string footer_key; + auto file_decryption = properties_.file_decryption(); + if (footer_key_metadata.empty()) { + footer_key = file_decryption->footer_key(); + } + else { + // TODO: get key retriver to get key + } + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm(), + footer_key, + footer_key_metadata, + file_decryption->aad() + ); + + file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size, + footer_encryption); + } + else { + file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size); + } + } + // error + else { + throw ParquetException("Invalid parquet file. Corrupt footer."); + } } private: std::unique_ptr source_; std::shared_ptr file_metadata_; + std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; }; From 2de348f293e95df153fffadc5f5d23bbfb81dc71 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 19 Jul 2018 17:58:09 +0700 Subject: [PATCH 08/18] pass aad into parquet::encrypt() method --- src/parquet/column_writer.cc | 5 +++-- src/parquet/thrift.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 23d8ffc1..fd90e7a0 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -172,7 +172,7 @@ class SerializedPageWriter : public PageWriter { int clen = parquet::encrypt(encryption_->algorithm(), false, compressed_data->data(), plen, encryption_->key_bytes(), encryption_->key_length(), - nullptr, 0, + encryption_->aad_bytes(), encryption_->aad_length(), cdata.data()); data = cdata.data(); data_len = clen; @@ -255,7 +255,8 @@ class SerializedPageWriter : public PageWriter { int clen = parquet::encrypt(encryption_->algorithm(), false, compressed_data->data(), plen, encryption_->key_bytes(), encryption_->key_length(), - nullptr, 0, cdata.data()); + encryption_->aad_bytes(), encryption_->aad_length(), + cdata.data()); data = cdata.data(); data_len = clen; } diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 0ea89271..1575b07d 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -211,7 +211,8 @@ inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, std::vector cipher_buffer(encryption->calculate_cipher_size(len)); int cipher_buffer_len = parquet::encrypt( encryption->algorithm(), true, out_buffer, out_length, - encryption->key_bytes(), encryption->key_length(), nullptr, 0, + encryption->key_bytes(), encryption->key_length(), + encryption->aad_bytes(), encryption->aad_length(), cipher_buffer.data()); out->Write(reinterpret_cast(&cipher_buffer_len), 4); From a52a86ffe2b8f7e0ddffa1f2cce7ce9af4fe3ae6 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 23 Jul 2018 17:39:12 +0700 Subject: [PATCH 09/18] add integer/string key retriever --- src/parquet/decryption_key_retriever.cc | 29 ++++++++++++++ src/parquet/decryption_key_retriever.h | 53 +++++++++++++++++++++++++ src/parquet/file_reader.cc | 32 ++++----------- src/parquet/properties.h | 51 +++++++++++++++++++----- 4 files changed, 131 insertions(+), 34 deletions(-) create mode 100644 src/parquet/decryption_key_retriever.cc create mode 100644 src/parquet/decryption_key_retriever.h diff --git a/src/parquet/decryption_key_retriever.cc b/src/parquet/decryption_key_retriever.cc new file mode 100644 index 00000000..5afd3013 --- /dev/null +++ b/src/parquet/decryption_key_retriever.cc @@ -0,0 +1,29 @@ +#include "decryption_key_retriever.h" + +namespace parquet { + +// integer key retriever +void IntegerKeyIdRetriever::put_key(uint32_t key_id, const std::string& key) { + key_map_[key_id] = key; +} + +std::string IntegerKeyIdRetriever::get_key(const std::string& key_metadata) +{ + uint32_t key_id; + memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + + return key_map_[key_id]; +} + +// string key retriever +void StringKeyIdRetriever::put_key(const std::string& key_id, const std::string& key) +{ + key_map_[key_id] = key; +} + +std::string StringKeyIdRetriever::get_key(const std::string& key_id) +{ + return key_map_[key_id]; +} + +} // namespace parquet diff --git a/src/parquet/decryption_key_retriever.h b/src/parquet/decryption_key_retriever.h new file mode 100644 index 00000000..8d0770cc --- /dev/null +++ b/src/parquet/decryption_key_retriever.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_DECRYPTION_RETRIEVER_H +#define PARQUET_DECRYPTION_RETRIEVER_H + +#include +#include +#include + +namespace parquet { + +class PARQUET_EXPORT DecryptionKeyRetriever { +public: + virtual std::string get_key(const std::string& key_metadata) = 0; + virtual ~DecryptionKeyRetriever() {} +}; + +// Simple integer key retriever +class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { +public: + void put_key(uint32_t key_id, const std::string& key); + std::string get_key(const std::string& key_metadata); +private: + std::map key_map_; +}; + +// Simple string key retriever +class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { +public: + void put_key(const std::string& key_id, const std::string& key); + std::string get_key(const std::string& key_metadata); +private: + std::map key_map_; +}; + +} // namespace parquet + +#endif // PARQUET_DECRYPTION_RETRIEVER_H diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index 9d831b31..46a8a646 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -95,8 +95,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { : source_(source), file_metadata_(file_metadata), file_crypto_metadata_(file_crypto_metadata), properties_(props) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); - -// if (row_group_number == 0) // TODO } const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); } @@ -162,14 +160,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (encrypted_with_footer_key) { std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); - std::string footer_key; - - if (footer_key_metadata.empty()) { - footer_key = file_decryption->footer_key(); - } - else { - // TODO: get key retriver to get key - } + std::string footer_key = file_decryption->footer_key(footer_key_metadata); if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); @@ -187,15 +178,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { footer_encryption, properties_.memory_pool()); } - // encrypted with column key std::string column_key_metadata = crypto_meta_data->column_key_metadata(); - std::string column_key; - if (column_key_metadata.empty()) { - column_key = file_decryption->column_key(col->path_in_schema()->ToDotString()); - } - else { - // TODO: get from key retriever - } + // encrypted with column key + std::string column_key = file_decryption->column_key( + col->path_in_schema()->ToDotString(), + crypto_meta_data->column_key_metadata()); if (column_key.empty()) { throw ParquetException("column is encrypted with null key, path=" @@ -342,14 +329,9 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_crypto_metadata_->encrypted_footer()) { // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); - std::string footer_key; + auto file_decryption = properties_.file_decryption(); - if (footer_key_metadata.empty()) { - footer_key = file_decryption->footer_key(); - } - else { - // TODO: get key retriver to get key - } + std::string footer_key = file_decryption->footer_key(footer_key_metadata); auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm(), diff --git a/src/parquet/properties.h b/src/parquet/properties.h index f0d8ae08..62142a6a 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -28,6 +28,7 @@ #include "parquet/types.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" +#include "parquet/decryption_key_retriever.h" namespace parquet { @@ -142,17 +143,17 @@ class PARQUET_EXPORT FileDecryptionProperties { } } - // TODO - // FileDecryptionProperties(std::shared_ptr keyRetriever) {} + FileDecryptionProperties(std::shared_ptr key_retriever) + : key_retriever_(key_retriever) {} - void aad(std::string aad) { aad_ = aad; } + void set_aad(std::string aad) { aad_ = aad; } - void column_key(std::string name, std::string key) + void set_column_key(std::string name, std::string key) { - column_key(std::vector({name}), key); + set_column_key(std::vector({name}), key); } - void column_key(std::vector paths, std::string key) + void set_column_key(std::vector paths, std::string key) { if (key.empty()) throw ParquetException("Decryption: null column key"); if (key.length() != 16 && key.length() != 24 && key.length() != 32) @@ -163,11 +164,25 @@ class PARQUET_EXPORT FileDecryptionProperties { } } - std::string column_key(std::string path) { - return column_keys_[path]; + std::string column_key(std::string path, std::string key_metadata = "") { + if (key_metadata.empty()) { + return column_keys_[path]; + } + if (key_retriever_ == nullptr) { + throw ParquetException("no key retriever is provided for column key metadata"); + } + return key_retriever_->get_key(key_metadata); } - std::string footer_key() { return footer_key_; } + std::string footer_key(std::string footer_key_metadata = "") { + if (footer_key_metadata.empty()) { + return footer_key_; + } + if (key_retriever_ == nullptr) { + throw ParquetException("no key retriever is provided for footer key metadata"); + } + return key_retriever_->get_key(footer_key_metadata); + } std::string aad() { return aad_; } private: @@ -175,6 +190,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string aad_; std::unordered_map column_keys_; + + std::shared_ptr key_retriever_; }; class PARQUET_EXPORT ReaderProperties { @@ -578,12 +595,28 @@ class PARQUET_EXPORT WriterProperties { return encryption(Encryption::AES_GCM_V1, key, 0); } + Builder* encryption(std::string key, uint32_t key_id) + { + return encryption(Encryption::AES_GCM_V1, key, key_id); + } + + Builder* encryption(std::string key, std::string key_id) + { + return encryption(Encryption::AES_GCM_V1, key, key_id); + } + Builder* encryption(Encryption::type algorithm, std::string key, uint32_t key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } + Builder* encryption(Encryption::type algorithm, std::string key, std::string key_id) + { + file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); + return this; + } + Builder* column_encryption(std::vector columns, bool encrypt_the_rest) { if (file_encryption_.get() == nullptr) From f5807f5eb17ad175223be88bdd2f8f72a205fca5 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 31 Jul 2018 16:51:50 +0700 Subject: [PATCH 10/18] improve code --- src/parquet/column_reader.cc | 33 ++- src/parquet/column_reader.h | 3 +- src/parquet/column_writer.cc | 66 +++-- src/parquet/decryption_key_retriever.cc | 34 ++- src/parquet/decryption_key_retriever.h | 14 +- src/parquet/file_reader.cc | 212 ++++++++-------- src/parquet/file_writer.cc | 11 +- src/parquet/metadata.cc | 316 ++++++++++++------------ src/parquet/metadata.h | 35 ++- src/parquet/properties.h | 206 +++++---------- src/parquet/thrift.h | 125 +++++----- src/parquet/types.h | 55 +++++ 12 files changed, 533 insertions(+), 577 deletions(-) diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index f8b05820..a2a78387 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -166,7 +166,8 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(bytes_available); try { - DeserializeThriftMsg(buffer, &header_size, ¤t_page_header_, encryption_.get()); + DeserializeThriftMsg(buffer, &header_size, ¤t_page_header_, + encryption_.get()); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -194,17 +195,15 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } - std::vector ptext; + std::vector decrypt_buffer; if (encryption_.get()) { - int clen = compressed_len; - ptext.resize(encryption_->calculate_plain_size(clen)); - int plen = parquet::decrypt(encryption_->algorithm(), false, buffer, clen, - encryption_->key_bytes(), encryption_->key_length(), - encryption_->aad_bytes(), encryption_->aad_length(), - ptext.data()); - - buffer = ptext.data(); - compressed_len = plen; + decrypt_buffer.resize(encryption_->calculate_plain_size(compressed_len)); + compressed_len = parquet::decrypt( + encryption_->algorithm(), false, buffer, compressed_len, + encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), + encryption_->aad_length(), decrypt_buffer.data()); + + buffer = decrypt_buffer.data(); } // Uncompress it if we need to @@ -275,13 +274,11 @@ std::shared_ptr SerializedPageReader::NextPage() { return std::shared_ptr(nullptr); } -std::unique_ptr PageReader::Open(std::unique_ptr stream, - int64_t total_num_rows, - Compression::type codec, - std::shared_ptr encryption, - ::arrow::MemoryPool* pool) { - return std::unique_ptr( - new SerializedPageReader(std::move(stream), total_num_rows, codec, encryption, pool)); +std::unique_ptr PageReader::Open( + std::unique_ptr stream, int64_t total_num_rows, Compression::type codec, + std::shared_ptr encryption, ::arrow::MemoryPool* pool) { + return std::unique_ptr(new SerializedPageReader( + std::move(stream), total_num_rows, codec, encryption, pool)); } // ---------------------------------------------------------------------- diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h index ef0cfe9a..089e4214 100644 --- a/src/parquet/column_reader.h +++ b/src/parquet/column_reader.h @@ -35,7 +35,6 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/exception.h" -#include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/memory.h" @@ -87,7 +86,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( std::unique_ptr stream, int64_t total_num_rows, - Compression::type codec, std::shared_ptr encryption, + Compression::type codec, std::shared_ptr encryption = nullptr, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index fd90e7a0..342ac5a3 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -28,9 +28,9 @@ #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" +#include "parquet/util/crypto.h" #include "parquet/util/logging.h" #include "parquet/util/memory.h" -#include "parquet/util/crypto.h" namespace parquet { @@ -162,26 +162,23 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_encoding(ToThrift(page.encoding())); dict_page_header.__set_is_sorted(page.is_sorted()); - const uint8_t* data = compressed_data->data(); - int data_len = static_cast(compressed_data->size()); + const uint8_t* output_data_buffer = compressed_data->data(); + int output_data_len = static_cast(compressed_data->size()); - std::vector cdata; + std::vector encrypted_data_buffer; if (encryption_.get()) { - int plen = data_len; - cdata.resize(encryption_->calculate_cipher_size(plen)); - int clen = parquet::encrypt(encryption_->algorithm(), false, - compressed_data->data(), plen, - encryption_->key_bytes(), encryption_->key_length(), - encryption_->aad_bytes(), encryption_->aad_length(), - cdata.data()); - data = cdata.data(); - data_len = clen; + encrypted_data_buffer.resize(encryption_->calculate_cipher_size(output_data_len)); + output_data_len = parquet::encrypt( + encryption_->algorithm(), false, compressed_data->data(), output_data_len, + encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), + encryption_->aad_length(), encrypted_data_buffer.data()); + output_data_buffer = encrypted_data_buffer.data(); } format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(data_len)); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_dictionary_page_header(dict_page_header); // TODO(PARQUET-594) crc checksum @@ -189,14 +186,13 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), - sink_, encryption_.get()); + int64_t header_size = SerializeThriftMsg(&page_header, sizeof(format::PageHeader), + sink_, encryption_.get()); - sink_->Write(data, data_len); + sink_->Write(output_data_buffer, output_data_len); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += data_len + header_size; + total_compressed_size_ += output_data_len + header_size; return sink_->Tell() - start_pos; } @@ -245,26 +241,23 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.repetition_level_encoding())); data_page_header.__set_statistics(ToThrift(page.statistics())); - const uint8_t* data = compressed_data->data(); - int data_len = static_cast(compressed_data->size()); + const uint8_t* output_data_buffer = compressed_data->data(); + int output_data_len = static_cast(compressed_data->size()); - std::vector cdata; + std::vector encrypted_data_buffer; if (encryption_.get()) { - int plen = data_len; - cdata.resize(encryption_->calculate_cipher_size(plen)); - int clen = parquet::encrypt(encryption_->algorithm(), false, - compressed_data->data(), plen, - encryption_->key_bytes(), encryption_->key_length(), - encryption_->aad_bytes(), encryption_->aad_length(), - cdata.data()); - data = cdata.data(); - data_len = clen; + encrypted_data_buffer.resize(encryption_->calculate_cipher_size(output_data_len)); + output_data_len = parquet::encrypt( + encryption_->algorithm(), false, compressed_data->data(), output_data_len, + encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), + encryption_->aad_length(), encrypted_data_buffer.data()); + output_data_buffer = encrypted_data_buffer.data(); } format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(data_len)); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_data_page_header(data_page_header); // TODO(PARQUET-594) crc checksum @@ -273,14 +266,13 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), - sink_, encryption_.get()); + int64_t header_size = SerializeThriftMsg(&page_header, sizeof(format::PageHeader), + sink_, encryption_.get()); - sink_->Write(data, data_len); + sink_->Write(output_data_buffer, output_data_len); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += data_len + header_size; + total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); return sink_->Tell() - start_pos; diff --git a/src/parquet/decryption_key_retriever.cc b/src/parquet/decryption_key_retriever.cc index 5afd3013..9de1bd71 100644 --- a/src/parquet/decryption_key_retriever.cc +++ b/src/parquet/decryption_key_retriever.cc @@ -1,28 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "decryption_key_retriever.h" +#include + namespace parquet { // integer key retriever void IntegerKeyIdRetriever::put_key(uint32_t key_id, const std::string& key) { - key_map_[key_id] = key; + key_map_.insert(std::make_pair(key_id, key)); } -std::string IntegerKeyIdRetriever::get_key(const std::string& key_metadata) -{ +std::string IntegerKeyIdRetriever::get_key(const std::string& key_metadata) { uint32_t key_id; - memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + std::memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); return key_map_[key_id]; } // string key retriever -void StringKeyIdRetriever::put_key(const std::string& key_id, const std::string& key) -{ - key_map_[key_id] = key; +void StringKeyIdRetriever::put_key(const std::string& key_id, const std::string& key) { + key_map_.insert(std::make_pair(key_id, key)); } -std::string StringKeyIdRetriever::get_key(const std::string& key_id) -{ +std::string StringKeyIdRetriever::get_key(const std::string& key_id) { return key_map_[key_id]; } diff --git a/src/parquet/decryption_key_retriever.h b/src/parquet/decryption_key_retriever.h index 8d0770cc..cedfbd49 100644 --- a/src/parquet/decryption_key_retriever.h +++ b/src/parquet/decryption_key_retriever.h @@ -18,33 +18,35 @@ #ifndef PARQUET_DECRYPTION_RETRIEVER_H #define PARQUET_DECRYPTION_RETRIEVER_H +#include #include #include -#include namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { -public: + public: virtual std::string get_key(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; // Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { -public: + public: void put_key(uint32_t key_id, const std::string& key); std::string get_key(const std::string& key_metadata); -private: + + private: std::map key_map_; }; // Simple string key retriever class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { -public: + public: void put_key(const std::string& key_id, const std::string& key); std::string get_key(const std::string& key_metadata); -private: + + private: std::map key_map_; }; diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index 46a8a646..75fd1317 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -90,10 +90,12 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(RandomAccessSource* source, FileMetaData* file_metadata, - FileCryptoMetaData* file_crypto_metadata, - int row_group_number, const ReaderProperties& props) - : source_(source), file_metadata_(file_metadata), - file_crypto_metadata_(file_crypto_metadata), properties_(props) { + FileCryptoMetaData* file_crypto_metadata, int row_group_number, + const ReaderProperties& props) + : source_(source), + file_metadata_(file_metadata), + file_crypto_metadata_(file_crypto_metadata), + properties_(props) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -130,11 +132,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { bool encrypted = true; // file is unencrypted - if (!file_crypto_metadata_) { - encrypted = false; - } - // file is encrypted but column is unencrypted - if (crypto_meta_data && !crypto_meta_data->encrypted()) { + // or file is encrypted but column is unencrypted + if (!file_crypto_metadata_ || (crypto_meta_data && !crypto_meta_data->encrypted())) { encrypted = false; } @@ -146,15 +145,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { bool encrypted_with_footer_key = false; // file is uniform encrypted - if (crypto_meta_data == nullptr) { + // or file is non-uniform encrypted and the column is encrypted with footer key + if (crypto_meta_data == nullptr || (crypto_meta_data->encrypted() && + crypto_meta_data->encrypted_with_footer_key())) { encrypted_with_footer_key = true; } - // file is non-uniform encrypted - else if (crypto_meta_data->encrypted() - && crypto_meta_data->encrypted_with_footer_key()) { - encrypted_with_footer_key = true; - } + // file is non-uniform encrypted and the column is encrypted with its own key auto file_decryption = properties_.file_decryption(); @@ -166,13 +163,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { throw ParquetException("column is encrypted with null footer key"); } - auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm(), - footer_key, - footer_key_metadata, - file_decryption->aad() - ); + file_crypto_metadata_->encryption_algorithm(), footer_key, footer_key_metadata, + file_decryption->aad()); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), footer_encryption, properties_.memory_pool()); @@ -181,19 +174,15 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string column_key_metadata = crypto_meta_data->column_key_metadata(); // encrypted with column key std::string column_key = file_decryption->column_key( - col->path_in_schema()->ToDotString(), - crypto_meta_data->column_key_metadata()); + col->path_in_schema()->ToDotString(), crypto_meta_data->column_key_metadata()); if (column_key.empty()) { - throw ParquetException("column is encrypted with null key, path=" - + col->path_in_schema()->ToDotString()); + throw ParquetException("column is encrypted with null key, path=" + + col->path_in_schema()->ToDotString()); } auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm(), - column_key, - column_key_metadata, - file_decryption->aad() - ); + file_crypto_metadata_->encryption_algorithm(), column_key, column_key_metadata, + file_decryption->aad()); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), column_encryption, properties_.memory_pool()); @@ -230,7 +219,8 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( - new SerializedRowGroup(source_.get(), file_metadata_.get(), file_crypto_metadata_.get(), i, properties_)); + new SerializedRowGroup(source_.get(), file_metadata_.get(), + file_crypto_metadata_.get(), i, properties_)); return std::make_shared(std::move(contents)); } @@ -254,98 +244,92 @@ class SerializedFile : public ParquetFileReader::Contents { // Check if all bytes are read. Check if last 4 bytes read have the magic bits // no encryption - if (bytes_read == footer_read_size && memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) == 0) - { - uint32_t metadata_len = - *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); - int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; - if (FOOTER_SIZE + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } - - std::shared_ptr metadata_buffer = - AllocateBuffer(properties_.memory_pool(), metadata_len); + if (bytes_read == footer_read_size && + memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) == 0) { + uint32_t metadata_len = + *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); + int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; + if (FOOTER_SIZE + metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + FOOTER_SIZE)) { - memcpy(metadata_buffer->mutable_data(), - footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), - metadata_len); - } else { - bytes_read = - source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data()); - if (bytes_read != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } + std::shared_ptr metadata_buffer = + AllocateBuffer(properties_.memory_pool(), metadata_len); + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (metadata_len + FOOTER_SIZE)) { + memcpy(metadata_buffer->mutable_data(), + footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), + metadata_len); + } else { + bytes_read = source_->ReadAt(metadata_start, metadata_len, + metadata_buffer->mutable_data()); + if (bytes_read != metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } + } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } // encryption - else if (bytes_read == footer_read_size && memcmp(footer_buffer + footer_read_size - 4, PARQUET_EMAGIC, 4) == 0) - { - // read crypto metadata - uint32_t crypto_metadata_len = - *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); - int64_t crypto_metadata_start = file_size - FOOTER_SIZE - crypto_metadata_len; - - if (FOOTER_SIZE + crypto_metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); + else if (bytes_read == footer_read_size && + memcmp(footer_buffer + footer_read_size - 4, PARQUET_EMAGIC, 4) == 0) { + // read crypto metadata + uint32_t crypto_metadata_len = + *reinterpret_cast(footer_buffer + footer_read_size - FOOTER_SIZE); + int64_t crypto_metadata_start = file_size - FOOTER_SIZE - crypto_metadata_len; + + if (FOOTER_SIZE + crypto_metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr crypto_metadata_buffer = + AllocateBuffer(properties_.memory_pool(), crypto_metadata_len); + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (crypto_metadata_len + FOOTER_SIZE)) { + memcpy(crypto_metadata_buffer->mutable_data(), + footer_buffer + (footer_read_size - crypto_metadata_len - FOOTER_SIZE), + crypto_metadata_len); + } else { + bytes_read = source_->ReadAt(crypto_metadata_start, crypto_metadata_len, + crypto_metadata_buffer->mutable_data()); + if (bytes_read != crypto_metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } + } - std::shared_ptr crypto_metadata_buffer = - AllocateBuffer(properties_.memory_pool(), crypto_metadata_len); + file_crypto_metadata_ = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (crypto_metadata_len + FOOTER_SIZE)) { - memcpy(crypto_metadata_buffer->mutable_data(), - footer_buffer + (footer_read_size - crypto_metadata_len - FOOTER_SIZE), - crypto_metadata_len); - } else { - bytes_read = - source_->ReadAt(crypto_metadata_start, crypto_metadata_len, - crypto_metadata_buffer->mutable_data()); - if (bytes_read != crypto_metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } - } + int64_t footer_offset = file_crypto_metadata_->footer_offset(); + uint32_t footer_read_size = (uint32_t)(crypto_metadata_start - footer_offset); - file_crypto_metadata_ = FileCryptoMetaData::Make( - crypto_metadata_buffer->data(), - &crypto_metadata_len); - - int64_t footer_offset = file_crypto_metadata_->footer_offset(); - uint32_t footer_read_size = (uint32_t) (crypto_metadata_start - footer_offset); - - std::shared_ptr footer_buffer = - AllocateBuffer(properties_.memory_pool(), footer_read_size); - bytes_read = - source_->ReadAt(footer_offset, footer_read_size, footer_buffer->mutable_data()); - - if (file_crypto_metadata_->encrypted_footer()) { - // get footer key metadata - std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); - - auto file_decryption = properties_.file_decryption(); - std::string footer_key = file_decryption->footer_key(footer_key_metadata); - - auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm(), - footer_key, - footer_key_metadata, - file_decryption->aad() - ); - - file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size, - footer_encryption); - } - else { - file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size); - } + std::shared_ptr footer_buffer = + AllocateBuffer(properties_.memory_pool(), footer_read_size); + bytes_read = + source_->ReadAt(footer_offset, footer_read_size, footer_buffer->mutable_data()); + + if (file_crypto_metadata_->encrypted_footer()) { + // get footer key metadata + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + + auto file_decryption = properties_.file_decryption(); + std::string footer_key = file_decryption->footer_key(footer_key_metadata); + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm(), footer_key, + footer_key_metadata, file_decryption->aad()); + + file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size, + footer_encryption); + } else { + file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size); + } } // error else { diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 8cc869ac..1fd56d40 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -125,7 +125,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, // TODO + properties_->encryption(column_descr->path()), col_meta, // TODO properties_->memory_pool()); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -328,8 +328,7 @@ class FileSerializer : public ParquetFileWriter::Contents { if (properties_->file_encryption() == nullptr) { // Parquet files always start with PAR1 sink_->Write(PARQUET_MAGIC, 4); - } - else { + } else { sink_->Write(PARQUET_EMAGIC, 4); } } @@ -348,8 +347,7 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write Footer sink_->Write(reinterpret_cast(&metadata_len), 4); sink_->Write(PARQUET_MAGIC, 4); - } - else { + } else { // Write MetaData with encryption uint64_t metadata_start = static_cast(sink_->Tell()); @@ -357,8 +355,7 @@ class FileSerializer : public ParquetFileWriter::Contents { if (file_encryption->encrypted_footer()) { auto footer_encryption = file_encryption->footer_encryption(); metadata->WriteTo(sink_.get(), footer_encryption.get()); - } - else { + } else { metadata->WriteTo(sink_.get()); } diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index b14626b0..9720a0f2 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -52,16 +52,16 @@ static std::shared_ptr MakeTypedColumnStats( // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { return std::make_shared>( - descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, true); + descr, metadata.statistics.min_value, metadata.statistics.max_value, + metadata.num_values - metadata.statistics.null_count, + metadata.statistics.null_count, metadata.statistics.distinct_count, true); } // Default behavior return std::make_shared>( - descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max || metadata.statistics.__isset.min); + descr, metadata.statistics.min, metadata.statistics.max, + metadata.num_values - metadata.statistics.null_count, + metadata.statistics.null_count, metadata.statistics.distinct_count, + metadata.statistics.__isset.max || metadata.statistics.__isset.min); } std::shared_ptr MakeColumnStats( @@ -90,21 +90,24 @@ std::shared_ptr MakeColumnStats( // MetaData Accessor // ColumnCryptoMetaData class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { -public: + public: explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata) - : crypto_metadata_(crypto_metadata) {} + : crypto_metadata_(crypto_metadata) {} ~ColumnCryptoMetaDataImpl() {} inline std::vector path_in_schema() const { - return crypto_metadata_->path_in_schema; } + return crypto_metadata_->path_in_schema; + } inline bool encrypted() const { return crypto_metadata_->encrypted; } inline bool encrypted_with_footer_key() const { - return crypto_metadata_->encrypted_with_footer_key; } + return crypto_metadata_->encrypted_with_footer_key; + } inline std::string column_key_metadata() const { - return crypto_metadata_->column_key_metadata; } + return crypto_metadata_->column_key_metadata; + } -private: + private: const format::ColumnCryptoMetaData* crypto_metadata_; }; @@ -113,28 +116,30 @@ std::unique_ptr ColumnCryptoMetaData::Make( return std::unique_ptr(new ColumnCryptoMetaData(metadata)); } -ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) - : impl_(new ColumnCryptoMetaDataImpl( - reinterpret_cast(metadata))) {} +ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) + : impl_(new ColumnCryptoMetaDataImpl( + reinterpret_cast(metadata))) {} ColumnCryptoMetaData::~ColumnCryptoMetaData() {} -std::vector ColumnCryptoMetaData::path_in_schema() const { - return impl_->path_in_schema(); } +std::vector ColumnCryptoMetaData::path_in_schema() const { + return impl_->path_in_schema(); +} bool ColumnCryptoMetaData::encrypted() const { return impl_->encrypted(); } -bool ColumnCryptoMetaData::encrypted_with_footer_key() const { - return impl_->encrypted_with_footer_key(); } -std::string ColumnCryptoMetaData::column_key_metadata() const { - return impl_->column_key_metadata(); } - +bool ColumnCryptoMetaData::encrypted_with_footer_key() const { + return impl_->encrypted_with_footer_key(); +} +std::string ColumnCryptoMetaData::column_key_metadata() const { + return impl_->column_key_metadata(); +} // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { -public: + public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) - : column_(column), descr_(descr), writer_version_(writer_version) { + : column_(column), descr_(descr), writer_version_(writer_version) { const format::ColumnMetaData& meta_data = column->meta_data; for (auto encoding : meta_data.encodings) { encodings_.push_back(FromThrift(encoding)); @@ -165,7 +170,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline bool is_stats_set() const { DCHECK(writer_version_ != nullptr); return column_->meta_data.__isset.statistics && - writer_version_->HasCorrectStatistics(type(), descr_->sort_order()); + writer_version_->HasCorrectStatistics(type(), descr_->sort_order()); } inline std::shared_ptr statistics() const { @@ -208,16 +213,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } inline std::unique_ptr crypto_meta_data() const { - if (column_->__isset.crypto_meta_data) { - return ColumnCryptoMetaData::Make( - reinterpret_cast(&column_->crypto_meta_data)); - } - else { - return nullptr; - } + if (column_->__isset.crypto_meta_data) { + return ColumnCryptoMetaData::Make( + reinterpret_cast(&column_->crypto_meta_data)); + } else { + return nullptr; + } } -private: + private: mutable std::shared_ptr stats_; std::vector encodings_; const format::ColumnChunk* column_; @@ -229,15 +233,15 @@ std::unique_ptr ColumnChunkMetaData::Make( const uint8_t* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version)); + new ColumnChunkMetaData(metadata, descr, writer_version)); } ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) - : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( - reinterpret_cast(metadata), descr, - writer_version))} {} + : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + reinterpret_cast(metadata), descr, + writer_version))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -300,11 +304,11 @@ std::unique_ptr ColumnChunkMetaData::crypto_meta_data() co // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { -public: + public: explicit RowGroupMetaDataImpl(const format::RowGroup* row_group, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) - : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} + : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} ~RowGroupMetaDataImpl() {} inline int num_columns() const { return static_cast(row_group_->columns.size()); } @@ -323,11 +327,11 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make( - reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), - writer_version_); + reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), + writer_version_); } -private: + private: const format::RowGroup* row_group_; const SchemaDescriptor* schema_; const ApplicationVersion* writer_version_; @@ -337,14 +341,14 @@ std::unique_ptr RowGroupMetaData::Make( const uint8_t* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) { return std::unique_ptr( - new RowGroupMetaData(metadata, schema, writer_version)); + new RowGroupMetaData(metadata, schema, writer_version)); } RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) - : impl_{std::unique_ptr(new RowGroupMetaDataImpl( - reinterpret_cast(metadata), schema, writer_version))} { + : impl_{std::unique_ptr(new RowGroupMetaDataImpl( + reinterpret_cast(metadata), schema, writer_version))} { } RowGroupMetaData::~RowGroupMetaData() {} @@ -362,12 +366,12 @@ std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const // file metadata class FileMetaData::FileMetaDataImpl { -public: + public: FileMetaDataImpl() : metadata_len_(0) {} explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len, std::shared_ptr encryption = nullptr) - : metadata_len_(0) { + : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(metadata, metadata_len, metadata_.get(), encryption.get()); metadata_len_ = *metadata_len; @@ -399,7 +403,8 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(OutputStream* dst, EncryptionProperties* encryption) const { - SerializeThriftMsg(metadata_.get(), 1024, dst, encryption); } + SerializeThriftMsg(metadata_.get(), 1024, dst, encryption); + } std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -409,8 +414,8 @@ class FileMetaData::FileMetaDataImpl { throw ParquetException(ss.str()); } return RowGroupMetaData::Make( - reinterpret_cast(&metadata_->row_groups[i]), &schema_, - &writer_version_); + reinterpret_cast(&metadata_->row_groups[i]), &schema_, + &writer_version_); } const SchemaDescriptor* schema() const { return &schema_; } @@ -419,13 +424,13 @@ class FileMetaData::FileMetaDataImpl { return key_value_metadata_; } -private: + private: friend FileMetaDataBuilder; uint32_t metadata_len_; std::unique_ptr metadata_; void InitSchema() { schema::FlatSchemaConverter converter(&metadata_->schema[0], - static_cast(metadata_->schema.size())); + static_cast(metadata_->schema.size())); schema_.Init(converter.Convert()); } void InitColumnOrders() { @@ -462,21 +467,21 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const uint8_t* metadata, - uint32_t* metadata_len, - std::shared_ptr encryption) { +std::shared_ptr FileMetaData::Make( + const uint8_t* metadata, uint32_t* metadata_len, + std::shared_ptr encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len, - encryption)); + return std::shared_ptr( + new FileMetaData(metadata, metadata_len, encryption)); } FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len, std::shared_ptr encryption) - : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len, encryption))} {} + : impl_{std::unique_ptr( + new FileMetaDataImpl(metadata, metadata_len, encryption))} {} FileMetaData::FileMetaData() - : impl_{std::unique_ptr(new FileMetaDataImpl())} {} + : impl_{std::unique_ptr(new FileMetaDataImpl())} {} FileMetaData::~FileMetaData() {} @@ -494,13 +499,13 @@ int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { - case 1: - return ParquetVersion::PARQUET_1_0; - case 2: - return ParquetVersion::PARQUET_2_0; - default: - // Improperly set version, assuming Parquet 1.0 - break; + case 1: + return ParquetVersion::PARQUET_1_0; + case 2: + return ParquetVersion::PARQUET_2_0; + default: + // Improperly set version, assuming Parquet 1.0 + break; } return ParquetVersion::PARQUET_1_0; } @@ -524,11 +529,10 @@ void FileMetaData::WriteTo(OutputStream* dst, EncryptionProperties* encryption) } class FileCryptoMetaData::FileCryptoMetaDataImpl { -public: + public: FileCryptoMetaDataImpl() {} - explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) - { + explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) { metadata_.reset(new format::FileCryptoMetaData); DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); metadata_len_ = *metadata_len; @@ -536,69 +540,59 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { ~FileCryptoMetaDataImpl() {} - Encryption::type encryption_algorithm() - { + Encryption::type encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } bool encrypted_footer() { return metadata_->encrypted_footer; } - std::string footer_key_metadata() - { + std::string footer_key_metadata() { return metadata_->__isset.footer_key_metadata ? metadata_->footer_key_metadata : ""; } uint64_t footer_offset() { return metadata_->footer_offset; } - std::string iv_prefix() - { + std::string iv_prefix() { return metadata_->__isset.iv_prefix ? metadata_->iv_prefix : ""; } - void WriteTo(OutputStream* dst) - { - SerializeThriftMsg(metadata_.get(), 1024, dst); - } + void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); } - -private: + private: friend FileMetaDataBuilder; std::unique_ptr metadata_; uint32_t metadata_len_; }; -Encryption::type FileCryptoMetaData::encryption_algorithm() -{ +Encryption::type FileCryptoMetaData::encryption_algorithm() { return impl_->encryption_algorithm(); } bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } -std::string FileCryptoMetaData::footer_key_metadata() -{ +std::string FileCryptoMetaData::footer_key_metadata() { return impl_->footer_key_metadata(); } uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } std::string FileCryptoMetaData::iv_prefix() { return impl_->iv_prefix(); } -std::shared_ptr FileCryptoMetaData::Make(const uint8_t* serialized_metadata, - uint32_t* metadata_len) -{ +std::shared_ptr FileCryptoMetaData::Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len) { return std::shared_ptr( - new FileCryptoMetaData(serialized_metadata, metadata_len)); + new FileCryptoMetaData(serialized_metadata, metadata_len)); } -FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len) - : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} +FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, + uint32_t* metadata_len) + : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} -FileCryptoMetaData::FileCryptoMetaData() - : impl_(new FileCryptoMetaDataImpl()) {} +FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {} FileCryptoMetaData::~FileCryptoMetaData() {} -void FileCryptoMetaData::WriteTo(OutputStream *dst) { impl_->WriteTo(dst); } +void FileCryptoMetaData::WriteTo(OutputStream* dst) { impl_->WriteTo(dst); } ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) - : application_(application), version{major, minor, patch, "", "", ""} {} + : application_(application), version{major, minor, patch, "", "", ""} {} ApplicationVersion::ApplicationVersion(const std::string& created_by) { boost::regex app_regex{ApplicationVersion::APPLICATION_FORMAT}; @@ -652,9 +646,9 @@ bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) cons bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const { return application_ == other_version.application_ && - version.major == other_version.version.major && - version.minor == other_version.version.minor && - version.patch == other_version.version.patch; + version.major == other_version.version.major && + version.minor == other_version.version.minor && + version.patch == other_version.version.patch; } // Reference: @@ -696,17 +690,16 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, // MetaData Builders // row-group metadata class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { -public: + public: explicit ColumnChunkMetaDataBuilderImpl(const std::shared_ptr& props, const ColumnDescriptor* column, uint8_t* contents) - : properties_(props), column_(column) { + : properties_(props), column_(column) { column_chunk_ = reinterpret_cast(contents); - meta_data_ = column_chunk_->meta_data; - meta_data_.__set_type(ToThrift(column->physical_type())); - meta_data_.__set_path_in_schema(column->path()->ToDotVector()); - meta_data_.__set_codec( - ToThrift(properties_->compression(column->path()))); + column_metadata_ = column_chunk_->meta_data; + column_metadata_.__set_type(ToThrift(column->physical_type())); + column_metadata_.__set_path_in_schema(column->path()->ToDotVector()); + column_metadata_.__set_codec(ToThrift(properties_->compression(column->path()))); } ~ColumnChunkMetaDataBuilderImpl() {} @@ -733,7 +726,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { stats.__isset.max = val.has_max; } - meta_data_.__set_statistics(stats); + column_metadata_.__set_statistics(stats); } void Finish(int64_t num_values, int64_t dictionary_page_offset, @@ -741,7 +734,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback) { if (dictionary_page_offset > 0) { - meta_data_.__set_dictionary_page_offset(dictionary_page_offset); + column_metadata_.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); } else { column_chunk_->__set_file_offset(data_page_offset + compressed_size); @@ -754,6 +747,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { meta_data_.__set_data_page_offset(data_page_offset); meta_data_.__set_total_uncompressed_size(uncompressed_size); meta_data_.__set_total_compressed_size(compressed_size); + std::vector thrift_encodings; if (has_dictionary) { thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); @@ -771,7 +765,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { if (dictionary_fallback) { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } - meta_data_.__set_encodings(thrift_encodings); + column_metadata_.__set_encodings(thrift_encodings); } void WriteTo(OutputStream* sink) { @@ -781,22 +775,25 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // file is not encrypted or uniform encrypted if (encrypt_md == nullptr) { column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(meta_data_); + column_chunk_->__set_meta_data(column_metadata_); SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); - } - else { // file is non-uniform encrypted + } else { // file is non-uniform encrypted column_chunk_->__isset.crypto_meta_data = true; - column_chunk_->crypto_meta_data.__set_path_in_schema(column_->path()->ToDotVector()); + column_chunk_->crypto_meta_data.__set_path_in_schema( + column_->path()->ToDotVector()); column_chunk_->crypto_meta_data.__set_encrypted(encrypt_md->encrypted()); - column_chunk_->crypto_meta_data.__set_encrypted(encrypt_md->encrypted_with_footer_key()); - column_chunk_->crypto_meta_data.__set_column_key_metadata(encrypt_md->key_metadata()); + column_chunk_->crypto_meta_data.__set_encrypted( + encrypt_md->encrypted_with_footer_key()); + column_chunk_->crypto_meta_data.__set_column_key_metadata( + encrypt_md->key_metadata()); auto footer_encryption = properties_->footer_encryption(); - // non-uniform: footer is unencrypted, or column is encrypted with a column-specific key - if ((footer_encryption == nullptr && encrypt_md->encrypted()) - || (footer_encryption != nullptr - && footer_encryption->key() == encrypt_md->key())) { + // non-uniform: footer is unencrypted, or column is encrypted with a column-specific + // key + if ((footer_encryption == nullptr && encrypt_md->encrypted()) || + (footer_encryption != nullptr && + footer_encryption->key() == encrypt_md->key())) { // don't set meta_data, column_chunk_->__isset.meta_data = false; @@ -806,15 +803,14 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { auto encrypt_props = properties_->encryption(column_->path()); uint64_t metadata_start = sink->Tell(); - SerializeThriftMsg(&meta_data_, sizeof(format::ColumnMetaData), sink, + SerializeThriftMsg(&column_metadata_, sizeof(format::ColumnMetaData), sink, encrypt_props.get()); // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. column_chunk_->__set_file_offset(metadata_start); - } - else { + } else { column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(meta_data_); + column_chunk_->__set_meta_data(column_metadata_); } SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); @@ -823,9 +819,9 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const ColumnDescriptor* descr() const { return column_; } -private: + private: format::ColumnChunk* column_chunk_; - format::ColumnMetaData meta_data_; + format::ColumnMetaData column_metadata_; const std::shared_ptr properties_; const ColumnDescriptor* column_; }; @@ -834,14 +830,14 @@ std::unique_ptr ColumnChunkMetaDataBuilder::Make( const std::shared_ptr& props, const ColumnDescriptor* column, uint8_t* contents) { return std::unique_ptr( - new ColumnChunkMetaDataBuilder(props, column, contents)); + new ColumnChunkMetaDataBuilder(props, column, contents)); } ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( const std::shared_ptr& props, const ColumnDescriptor* column, uint8_t* contents) - : impl_{std::unique_ptr( - new ColumnChunkMetaDataBuilderImpl(props, column, contents))} {} + : impl_{std::unique_ptr( + new ColumnChunkMetaDataBuilderImpl(props, column, contents))} {} ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {} @@ -871,10 +867,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, } class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { -public: + public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, const SchemaDescriptor* schema, uint8_t* contents) - : properties_(props), schema_(schema), current_column_(0) { + : properties_(props), schema_(schema), current_column_(0) { row_group_ = reinterpret_cast(contents); InitializeColumns(schema->num_columns()); } @@ -889,8 +885,8 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { } auto column = schema_->Column(current_column_); auto column_builder = ColumnChunkMetaDataBuilder::Make( - properties_, column, - reinterpret_cast(&row_group_->columns[current_column_++])); + properties_, column, + reinterpret_cast(&row_group_->columns[current_column_++])); auto column_builder_ptr = column_builder.get(); column_builders_.push_back(std::move(column_builder)); return column_builder_ptr; @@ -927,7 +923,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { int64_t num_rows() { return row_group_->num_rows; } -private: + private: void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); } format::RowGroup* row_group_; @@ -941,14 +937,14 @@ std::unique_ptr RowGroupMetaDataBuilder::Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, uint8_t* contents) { return std::unique_ptr( - new RowGroupMetaDataBuilder(props, schema_, contents)); + new RowGroupMetaDataBuilder(props, schema_, contents)); } RowGroupMetaDataBuilder::RowGroupMetaDataBuilder( const std::shared_ptr& props, const SchemaDescriptor* schema_, uint8_t* contents) - : impl_{std::unique_ptr( - new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} + : impl_{std::unique_ptr( + new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() {} @@ -973,11 +969,11 @@ void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written) { // file metadata // TODO(PARQUET-595) Support key_value_metadata class FileMetaDataBuilder::FileMetaDataBuilderImpl { -public: + public: explicit FileMetaDataBuilderImpl( const SchemaDescriptor* schema, const std::shared_ptr& props, const std::shared_ptr& key_value_metadata) - : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { + : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->footer_encryption().get() != nullptr) { crypto_metadata_.reset(new format::FileCryptoMetaData()); @@ -988,7 +984,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { RowGroupMetaDataBuilder* AppendRowGroup() { auto row_group = std::unique_ptr(new format::RowGroup()); auto row_group_builder = RowGroupMetaDataBuilder::Make( - properties_, schema_, reinterpret_cast(row_group.get())); + properties_, schema_, reinterpret_cast(row_group.get())); RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get(); row_group_builders_.push_back(std::move(row_group_builder)); row_groups_.push_back(std::move(row_group)); @@ -1021,14 +1017,14 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { int32_t file_version = 0; switch (properties_->version()) { - case ParquetVersion::PARQUET_1_0: - file_version = 1; - break; - case ParquetVersion::PARQUET_2_0: - file_version = 2; - break; - default: - break; + case ParquetVersion::PARQUET_1_0: + file_version = 1; + break; + case ParquetVersion::PARQUET_2_0: + file_version = 2; + break; + default: + break; } metadata_->__set_version(file_version); metadata_->__set_created_by(properties_->created_by()); @@ -1046,8 +1042,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->__isset.column_orders = true; parquet::schema::SchemaFlattener flattener( - static_cast(schema_->schema_root().get()), - &metadata_->schema); + static_cast(schema_->schema_root().get()), + &metadata_->schema); flattener.Flatten(); auto file_meta_data = std::unique_ptr(new FileMetaData()); file_meta_data->impl_->metadata_ = std::move(metadata_); @@ -1055,8 +1051,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } - std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) - { + std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) { if (crypto_metadata_.get() == nullptr) { return nullptr; } @@ -1066,7 +1061,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto footer_encryption = properties_->footer_encryption(); // build format::FileCryptoMetaData - crypto_metadata_->__set_encryption_algorithm(ToThrift(footer_encryption->algorithm())); + crypto_metadata_->__set_encryption_algorithm( + ToThrift(footer_encryption->algorithm())); crypto_metadata_->__set_encrypted_footer(file_encryption->encrypted_footer()); std::string footer_key_metadata = footer_encryption->key_metadata(); @@ -1084,11 +1080,11 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_crypto_meta_data; } -protected: + protected: std::unique_ptr metadata_; std::unique_ptr crypto_metadata_; -private: + private: const std::shared_ptr properties_; std::vector> row_groups_; std::vector> row_group_builders_; @@ -1100,14 +1096,14 @@ std::unique_ptr FileMetaDataBuilder::Make( const SchemaDescriptor* schema, const std::shared_ptr& props, const std::shared_ptr& key_value_metadata) { return std::unique_ptr( - new FileMetaDataBuilder(schema, props, key_value_metadata)); + new FileMetaDataBuilder(schema, props, key_value_metadata)); } FileMetaDataBuilder::FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, const std::shared_ptr& key_value_metadata) - : impl_{std::unique_ptr( - new FileMetaDataBuilderImpl(schema, props, key_value_metadata))} {} + : impl_{std::unique_ptr( + new FileMetaDataBuilderImpl(schema, props, key_value_metadata))} {} FileMetaDataBuilder::~FileMetaDataBuilder() {} @@ -1117,8 +1113,8 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } -std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData(uint64_t footerOffset) -{ +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData( + uint64_t footerOffset) { return impl_->BuildFileCryptoMetaData(footerOffset); } diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h index e0c40631..2bfa4893 100644 --- a/src/parquet/metadata.h +++ b/src/parquet/metadata.h @@ -89,8 +89,7 @@ class ApplicationVersion { class PARQUET_EXPORT ColumnCryptoMetaData { public: - static std::unique_ptr Make( - const uint8_t* metadata); + static std::unique_ptr Make(const uint8_t* metadata); ~ColumnCryptoMetaData(); std::vector path_in_schema() const; @@ -134,6 +133,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; std::unique_ptr crypto_meta_data() const; + private: explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = nullptr); @@ -172,9 +172,9 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const uint8_t* serialized_metadata, - uint32_t* metadata_len, - std::shared_ptr encryption = nullptr); + static std::shared_ptr Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len, + std::shared_ptr encryption = nullptr); ~FileMetaData(); @@ -209,11 +209,11 @@ class PARQUET_EXPORT FileMetaData { }; class PARQUET_EXPORT FileCryptoMetaData { -public: - // API convenience to get a MetaData accessor - static std::shared_ptr Make(const uint8_t* serialized_metadata, - uint32_t* metadata_len); - ~FileCryptoMetaData(); + public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + ~FileCryptoMetaData(); Encryption::type encryption_algorithm(); bool encrypted_footer(); @@ -223,15 +223,14 @@ class PARQUET_EXPORT FileCryptoMetaData { void WriteTo(OutputStream* dst); -private: - friend FileMetaDataBuilder; - explicit FileCryptoMetaData(const uint8_t* serialized_metadata, - uint32_t* metadata_len); + private: + friend FileMetaDataBuilder; + FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); - // PIMPL Idiom - FileCryptoMetaData(); - class FileCryptoMetaDataImpl; - std::unique_ptr impl_; + // PIMPL Idiom + FileCryptoMetaData(); + class FileCryptoMetaDataImpl; + std::unique_ptr impl_; }; // Builder API diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 62142a6a..31f2624c 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -22,13 +22,13 @@ #include #include +#include "parquet/decryption_key_retriever.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" -#include "parquet/decryption_key_retriever.h" namespace parquet { @@ -39,80 +39,24 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 0; static bool DEFAULT_USE_BUFFERED_STREAM = false; -// should find a better name??? -class PARQUET_EXPORT EncryptionProperties { - private: - static inline uint8_t* str2bytes(std::string& str) - { - if (str.empty()) return nullptr; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); - } - - public: - EncryptionProperties() = default; - EncryptionProperties(Encryption::type algorithm, std::string key, - std::string key_metadata, std::string aad) - : algorithm_(algorithm), key_(key), key_metadata_(key_metadata), aad_(aad) {} - - ~EncryptionProperties() { - for (int i = 0; i < key_.length(); i++) { - key_[i] = '\0'; - } - } - - int key_length() { return static_cast(key_.length()); } - uint8_t* key_bytes() { return str2bytes(key_); } - - int aad_length() { return static_cast(aad_.length()); } - uint8_t* aad_bytes() { return str2bytes(aad_); } - - Encryption::type algorithm() { return algorithm_; } - - std::string key_metadata() { return key_metadata_; } - - std::string key() { return key_; } - - uint32_t calculate_cipher_size(uint32_t plain_len) { - if (algorithm_ == Encryption::AES_GCM_V1) return plain_len + 28; - else if (algorithm_ == Encryption::AES_GCM_CTR_V1) return plain_len + 16; - return plain_len; - } - - uint32_t calculate_plain_size(uint32_t cipher_len) { - if (algorithm_ == Encryption::AES_GCM_V1) return cipher_len - 28; - else if (algorithm_ == Encryption::AES_GCM_CTR_V1) return cipher_len - 16; - return cipher_len; - } - - private: - std::string key_; - std::string key_metadata_; - Encryption::type algorithm_; - std::string aad_; -}; - class PARQUET_EXPORT ColumnEncryptionProperties { public: ColumnEncryptionProperties() = default; ColumnEncryptionProperties(bool encrypt, std::string path) - : encrypt_(encrypt), path_(path), encrypted_with_footer_key_(encrypt) {} + : encrypt_(encrypt), path_(path), encrypted_with_footer_key_(encrypt) {} bool encrypted() { return encrypt_; } bool encrypted_with_footer_key() { return encrypted_with_footer_key_; } std::string key() { return key_; } std::string key_metadata() { return key_metadata_; } - void set_encryption_key(std::string key, uint32_t key_id = 0) - { - std::string key_metadata = key_id == 0 - ? "" : std::string(reinterpret_cast(&key_id), 4); + void set_encryption_key(std::string key, uint32_t key_id = 0) { + std::string key_metadata = + key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4); set_encryption_key(key, key_metadata); } - void set_encryption_key(std::string key, std::string key_metadata) - { + void set_encryption_key(std::string key, std::string key_metadata) { if (!encrypt_) throw ParquetException("Setting key on unencrypted column: " + path_); if (key.empty()) throw ParquetException("Null key for " + path_); @@ -131,32 +75,28 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::string path_; }; - class PARQUET_EXPORT FileDecryptionProperties { public: - FileDecryptionProperties(std::string footer_key) : footer_key_(footer_key) - { + FileDecryptionProperties(std::string footer_key) : footer_key_(footer_key) { if (footer_key_.empty()) throw ParquetException("Decryption: null footer key"); - if (!(footer_key_.length() == 16 || footer_key_.length() == 24 - || footer_key_.length() == 32)) { + if (!(footer_key_.length() == 16 || footer_key_.length() == 24 || + footer_key_.length() == 32)) { throw ParquetException("Wrong key length " + footer_key_.length()); } } - FileDecryptionProperties(std::shared_ptr key_retriever) - : key_retriever_(key_retriever) {} + FileDecryptionProperties(std::shared_ptr key_retriever) + : key_retriever_(key_retriever) {} void set_aad(std::string aad) { aad_ = aad; } - - void set_column_key(std::string name, std::string key) - { + + void set_column_key(std::string name, std::string key) { set_column_key(std::vector({name}), key); } - void set_column_key(std::vector paths, std::string key) - { + void set_column_key(std::vector paths, std::string key) { if (key.empty()) throw ParquetException("Decryption: null column key"); - if (key.length() != 16 && key.length() != 24 && key.length() != 32) + if (key.length() != 16 && key.length() != 24 && key.length() != 32) throw ParquetException("Wrong key length " + key.length()); for (auto path = paths.begin(); path != paths.end(); path++) { @@ -227,7 +167,7 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } void set_file_decryption(std::shared_ptr decryption) { - file_decryption_ = decryption; + file_decryption_ = decryption; } FileDecryptionProperties* file_decryption() { return file_decryption_.get(); } @@ -313,45 +253,45 @@ class PARQUET_EXPORT FileEncryptionProperties { public: FileEncryptionProperties() = default; FileEncryptionProperties(const FileEncryptionProperties&) = default; - - FileEncryptionProperties(Encryption::type algorithm, std::string key, - std::string key_metadata) - { + + FileEncryptionProperties(Encryption::type algorithm, std::string key, + std::string key_metadata) { if (key.length() != 16 && key.length() != 24 && key.length() != 32) { - throw ParquetException("Wrong key length " + key.length()); // TODO io exception + throw ParquetException("Wrong key length " + key.length()); // TODO io exception } if (!key_metadata.empty() && key_metadata.length() > 256) { - throw ParquetException("Footer key meta data is too long: " + key_metadata.length()); + throw ParquetException("Footer key meta data is too long: " + + key_metadata.length()); } - uniform_encryption_ = true; footer_key_ = key; footer_key_metadata_ = key_metadata; - single_key_encryption_ = !footer_key_.empty(); + uniform_encryption_ = !footer_key_.empty(); algorithm_ = algorithm; } - + FileEncryptionProperties(Encryption::type algorithm, std::string key, int key_id) - : FileEncryptionProperties(algorithm, key, - key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4)) {} + : FileEncryptionProperties( + algorithm, key, + key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4)) {} - void setup_columns(std::vector columns, bool encrypt_the_rest) - { + void setup_columns(std::vector columns, + bool encrypt_the_rest) { encrypt_the_rest_ = encrypt_the_rest; columns_ = columns; if (!footer_key_.empty()) { - single_key_encryption_ = true; + uniform_encryption_ = true; for (auto col = columns.begin(); col != columns.end(); col++) { if (col->key().compare(footer_key_) != 0) { - single_key_encryption_ = false; + uniform_encryption_ = false; break; } } - } - else { - if (encrypt_the_rest) throw ParquetException("Encrypt the rest with null footer key"); + } else { + if (encrypt_the_rest) + throw ParquetException("Encrypt the rest with null footer key"); bool all_are_unencrypted = true; for (auto col = columns.begin(); col != columns.end(); col++) { if (col->encrypted()) { @@ -362,24 +302,22 @@ class PARQUET_EXPORT FileEncryptionProperties { } } - if (all_are_unencrypted) + if (all_are_unencrypted) throw ParquetException("Footer and all columns unencrypted"); } } - std::shared_ptr footer_encryption() - { + std::shared_ptr footer_encryption() { if (footer_key_.empty()) { return nullptr; - } - else { - return std::make_shared(algorithm_, footer_key_, + } else { + return std::make_shared(algorithm_, footer_key_, footer_key_metadata_, aad_); } } std::shared_ptr encryption_metadata( - const std::shared_ptr& path) { + const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { return nullptr; @@ -387,7 +325,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // non-uniform encryption std::string pathStr = path->ToDotString(); - for(auto col = columns_.begin(); col != columns_.end(); col++) { // TODO + for (auto col = columns_.begin(); col != columns_.end(); col++) { // TODO if (col->path() == pathStr) { return std::shared_ptr(&(*col)); } @@ -395,19 +333,18 @@ class PARQUET_EXPORT FileEncryptionProperties { // encrypted with footer key if (encrypt_the_rest_) { std::shared_ptr col( - new ColumnEncryptionProperties(true, path->ToDotString())); + new ColumnEncryptionProperties(true, path->ToDotString())); col->set_encryption_key(footer_key_, footer_key_metadata_); return col; } // unencrypted return std::shared_ptr( - new ColumnEncryptionProperties(false, path->ToDotString())); - + new ColumnEncryptionProperties(false, path->ToDotString())); } std::shared_ptr encryption_properties( - const std::shared_ptr& path) { + const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { return footer_encryption(); @@ -415,14 +352,10 @@ class PARQUET_EXPORT FileEncryptionProperties { // non-uniform encryption std::string pathStr = path->ToDotString(); - for(auto col = columns_.begin(); col != columns_.end(); col++) { // TODO + for (auto col = columns_.begin(); col != columns_.end(); col++) { // TODO if (col->path() == pathStr) { - return std::shared_ptr(new EncryptionProperties( - algorithm_, - col->key(), - col->key_metadata(), - aad_ - )); + return std::shared_ptr( + new EncryptionProperties(algorithm_, col->key(), col->key_metadata(), aad_)); } } @@ -441,7 +374,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key_; std::string footer_key_metadata_; Encryption::type algorithm_; - bool single_key_encryption_; std::string aad_; bool uniform_encryption_; @@ -590,36 +522,31 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } - Builder* encryption(std::string key) - { + Builder* encryption(std::string key) { return encryption(Encryption::AES_GCM_V1, key, 0); } - Builder* encryption(std::string key, uint32_t key_id) - { + Builder* encryption(std::string key, uint32_t key_id) { return encryption(Encryption::AES_GCM_V1, key, key_id); } - Builder* encryption(std::string key, std::string key_id) - { + Builder* encryption(std::string key, std::string key_id) { return encryption(Encryption::AES_GCM_V1, key, key_id); } - Builder* encryption(Encryption::type algorithm, std::string key, uint32_t key_id) - { + Builder* encryption(Encryption::type algorithm, std::string key, uint32_t key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } - Builder* encryption(Encryption::type algorithm, std::string key, std::string key_id) - { + Builder* encryption(Encryption::type algorithm, std::string key, std::string key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } Builder* column_encryption(std::vector columns, - bool encrypt_the_rest) { - if (file_encryption_.get() == nullptr) + bool encrypt_the_rest) { + if (file_encryption_.get() == nullptr) throw ParquetException("null file encryption"); file_encryption_->setup_columns(columns, encrypt_the_rest); @@ -671,11 +598,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, - std::move(file_encryption_), - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_), + default_column_properties_, column_properties)); } private: @@ -711,13 +637,13 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } inline FileEncryptionProperties* file_encryption() const { - return parquet_file_encryption_.get(); } + return parquet_file_encryption_.get(); + } inline std::shared_ptr footer_encryption() const { if (parquet_file_encryption_.get() == nullptr) { - return std::shared_ptr(nullptr); - } - else { + return std::shared_ptr(nullptr); + } else { return parquet_file_encryption_->footer_encryption(); } } @@ -766,21 +692,19 @@ class PARQUET_EXPORT WriterProperties { } std::shared_ptr encryption_metadata( - const std::shared_ptr& path) const { + const std::shared_ptr& path) const { if (parquet_file_encryption_) { return parquet_file_encryption_->encryption_metadata(path); - } - else { + } else { return nullptr; } } std::shared_ptr encryption( - const std::shared_ptr& path) const { + const std::shared_ptr& path) const { if (parquet_file_encryption_) { return parquet_file_encryption_->encryption_properties(path); - } - else { + } else { return nullptr; } } diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 1575b07d..19e2d56a 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -40,10 +40,10 @@ #include "parquet/exception.h" #include "parquet/parquet_types.h" +#include "parquet/types.h" +#include "parquet/util/crypto.h" #include "parquet/util/logging.h" #include "parquet/util/memory.h" -#include "parquet/util/crypto.h" -#include "parquet/properties.h" namespace parquet { @@ -117,64 +117,61 @@ static inline format::EncryptionAlgorithm::type ToThrift(Encryption::type type) // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - EncryptionProperties* encryption = nullptr - ) { - if (encryption == nullptr) { - shared_ptr tmem_transport( - new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; + const EncryptionProperties* encryption = nullptr) { + if (encryption == nullptr) { + shared_ptr tmem_transport( + new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + shared_ptr tproto = + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); } - else { - // first 4 bytes for length - uint8_t clenBytes[4]; - memcpy(clenBytes, buf, 4); - - uint32_t clen = *(reinterpret_cast(clenBytes)); - - // decrypt - std::vector decrypted_buffer(encryption->calculate_plain_size(clen)); - - int decrypted_buffer_len = parquet::decrypt( - encryption->algorithm(), true, &buf[4], clen, - encryption->key_bytes(), encryption->key_length(), - encryption->aad_bytes(), encryption->aad_length(), - decrypted_buffer.data()); - - if (decrypted_buffer_len <= 0) { - throw ParquetException("Couldn't decrypt buffer\n"); - } - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new apache::thrift::transport::TMemoryBuffer( - decrypted_buffer.data(), decrypted_buffer_len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - - *len = 4 + clen; + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; + } else { + // first 4 bytes for length + uint8_t clenBytes[4]; + memcpy(clenBytes, buf, 4); + + uint32_t clen = *(reinterpret_cast(clenBytes)); + + // decrypt + std::vector decrypted_buffer(encryption->calculate_plain_size(clen)); + + int decrypted_buffer_len = parquet::decrypt( + encryption->algorithm(), true, &buf[4], clen, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + decrypted_buffer.data()); + + if (decrypted_buffer_len <= 0) { + throw ParquetException("Couldn't decrypt buffer\n"); + } + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new apache::thrift::transport::TMemoryBuffer(decrypted_buffer.data(), + decrypted_buffer_len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + shared_ptr tproto = + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); } + + *len = 4 + clen; + } } // Serialize obj into a buffer. The result is returned as a string. @@ -182,7 +179,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali // the expected size of the serialized object template inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, - EncryptionProperties* encryption = nullptr) { + const EncryptionProperties* encryption = nullptr) { shared_ptr mem_buffer( new apache::thrift::transport::TMemoryBuffer(len)); apache::thrift::protocol::TCompactProtocolFactoryT< @@ -206,14 +203,12 @@ inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, out->Write(out_buffer, out_length); return out_length; - } - else { + } else { std::vector cipher_buffer(encryption->calculate_cipher_size(len)); int cipher_buffer_len = parquet::encrypt( - encryption->algorithm(), true, out_buffer, out_length, - encryption->key_bytes(), encryption->key_length(), - encryption->aad_bytes(), encryption->aad_length(), - cipher_buffer.data()); + encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + cipher_buffer.data()); out->Write(reinterpret_cast(&cipher_buffer_len), 4); out->Write(cipher_buffer.data(), cipher_buffer_len); diff --git a/src/parquet/types.h b/src/parquet/types.h index 10789cbf..6f1f1b34 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -117,6 +117,61 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +// should find a better name??? +class PARQUET_EXPORT EncryptionProperties { + private: + static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return nullptr; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); + } + + public: + EncryptionProperties() = default; + EncryptionProperties(Encryption::type algorithm, const std::string& key, + const std::string& key_metadata, const std::string& aad) + : algorithm_(algorithm), key_(key), key_metadata_(key_metadata), aad_(aad) {} + + ~EncryptionProperties() { key_.replace(0, key_.length(), '\0'); } + + int key_length() const { return static_cast(key_.length()); } + uint8_t* key_bytes() const { return str2bytes(key_); } + + int aad_length() const { return static_cast(aad_.length()); } + uint8_t* aad_bytes() const { return str2bytes(aad_); } + + Encryption::type algorithm() const { return algorithm_; } + + std::string key_metadata() const { return key_metadata_; } + + std::string key() const { return key_; } + + uint32_t calculate_cipher_size(uint32_t plain_len) const { + if (algorithm_ == Encryption::AES_GCM_V1) { + return plain_len + 28; + } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + return plain_len + 16; + } + return plain_len; + } + + uint32_t calculate_plain_size(uint32_t cipher_len) const { + if (algorithm_ == Encryption::AES_GCM_V1) { + return cipher_len - 28; + } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + return cipher_len - 16; + } + return cipher_len; + } + + private: + std::string key_; // encryption key, should have 16, 24, 32-byte length + std::string key_metadata_; // key metadata, used for retrieving key + Encryption::type algorithm_; // encryption algorithm + std::string aad_; // encryption additional authenticated data +}; + // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; From fa88fb5c22d93ad5598b6c478eb3f7af15448bb8 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 6 Aug 2018 13:54:07 +0700 Subject: [PATCH 11/18] post-review change --- src/parquet/column_reader.cc | 4 +- src/parquet/column_writer.cc | 22 +-- ...ryption_key_retriever.cc => encryption.cc} | 12 +- ...ecryption_key_retriever.h => encryption.h} | 16 +- src/parquet/file_reader.cc | 16 +- src/parquet/metadata.cc | 21 ++- src/parquet/metadata.h | 8 +- src/parquet/properties.h | 143 +++++++++--------- src/parquet/thrift.h | 46 +++--- src/parquet/types.h | 10 +- 10 files changed, 148 insertions(+), 150 deletions(-) rename src/parquet/{decryption_key_retriever.cc => encryption.cc} (75%) rename src/parquet/{decryption_key_retriever.h => encryption.h} (76%) diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index a2a78387..7e6d4acd 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -197,8 +197,8 @@ std::shared_ptr SerializedPageReader::NextPage() { std::vector decrypt_buffer; if (encryption_.get()) { - decrypt_buffer.resize(encryption_->calculate_plain_size(compressed_len)); - compressed_len = parquet::decrypt( + decrypt_buffer.resize(encryption_->CalculatePlainSize(compressed_len)); + compressed_len = parquet_encryption::Decrypt( encryption_->algorithm(), false, buffer, compressed_len, encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), encryption_->aad_length(), decrypt_buffer.data()); diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 342ac5a3..fcf3767a 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -165,14 +165,14 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int output_data_len = static_cast(compressed_data->size()); - std::vector encrypted_data_buffer; + std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (encryption_.get()) { - encrypted_data_buffer.resize(encryption_->calculate_cipher_size(output_data_len)); - output_data_len = parquet::encrypt( + encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); + output_data_len = parquet_encryption::Encrypt( encryption_->algorithm(), false, compressed_data->data(), output_data_len, encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), - encryption_->aad_length(), encrypted_data_buffer.data()); - output_data_buffer = encrypted_data_buffer.data(); + encryption_->aad_length(), encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); } format::PageHeader page_header; @@ -242,16 +242,16 @@ class SerializedPageWriter : public PageWriter { data_page_header.__set_statistics(ToThrift(page.statistics())); const uint8_t* output_data_buffer = compressed_data->data(); - int output_data_len = static_cast(compressed_data->size()); + int32_t output_data_len = static_cast(compressed_data->size()); - std::vector encrypted_data_buffer; + std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (encryption_.get()) { - encrypted_data_buffer.resize(encryption_->calculate_cipher_size(output_data_len)); - output_data_len = parquet::encrypt( + encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); + output_data_len = parquet_encryption::Encrypt( encryption_->algorithm(), false, compressed_data->data(), output_data_len, encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), - encryption_->aad_length(), encrypted_data_buffer.data()); - output_data_buffer = encrypted_data_buffer.data(); + encryption_->aad_length(), encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); } format::PageHeader page_header; diff --git a/src/parquet/decryption_key_retriever.cc b/src/parquet/encryption.cc similarity index 75% rename from src/parquet/decryption_key_retriever.cc rename to src/parquet/encryption.cc index 9de1bd71..fc4147f5 100644 --- a/src/parquet/decryption_key_retriever.cc +++ b/src/parquet/encryption.cc @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. -#include "decryption_key_retriever.h" +#include "encryption.h" -#include +#include namespace parquet { // integer key retriever -void IntegerKeyIdRetriever::put_key(uint32_t key_id, const std::string& key) { +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -std::string IntegerKeyIdRetriever::get_key(const std::string& key_metadata) { +const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { uint32_t key_id; std::memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); @@ -34,11 +34,11 @@ std::string IntegerKeyIdRetriever::get_key(const std::string& key_metadata) { } // string key retriever -void StringKeyIdRetriever::put_key(const std::string& key_id, const std::string& key) { +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -std::string StringKeyIdRetriever::get_key(const std::string& key_id) { +const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } diff --git a/src/parquet/decryption_key_retriever.h b/src/parquet/encryption.h similarity index 76% rename from src/parquet/decryption_key_retriever.h rename to src/parquet/encryption.h index cedfbd49..1dbf0d20 100644 --- a/src/parquet/decryption_key_retriever.h +++ b/src/parquet/encryption.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#ifndef PARQUET_DECRYPTION_RETRIEVER_H -#define PARQUET_DECRYPTION_RETRIEVER_H +#ifndef PARQUET_ENCRYPTION_H +#define PARQUET_ENCRYPTION_H #include #include @@ -26,15 +26,15 @@ namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual std::string get_key(const std::string& key_metadata) = 0; + virtual const std::string& GetKey(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; // Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: - void put_key(uint32_t key_id, const std::string& key); - std::string get_key(const std::string& key_metadata); + void PutKey(uint32_t key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); private: std::map key_map_; @@ -43,8 +43,8 @@ class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { // Simple string key retriever class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { public: - void put_key(const std::string& key_id, const std::string& key); - std::string get_key(const std::string& key_metadata); + void PutKey(const std::string& key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); private: std::map key_map_; @@ -52,4 +52,4 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { } // namespace parquet -#endif // PARQUET_DECRYPTION_RETRIEVER_H +#endif // PARQUET_ENCRYPTION_H diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index 75fd1317..9c8f3e03 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -157,7 +157,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (encrypted_with_footer_key) { std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); - std::string footer_key = file_decryption->footer_key(footer_key_metadata); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); @@ -165,7 +165,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm(), footer_key, footer_key_metadata, - file_decryption->aad()); + file_decryption->GetAad()); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), footer_encryption, properties_.memory_pool()); @@ -173,7 +173,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string column_key_metadata = crypto_meta_data->column_key_metadata(); // encrypted with column key - std::string column_key = file_decryption->column_key( + std::string column_key = file_decryption->GetColumnKey( col->path_in_schema()->ToDotString(), crypto_meta_data->column_key_metadata()); if (column_key.empty()) { @@ -182,7 +182,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { } auto column_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm(), column_key, column_key_metadata, - file_decryption->aad()); + file_decryption->GetAad()); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), column_encryption, properties_.memory_pool()); @@ -287,7 +287,7 @@ class SerializedFile : public ParquetFileReader::Contents { "file metadata size."); } - std::shared_ptr crypto_metadata_buffer = + std::shared_ptr crypto_metadata_buffer = AllocateBuffer(properties_.memory_pool(), crypto_metadata_len); // Check if the footer_buffer contains the entire metadata @@ -309,7 +309,7 @@ class SerializedFile : public ParquetFileReader::Contents { int64_t footer_offset = file_crypto_metadata_->footer_offset(); uint32_t footer_read_size = (uint32_t)(crypto_metadata_start - footer_offset); - std::shared_ptr footer_buffer = + std::shared_ptr footer_buffer = AllocateBuffer(properties_.memory_pool(), footer_read_size); bytes_read = source_->ReadAt(footer_offset, footer_read_size, footer_buffer->mutable_data()); @@ -319,11 +319,11 @@ class SerializedFile : public ParquetFileReader::Contents { std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); auto file_decryption = properties_.file_decryption(); - std::string footer_key = file_decryption->footer_key(footer_key_metadata); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm(), footer_key, - footer_key_metadata, file_decryption->aad()); + footer_key_metadata, file_decryption->GetAad()); file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size, footer_encryption); diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index 9720a0f2..8bbf46ab 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -96,14 +96,14 @@ class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { ~ColumnCryptoMetaDataImpl() {} - inline std::vector path_in_schema() const { + const std::vector& path_in_schema() const { return crypto_metadata_->path_in_schema; } - inline bool encrypted() const { return crypto_metadata_->encrypted; } - inline bool encrypted_with_footer_key() const { + bool encrypted() const { return crypto_metadata_->encrypted; } + bool encrypted_with_footer_key() const { return crypto_metadata_->encrypted_with_footer_key; } - inline std::string column_key_metadata() const { + const std::string& column_key_metadata() const { return crypto_metadata_->column_key_metadata; } @@ -122,14 +122,14 @@ ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) ColumnCryptoMetaData::~ColumnCryptoMetaData() {} -std::vector ColumnCryptoMetaData::path_in_schema() const { +const std::vector& ColumnCryptoMetaData::path_in_schema() const { return impl_->path_in_schema(); } bool ColumnCryptoMetaData::encrypted() const { return impl_->encrypted(); } bool ColumnCryptoMetaData::encrypted_with_footer_key() const { return impl_->encrypted_with_footer_key(); } -std::string ColumnCryptoMetaData::column_key_metadata() const { +const std::string& ColumnCryptoMetaData::column_key_metadata() const { return impl_->column_key_metadata(); } @@ -546,13 +546,13 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { bool encrypted_footer() { return metadata_->encrypted_footer; } - std::string footer_key_metadata() { + const std::string& footer_key_metadata() { return metadata_->__isset.footer_key_metadata ? metadata_->footer_key_metadata : ""; } uint64_t footer_offset() { return metadata_->footer_offset; } - std::string iv_prefix() { + const std::string& iv_prefix() { return metadata_->__isset.iv_prefix ? metadata_->iv_prefix : ""; } @@ -769,8 +769,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(OutputStream* sink) { - std::cout << "ColumnChunkMetaDataBuilderImpl::WriteTo" << std::endl; - auto encrypt_md = properties_->encryption_metadata(column_->path()); + const auto& encrypt_md = properties_->encryption_metadata(column_->path()); // file is not encrypted or uniform encrypted if (encrypt_md == nullptr) { @@ -975,7 +974,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->footer_encryption().get() != nullptr) { + if (props->footer_encryption() != nullptr) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h index 2bfa4893..3803d249 100644 --- a/src/parquet/metadata.h +++ b/src/parquet/metadata.h @@ -92,10 +92,10 @@ class PARQUET_EXPORT ColumnCryptoMetaData { static std::unique_ptr Make(const uint8_t* metadata); ~ColumnCryptoMetaData(); - std::vector path_in_schema() const; + const std::vector& path_in_schema() const; bool encrypted() const; bool encrypted_with_footer_key() const; - std::string column_key_metadata() const; + const std::string& column_key_metadata() const; private: explicit ColumnCryptoMetaData(const uint8_t* metadata); @@ -217,9 +217,9 @@ class PARQUET_EXPORT FileCryptoMetaData { Encryption::type encryption_algorithm(); bool encrypted_footer(); - std::string footer_key_metadata(); + const std::string& footer_key_metadata(); uint64_t footer_offset(); - std::string iv_prefix(); + const std::string& iv_prefix(); void WriteTo(OutputStream* dst); diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 31f2624c..76c98039 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -22,7 +22,8 @@ #include #include -#include "parquet/decryption_key_retriever.h" +#include "parquet/util/logging.h" +#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/schema.h" @@ -45,18 +46,18 @@ class PARQUET_EXPORT ColumnEncryptionProperties { ColumnEncryptionProperties(bool encrypt, std::string path) : encrypt_(encrypt), path_(path), encrypted_with_footer_key_(encrypt) {} - bool encrypted() { return encrypt_; } - bool encrypted_with_footer_key() { return encrypted_with_footer_key_; } - std::string key() { return key_; } - std::string key_metadata() { return key_metadata_; } + bool encrypted() const { return encrypt_; } + bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } - void set_encryption_key(std::string key, uint32_t key_id = 0) { + void SetEncryptionKey(const std::string& key, uint32_t key_id = 0) { std::string key_metadata = key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4); - set_encryption_key(key, key_metadata); + SetEncryptionKey(key, key_metadata); } - void set_encryption_key(std::string key, std::string key_metadata) { + void SetEncryptionKey(const std::string& key, const std::string& key_metadata) { if (!encrypt_) throw ParquetException("Setting key on unencrypted column: " + path_); if (key.empty()) throw ParquetException("Null key for " + path_); @@ -65,65 +66,60 @@ class PARQUET_EXPORT ColumnEncryptionProperties { key_metadata_ = key_metadata; } - std::string path() { return path_; } + const std::string& path() const { return path_; } private: bool encrypt_; + std::string path_; bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; - std::string path_; }; class PARQUET_EXPORT FileDecryptionProperties { public: - FileDecryptionProperties(std::string footer_key) : footer_key_(footer_key) { - if (footer_key_.empty()) throw ParquetException("Decryption: null footer key"); - if (!(footer_key_.length() == 16 || footer_key_.length() == 24 || - footer_key_.length() == 32)) { - throw ParquetException("Wrong key length " + footer_key_.length()); - } + FileDecryptionProperties(const std::string& footer_key) : footer_key_(footer_key) { + DCHECK(footer_key_.length() == 16 || footer_key_.length() == 24 || + footer_key_.length() == 32); } - FileDecryptionProperties(std::shared_ptr key_retriever) + FileDecryptionProperties(const std::shared_ptr& key_retriever) : key_retriever_(key_retriever) {} - void set_aad(std::string aad) { aad_ = aad; } + void SetAad(const std::string& aad) { aad_ = aad; } - void set_column_key(std::string name, std::string key) { - set_column_key(std::vector({name}), key); + void SetColumnKey(const std::string& name, const std::string& key) { + SetColumnKey(std::vector({name}), key); } - void set_column_key(std::vector paths, std::string key) { - if (key.empty()) throw ParquetException("Decryption: null column key"); - if (key.length() != 16 && key.length() != 24 && key.length() != 32) - throw ParquetException("Wrong key length " + key.length()); + void SetColumnKey(const std::vector& paths, const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - for (auto path = paths.begin(); path != paths.end(); path++) { - column_keys_[*path] = key; + for (const auto& path : paths) { + column_keys_[path] = key; } } - std::string column_key(std::string path, std::string key_metadata = "") { + const std::string& GetColumnKey(const std::string& path, const std::string& key_metadata = "") { if (key_metadata.empty()) { return column_keys_[path]; } if (key_retriever_ == nullptr) { throw ParquetException("no key retriever is provided for column key metadata"); } - return key_retriever_->get_key(key_metadata); + return key_retriever_->GetKey(key_metadata); } - std::string footer_key(std::string footer_key_metadata = "") { + const std::string& GetFooterKey(const std::string& footer_key_metadata = "") { if (footer_key_metadata.empty()) { return footer_key_; } if (key_retriever_ == nullptr) { throw ParquetException("no key retriever is provided for footer key metadata"); } - return key_retriever_->get_key(footer_key_metadata); + return key_retriever_->GetKey(footer_key_metadata); } - std::string aad() { return aad_; } + const std::string& GetAad() { return aad_; } private: std::string footer_key_; @@ -166,7 +162,7 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } - void set_file_decryption(std::shared_ptr decryption) { + void SetFileDecryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -254,14 +250,11 @@ class PARQUET_EXPORT FileEncryptionProperties { FileEncryptionProperties() = default; FileEncryptionProperties(const FileEncryptionProperties&) = default; - FileEncryptionProperties(Encryption::type algorithm, std::string key, - std::string key_metadata) { - if (key.length() != 16 && key.length() != 24 && key.length() != 32) { - throw ParquetException("Wrong key length " + key.length()); // TODO io exception - } - if (!key_metadata.empty() && key_metadata.length() > 256) { - throw ParquetException("Footer key meta data is too long: " + - key_metadata.length()); + FileEncryptionProperties(Encryption::type algorithm, const std::string& key, + const std::string& key_metadata) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + if (!key_metadata.empty()) { + DCHECK(key_metadata.length() <= 256); } footer_key_ = key; @@ -270,12 +263,18 @@ class PARQUET_EXPORT FileEncryptionProperties { algorithm_ = algorithm; } - FileEncryptionProperties(Encryption::type algorithm, std::string key, int key_id) + FileEncryptionProperties(Encryption::type algorithm, const std::string& key, int key_id) : FileEncryptionProperties( algorithm, key, key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4)) {} - void setup_columns(std::vector columns, + /** + * encrypt_the_rest will define if other columns (not defined in columns argument) + * will be encrypted or not + * if encrypt_the_rest = true, other columns will be encrypted with footer key + * else, other columns will be unencrypted + */ + void SetupColumns(const std::vector& columns, bool encrypt_the_rest) { encrypt_the_rest_ = encrypt_the_rest; columns_ = columns; @@ -293,17 +292,18 @@ class PARQUET_EXPORT FileEncryptionProperties { if (encrypt_the_rest) throw ParquetException("Encrypt the rest with null footer key"); bool all_are_unencrypted = true; - for (auto col = columns.begin(); col != columns.end(); col++) { - if (col->encrypted()) { - if (col->key().empty()) { + for (const auto& col : columns) { + if (col.encrypted()) { + if (col.key().empty()) { throw ParquetException("Encrypt column with null footer key"); } all_are_unencrypted = false; } } - if (all_are_unencrypted) + if (all_are_unencrypted) { throw ParquetException("Footer and all columns unencrypted"); + } } } @@ -316,7 +316,7 @@ class PARQUET_EXPORT FileEncryptionProperties { } } - std::shared_ptr encryption_metadata( + std::shared_ptr GetColumnCryptoMetaData( const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { @@ -324,17 +324,17 @@ class PARQUET_EXPORT FileEncryptionProperties { } // non-uniform encryption - std::string pathStr = path->ToDotString(); - for (auto col = columns_.begin(); col != columns_.end(); col++) { // TODO - if (col->path() == pathStr) { - return std::shared_ptr(&(*col)); + std::string path_str = path->ToDotString(); + for (const auto& col : columns_) { + if (col.path() == path_str) { + return std::shared_ptr(const_cast(&col)); } } // encrypted with footer key if (encrypt_the_rest_) { std::shared_ptr col( new ColumnEncryptionProperties(true, path->ToDotString())); - col->set_encryption_key(footer_key_, footer_key_metadata_); + col->SetEncryptionKey(footer_key_, footer_key_metadata_); return col; } @@ -343,7 +343,7 @@ class PARQUET_EXPORT FileEncryptionProperties { new ColumnEncryptionProperties(false, path->ToDotString())); } - std::shared_ptr encryption_properties( + std::shared_ptr GetColumnEncryptionProperties( const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { @@ -351,11 +351,11 @@ class PARQUET_EXPORT FileEncryptionProperties { } // non-uniform encryption - std::string pathStr = path->ToDotString(); - for (auto col = columns_.begin(); col != columns_.end(); col++) { // TODO - if (col->path() == pathStr) { - return std::shared_ptr( - new EncryptionProperties(algorithm_, col->key(), col->key_metadata(), aad_)); + std::string path_str = path->ToDotString(); + for (const auto& col : columns_) { + if (col.path() == path_str) { + return std::make_shared(algorithm_, col.key(), + col.key_metadata(), aad_); } } @@ -363,7 +363,7 @@ class PARQUET_EXPORT FileEncryptionProperties { return footer_encryption(); } - return std::shared_ptr(nullptr); + return nullptr; } void aad(std::string aad) { aad_ = aad; } @@ -522,34 +522,35 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } - Builder* encryption(std::string key) { + Builder* encryption(const std::string& key) { return encryption(Encryption::AES_GCM_V1, key, 0); } - Builder* encryption(std::string key, uint32_t key_id) { + Builder* encryption(const std::string& key, uint32_t key_id) { return encryption(Encryption::AES_GCM_V1, key, key_id); } - Builder* encryption(std::string key, std::string key_id) { + Builder* encryption(const std::string& key, std::string key_id) { return encryption(Encryption::AES_GCM_V1, key, key_id); } - Builder* encryption(Encryption::type algorithm, std::string key, uint32_t key_id) { + Builder* encryption(Encryption::type algorithm, const std::string& key, uint32_t key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } - Builder* encryption(Encryption::type algorithm, std::string key, std::string key_id) { + Builder* encryption(Encryption::type algorithm, const std::string& key, const std::string& key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } - Builder* column_encryption(std::vector columns, + Builder* column_encryption(const std::vector& columns, bool encrypt_the_rest) { - if (file_encryption_.get() == nullptr) + if (file_encryption_ == nullptr) { throw ParquetException("null file encryption"); + } - file_encryption_->setup_columns(columns, encrypt_the_rest); + file_encryption_->SetupColumns(columns, encrypt_the_rest); return this; } @@ -641,8 +642,8 @@ class PARQUET_EXPORT WriterProperties { } inline std::shared_ptr footer_encryption() const { - if (parquet_file_encryption_.get() == nullptr) { - return std::shared_ptr(nullptr); + if (parquet_file_encryption_ == nullptr) { + return nullptr; } else { return parquet_file_encryption_->footer_encryption(); } @@ -694,7 +695,7 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr encryption_metadata( const std::shared_ptr& path) const { if (parquet_file_encryption_) { - return parquet_file_encryption_->encryption_metadata(path); + return parquet_file_encryption_->GetColumnCryptoMetaData(path); } else { return nullptr; } @@ -703,7 +704,7 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr encryption( const std::shared_ptr& path) const { if (parquet_file_encryption_) { - return parquet_file_encryption_->encryption_properties(path); + return parquet_file_encryption_->GetColumnEncryptionProperties(path); } else { return nullptr; } diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 19e2d56a..2af69ee4 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -79,8 +79,13 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } -static inline Encryption::type FromThrift(format::EncryptionAlgorithm::type type) { - return static_cast(type); +static inline Encryption::type FromThrift(format::EncryptionAlgorithm type) { + if (type.__isset.AES_GCM_V1) { + return Encryption::AES_GCM_V1; + } + else { + return Encryption::AES_GCM_CTR_V1; + } } static inline format::Type::type ToThrift(Type::type type) { @@ -105,8 +110,15 @@ static inline format::CompressionCodec::type ToThrift(Compression::type type) { return static_cast(type); } -static inline format::EncryptionAlgorithm::type ToThrift(Encryption::type type) { - return static_cast(type); +static inline format::EncryptionAlgorithm ToThrift(Encryption::type type) { + format::EncryptionAlgorithm encryption_algorithm; + if (type == Encryption::AES_GCM_V1) { + encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); + } + else { + encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); + } + return encryption_algorithm; } // ---------------------------------------------------------------------- @@ -143,9 +155,9 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali uint32_t clen = *(reinterpret_cast(clenBytes)); // decrypt - std::vector decrypted_buffer(encryption->calculate_plain_size(clen)); + std::vector decrypted_buffer(encryption->CalculatePlainSize(clen)); - int decrypted_buffer_len = parquet::decrypt( + uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( encryption->algorithm(), true, &buf[4], clen, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), decrypted_buffer.data()); @@ -153,22 +165,8 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new apache::thrift::transport::TMemoryBuffer(decrypted_buffer.data(), - decrypted_buffer_len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } + + DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); *len = 4 + clen; } @@ -204,8 +202,8 @@ inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out, return out_length; } else { - std::vector cipher_buffer(encryption->calculate_cipher_size(len)); - int cipher_buffer_len = parquet::encrypt( + std::vector cipher_buffer(encryption->CalculateCipherSize(len)); + int cipher_buffer_len = parquet_encryption::Encrypt( encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), cipher_buffer.data()); diff --git a/src/parquet/types.h b/src/parquet/types.h index 6f1f1b34..cec0c099 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -143,11 +143,11 @@ class PARQUET_EXPORT EncryptionProperties { Encryption::type algorithm() const { return algorithm_; } - std::string key_metadata() const { return key_metadata_; } + const std::string& key_metadata() const { return key_metadata_; } - std::string key() const { return key_; } + const std::string& key() const { return key_; } - uint32_t calculate_cipher_size(uint32_t plain_len) const { + uint32_t CalculateCipherSize(uint32_t plain_len) const { if (algorithm_ == Encryption::AES_GCM_V1) { return plain_len + 28; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { @@ -156,7 +156,7 @@ class PARQUET_EXPORT EncryptionProperties { return plain_len; } - uint32_t calculate_plain_size(uint32_t cipher_len) const { + uint32_t CalculatePlainSize(uint32_t cipher_len) const { if (algorithm_ == Encryption::AES_GCM_V1) { return cipher_len - 28; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { @@ -166,9 +166,9 @@ class PARQUET_EXPORT EncryptionProperties { } private: + Encryption::type algorithm_; // encryption algorithm std::string key_; // encryption key, should have 16, 24, 32-byte length std::string key_metadata_; // key metadata, used for retrieving key - Encryption::type algorithm_; // encryption algorithm std::string aad_; // encryption additional authenticated data }; From 70a3b07983163ae94dbf5c2309507b9b83a7cb74 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 7 Aug 2018 16:59:50 +0700 Subject: [PATCH 12/18] apply change in parquet thrift structure (using union) --- src/parquet/encryption.cc | 4 +- src/parquet/file_reader.cc | 22 ++++------ src/parquet/file_writer.cc | 8 +--- src/parquet/metadata.cc | 74 +++++++++++++++++----------------- src/parquet/metadata.h | 1 - src/parquet/properties.h | 82 +++++++++++++++++--------------------- src/parquet/thrift.h | 9 ++--- src/parquet/types.h | 6 +-- 8 files changed, 92 insertions(+), 114 deletions(-) diff --git a/src/parquet/encryption.cc b/src/parquet/encryption.cc index fc4147f5..0a2d9ef9 100644 --- a/src/parquet/encryption.cc +++ b/src/parquet/encryption.cc @@ -17,7 +17,7 @@ #include "encryption.h" -#include +#include namespace parquet { @@ -28,7 +28,7 @@ void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { uint32_t key_id; - std::memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); return key_map_[key_id]; } diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index 9c8f3e03..5eb838e6 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -133,7 +133,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if (!file_crypto_metadata_ || (crypto_meta_data && !crypto_meta_data->encrypted())) { + if (!file_crypto_metadata_ || !crypto_meta_data) { encrypted = false; } @@ -142,20 +142,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { nullptr, properties_.memory_pool()); } - bool encrypted_with_footer_key = false; - - // file is uniform encrypted - // or file is non-uniform encrypted and the column is encrypted with footer key - if (crypto_meta_data == nullptr || (crypto_meta_data->encrypted() && - crypto_meta_data->encrypted_with_footer_key())) { - encrypted_with_footer_key = true; - } - - // file is non-uniform encrypted and the column is encrypted with its own key + // the column is encrypted auto file_decryption = properties_.file_decryption(); - if (encrypted_with_footer_key) { + // the column is encrypted with footer key + if (crypto_meta_data->encrypted_with_footer_key()) { std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); @@ -171,10 +163,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { footer_encryption, properties_.memory_pool()); } + // file is non-uniform encrypted and the column is encrypted with its own key + std::string column_key_metadata = crypto_meta_data->column_key_metadata(); // encrypted with column key - std::string column_key = file_decryption->GetColumnKey( - col->path_in_schema()->ToDotString(), crypto_meta_data->column_key_metadata()); + std::string column_key = + file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); if (column_key.empty()) { throw ParquetException("column is encrypted with null key, path=" + diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 1fd56d40..3af4debd 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -352,12 +352,8 @@ class FileSerializer : public ParquetFileWriter::Contents { uint64_t metadata_start = static_cast(sink_->Tell()); auto metadata = metadata_->Finish(); - if (file_encryption->encrypted_footer()) { - auto footer_encryption = file_encryption->footer_encryption(); - metadata->WriteTo(sink_.get(), footer_encryption.get()); - } else { - metadata->WriteTo(sink_.get()); - } + auto footer_encryption = file_encryption->GetFooterEncryptionProperties(); + metadata->WriteTo(sink_.get(), footer_encryption.get()); WriteFileCryptoMetaData(metadata_start); sink_->Write(PARQUET_EMAGIC, 4); diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index 8bbf46ab..2878316a 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -96,15 +96,17 @@ class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { ~ColumnCryptoMetaDataImpl() {} - const std::vector& path_in_schema() const { - return crypto_metadata_->path_in_schema; - } - bool encrypted() const { return crypto_metadata_->encrypted; } bool encrypted_with_footer_key() const { - return crypto_metadata_->encrypted_with_footer_key; + return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY; + } + bool encrypted_with_column_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY; + } + const std::vector& path_in_schema() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema; } const std::string& column_key_metadata() const { - return crypto_metadata_->column_key_metadata; + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.column_key_metadata; } private: @@ -125,7 +127,6 @@ ColumnCryptoMetaData::~ColumnCryptoMetaData() {} const std::vector& ColumnCryptoMetaData::path_in_schema() const { return impl_->path_in_schema(); } -bool ColumnCryptoMetaData::encrypted() const { return impl_->encrypted(); } bool ColumnCryptoMetaData::encrypted_with_footer_key() const { return impl_->encrypted_with_footer_key(); } @@ -546,15 +547,11 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { bool encrypted_footer() { return metadata_->encrypted_footer; } - const std::string& footer_key_metadata() { - return metadata_->__isset.footer_key_metadata ? metadata_->footer_key_metadata : ""; - } + const std::string& footer_key_metadata() { return metadata_->footer_key_metadata; } uint64_t footer_offset() { return metadata_->footer_offset; } - const std::string& iv_prefix() { - return metadata_->__isset.iv_prefix ? metadata_->iv_prefix : ""; - } + const std::string& iv_prefix() { return metadata_->iv_prefix; } void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); } @@ -568,11 +565,11 @@ Encryption::type FileCryptoMetaData::encryption_algorithm() { return impl_->encryption_algorithm(); } bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } -std::string FileCryptoMetaData::footer_key_metadata() { +const std::string& FileCryptoMetaData::footer_key_metadata() { return impl_->footer_key_metadata(); } uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } -std::string FileCryptoMetaData::iv_prefix() { return impl_->iv_prefix(); } +const std::string& FileCryptoMetaData::iv_prefix() { return impl_->iv_prefix(); } std::shared_ptr FileCryptoMetaData::Make( const uint8_t* serialized_metadata, uint32_t* metadata_len) { @@ -740,13 +737,13 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__set_file_offset(data_page_offset + compressed_size); } - meta_data_.__set_num_values(num_values); + column_metadata_.__set_num_values(num_values); if (index_page_offset >= 0) { - meta_data_.__set_index_page_offset(index_page_offset); + column_metadata_.__set_index_page_offset(index_page_offset); } - meta_data_.__set_data_page_offset(data_page_offset); - meta_data_.__set_total_uncompressed_size(uncompressed_size); - meta_data_.__set_total_compressed_size(compressed_size); + column_metadata_.__set_data_page_offset(data_page_offset); + column_metadata_.__set_total_uncompressed_size(uncompressed_size); + column_metadata_.__set_total_compressed_size(compressed_size); std::vector thrift_encodings; if (has_dictionary) { @@ -769,22 +766,27 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(OutputStream* sink) { - const auto& encrypt_md = properties_->encryption_metadata(column_->path()); + const auto& encrypt_md = properties_->column_encryption_props(column_->path()); - // file is not encrypted or uniform encrypted - if (encrypt_md == nullptr) { + // column is unencrypted + if (!encrypt_md->encrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); - } else { // file is non-uniform encrypted + } else { // column is encrypted column_chunk_->__isset.crypto_meta_data = true; - column_chunk_->crypto_meta_data.__set_path_in_schema( - column_->path()->ToDotVector()); - column_chunk_->crypto_meta_data.__set_encrypted(encrypt_md->encrypted()); - column_chunk_->crypto_meta_data.__set_encrypted( - encrypt_md->encrypted_with_footer_key()); - column_chunk_->crypto_meta_data.__set_column_key_metadata( - encrypt_md->key_metadata()); + + // encrypted with footer key + format::ColumnCryptoMetaData ccmd; + if (encrypt_md->encrypted_with_footer_key()) { + ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); + } else { // encrypted with column key + format::EncryptionWithColumnKey eck; + eck.__set_column_key_metadata(encrypt_md->key_metadata()); + eck.__set_path_in_schema(column_->path()->ToDotVector()); + ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); + } + column_chunk_->__set_crypto_meta_data(ccmd); auto footer_encryption = properties_->footer_encryption(); @@ -792,8 +794,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // key if ((footer_encryption == nullptr && encrypt_md->encrypted()) || (footer_encryption != nullptr && - footer_encryption->key() == encrypt_md->key())) { - // don't set meta_data, + footer_encryption->key().compare(encrypt_md->key()) != 0)) { + // don't set meta_data column_chunk_->__isset.meta_data = false; // Thrift-serialize the ColumnMetaData structure, @@ -1051,18 +1053,16 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) { - if (crypto_metadata_.get() == nullptr) { + if (crypto_metadata_ == nullptr) { return nullptr; } - auto file_encryption = properties_->file_encryption(); - auto footer_encryption = properties_->footer_encryption(); // build format::FileCryptoMetaData crypto_metadata_->__set_encryption_algorithm( ToThrift(footer_encryption->algorithm())); - crypto_metadata_->__set_encrypted_footer(file_encryption->encrypted_footer()); + crypto_metadata_->__set_encrypted_footer(!footer_encryption->key().empty()); std::string footer_key_metadata = footer_encryption->key_metadata(); if (!footer_key_metadata.empty()) { diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h index 3803d249..8abd0f26 100644 --- a/src/parquet/metadata.h +++ b/src/parquet/metadata.h @@ -93,7 +93,6 @@ class PARQUET_EXPORT ColumnCryptoMetaData { ~ColumnCryptoMetaData(); const std::vector& path_in_schema() const; - bool encrypted() const; bool encrypted_with_footer_key() const; const std::string& column_key_metadata() const; diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 76c98039..80b52b25 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -22,12 +22,12 @@ #include #include -#include "parquet/util/logging.h" #include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/logging.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -80,7 +80,7 @@ class PARQUET_EXPORT FileDecryptionProperties { public: FileDecryptionProperties(const std::string& footer_key) : footer_key_(footer_key) { DCHECK(footer_key_.length() == 16 || footer_key_.length() == 24 || - footer_key_.length() == 32); + footer_key_.length() == 32); } FileDecryptionProperties(const std::shared_ptr& key_retriever) @@ -95,14 +95,15 @@ class PARQUET_EXPORT FileDecryptionProperties { void SetColumnKey(const std::vector& paths, const std::string& key) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - for (const auto& path : paths) { - column_keys_[path] = key; - } + schema::ColumnPath columnPath(paths); + + column_keys_[columnPath.ToDotString()] = key; } - const std::string& GetColumnKey(const std::string& path, const std::string& key_metadata = "") { + const std::string& GetColumnKey(const std::shared_ptr& columnPath, + const std::string& key_metadata = "") { if (key_metadata.empty()) { - return column_keys_[path]; + return column_keys_.at(columnPath->ToDotString()); } if (key_retriever_ == nullptr) { throw ParquetException("no key retriever is provided for column key metadata"); @@ -125,7 +126,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string footer_key_; std::string aad_; - std::unordered_map column_keys_; + std::map column_keys_; std::shared_ptr key_retriever_; }; @@ -257,10 +258,8 @@ class PARQUET_EXPORT FileEncryptionProperties { DCHECK(key_metadata.length() <= 256); } - footer_key_ = key; - footer_key_metadata_ = key_metadata; - uniform_encryption_ = !footer_key_.empty(); - algorithm_ = algorithm; + footer_encryption_.reset(new EncryptionProperties(algorithm, key, key_metadata)); + uniform_encryption_ = !key.empty(); } FileEncryptionProperties(Encryption::type algorithm, const std::string& key, int key_id) @@ -268,22 +267,22 @@ class PARQUET_EXPORT FileEncryptionProperties { algorithm, key, key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4)) {} - /** + /** * encrypt_the_rest will define if other columns (not defined in columns argument) * will be encrypted or not * if encrypt_the_rest = true, other columns will be encrypted with footer key * else, other columns will be unencrypted - */ + */ void SetupColumns(const std::vector& columns, - bool encrypt_the_rest) { + bool encrypt_the_rest) { encrypt_the_rest_ = encrypt_the_rest; columns_ = columns; - if (!footer_key_.empty()) { + if (!footer_encryption_->key().empty()) { uniform_encryption_ = true; - for (auto col = columns.begin(); col != columns.end(); col++) { - if (col->key().compare(footer_key_) != 0) { + for (const auto& col : columns) { + if (col.key().compare(footer_encryption_->key()) != 0) { uniform_encryption_ = false; break; } @@ -307,35 +306,28 @@ class PARQUET_EXPORT FileEncryptionProperties { } } - std::shared_ptr footer_encryption() { - if (footer_key_.empty()) { - return nullptr; - } else { - return std::make_shared(algorithm_, footer_key_, - footer_key_metadata_, aad_); - } + std::shared_ptr GetFooterEncryptionProperties() { + return footer_encryption_; } std::shared_ptr GetColumnCryptoMetaData( const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { - return nullptr; + return std::make_shared(true, path->ToDotString()); } // non-uniform encryption std::string path_str = path->ToDotString(); for (const auto& col : columns_) { if (col.path() == path_str) { - return std::shared_ptr(const_cast(&col)); + return std::shared_ptr( + const_cast(&col)); } } // encrypted with footer key if (encrypt_the_rest_) { - std::shared_ptr col( - new ColumnEncryptionProperties(true, path->ToDotString())); - col->SetEncryptionKey(footer_key_, footer_key_metadata_); - return col; + return std::make_shared(true, path->ToDotString()); } // unencrypted @@ -347,34 +339,30 @@ class PARQUET_EXPORT FileEncryptionProperties { const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { - return footer_encryption(); + return footer_encryption_; } // non-uniform encryption std::string path_str = path->ToDotString(); for (const auto& col : columns_) { if (col.path() == path_str) { - return std::make_shared(algorithm_, col.key(), - col.key_metadata(), aad_); + return std::make_shared(footer_encryption_->algorithm(), + col.key(), col.key_metadata(), + footer_encryption_->aad()); } } if (encrypt_the_rest_) { - return footer_encryption(); + return footer_encryption_; } return nullptr; } - void aad(std::string aad) { aad_ = aad; } - - bool encrypted_footer() { return footer_key_.length() != 0; } + void SetupAad(const std::string& aad) { footer_encryption_->aad(aad); } private: - std::string footer_key_; - std::string footer_key_metadata_; - Encryption::type algorithm_; - std::string aad_; + std::shared_ptr footer_encryption_; bool uniform_encryption_; @@ -534,12 +522,14 @@ class PARQUET_EXPORT WriterProperties { return encryption(Encryption::AES_GCM_V1, key, key_id); } - Builder* encryption(Encryption::type algorithm, const std::string& key, uint32_t key_id) { + Builder* encryption(Encryption::type algorithm, const std::string& key, + uint32_t key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } - Builder* encryption(Encryption::type algorithm, const std::string& key, const std::string& key_id) { + Builder* encryption(Encryption::type algorithm, const std::string& key, + const std::string& key_id) { file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); return this; } @@ -645,7 +635,7 @@ class PARQUET_EXPORT WriterProperties { if (parquet_file_encryption_ == nullptr) { return nullptr; } else { - return parquet_file_encryption_->footer_encryption(); + return parquet_file_encryption_->GetFooterEncryptionProperties(); } } @@ -692,7 +682,7 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr encryption_metadata( + std::shared_ptr column_encryption_props( const std::shared_ptr& path) const { if (parquet_file_encryption_) { return parquet_file_encryption_->GetColumnCryptoMetaData(path); diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 2af69ee4..81737af4 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -82,8 +82,7 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) static inline Encryption::type FromThrift(format::EncryptionAlgorithm type) { if (type.__isset.AES_GCM_V1) { return Encryption::AES_GCM_V1; - } - else { + } else { return Encryption::AES_GCM_CTR_V1; } } @@ -114,8 +113,7 @@ static inline format::EncryptionAlgorithm ToThrift(Encryption::type type) { format::EncryptionAlgorithm encryption_algorithm; if (type == Encryption::AES_GCM_V1) { encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); - } - else { + } else { encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); } return encryption_algorithm; @@ -166,7 +164,8 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali throw ParquetException("Couldn't decrypt buffer\n"); } - DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); + DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, + deserialized_msg); *len = 4 + clen; } diff --git a/src/parquet/types.h b/src/parquet/types.h index cec0c099..423648c5 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -117,7 +117,6 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; -// should find a better name??? class PARQUET_EXPORT EncryptionProperties { private: static inline uint8_t* str2bytes(const std::string& str) { @@ -130,7 +129,7 @@ class PARQUET_EXPORT EncryptionProperties { public: EncryptionProperties() = default; EncryptionProperties(Encryption::type algorithm, const std::string& key, - const std::string& key_metadata, const std::string& aad) + const std::string& key_metadata, const std::string& aad = "") : algorithm_(algorithm), key_(key), key_metadata_(key_metadata), aad_(aad) {} ~EncryptionProperties() { key_.replace(0, key_.length(), '\0'); } @@ -138,14 +137,15 @@ class PARQUET_EXPORT EncryptionProperties { int key_length() const { return static_cast(key_.length()); } uint8_t* key_bytes() const { return str2bytes(key_); } + void aad(const std::string& aad) { aad_ = aad; } int aad_length() const { return static_cast(aad_.length()); } uint8_t* aad_bytes() const { return str2bytes(aad_); } Encryption::type algorithm() const { return algorithm_; } const std::string& key_metadata() const { return key_metadata_; } - const std::string& key() const { return key_; } + const std::string& aad() const { return aad_; } uint32_t CalculateCipherSize(uint32_t plain_len) const { if (algorithm_ == Encryption::AES_GCM_V1) { From 217ad340e3b98a3557f8010f78120ffafa1d35b1 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 7 Aug 2018 18:08:11 +0700 Subject: [PATCH 13/18] change PAR2 into PARE, some post-review changes --- src/parquet/column_reader.cc | 14 ++++++++------ src/parquet/column_writer.cc | 2 +- src/parquet/file_reader.cc | 2 +- src/parquet/file_writer.cc | 2 +- src/parquet/properties.h | 3 ++- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index 7e6d4acd..69cd451e 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -111,7 +111,8 @@ class SerializedPageReader : public PageReader { decompression_buffer_(AllocateBuffer(pool, 0)), seen_num_rows_(0), total_num_rows_(total_num_rows), - encryption_(encryption) { + encryption_(encryption), + decryption_buffer_(AllocateBuffer(pool, 0)) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); } @@ -142,6 +143,7 @@ class SerializedPageReader : public PageReader { // Encryption std::shared_ptr encryption_; + std::shared_ptr decryption_buffer_; }; std::shared_ptr SerializedPageReader::NextPage() { @@ -195,15 +197,15 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } - std::vector decrypt_buffer; - if (encryption_.get()) { - decrypt_buffer.resize(encryption_->CalculatePlainSize(compressed_len)); + // Decrypt it if we need to + if (encryption_ != nullptr) { + decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); compressed_len = parquet_encryption::Decrypt( encryption_->algorithm(), false, buffer, compressed_len, encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), - encryption_->aad_length(), decrypt_buffer.data()); + encryption_->aad_length(), decryption_buffer_->mutable_data()); - buffer = decrypt_buffer.data(); + buffer = decryption_buffer_->data(); } // Uncompress it if we need to diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index fcf3767a..bb9e3ec4 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -163,7 +163,7 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_is_sorted(page.is_sorted()); const uint8_t* output_data_buffer = compressed_data->data(); - int output_data_len = static_cast(compressed_data->size()); + int32_t output_data_len = static_cast(compressed_data->size()); std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (encryption_.get()) { diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index 5eb838e6..dbe247ed 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -53,7 +53,7 @@ namespace parquet { static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024; static constexpr uint32_t FOOTER_SIZE = 8; static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; -static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', '2'}; +static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', 'E'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 3af4debd..31a40d95 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -34,7 +34,7 @@ namespace parquet { // FIXME: copied from reader-internal.cc static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; -static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', '2'}; +static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', 'E'}; // ---------------------------------------------------------------------- // RowGroupWriter public API diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 80b52b25..b86ac8bb 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -288,8 +288,9 @@ class PARQUET_EXPORT FileEncryptionProperties { } } } else { - if (encrypt_the_rest) + if (encrypt_the_rest) { throw ParquetException("Encrypt the rest with null footer key"); + } bool all_are_unencrypted = true; for (const auto& col : columns) { if (col.encrypted()) { From 97ba7408948e5a64510fb4b3ffed004142e0c541 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 28 Aug 2018 18:32:37 +0700 Subject: [PATCH 14/18] fix build issue after rebase --- src/parquet/column_writer.cc | 11 ++++++----- src/parquet/column_writer.h | 2 +- src/parquet/file_writer.cc | 1 + src/parquet/properties.h | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index bb9e3ec4..22a1658a 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -130,7 +130,7 @@ static format::Statistics ToThrift(const EncodedStatistics& row_group_statistics class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(OutputStream* sink, Compression::type codec, - std::shared_ptr encryption, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : sink_(sink), @@ -310,12 +310,13 @@ class SerializedPageWriter : public PageWriter { class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(OutputStream* sink, Compression::type codec, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata), in_memory_sink_(new InMemoryOutputStream(pool)), - pager_(new SerializedPageWriter(in_memory_sink_.get(), codec, metadata, pool)) {} + pager_(new SerializedPageWriter(in_memory_sink_.get(), codec, encryption, metadata, pool)) {} // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { return pager_->WriteDictionaryPage(page); @@ -354,16 +355,16 @@ class BufferedPageWriter : public PageWriter { }; std::unique_ptr PageWriter::Open(OutputStream* sink, Compression::type codec, - std::shared_ptr encryption, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool, bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, metadata, pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, pool)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, metadata, pool)); + new SerializedPageWriter(sink, codec, encryption, metadata, pool)); } } diff --git a/src/parquet/column_writer.h b/src/parquet/column_writer.h index 7c60c279..6fe18e07 100644 --- a/src/parquet/column_writer.h +++ b/src/parquet/column_writer.h @@ -75,7 +75,7 @@ class PageWriter { static std::unique_ptr Open( OutputStream* sink, Compression::type codec, - std::shared_ptr encryption, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool buffered_row_group = false); diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 31a40d95..6174878c 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -223,6 +223,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), + properties_->encryption(column_descr->path()), col_meta, properties_->memory_pool(), buffered_row_group_); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); diff --git a/src/parquet/properties.h b/src/parquet/properties.h index b86ac8bb..0b50a574 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -163,7 +163,7 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } - void SetFileDecryption(const std::shared_ptr& decryption) { + void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } From 0836a46b7e92173ac78782e3dcca4dbbabc643d2 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 29 Aug 2018 18:23:34 +0700 Subject: [PATCH 15/18] encryption-reader-writer example --- CMakeLists.txt | 15 + examples/low-level-api/CMakeLists.txt | 4 + .../low-level-api/encryption-reader-writer.cc | 422 ++++++++++++++++++ src/parquet/metadata.cc | 7 +- src/parquet/properties.h | 12 +- 5 files changed, 446 insertions(+), 14 deletions(-) create mode 100644 examples/low-level-api/encryption-reader-writer.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 698f6d76..44e4eb75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -537,6 +537,15 @@ else() endif() endif() +############################################################# +# OpenSSL linkage +find_package(OpenSSL REQUIRED) +message(STATUS "OPENSSL_FOUND ${OPENSSL_FOUND}") +if(OPENSSL_FOUND) + set(OPENSSL_USE_STATIC_LIBS TRUE) + include_directories(${OPENSSL_INCLUDE_DIR}) +endif() + ############################################################# # Apache Arrow linkage @@ -691,6 +700,8 @@ set(LIBPARQUET_SRCS src/parquet/types.cc src/parquet/util/comparison.cc src/parquet/util/memory.cc + src/parquet/util/crypto.cc + src/parquet/encryption.cc ) # # Ensure that thrift compilation is done before using its generated headers @@ -712,6 +723,10 @@ if (NOT PARQUET_MINIMAL_DEPENDENCY) # Although we don't link parquet_objlib against anything, we need it to depend # on these libs as we may generate their headers via ExternalProject_Add set(PARQUET_DEPENDENCIES ${PARQUET_DEPENDENCIES} ${LIBPARQUET_INTERFACE_LINK_LIBS}) + set(LIBPARQUET_INTERFACE_LINK_LIBS + ${OPENSSL_CRYPTO_LIBRARY} + ${LIBPARQUET_INTERFACE_LINK_LIBS} + ) endif() if(NOT APPLE AND NOT MSVC) diff --git a/examples/low-level-api/CMakeLists.txt b/examples/low-level-api/CMakeLists.txt index 64ba110e..10082ad2 100644 --- a/examples/low-level-api/CMakeLists.txt +++ b/examples/low-level-api/CMakeLists.txt @@ -18,8 +18,12 @@ if (PARQUET_BUILD_EXECUTABLES) add_executable(reader-writer reader-writer.cc) add_executable(reader-writer2 reader-writer2.cc) + add_executable(encryption-reader-writer encryption-reader-writer.cc) target_include_directories(reader-writer PRIVATE .) target_include_directories(reader-writer2 PRIVATE .) + target_include_directories(encryption-reader-writer PRIVATE .) target_link_libraries(reader-writer parquet_static) target_link_libraries(reader-writer2 parquet_static) + + target_link_libraries(encryption-reader-writer parquet_static) endif() diff --git a/examples/low-level-api/encryption-reader-writer.cc b/examples/low-level-api/encryption-reader-writer.cc new file mode 100644 index 00000000..c5a45061 --- /dev/null +++ b/examples/low-level-api/encryption-reader-writer.cc @@ -0,0 +1,422 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +/* + * This example describes writing and reading Parquet Files in C++ and serves as a + * reference to the API. + * The file contains all the physical data types supported by Parquet. + * This example uses the RowGroupWriter API that supports writing RowGroups optimized for + *memory consumption + **/ + +/* Parquet is a structured columnar file format + * Parquet File = "Parquet data" + "Parquet Metadata" + * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a + * columnar layout + * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their + * Columns + * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a + * complex (nested) type (internal nodes) + * For specific details, please refer the format here: + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + **/ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; +const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; +const std::string encryptionKey = "0123456789012345"; // 16 bytes + +int main(int argc, char** argv) { + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // parquet::REQUIRED fields do not need definition and repetition level values + // parquet::OPTIONAL fields require only definition level values + // parquet::REPEATED fields require both definition and repetition level values + try { std::cout << "1" << std::endl; + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; std::cout << "2" << std::endl; + std::shared_ptr out_file; std::cout << "3" << std::endl; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); std::cout << "4" << std::endl; + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); std::cout << "5" << std::endl; + + // Add writer properties + parquet::WriterProperties::Builder builder; std::cout << "6" << std::endl; + builder.compression(parquet::Compression::SNAPPY); std::cout << "7" << std::endl; + // uniform encryption + + std::cout << "builder.encryption(encryptionKey);" << std::endl; + builder.encryption(encryptionKey); + std::cout << "builder.encryption(encryptionKey); -- end" << std::endl; + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + try { + // decryption properties + std::shared_ptr decryption_properties = + std::make_shared(encryptionKey); + + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption(decryption_properties); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; + return -1; + } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; + + return 0; +} diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index 2878316a..0f2f0366 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -769,7 +769,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const auto& encrypt_md = properties_->column_encryption_props(column_->path()); // column is unencrypted - if (!encrypt_md->encrypted()) { + if (!encrypt_md || !encrypt_md->encrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); @@ -792,9 +792,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_encryption == nullptr && encrypt_md->encrypted()) || - (footer_encryption != nullptr && - footer_encryption->key().compare(encrypt_md->key()) != 0)) { + if ((footer_encryption == nullptr && encrypt_md->encrypted()) + || !encrypt_md->encrypted_with_footer_key()) { // don't set meta_data column_chunk_->__isset.meta_data = false; diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 0b50a574..8fa33877 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -190,7 +190,6 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; -static const EncryptionProperties DEFAULT_ENCRYPTION = EncryptionProperties(); class PARQUET_EXPORT ColumnProperties { public: @@ -198,14 +197,12 @@ class PARQUET_EXPORT ColumnProperties { Compression::type codec = DEFAULT_COMPRESSION_TYPE, bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, - size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE, - EncryptionProperties encryption = DEFAULT_ENCRYPTION) + size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE) : encoding_(encoding), codec_(codec), dictionary_enabled_(dictionary_enabled), statistics_enabled_(statistics_enabled), - max_stats_size_(max_stats_size), - encryption_(encryption) {} + max_stats_size_(max_stats_size) {} void set_encoding(Encoding::type encoding) { encoding_ = encoding; } @@ -223,8 +220,6 @@ class PARQUET_EXPORT ColumnProperties { max_stats_size_ = max_stats_size; } - void set_encryption(EncryptionProperties encryption) { encryption_ = encryption; } - Encoding::type encoding() const { return encoding_; } Compression::type compression() const { return codec_; } @@ -235,15 +230,12 @@ class PARQUET_EXPORT ColumnProperties { size_t max_statistics_size() const { return max_stats_size_; } - EncryptionProperties encryption() const { return encryption_; } - private: Encoding::type encoding_; Compression::type codec_; bool dictionary_enabled_; bool statistics_enabled_; size_t max_stats_size_; - EncryptionProperties encryption_; }; class PARQUET_EXPORT FileEncryptionProperties { From f136ffb1247b7604641dd54bb73d76b5b41d42bc Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 31 Aug 2018 18:49:18 +0700 Subject: [PATCH 16/18] public WriteFileMetaData() method should have footer_encryption param. Add public WriteFileCryptoMetaData() method --- src/parquet/column_reader.cc | 5 +- src/parquet/column_writer.cc | 10 ++-- src/parquet/file_writer.cc | 90 +++++++++++++++++------------------- src/parquet/file_writer.h | 5 +- src/parquet/metadata.cc | 4 +- src/parquet/metadata.h | 2 +- 6 files changed, 56 insertions(+), 60 deletions(-) diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index 69cd451e..dab3dc43 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -201,9 +201,8 @@ std::shared_ptr SerializedPageReader::NextPage() { if (encryption_ != nullptr) { decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); compressed_len = parquet_encryption::Decrypt( - encryption_->algorithm(), false, buffer, compressed_len, - encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), - encryption_->aad_length(), decryption_buffer_->mutable_data()); + encryption_, false, buffer, compressed_len, + decryption_buffer_->mutable_data()); buffer = decryption_buffer_->data(); } diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 22a1658a..88fd94f0 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -169,9 +169,8 @@ class SerializedPageWriter : public PageWriter { if (encryption_.get()) { encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); output_data_len = parquet_encryption::Encrypt( - encryption_->algorithm(), false, compressed_data->data(), output_data_len, - encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), - encryption_->aad_length(), encrypted_data_buffer->mutable_data()); + encryption_, false, compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } @@ -248,9 +247,8 @@ class SerializedPageWriter : public PageWriter { if (encryption_.get()) { encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); output_data_len = parquet_encryption::Encrypt( - encryption_->algorithm(), false, compressed_data->data(), output_data_len, - encryption_->key_bytes(), encryption_->key_length(), encryption_->aad_bytes(), - encryption_->aad_length(), encrypted_data_buffer->mutable_data()); + encryption_, false, compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index 6174878c..d288709b 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -261,7 +261,21 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto metadata = metadata_->Finish(); - WriteFileMetaData(*metadata, sink_.get()); + + auto file_encryption = properties_->file_encryption(); + if (file_encryption == nullptr) { + WriteFileMetaData(*metadata, sink_.get()); + } + else { + uint64_t metadata_start = static_cast(sink_->Tell()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*metadata, sink_.get(), footer_encryption.get()); + + auto crypto_metadata = metadata_->GetCryptoMetaData(metadata_start); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + } sink_->Close(); is_open_ = false; @@ -333,44 +347,6 @@ class FileSerializer : public ParquetFileWriter::Contents { sink_->Write(PARQUET_EMAGIC, 4); } } - - void WriteMetaData() { - auto file_encryption = properties_->file_encryption(); - if (file_encryption == nullptr) { - // Write MetaData - uint32_t metadata_len = static_cast(sink_->Tell()); - - // Get a FileMetaData - auto metadata = metadata_->Finish(); - metadata->WriteTo(sink_.get()); - metadata_len = static_cast(sink_->Tell()) - metadata_len; - - // Write Footer - sink_->Write(reinterpret_cast(&metadata_len), 4); - sink_->Write(PARQUET_MAGIC, 4); - } else { - // Write MetaData with encryption - uint64_t metadata_start = static_cast(sink_->Tell()); - - auto metadata = metadata_->Finish(); - auto footer_encryption = file_encryption->GetFooterEncryptionProperties(); - metadata->WriteTo(sink_.get(), footer_encryption.get()); - - WriteFileCryptoMetaData(metadata_start); - sink_->Write(PARQUET_EMAGIC, 4); - } - } - - void WriteFileCryptoMetaData(int64_t footerOffset) { - uint64_t crypto_offset = static_cast(sink_->Tell()); - - // Get a FileCryptoMetaData - auto crypto_metadata = metadata_->GetCryptoMetaData(footerOffset); - crypto_metadata->WriteTo(sink_.get()); - - auto crypto_len = static_cast(sink_->Tell()) - crypto_offset; - sink_->Write(reinterpret_cast(&crypto_len), 4); - } }; // ---------------------------------------------------------------------- @@ -405,16 +381,36 @@ std::unique_ptr ParquetFileWriter::Open( return result; } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { - // Write MetaData - uint32_t metadata_len = static_cast(sink->Tell()); +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + EncryptionProperties* footer_encryption) { + if (footer_encryption == nullptr) { + // Write MetaData + uint32_t metadata_len = static_cast(sink->Tell()); + + file_metadata.WriteTo(sink); + metadata_len = static_cast(sink->Tell()) - metadata_len; + + // Write Footer + sink->Write(reinterpret_cast(&metadata_len), 4); + sink->Write(PARQUET_MAGIC, 4); + } + else { + // encrypt and write to sink + file_metadata.WriteTo(sink, footer_encryption); + } +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink) { + uint64_t crypto_offset = static_cast(sink->Tell()); + + // Get a FileCryptoMetaData + crypto_metadata.WriteTo(sink); - file_metadata.WriteTo(sink); - metadata_len = static_cast(sink->Tell()) - metadata_len; + auto crypto_len = static_cast(sink->Tell()) - crypto_offset; + sink->Write(reinterpret_cast(&crypto_len), 4); - // Write Footer - sink->Write(reinterpret_cast(&metadata_len), 4); - sink->Write(PARQUET_MAGIC, 4); + sink->Write(PARQUET_EMAGIC, 4); } const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); } diff --git a/src/parquet/file_writer.h b/src/parquet/file_writer.h index cdfe06cd..e3f9e9e7 100644 --- a/src/parquet/file_writer.h +++ b/src/parquet/file_writer.h @@ -101,7 +101,10 @@ class PARQUET_EXPORT RowGroupWriter { }; PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + EncryptionProperties* encryption_properties = nullptr); +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink); class PARQUET_EXPORT ParquetFileWriter { public: diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index 0f2f0366..a4a21726 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -553,7 +553,7 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { const std::string& iv_prefix() { return metadata_->iv_prefix; } - void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); } + void WriteTo(OutputStream* dst) const { SerializeThriftMsg(metadata_.get(), 1024, dst); } private: friend FileMetaDataBuilder; @@ -585,7 +585,7 @@ FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) { FileCryptoMetaData::~FileCryptoMetaData() {} -void FileCryptoMetaData::WriteTo(OutputStream* dst) { impl_->WriteTo(dst); } +void FileCryptoMetaData::WriteTo(OutputStream* dst) const { impl_->WriteTo(dst); } ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h index 8abd0f26..d2303651 100644 --- a/src/parquet/metadata.h +++ b/src/parquet/metadata.h @@ -220,7 +220,7 @@ class PARQUET_EXPORT FileCryptoMetaData { uint64_t footer_offset(); const std::string& iv_prefix(); - void WriteTo(OutputStream* dst); + void WriteTo(OutputStream* dst) const; private: friend FileMetaDataBuilder; From ee46f524d1865bd31eb7e8bd7b7575fc4c3084a8 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 4 Sep 2018 13:36:23 +0700 Subject: [PATCH 17/18] add EncryptionAlgorithm in types.h for new union format::EncryptionAlgorithm. Add aad_metadata in FileEncryptionProperties --- .../low-level-api/encryption-reader-writer.cc | 31 ++++++----- src/parquet/column_reader.cc | 3 +- src/parquet/column_writer-test.cc | 4 +- src/parquet/column_writer.cc | 14 ++--- src/parquet/file_reader.cc | 8 +-- src/parquet/file_writer.cc | 14 +++-- src/parquet/metadata.cc | 53 +++++++++++-------- src/parquet/metadata.h | 2 +- src/parquet/properties.h | 19 +++++-- src/parquet/thrift.h | 18 ++++--- src/parquet/types.h | 13 +++-- 11 files changed, 106 insertions(+), 73 deletions(-) diff --git a/examples/low-level-api/encryption-reader-writer.cc b/examples/low-level-api/encryption-reader-writer.cc index c5a45061..67a74156 100644 --- a/examples/low-level-api/encryption-reader-writer.cc +++ b/examples/low-level-api/encryption-reader-writer.cc @@ -44,7 +44,8 @@ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; -const std::string encryptionKey = "0123456789012345"; // 16 bytes +const std::string encryptionKey = "0123456789012345"; // 16 bytes +const std::string encryptionKeyCol = "1234567890123450"; // 16 bytes int main(int argc, char** argv) { /********************************************************************************** @@ -53,23 +54,28 @@ int main(int argc, char** argv) { // parquet::REQUIRED fields do not need definition and repetition level values // parquet::OPTIONAL fields require only definition level values // parquet::REPEATED fields require both definition and repetition level values - try { std::cout << "1" << std::endl; + try { // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; std::cout << "2" << std::endl; - std::shared_ptr out_file; std::cout << "3" << std::endl; - PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); std::cout << "4" << std::endl; + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); // Setup the parquet schema - std::shared_ptr schema = SetupSchema(); std::cout << "5" << std::endl; + std::shared_ptr schema = SetupSchema(); // Add writer properties - parquet::WriterProperties::Builder builder; std::cout << "6" << std::endl; - builder.compression(parquet::Compression::SNAPPY); std::cout << "7" << std::endl; + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); // uniform encryption - - std::cout << "builder.encryption(encryptionKey);" << std::endl; + builder.encryption(encryptionKey); - std::cout << "builder.encryption(encryptionKey); -- end" << std::endl; + + std::vector encryptionCols; + parquet::ColumnEncryptionProperties col0(true, "column_0"); + col0.SetEncryptionKey(encryptionKeyCol); + + builder.column_encryption(encryptionCols, true); + std::shared_ptr props = builder.build(); // Create a ParquetFileWriter instance @@ -184,8 +190,9 @@ int main(int argc, char** argv) { try { // decryption properties - std::shared_ptr decryption_properties = + std::shared_ptr decryption_properties = std::make_shared(encryptionKey); + decryption_properties->SetColumnKey("column_0", encryptionKeyCol); parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); reader_properties.file_decryption(decryption_properties); diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index dab3dc43..f1da9515 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -201,8 +201,7 @@ std::shared_ptr SerializedPageReader::NextPage() { if (encryption_ != nullptr) { decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); compressed_len = parquet_encryption::Decrypt( - encryption_, false, buffer, compressed_len, - decryption_buffer_->mutable_data()); + encryption_, false, buffer, compressed_len, decryption_buffer_->mutable_data()); buffer = decryption_buffer_->data(); } diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index e87d549b..97475eec 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -91,8 +91,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { metadata_ = ColumnChunkMetaDataBuilder::Make( writer_properties_, this->descr_, reinterpret_cast(&thrift_metadata_)); - std::unique_ptr pager = - PageWriter::Open(sink_.get(), column_properties.compression(), metadata_.get()); + std::unique_ptr pager = PageWriter::Open( + sink_.get(), column_properties.compression(), nullptr, metadata_.get()); std::shared_ptr writer = ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get()); return std::static_pointer_cast>(writer); diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 88fd94f0..4d7a708d 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -314,7 +314,9 @@ class BufferedPageWriter : public PageWriter { : final_sink_(sink), metadata_(metadata), in_memory_sink_(new InMemoryOutputStream(pool)), - pager_(new SerializedPageWriter(in_memory_sink_.get(), codec, encryption, metadata, pool)) {} // TODO: nullptr for EncryptionProperties + pager_(new SerializedPageWriter(in_memory_sink_.get(), codec, encryption, + metadata, pool)) { + } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { return pager_->WriteDictionaryPage(page); @@ -352,11 +354,11 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr pager_; }; -std::unique_ptr PageWriter::Open(OutputStream* sink, Compression::type codec, - const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, - ::arrow::MemoryPool* pool, - bool buffered_row_group) { +std::unique_ptr PageWriter::Open( + OutputStream* sink, Compression::type codec, + const std::shared_ptr& encryption, + ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool, + bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( new BufferedPageWriter(sink, codec, encryption, metadata, pool)); diff --git a/src/parquet/file_reader.cc b/src/parquet/file_reader.cc index dbe247ed..596180c1 100644 --- a/src/parquet/file_reader.cc +++ b/src/parquet/file_reader.cc @@ -156,7 +156,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { } auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm(), footer_key, footer_key_metadata, + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, file_decryption->GetAad()); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), @@ -175,7 +175,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { col->path_in_schema()->ToDotString()); } auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm(), column_key, column_key_metadata, + file_crypto_metadata_->encryption_algorithm().algorithm, column_key, file_decryption->GetAad()); return PageReader::Open(std::move(stream), col->num_values(), col->compression(), @@ -316,8 +316,8 @@ class SerializedFile : public ParquetFileReader::Contents { std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm(), footer_key, - footer_key_metadata, file_decryption->GetAad()); + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); file_metadata_ = FileMetaData::Make(footer_buffer->data(), &footer_read_size, footer_encryption); diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc index d288709b..c6447938 100644 --- a/src/parquet/file_writer.cc +++ b/src/parquet/file_writer.cc @@ -223,8 +223,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), - col_meta, properties_->memory_pool(), buffered_row_group_); + properties_->encryption(column_descr->path()), col_meta, + properties_->memory_pool(), buffered_row_group_); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -265,12 +265,11 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { WriteFileMetaData(*metadata, sink_.get()); - } - else { + } else { uint64_t metadata_start = static_cast(sink_->Tell()); std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + file_encryption->GetFooterEncryptionProperties(); WriteFileMetaData(*metadata, sink_.get(), footer_encryption.get()); auto crypto_metadata = metadata_->GetCryptoMetaData(metadata_start); @@ -382,7 +381,7 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - EncryptionProperties* footer_encryption) { + EncryptionProperties* footer_encryption) { if (footer_encryption == nullptr) { // Write MetaData uint32_t metadata_len = static_cast(sink->Tell()); @@ -393,8 +392,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, // Write Footer sink->Write(reinterpret_cast(&metadata_len), 4); sink->Write(PARQUET_MAGIC, 4); - } - else { + } else { // encrypt and write to sink file_metadata.WriteTo(sink, footer_encryption); } diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc index a4a21726..9a894daf 100644 --- a/src/parquet/metadata.cc +++ b/src/parquet/metadata.cc @@ -541,7 +541,7 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { ~FileCryptoMetaDataImpl() {} - Encryption::type encryption_algorithm() { + EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } @@ -553,7 +553,9 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { const std::string& iv_prefix() { return metadata_->iv_prefix; } - void WriteTo(OutputStream* dst) const { SerializeThriftMsg(metadata_.get(), 1024, dst); } + void WriteTo(OutputStream* dst) const { + SerializeThriftMsg(metadata_.get(), 1024, dst); + } private: friend FileMetaDataBuilder; @@ -561,7 +563,7 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { uint32_t metadata_len_; }; -Encryption::type FileCryptoMetaData::encryption_algorithm() { +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() { return impl_->encryption_algorithm(); } bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } @@ -779,11 +781,13 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypted with footer key format::ColumnCryptoMetaData ccmd; if (encrypt_md->encrypted_with_footer_key()) { + ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; eck.__set_column_key_metadata(encrypt_md->key_metadata()); eck.__set_path_in_schema(column_->path()->ToDotVector()); + ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } column_chunk_->__set_crypto_meta_data(ccmd); @@ -792,8 +796,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_encryption == nullptr && encrypt_md->encrypted()) - || !encrypt_md->encrypted_with_footer_key()) { + if ((footer_encryption == nullptr && encrypt_md->encrypted()) || + !encrypt_md->encrypted_with_footer_key()) { // don't set meta_data column_chunk_->__isset.meta_data = false; @@ -901,20 +905,22 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { << " columns are initialized"; throw ParquetException(ss.str()); } - int64_t total_byte_size = 0; - - for (int i = 0; i < schema_->num_columns(); i++) { - if (!(row_group_->columns[i].file_offset >= 0)) { - std::stringstream ss; - ss << "Column " << i << " is not complete."; - throw ParquetException(ss.str()); - } - total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; - } - DCHECK(total_bytes_written == total_byte_size) - << "Total bytes in this RowGroup does not match with compressed sizes of columns"; - - row_group_->__set_total_byte_size(total_byte_size); + // int64_t total_byte_size = 0; + + // for (int i = 0; i < schema_->num_columns(); i++) { + // if (!(row_group_->columns[i].file_offset >= 0)) { + // std::stringstream ss; + // ss << "Column " << i << " is not complete."; + // throw ParquetException(ss.str()); + // } + // total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + // } + // DCHECK(total_bytes_written == total_byte_size) + // << "Total bytes in this RowGroup does not match with compressed sizes of + // columns"; + + // row_group_->__set_total_byte_size(total_byte_size); + row_group_->__set_total_byte_size(total_bytes_written); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -1056,14 +1062,17 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return nullptr; } + auto file_encryption = properties_->file_encryption(); auto footer_encryption = properties_->footer_encryption(); // build format::FileCryptoMetaData - crypto_metadata_->__set_encryption_algorithm( - ToThrift(footer_encryption->algorithm())); + EncryptionAlgorithm encryption_algorithm; + encryption_algorithm.algorithm = footer_encryption->algorithm(); + encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); + crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); crypto_metadata_->__set_encrypted_footer(!footer_encryption->key().empty()); - std::string footer_key_metadata = footer_encryption->key_metadata(); + std::string footer_key_metadata = file_encryption->footer_key_metadata(); if (!footer_key_metadata.empty()) { crypto_metadata_->__set_footer_key_metadata(footer_key_metadata); } diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h index d2303651..2ed4f2e2 100644 --- a/src/parquet/metadata.h +++ b/src/parquet/metadata.h @@ -214,7 +214,7 @@ class PARQUET_EXPORT FileCryptoMetaData { uint32_t* metadata_len); ~FileCryptoMetaData(); - Encryption::type encryption_algorithm(); + EncryptionAlgorithm encryption_algorithm(); bool encrypted_footer(); const std::string& footer_key_metadata(); uint64_t footer_offset(); diff --git a/src/parquet/properties.h b/src/parquet/properties.h index 8fa33877..f5531a35 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -250,7 +250,8 @@ class PARQUET_EXPORT FileEncryptionProperties { DCHECK(key_metadata.length() <= 256); } - footer_encryption_.reset(new EncryptionProperties(algorithm, key, key_metadata)); + footer_encryption_.reset(new EncryptionProperties(algorithm, key)); + footer_key_metadata_ = key_metadata; uniform_encryption_ = !key.empty(); } @@ -303,6 +304,10 @@ class PARQUET_EXPORT FileEncryptionProperties { return footer_encryption_; } + const std::string& footer_key_metadata() { return footer_key_metadata_; } + + const std::string& aad_metadata() { return aad_metadata_; } + std::shared_ptr GetColumnCryptoMetaData( const std::shared_ptr& path) { // uniform encryption @@ -339,9 +344,8 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string path_str = path->ToDotString(); for (const auto& col : columns_) { if (col.path() == path_str) { - return std::make_shared(footer_encryption_->algorithm(), - col.key(), col.key_metadata(), - footer_encryption_->aad()); + return std::make_shared( + footer_encryption_->algorithm(), col.key(), footer_encryption_->aad()); } } @@ -352,10 +356,15 @@ class PARQUET_EXPORT FileEncryptionProperties { return nullptr; } - void SetupAad(const std::string& aad) { footer_encryption_->aad(aad); } + void SetupAad(const std::string& aad, const std::string& aad_metadata = "") { + footer_encryption_->aad(aad); + aad_metadata_ = aad_metadata; + } private: std::shared_ptr footer_encryption_; + std::string footer_key_metadata_; + std::string aad_metadata_; bool uniform_encryption_; diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h index 81737af4..d759836b 100644 --- a/src/parquet/thrift.h +++ b/src/parquet/thrift.h @@ -79,11 +79,13 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } -static inline Encryption::type FromThrift(format::EncryptionAlgorithm type) { - if (type.__isset.AES_GCM_V1) { - return Encryption::AES_GCM_V1; +static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + if (encryption.__isset.AES_GCM_V1) { + return EncryptionAlgorithm{Encryption::AES_GCM_V1, + encryption.AES_GCM_V1.aad_metadata}; } else { - return Encryption::AES_GCM_CTR_V1; + return EncryptionAlgorithm{Encryption::AES_GCM_CTR_V1, + encryption.AES_GCM_CTR_V1.aad_metadata}; } } @@ -109,12 +111,16 @@ static inline format::CompressionCodec::type ToThrift(Compression::type type) { return static_cast(type); } -static inline format::EncryptionAlgorithm ToThrift(Encryption::type type) { +static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; - if (type == Encryption::AES_GCM_V1) { + if (encryption.algorithm == Encryption::AES_GCM_V1) { + encryption_algorithm.__isset.AES_GCM_V1 = true; encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); + encryption_algorithm.AES_GCM_V1.aad_metadata = encryption.aad_metadata; } else { + encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); + encryption_algorithm.AES_GCM_CTR_V1.aad_metadata = encryption.aad_metadata; } return encryption_algorithm; } diff --git a/src/parquet/types.h b/src/parquet/types.h index 423648c5..36c01de3 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -117,6 +117,11 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct EncryptionAlgorithm { + Encryption::type algorithm; + std::string aad_metadata; +}; + class PARQUET_EXPORT EncryptionProperties { private: static inline uint8_t* str2bytes(const std::string& str) { @@ -129,10 +134,10 @@ class PARQUET_EXPORT EncryptionProperties { public: EncryptionProperties() = default; EncryptionProperties(Encryption::type algorithm, const std::string& key, - const std::string& key_metadata, const std::string& aad = "") - : algorithm_(algorithm), key_(key), key_metadata_(key_metadata), aad_(aad) {} + const std::string& aad = "") + : algorithm_(algorithm), key_(key), aad_(aad) {} - ~EncryptionProperties() { key_.replace(0, key_.length(), '\0'); } + ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } int key_length() const { return static_cast(key_.length()); } uint8_t* key_bytes() const { return str2bytes(key_); } @@ -143,7 +148,6 @@ class PARQUET_EXPORT EncryptionProperties { Encryption::type algorithm() const { return algorithm_; } - const std::string& key_metadata() const { return key_metadata_; } const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } @@ -168,7 +172,6 @@ class PARQUET_EXPORT EncryptionProperties { private: Encryption::type algorithm_; // encryption algorithm std::string key_; // encryption key, should have 16, 24, 32-byte length - std::string key_metadata_; // key metadata, used for retrieving key std::string aad_; // encryption additional authenticated data }; From d252c8f73d2aa4fd246aaaa618b0fa699bc5d7b5 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 12 Sep 2018 17:49:59 +0700 Subject: [PATCH 18/18] use Builder for ColumnEncryptionProperties and FileEncryptionProperties --- .../low-level-api/encryption-reader-writer.cc | 25 +- src/parquet/properties.h | 260 +++++++++++------- 2 files changed, 170 insertions(+), 115 deletions(-) diff --git a/examples/low-level-api/encryption-reader-writer.cc b/examples/low-level-api/encryption-reader-writer.cc index 67a74156..27dd9ab9 100644 --- a/examples/low-level-api/encryption-reader-writer.cc +++ b/examples/low-level-api/encryption-reader-writer.cc @@ -44,8 +44,8 @@ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; -const std::string encryptionKey = "0123456789012345"; // 16 bytes -const std::string encryptionKeyCol = "1234567890123450"; // 16 bytes +const std::string encryptionKey = "0123456789012345"; // 16 bytes +const std::string encryptionKeyCol = "1234567890123450"; // 16 bytes int main(int argc, char** argv) { /********************************************************************************** @@ -66,16 +66,23 @@ int main(int argc, char** argv) { // Add writer properties parquet::WriterProperties::Builder builder; builder.compression(parquet::Compression::SNAPPY); + // uniform encryption + parquet::FileEncryptionProperties::Builder file_encryption_builder; + file_encryption_builder.footer_key(encryptionKey); - builder.encryption(encryptionKey); + // non-uniform with column keys + std::map> encryption_cols; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("column_0", true); + encryption_col_builder0.key(encryptionKeyCol); + auto encryption_col0 = encryption_col_builder0.build(); + + encryption_cols[encryption_col0->path()] = encryption_col0; - std::vector encryptionCols; - parquet::ColumnEncryptionProperties col0(true, "column_0"); - col0.SetEncryptionKey(encryptionKeyCol); - - builder.column_encryption(encryptionCols, true); + file_encryption_builder.column_properties(encryption_cols, true); + builder.encryption(file_encryption_builder.build()); + std::shared_ptr props = builder.build(); // Create a ParquetFileWriter instance @@ -190,7 +197,7 @@ int main(int argc, char** argv) { try { // decryption properties - std::shared_ptr decryption_properties = + std::shared_ptr decryption_properties = std::make_shared(encryptionKey); decryption_properties->SetColumnKey("column_0", encryptionKeyCol); diff --git a/src/parquet/properties.h b/src/parquet/properties.h index f5531a35..1ac61b1e 100644 --- a/src/parquet/properties.h +++ b/src/parquet/properties.h @@ -42,35 +42,61 @@ static bool DEFAULT_USE_BUFFERED_STREAM = false; class PARQUET_EXPORT ColumnEncryptionProperties { public: - ColumnEncryptionProperties() = default; - ColumnEncryptionProperties(bool encrypt, std::string path) - : encrypt_(encrypt), path_(path), encrypted_with_footer_key_(encrypt) {} + class Builder { + public: + Builder(const std::string& path, bool encrypt) + : path_(path), encrypt_(encrypt), encrypted_with_footer_key_(encrypt) {} - bool encrypted() const { return encrypt_; } - bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } - const std::string& key() const { return key_; } - const std::string& key_metadata() const { return key_metadata_; } + Builder* key(const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + DCHECK(encrypt_); - void SetEncryptionKey(const std::string& key, uint32_t key_id = 0) { - std::string key_metadata = - key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4); - SetEncryptionKey(key, key_metadata); - } + key_ = key; + return this; + } + Builder* key_metadata(const std::string& key_id) { + DCHECK(!key_id.empty()); + key_metadata_ = key_id; + return this; + } - void SetEncryptionKey(const std::string& key, const std::string& key_metadata) { - if (!encrypt_) throw ParquetException("Setting key on unencrypted column: " + path_); - if (key.empty()) throw ParquetException("Null key for " + path_); + Builder* key_id(uint32_t key_id) { + std::string key_metadata = std::string(reinterpret_cast(&key_id), 4); + this->key_metadata(key_metadata); + return this; + } - encrypted_with_footer_key_ = false; - key_ = key; - key_metadata_ = key_metadata; - } + std::shared_ptr build() { + return std::make_shared(path_, encrypt_, encrypted_with_footer_key_, + key_, key_metadata_); + } + + private: + std::string path_; + bool encrypt_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + }; + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + ColumnEncryptionProperties(const std::string& path, bool encrypt, bool encrypted_with_footer_key, + const std::string& key, const std::string& key_metadata) + : path_(path), encrypt_(encrypt), encrypted_with_footer_key_(encrypted_with_footer_key), + key_(key), key_metadata_(key_metadata) {} const std::string& path() const { return path_; } + bool encrypted() const { return encrypt_; } + bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } private: - bool encrypt_; std::string path_; + bool encrypt_; bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; @@ -190,6 +216,10 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; +static constexpr Encryption::type DEFAULT_ENCRYPTION_ALGORITHM = Encryption::AES_GCM_V1; +static constexpr int32_t MAXIMAL_KEY_METADATA_LENGTH = 256; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPT_THE_REST = true; class PARQUET_EXPORT ColumnProperties { public: @@ -240,42 +270,63 @@ class PARQUET_EXPORT ColumnProperties { class PARQUET_EXPORT FileEncryptionProperties { public: - FileEncryptionProperties() = default; - FileEncryptionProperties(const FileEncryptionProperties&) = default; + class Builder { + public: + Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) {} - FileEncryptionProperties(Encryption::type algorithm, const std::string& key, - const std::string& key_metadata) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - if (!key_metadata.empty()) { - DCHECK(key_metadata.length() <= 256); + Builder(const std::string& key) + : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + footer_key_ = key; } - footer_encryption_.reset(new EncryptionProperties(algorithm, key)); - footer_key_metadata_ = key_metadata; - uniform_encryption_ = !key.empty(); - } + Builder* algorithm(Encryption::type algorithm) { + algorithm_ = algorithm; + return this; + } + + Builder* footer_key(const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + footer_key_ = key; + return this; + } + + Builder* footer_key_metadata(const std::string& key_metadata) { + DCHECK(!footer_key_.empty()); + DCHECK(!key_metadata.empty() && key_metadata.length() < MAXIMAL_KEY_METADATA_LENGTH); + footer_key_metadata_ = key_metadata; + return this; + } + + Builder* aad(const std::string& aad) { + DCHECK(!aad.empty()); + aad_ = aad; + return this; + } + + Builder* aad_metadata(const std::string& aad_metadata) { + DCHECK(!aad_.empty()); + DCHECK(!aad_metadata.empty() && aad_metadata.length() < MAXIMAL_AAD_METADATA_LENGTH); + aad_metadata_ = aad_metadata; + return this; + } - FileEncryptionProperties(Encryption::type algorithm, const std::string& key, int key_id) - : FileEncryptionProperties( - algorithm, key, - key_id == 0 ? "" : std::string(reinterpret_cast(&key_id), 4)) {} - - /** - * encrypt_the_rest will define if other columns (not defined in columns argument) - * will be encrypted or not - * if encrypt_the_rest = true, other columns will be encrypted with footer key - * else, other columns will be unencrypted - */ - void SetupColumns(const std::vector& columns, - bool encrypt_the_rest) { + /** + * encrypt_the_rest will define if other columns (not defined in columns argument) + * will be encrypted or not + * if encrypt_the_rest = true, other columns will be encrypted with footer key + * else, other columns will be unencrypted + */ + Builder* column_properties(const std::map>& column_properties, + bool encrypt_the_rest = DEFAULT_ENCRYPT_THE_REST) { encrypt_the_rest_ = encrypt_the_rest; - columns_ = columns; + column_properties_ = column_properties; - if (!footer_encryption_->key().empty()) { + if (!footer_key_.empty()) { uniform_encryption_ = true; - for (const auto& col : columns) { - if (col.key().compare(footer_encryption_->key()) != 0) { + for (const auto& col : column_properties) { + if (col.second->key().compare(footer_key_) != 0) { uniform_encryption_ = false; break; } @@ -285,9 +336,9 @@ class PARQUET_EXPORT FileEncryptionProperties { throw ParquetException("Encrypt the rest with null footer key"); } bool all_are_unencrypted = true; - for (const auto& col : columns) { - if (col.encrypted()) { - if (col.key().empty()) { + for (const auto& col : column_properties) { + if (col.second->encrypted()) { + if (col.second->key().empty()) { throw ParquetException("Encrypt column with null footer key"); } all_are_unencrypted = false; @@ -298,39 +349,72 @@ class PARQUET_EXPORT FileEncryptionProperties { throw ParquetException("Footer and all columns unencrypted"); } } - } + return this; + } + + std::shared_ptr build() { + std::shared_ptr footer_encryption; + if (!footer_key_.empty()) { + footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); + } + return std::make_shared(footer_encryption, footer_key_metadata_, + aad_metadata_, uniform_encryption_, column_properties_, encrypt_the_rest_); + } + + private: + Encryption::type algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_; + std::string aad_metadata_; + + bool uniform_encryption_; + + std::map> column_properties_; + bool encrypt_the_rest_; + }; + + FileEncryptionProperties(const std::shared_ptr& footer_encryption, + const std::string& footer_key_metadata, const std::string& aad_metadata, + bool uniform_encryption, + const std::map>& column_properties, + bool encrypt_the_rest) + : footer_encryption_(footer_encryption) + , footer_key_metadata_(footer_key_metadata) + , aad_metadata_(aad_metadata) + , uniform_encryption_(uniform_encryption) + , column_properties_(column_properties) + , encrypt_the_rest_(encrypt_the_rest) {} std::shared_ptr GetFooterEncryptionProperties() { return footer_encryption_; } - const std::string& footer_key_metadata() { return footer_key_metadata_; } + const std::string& footer_key_metadata() const { return footer_key_metadata_; } - const std::string& aad_metadata() { return aad_metadata_; } + const std::string& aad_metadata() const { return aad_metadata_; } std::shared_ptr GetColumnCryptoMetaData( const std::shared_ptr& path) { // uniform encryption if (uniform_encryption_) { - return std::make_shared(true, path->ToDotString()); + return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); } // non-uniform encryption std::string path_str = path->ToDotString(); - for (const auto& col : columns_) { - if (col.path() == path_str) { - return std::shared_ptr( - const_cast(&col)); - } + if (column_properties_.find(path_str) != column_properties_.end()) { + return column_properties_[path_str]; } + // encrypted with footer key if (encrypt_the_rest_) { - return std::make_shared(true, path->ToDotString()); + return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); } // unencrypted - return std::shared_ptr( - new ColumnEncryptionProperties(false, path->ToDotString())); + return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); } std::shared_ptr GetColumnEncryptionProperties( @@ -342,11 +426,9 @@ class PARQUET_EXPORT FileEncryptionProperties { // non-uniform encryption std::string path_str = path->ToDotString(); - for (const auto& col : columns_) { - if (col.path() == path_str) { + if (column_properties_.find(path_str) != column_properties_.end()) { return std::make_shared( - footer_encryption_->algorithm(), col.key(), footer_encryption_->aad()); - } + footer_encryption_->algorithm(), column_properties_[path_str]->key(), footer_encryption_->aad()); } if (encrypt_the_rest_) { @@ -356,11 +438,6 @@ class PARQUET_EXPORT FileEncryptionProperties { return nullptr; } - void SetupAad(const std::string& aad, const std::string& aad_metadata = "") { - footer_encryption_->aad(aad); - aad_metadata_ = aad_metadata; - } - private: std::shared_ptr footer_encryption_; std::string footer_key_metadata_; @@ -368,7 +445,7 @@ class PARQUET_EXPORT FileEncryptionProperties { bool uniform_encryption_; - std::vector columns_; + std::map> column_properties_; bool encrypt_the_rest_; }; @@ -512,37 +589,8 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } - Builder* encryption(const std::string& key) { - return encryption(Encryption::AES_GCM_V1, key, 0); - } - - Builder* encryption(const std::string& key, uint32_t key_id) { - return encryption(Encryption::AES_GCM_V1, key, key_id); - } - - Builder* encryption(const std::string& key, std::string key_id) { - return encryption(Encryption::AES_GCM_V1, key, key_id); - } - - Builder* encryption(Encryption::type algorithm, const std::string& key, - uint32_t key_id) { - file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); - return this; - } - - Builder* encryption(Encryption::type algorithm, const std::string& key, - const std::string& key_id) { - file_encryption_.reset(new FileEncryptionProperties(algorithm, key, key_id)); - return this; - } - - Builder* column_encryption(const std::vector& columns, - bool encrypt_the_rest) { - if (file_encryption_ == nullptr) { - throw ParquetException("null file encryption"); - } - - file_encryption_->SetupColumns(columns, encrypt_the_rest); + Builder* encryption(const std::shared_ptr& file_encryption) { + file_encryption_ = file_encryption; return this; } @@ -605,7 +653,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; - std::unique_ptr file_encryption_; + std::shared_ptr file_encryption_; // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -737,4 +785,4 @@ std::shared_ptr PARQUET_EXPORT default_writer_properties(); } // namespace parquet -#endif // PARQUET_COLUMN_PROPERTIES_H +#endif // PARQUET_COLUMN_PROPERTIES_H \ No newline at end of file