diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index f2722b1cbf8..813484641fe 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -22,6 +22,16 @@ target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) +if (ARROW_USE_OPENSSL) + add_definitions(-DPARQUET_ENCRYPTION) + add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) + add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) + target_include_directories(parquet-encryption-example PRIVATE low-level-api/) + target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) + target_link_libraries(parquet-encryption-example parquet_static) + target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) +endif() + add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated if (ARROW_BUILD_SHARED) @@ -34,3 +44,9 @@ add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 parquet-arrow-example) + +if (ARROW_USE_OPENSSL) + add_dependencies(parquet + parquet-encryption-example + parquet-encryption-example-all-crypto-options) +endif() diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc new file mode 100644 index 00000000000..06d43be8f5a --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -0,0 +1,639 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * This file contains samples for writing and reading encrypted Parquet files in different + * encryption and decryption configurations. + * Each sample section is dedicated to an independent configuration and shows its creation + * from beginning to end. + * The samples have the following goals: + * 1) Demonstrate usage of different options for data encryption and decryption. + * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr) + * readers that support encryption. + * 3) Produce encrypted files with plaintext footer, for testing the ability of legacy + * readers to parse the footer and read unencrypted columns. + * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading + * encrypted files produced by these writers. + * + * Each write sample produces new independent parquet file, encrypted with a different + * encryption configuration as described below. + * The name of each file is in the form of: + * tester.parquet.encrypted. + * + * The read sample creates a set of decryption configurations and then uses each of them + * to read all encrypted files in the input directory. + * + * The different encryption and decryption configurations are listed below. + * + * Usage: ./encryption-interop-tests + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The write sample creates files with four columns in the following + * encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + * + * The read sample uses each of the following decryption configurations to read every + * encrypted files in the input directory: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + */ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; +const std::string fileName = "tester"; + +using FileClass = ::arrow::io::FileOutputStream; + +void PrintDecryptionConfiguration(int configuration); +// Check that the decryption result is as expected. +void CheckResult(std::string file, int example_id, std::string exception_msg); +// Returns true if FileName ends with suffix. Otherwise returns false. +// Used to skip unencrypted parquet files. +bool FileNameEndsWith(std::string file_name, std::string suffix); + +std::vector GetDirectoryFiles(const std::string& path) { + std::vector files; + struct dirent* entry; + DIR* dir = opendir(path.c_str()); + + if (dir == NULL) { + exit(-1); + } + while ((entry = readdir(dir)) != NULL) { + files.push_back(std::string(entry->d_name)); + } + closedir(dir); + return files; +} + +void InteropTestWriteEncryptedParquetFiles(std::string root_path) { + /********************************************************************************** + Creating a number of Encryption configurations + **********************************************************************************/ + + // This vector will hold various encryption configuraions. + std::vector> + vector_of_encryption_configurations; + + // Encryption configuration 1: Encrypt all columns and the footer with the same key. + // (uniform encryption) + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey); + // Add to list of encryption configurations. + vector_of_encryption_configurations.push_back( + file_encryption_builder_1.footer_key_metadata("kf")->build()); + + // Encryption configuration 2: Encrypt two columns and the footer, with different keys. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_ptr1 = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21(path_ptr1); + encryption_col_builder_20.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols2[path_ptr] = encryption_col_builder_20.build(); + encryption_cols2[path_ptr1] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build()); + + // Encryption configuration 3: Encrypt two columns, with different keys. + // Don’t encrypt footer. + // (plaintext footer mode, readable by legacy readers) + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31(path_ptr1); + encryption_col_builder_30.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols3[path_ptr] = encryption_col_builder_30.build(); + encryption_cols3[path_ptr1] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build()); + + // Encryption configuration 4: Encrypt two columns and the footer, with different keys. + // Use aad_prefix. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41(path_ptr1); + encryption_col_builder_40.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols4[path_ptr] = encryption_col_builder_40.build(); + encryption_cols4[path_ptr1] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(fileName) + ->build()); + + // Encryption configuration 5: Encrypt two columns and the footer, with different keys. + // Use aad_prefix and disable_aad_prefix_storage. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51(path_ptr1); + encryption_col_builder_50.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols5[path_ptr] = encryption_col_builder_50.build(); + encryption_cols5[path_ptr1] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(fileName) + ->disable_store_aad_prefix_storage() + ->build()); + + // Encryption configuration 6: Encrypt two columns and the footer, with different keys. + // Use AES_GCM_CTR_V1 algorithm. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61(path_ptr1); + encryption_col_builder_60.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols6[path_ptr] = encryption_col_builder_60.build(); + encryption_cols6[path_ptr1] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build()); + + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + + // Iterate over the encryption configurations and for each one write a parquet file. + for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); + ++example_id) { + std::stringstream ss; + ss << example_id + 1; + std::string test_number_string = ss.str(); + try { + // Create a local file output stream instance. + std::shared_ptr out_file; + std::string file = + root_path + fileName + std::string(test_number_string) + ".parquet.encrypted"; + std::cout << "Write " << file << std::endl; + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(vector_of_encryption_configurations[example_id]); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return; + } + } +} + +void InteropTestReadEncryptedParquetFiles(std::string root_path) { + std::vector files_in_directory = GetDirectoryFiles(root_path); + + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations; + + // Decryption configuration 1: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey); + string_kr2->PutKey("kc1", kColumnEncryptionKey1); + string_kr2->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31(path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey) + ->column_properties(decryption_cols) + ->build()); + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Iterate over the decryption configurations and use each one to read every files + // in the input directory. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations.size(); + ++example_id) { + PrintDecryptionConfiguration(example_id + 1); + for (auto const& file : files_in_directory) { + std::string exception_msg = ""; + if (!FileNameEndsWith(file, "parquet.encrypted")) // Skip non encrypted files + continue; + try { + std::cout << "--> Read file " << file << std::endl; + + parquet::ReaderProperties reader_properties = + parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + vector_of_decryption_configurations[example_id]->DeepClone()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(root_path + file, false, + reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 4); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(2); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(3); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(file, example_id, exception_msg); + std::cout << "file [" << file << "] Parquet Reading Complete" << std::endl; + } + } +} + +void PrintDecryptionConfiguration(int configuration) { + std::cout << "\n\nDecryption configuration "; + if (configuration == 1) + std::cout << "1: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key." + << std::endl; + else if (configuration == 2) + std::cout << "2: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key. Pass aad_prefix." + << std::endl; + else if (configuration == 3) + std::cout << "3: \n\nDecrypt using explicit column and footer keys." << std::endl; + else { + std::cout << "Unknown configuraion" << std::endl; + exit(-1); + } + std::cout << std::endl; +} + +// Check that the decryption result is as expected. +void CheckResult(std::string file, int example_id, std::string exception_msg) { + int encryption_configuration_number; + std::regex r("tester([0-9]+)\\.parquet.encrypted"); + std::smatch m; + std::regex_search(file, m, r); + if (m.size() == 0) { + std::cerr + << "Error: Error parsing filename to extract encryption configuration number. " + << std::endl; + } + std::string encryption_configuration_number_str = m.str(1); + encryption_configuration_number = atoi(encryption_configuration_number_str.c_str()); + if (encryption_configuration_number < 1 || encryption_configuration_number > 6) { + std::cerr << "Error: Unknown encryption configuration number. " << std::endl; + } + + int decryption_configuration_number = example_id + 1; + + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) + std::cout << "Error: Expecting AAD related exception."; + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected to + // be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) { + std::cout << "Error: Expecting AAD related exception." << std::endl; + } + return; + } + } + if (!exception_msg.empty()) + std::cout << "Error: Unexpected exception was thrown." << exception_msg; +} + +bool FileNameEndsWith(std::string file_name, std::string suffix) { + std::string::size_type idx = file_name.find_first_of('.'); + + if (idx != std::string::npos) { + std::string extension = file_name.substr(idx + 1); + if (extension.compare(suffix) == 0) return true; + } + return false; +} + +int main(int argc, char** argv) { + enum Operation { write, read }; + std::string root_path; + Operation operation = write; + if (argc < 3) { + std::cout << "Usage: encryption-reader-writer-all-crypto-options " + "" + << std::endl; + exit(1); + } + root_path = argv[1]; + if (root_path.compare("read") == 0) { + operation = read; + } + + root_path = argv[2]; + std::cout << "Root path is: " << root_path << std::endl; + + if (operation == write) { + InteropTestWriteEncryptedParquetFiles(root_path); + } else + InteropTestReadEncryptedParquetFiles(root_path); + + return 0; +} diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h new file mode 100644 index 00000000000..2ca3a064768 --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; + +constexpr int FIXED_LENGTH = 10; + +static std::shared_ptr SetupSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, + ConvertedType::NONE)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc new file mode 100644 index 00000000000..5ce66769c0f --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -0,0 +1,449 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +/* + * This file contains sample for writing and reading encrypted Parquet file with + * basic encryption configuration. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The write sample creates a file with eight columns where two of the columns and the + * footer are encrypted. + * + * The read sample decrypts using key retriever that holds the keys of two encrypted + * columns and the footer key. + */ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; +const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; + +int main(int argc, char** argv) { + + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + + // Encryption configuration: Encrypt two columns and the footer. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols; + std::shared_ptr path_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_ptr1 = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder1(path_ptr1); + encryption_col_builder0.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder1.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols[path_ptr] = encryption_col_builder0.build(); + encryption_cols[path_ptr1] = encryption_col_builder1.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder( + kFooterEncryptionKey); + + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(file_encryption_builder.footer_key_metadata("kf") + ->column_properties(encryption_cols) + ->build()); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Decryption configuration: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder; + + + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + file_decryption_builder.key_retriever(kr1)->build()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; + } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; + return 0; +} diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index cb8de1657d6..c2fb9af4792 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -179,6 +179,16 @@ set(PARQUET_SRCS statistics.cc types.cc) +if(ARROW_USE_OPENSSL) + add_definitions(-DPARQUET_ENCRYPTION) + set(PARQUET_SRCS + ${PARQUET_SRCS} + encryption.cc + encryption_internal.cc + internal_file_decryptor.cc + internal_file_encryptor.cc) +endif() + # Ensure that thrift compilation is done before using its generated headers # in parquet code. add_custom_target(parquet-thrift-deps ALL DEPENDS ${THRIFT_OUTPUT_FILES}) @@ -316,6 +326,14 @@ add_parquet_test(arrow-test arrow/arrow-schema-test.cc test-util.cc) +if(ARROW_USE_OPENSSL) + add_parquet_test(encryption-test + SOURCES + encryption-configuration-encrypted-columns-and-footer.cc + encryption-properties-test.cc + test-util.cc) +endif() + # Those tests need to use static linking as they access thrift-generated # symbols which are not exported by parquet.dll on Windows (PARQUET-1420). add_parquet_test(file-deserialize-test diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 130b75a5210..9c8b63039cb 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "arrow/buffer.h" #include "arrow/util/bit-stream-utils.h" @@ -35,6 +36,11 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#endif + using arrow::MemoryPool; namespace parquet { @@ -92,6 +98,7 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; + return default_reader_properties; } @@ -106,11 +113,31 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - ::arrow::MemoryPool* pool) + ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), + column_has_dictionary_(false), + first_page_(true), + row_group_ordinal_(-1), + column_ordinal_(-1), + page_ordinal_(-1), seen_num_rows_(0), - total_num_rows_(total_num_rows) { + total_num_rows_(total_num_rows), + decryption_buffer_(AllocateBuffer(pool, 0)), + meta_decryptor_(NULLPTR), + data_decryptor_(NULLPTR) { + if (ctx != NULLPTR) { + column_has_dictionary_ = ctx->column_has_dictionary; + row_group_ordinal_ = ctx->row_group_ordinal; + column_ordinal_ = ctx->column_ordinal; +#ifdef PARQUET_ENCRYPTION + meta_decryptor_ = ctx->meta_decryptor; + data_decryptor_ = ctx->data_decryptor; + if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { + InitDecryption(); + } +#endif + } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); } @@ -121,6 +148,14 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: +#ifdef PARQUET_ENCRYPTION + void UpdateDecryption(const std::shared_ptr& decryptor, + bool current_page_is_dictionary, int8_t module_type, + const std::string& pageAAD); + + void InitDecryption(); +#endif + std::shared_ptr stream_; format::PageHeader current_page_header_; @@ -130,6 +165,28 @@ class SerializedPageReader : public PageReader { std::unique_ptr<::arrow::util::Codec> decompressor_; std::shared_ptr decompression_buffer_; + // The fields below are used for calculation of AAD (additional authenticated data) + // suffix which is part of the Parquet Modular Encryption. + // The AAD suffix for a parquet module is built internally by Parquet, by direct + // concatenation the different parts of the module, which includes amongst other + // its row group ordinal, column ordinal and page ordinal. + // Please refer to the encryption specification for more details: + // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data + + // To calculate the AAD suffix of an encrypted module, the exact type of the module + // should be known. The following two fields indicate whether the page is data or + // dictionary page. + + // Indicates whether the column has dictionary page. + bool column_has_dictionary_; + // If the column has dictionary page and the page currently processed in the first + // one then it is a dictionary page. + bool first_page_; + // The ordinal fields below are used for AAD suffix calculation. + int16_t row_group_ordinal_; + int16_t column_ordinal_; + int16_t page_ordinal_; + // Maximum allowed page size uint32_t max_page_header_size_; @@ -138,11 +195,72 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; + + // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page + // header in a single column respectively. + // While calculating AAD for different pages in a single column the pages AAD is + // updated by only the page ordinal. + std::string data_pageAAD_; + std::string data_page_headerAAD_; + // Encryption + std::shared_ptr decryption_buffer_; + std::shared_ptr meta_decryptor_; + std::shared_ptr data_decryptor_; }; +#ifdef PARQUET_ENCRYPTION +void SerializedPageReader::InitDecryption() { + // Prepare the AAD for quick update later. + if (data_decryptor_ != NULLPTR) { + DCHECK(!data_decryptor_->file_aad().empty()); + data_pageAAD_ = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + if (meta_decryptor_ != NULLPTR) { + DCHECK(!meta_decryptor_->file_aad().empty()); + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } +} + +void SerializedPageReader::UpdateDecryption(const std::shared_ptr& decryptor, + bool current_page_is_dictionary, + int8_t module_type, + const std::string& pageAAD) { + DCHECK(decryptor != NULLPTR); + if (current_page_is_dictionary) { + std::string aad = encryption::CreateModuleAad(decryptor->file_aad(), module_type, + row_group_ordinal_, column_ordinal_, + static_cast(-1)); + decryptor->UpdateAad(aad); + } else { + encryption::QuickUpdatePageAad(pageAAD, page_ordinal_); + decryptor->UpdateAad(pageAAD); + } +} +#endif // PARQUET_ENCRYPTION + std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with +#ifdef PARQUET_ENCRYPTION + bool current_page_is_dictionary = false; +#endif + if (column_has_dictionary_) { + if (first_page_) { +#ifdef PARQUET_ENCRYPTION + current_page_is_dictionary = true; +#endif + first_page_ = false; + } else { + page_ordinal_++; + } + } else { + page_ordinal_++; + } + while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; uint32_t allowed_page_size = kDefaultPageHeaderSize; @@ -160,8 +278,17 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { +#ifdef PARQUET_ENCRYPTION + if (meta_decryptor_ != NULLPTR) { + UpdateDecryption(meta_decryptor_, current_page_is_dictionary, + encryption::kDictionaryPageHeader, data_page_headerAAD_); + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, meta_decryptor_); +#else DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_); +#endif // PARQUET_ENCRYPTION break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -179,7 +306,12 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - +#ifdef PARQUET_ENCRYPTION + if (data_decryptor_ != NULLPTR) { + UpdateDecryption(data_decryptor_, current_page_is_dictionary, + encryption::kDictionaryPage, data_pageAAD_); + } +#endif // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -190,6 +322,17 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } +#ifdef PARQUET_ENCRYPTION + // Decrypt it if we need to + if (data_decryptor_ != nullptr) { + PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( + compressed_len - data_decryptor_->CiphertextSizeDelta())); + compressed_len = data_decryptor_->Decrypt(page_buffer->data(), compressed_len, + decryption_buffer_->mutable_data()); + + page_buffer = decryption_buffer_; + } +#endif // PARQUET_ENCRYPTION // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. @@ -258,9 +401,9 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool) { + Compression::type codec, ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, pool)); + new SerializedPageReader(stream, total_num_rows, codec, pool, ctx)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index e7d6afbb467..ae897f659d0 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -43,6 +43,7 @@ class RleDecoder; namespace parquet { +class Decryptor; class DictionaryPage; class Page; @@ -73,6 +74,15 @@ class PARQUET_EXPORT LevelDecoder { std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; }; +struct PageReaderContext { + bool column_has_dictionary; + int16_t row_group_ordinal; + int16_t column_ordinal; + + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; +}; + // Abstract page iterator interface. This way, we can feed column pages to the // ColumnReader through whatever mechanism we choose class PARQUET_EXPORT PageReader { @@ -81,8 +91,8 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + struct PageReaderContext* ctx = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f2783d00964..7f525f41948 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "arrow/buffer-builder.h" @@ -37,6 +38,11 @@ #include "parquet/thrift.h" #include "parquet/types.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" +#endif + namespace parquet { using ::arrow::internal::checked_cast; @@ -127,7 +133,10 @@ class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + int16_t row_group_ordinal, int16_t column_chunk_ordinal, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), pool_(pool), @@ -135,7 +144,17 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_(0), data_page_offset_(0), total_uncompressed_size_(0), - total_compressed_size_(0) { + total_compressed_size_(0), + page_ordinal_(0), + row_group_ordinal_(row_group_ordinal), + column_ordinal_(column_chunk_ordinal), + meta_encryptor_(meta_encryptor), + data_encryptor_(data_encryptor) { + if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { +#ifdef PARQUET_ENCRYPTION + InitEncryption(); +#endif + } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -157,10 +176,25 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_encoding(ToThrift(page.encoding())); dict_page_header.__set_is_sorted(page.is_sorted()); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + +#ifdef PARQUET_ENCRYPTION + std::shared_ptr encrypted_data_buffer = nullptr; + if (data_encryptor_.get()) { + UpdateEncryption(encryption::kDictionaryPage); + encrypted_data_buffer = std::static_pointer_cast(AllocateBuffer( + pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } +#endif + format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_dictionary_page_header(dict_page_header); // TODO(PARQUET-594) crc checksum @@ -169,11 +203,19 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data->data(), compressed_data->size())); + +#ifdef PARQUET_ENCRYPTION + if (meta_encryptor_) { + UpdateEncryption(encryption::kDictionaryPageHeader); + } +#endif + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; int64_t final_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(&final_pos)); @@ -181,11 +223,15 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { +#ifdef PARQUET_ENCRYPTION + if (meta_encryptor_ != nullptr) { + UpdateEncryption(encryption::kColumnMetaData); + } +#endif // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, - fallback); - + fallback, meta_encryptor_); // Write metadata at end of column chunk metadata_->WriteTo(sink_.get()); } @@ -214,7 +260,6 @@ class SerializedPageWriter : public PageWriter { int64_t WriteDataPage(const CompressedDataPage& page) override { int64_t uncompressed_size = page.uncompressed_size(); std::shared_ptr compressed_data = page.buffer(); - format::DataPageHeader data_page_header; data_page_header.__set_num_values(page.num_values()); data_page_header.__set_encoding(ToThrift(page.encoding())); @@ -224,10 +269,25 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.repetition_level_encoding())); data_page_header.__set_statistics(ToThrift(page.statistics())); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + +#ifdef PARQUET_ENCRYPTION + std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); + if (data_encryptor_.get()) { + UpdateEncryption(encryption::kDataPage); + PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( + data_encryptor_->CiphertextSizeDelta() + output_data_len)); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } +#endif + format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_data_page_header(data_page_header); // TODO(PARQUET-594) crc checksum @@ -237,13 +297,20 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data->data(), compressed_data->size())); +#ifdef PARQUET_ENCRYPTION + if (meta_encryptor_) { + UpdateEncryption(encryption::kDataPageHeader); + } +#endif + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); + page_ordinal_++; int64_t current_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(¤t_pos)); return current_pos - start_pos; @@ -262,6 +329,57 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: +#ifdef PARQUET_ENCRYPTION + void InitEncryption() { + // Prepare the AAD for quick update later. + if (data_encryptor_ != NULLPTR) { + data_pageAAD_ = encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + if (meta_encryptor_ != NULLPTR) { + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + } + + void UpdateEncryption(int8_t module_type) { + switch (module_type) { + case encryption::kColumnMetaData: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + case encryption::kDataPage: { + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); + data_encryptor_->UpdateAad(data_pageAAD_); + break; + } + case encryption::kDataPageHeader: { + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); + meta_encryptor_->UpdateAad(data_page_headerAAD_); + break; + } + case encryption::kDictionaryPageHeader: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + case encryption::kDictionaryPage: { + data_encryptor_->UpdateAad(encryption::CreateModuleAad( + data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + default: + throw ParquetException("Unknown module type in UpdateEncryption"); + } + } +#endif + std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; ::arrow::MemoryPool* pool_; @@ -270,11 +388,22 @@ class SerializedPageWriter : public PageWriter { int64_t data_page_offset_; int64_t total_uncompressed_size_; int64_t total_compressed_size_; + int16_t page_ordinal_; + int16_t row_group_ordinal_; + int16_t column_ordinal_; std::unique_ptr thrift_serializer_; // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; + +#ifdef PARQUET_ENCRYPTION + std::string data_pageAAD_; + std::string data_page_headerAAD_; +#endif + + std::shared_ptr meta_encryptor_; + std::shared_ptr data_encryptor_; }; // This implementation of the PageWriter writes to the final sink on Close . @@ -282,11 +411,16 @@ class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + int16_t row_group_ordinal, int16_t current_column_ordinal, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, pool)); + + pager_ = std::unique_ptr(new SerializedPageWriter( + in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, + meta_encryptor, data_encryptor)); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -330,14 +464,18 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool, - bool buffered_row_group) { + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, ::arrow::MemoryPool* pool, bool buffered_row_group, + std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor) { if (buffered_row_group) { - return std::unique_ptr( - new BufferedPageWriter(sink, codec, metadata, pool)); + return std::unique_ptr(new BufferedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, + meta_encryptor, data_encryptor)); } else { - return std::unique_ptr( - new SerializedPageWriter(sink, codec, metadata, pool)); + return std::unique_ptr(new SerializedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, + meta_encryptor, data_encryptor)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 023b96585eb..81154314400 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -45,6 +45,7 @@ class RleEncoder; namespace parquet { class ColumnChunkMetaDataBuilder; +class Encryptor; class WriterProperties; class PARQUET_EXPORT LevelEncoder { @@ -83,9 +84,12 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, + int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool buffered_row_group = false); + bool buffered_row_group = false, + std::shared_ptr header_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR); // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc new file mode 100644 index 00000000000..fd0e98ad82c --- /dev/null +++ b/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc @@ -0,0 +1,556 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-util.h" + +/* + * This file contains unit-test for writing and reading encrypted Parquet file with + * different encryption and decryption configuration. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The unit-test creates a single parquet file with eight columns using the + * following encryption configuration: + * + * - Encryption configuration : Encrypt two columns and the footer, with different + * keys. + * + * The written parquet file produced above is read by the following decryption + * configurations: + * + * - Decryption configuration : Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + */ + +namespace parquet { +namespace test { + +using FileClass = ::arrow::io::FileOutputStream; + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +constexpr int kFixedLength = 10; + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + createDecryptionConfigurations(); + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + std::string res = "test.parquet.encrypted"; + file_name_ = data_file(res.c_str()); + } + + void TearDown() { + // delete test file. + ASSERT_EQ(std::remove(file_name_.c_str()), 0); + } + + protected: + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::string file_name_; + int rows_per_rowgroup_ = 50; + std::shared_ptr schema_; + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); + } + + void createDecryptionConfigurations() { + /********************************************************************************** + Creating Decryption configuration + **********************************************************************************/ + + // Decryption configuration : Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + } + + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file) { + std::shared_ptr out_file; + + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + return; + } + + void DecryptFile(std::string file, int example_id, int encryption_configuration) { + std::string exception_msg; + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[example_id]->DeepClone()); + + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + ASSERT_EQ(num_row_groups, 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + ASSERT_EQ(num_columns, 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + ASSERT_EQ(value, i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + ASSERT_EQ(value, expected_value); + if ((i % 2) == 0) { + ASSERT_EQ(repetition_level, 1); + } else { + ASSERT_EQ(repetition_level, 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + ASSERT_EQ(value.value[j], expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); + } else { + // There are NULL values in the rows written + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + i++; + } + file_reader->Close(); + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(encryption_configuration, example_id, exception_msg); + } + + // Check that the decryption result is as expected. + void CheckResult(int encryption_configuration_number, int example_id, + std::string exception_msg) { + if (!exception_msg.empty()) { + ASSERT_EQ(1, 0); + } + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration : Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build(), + file_name_); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); + } +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption-properties-test.cc b/cpp/src/parquet/encryption-properties-test.cc new file mode 100644 index 00000000000..088a2c08bf8 --- /dev/null +++ b/cpp/src/parquet/encryption-properties-test.cc @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/encryption.h" + +namespace parquet { + +using schema::ColumnPath; + +namespace test { + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1->ToDotString(), column_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(false, column_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, column_props_1->key()); + ASSERT_EQ("kc1", column_props_1->key_metadata()); +} + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithFooterKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1->ToDotString(), column_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(true, column_props_1->is_encrypted_with_footer_key()); +} + +// Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST(TestEncryptionProperties, UniformEncryption) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ("kf", props->footer_key_metadata()); + + std::shared_ptr column_path = + parquet::schema::ColumnPath::FromDotString("a_column"); + std::shared_ptr out_col_props = + props->column_properties(column_path); + + ASSERT_EQ(true, out_col_props->is_encrypted()); + ASSERT_EQ(true, out_col_props->is_encrypted_with_footer_key()); +} + +// Encrypt two columns with their own keys and the same key for +// the footer and other columns +TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties; + column_properties[column_path_1] = column_builder_1.build(); + column_properties[column_path_2] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.column_properties(column_properties); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_properties(column_path_1); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_properties(column_path_2); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Encryption configuration 3: Encrypt two columns, don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties; + column_properties[column_path_1] = column_builder_1.build(); + column_properties[column_path_2] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.set_plaintext_footer(); + builder.column_properties(column_properties); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(false, props->encrypted_footer()); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_properties(column_path_1); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_properties(column_path_2); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + // other columns: encrypted with footer, footer is not encrypted + // so column is not encrypted as well + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Use aad_prefix +TEST(TestEncryptionProperties, UseAadPrefix) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->algorithm().aad.aad_prefix); + ASSERT_EQ(false, props->algorithm().aad.supply_aad_prefix); +} + +// Use aad_prefix and +// disable_aad_prefix_storage. +TEST(TestEncryptionProperties, UseAadPrefixNotStoreInFile) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + builder.disable_store_aad_prefix_storage(); + std::shared_ptr props = builder.build(); + + ASSERT_EQ("", props->algorithm().aad.aad_prefix); + ASSERT_EQ(true, props->algorithm().aad.supply_aad_prefix); +} + +// Use AES_GCM_CTR_V1 algorithm +TEST(TestEncryptionProperties, UseAES_GCM_CTR_V1Algorithm) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.algorithm(ParquetCipher::AES_GCM_CTR_V1); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(ParquetCipher::AES_GCM_CTR_V1, props->algorithm().algorithm); +} + +TEST(TestDecryptionProperties, UseKeyRetriever) { + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder builder; + builder.key_retriever(kr1); + std::shared_ptr props = builder.build(); + + auto out_key_retriever = props->key_retriever(); + ASSERT_EQ(kFooterEncryptionKey, out_key_retriever->GetKey("kf")); + ASSERT_EQ(kColumnEncryptionKey1, out_key_retriever->GetKey("kc1")); + ASSERT_EQ(kColumnEncryptionKey2, out_key_retriever->GetKey("kc2")); +} + +TEST(TestDecryptionProperties, SupplyAadPrefix) { + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->aad_prefix()); +} + +TEST(ColumnDecryptionProperties, SetKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + col_builder_1.key(kColumnEncryptionKey1); + + auto props = col_builder_1.build(); + ASSERT_EQ(kColumnEncryptionKey1, props->key()); +} + +TEST(TestDecryptionProperties, UsingExplicitFooterAndColumnKeys) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + parquet::ColumnDecryptionProperties::Builder col_builder_2(column_path_2); + + decryption_cols[column_path_1] = col_builder_1.key(kColumnEncryptionKey1)->build(); + decryption_cols[column_path_2] = col_builder_2.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.column_properties(decryption_cols); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, props->column_key(column_path_1)); + ASSERT_EQ(kColumnEncryptionKey2, props->column_key(column_path_2)); +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc new file mode 100644 index 00000000000..69da7eeae56 --- /dev/null +++ b/cpp/src/parquet/encryption.cc @@ -0,0 +1,411 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption.h" + +#include +#include +#include + +#include + +#include "arrow/util/logging.h" +#include "arrow/util/utf8.h" + +namespace parquet { + +// integer key retriever +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { + key_map_.insert({key_id, key}); +} + +const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { + uint32_t key_id; + memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + + return key_map_[key_id]; +} + +// string key retriever +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { + key_map_.insert({key_id, key}); +} + +const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { + return key_map_[key_id]; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key( + std::string column_key) { + if (column_key.empty()) return this; + + DCHECK(key_.empty()); + key_ = column_key; + return this; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata( + const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("key id should be in UTF8 encoding"); + } + + DCHECK(!key_id.empty()); + this->key_metadata(key_id); + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_properties( + const ColumnPathToDecryptionPropertiesMap& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (const auto& element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + + column_properties_ = column_properties; + return this; +} + +void FileDecryptionProperties::WipeOutDecryptionKeys() { + footer_key_.clear(); + + for (const auto& element : column_properties_) { + element.second->WipeOutDecryptionKey(); + } +} + +bool FileDecryptionProperties::is_utilized() { + if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) + return false; + + return utilized_; +} + +std::shared_ptr FileDecryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + ColumnPathToDecryptionPropertiesMap column_properties_map_copy; + + for (const auto& element : column_properties_) { + column_properties_map_copy.insert( + {element.second->column_path(), element.second->DeepClone()}); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileDecryptionProperties( + footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix, + aad_prefix_verifier_, column_properties_map_copy, plaintext_files_allowed_)); +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key( + const std::string footer_key) { + if (footer_key.empty()) { + return this; + } + DCHECK(footer_key_.empty()); + footer_key_ = footer_key; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever( + const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier( + std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; +} + +ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key( + const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; +} + +std::shared_ptr ColumnDecryptionProperties::Builder::build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); +} + +void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); } + +std::shared_ptr ColumnDecryptionProperties::DeepClone() { + std::string key_copy = key_; + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata( + const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_properties( + const ColumnPathToEncryptionPropertiesMap& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (const auto& element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + column_properties_ = column_properties; + return this; +} + +void FileEncryptionProperties::WipeOutEncryptionKeys() { + footer_key_.clear(); + for (const auto& element : column_properties_) { + element.second->WipeOutEncryptionKey(); + } +} + +std::shared_ptr FileEncryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + ColumnPathToEncryptionPropertiesMap column_properties_map_copy; + + for (const auto& element : column_properties_) { + column_properties_map_copy.insert( + {element.second->column_path(), element.second->DeepClone()}); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileEncryptionProperties( + algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, + new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; +} + +FileEncryptionProperties::Builder* +FileEncryptionProperties::Builder::disable_store_aad_prefix_storage() { + DCHECK(!aad_prefix_.empty()); + + store_aad_prefix_in_file_ = false; + return this; +} + +ColumnEncryptionProperties::ColumnEncryptionProperties( + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata) + : column_path_(column_path) { + // column encryption properties object (with a column key) can be used for writing only + // one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + + DCHECK(column_path != nullptr); + if (!encrypted) { + DCHECK(key.empty() && key_metadata.empty()); + } + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) { + DCHECK(key_metadata.empty()); + } + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; +} + +ColumnDecryptionProperties::ColumnDecryptionProperties( + const std::shared_ptr& column_path, const std::string& key) + : column_path_(column_path) { + utilized_ = false; + DCHECK(column_path != nullptr); + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + key_ = key; +} + +const std::string& FileDecryptionProperties::column_key( + const std::shared_ptr& column_path) { + if (column_properties_.find(column_path) != column_properties_.end()) { + auto column_prop = column_properties_[column_path]; + if (column_prop != nullptr) { + return column_prop->key(); + } + } + return empty_string_; +} + +FileDecryptionProperties::FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_properties, + bool plaintext_files_allowed) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || + 0 != column_properties.size()); + + if (!footer_key.empty()) { + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + } + if (footer_key.empty() && check_plaintext_footer_integrity) { + DCHECK(NULLPTR != key_retriever); + } + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_properties_ = column_properties; + plaintext_files_allowed_ = plaintext_files_allowed; + utilized_ = false; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("footer key id should be in UTF8 encoding"); + } + + if (key_id.empty()) { + return this; + } + + return footer_key_metadata(key_id); +} + +std::shared_ptr FileEncryptionProperties::column_properties( + const std::shared_ptr& column_path) { + if (column_properties_.size() == 0) { + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder(column_path)); + return builder->build(); + } + if (column_properties_.find(column_path) != column_properties_.end()) { + return column_properties_[column_path]; + } + + return nullptr; +} + +FileEncryptionProperties::FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const ColumnPathToEncryptionPropertiesMap& column_properties) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + aad_prefix_(aad_prefix), + store_aad_prefix_in_file_(store_aad_prefix_in_file), + column_properties_(column_properties) { + // file encryption properties object can be used for writing only one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + + uint8_t aad_file_unique[kAadFileUniqueLength]; + memset(aad_file_unique, 0, kAadFileUniqueLength); + RAND_bytes(aad_file_unique, sizeof(kAadFileUniqueLength)); + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + kAadFileUniqueLength); + + bool supply_aad_prefix = false; + if (aad_prefix.empty()) { + file_aad_ = aad_file_unique_str; + } else { + file_aad_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } +} + +} // namespace parquet diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h new file mode 100644 index 00000000000..4958faac2ae --- /dev/null +++ b/cpp/src/parquet/encryption.h @@ -0,0 +1,513 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_H +#define PARQUET_ENCRYPTION_H + +#include +#include +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace parquet { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = + ParquetCipher::AES_GCM_V1; +static constexpr int32_t kMaximalAadMetadataLength = 256; +static constexpr bool kDefaultEncryptedFooter = true; +static constexpr bool kDefaultCheckSignature = true; +static constexpr bool kDefaultAllowPlaintextFiles = false; +static constexpr int32_t kAadFileUniqueLength = 8; + +class ColumnDecryptionProperties; +using ColumnPathToDecryptionPropertiesMap = + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>; + +class ColumnEncryptionProperties; +using ColumnPathToEncryptionPropertiesMap = + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>; + +class PARQUET_EXPORT DecryptionKeyRetriever { + public: + virtual const std::string& GetKey(const std::string& key_metadata) = 0; + virtual ~DecryptionKeyRetriever() {} +}; + +/// Simple integer key retriever +class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(uint32_t key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); + + private: + std::map key_map_; +}; + +// Simple string key retriever +class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(const std::string& key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); + + private: + std::map key_map_; +}; + +class PARQUET_EXPORT HiddenColumnException : public ParquetException { + public: + explicit HiddenColumnException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} +}; + +class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { + public: + explicit KeyAccessDeniedException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} +}; + +inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} + +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + /// Convenience builder for regular (not nested) columns. + explicit Builder(const std::string& name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + + /// Convenience builder for encrypted columns. + explicit Builder(const std::shared_ptr& path) + : Builder(path, true) {} + + /// Set a column-specific key. + /// If key is not set on an encrypted column, the column will + /// be encrypted with the footer key. + /// keyBytes Key length must be either 16, 24 or 32 bytes. + /// The key is cloned, and will be wiped out (array values set to 0) upon completion + /// of file writing. + /// Caller is responsible for wiping out the input key array. + Builder* key(std::string column_key); + + /// Set a key retrieval metadata. + /// use either key_metadata() or key_id(), not both + Builder* key_metadata(const std::string& key_metadata); + + /// A convenience function to set key metadata using a string id. + /// Set a key retrieval metadata (converted from String). + /// use either key_metadata() or key_id(), not both + /// key_id will be converted to metadata (UTF-8 array). + Builder* key_id(const std::string& key_id); + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); + } + + private: + const std::shared_ptr column_path_; + bool encrypted_; + std::string key_; + std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} + }; + + const std::shared_ptr& column_path() { return column_path_; } + bool is_encrypted() const { return encrypted_; } + bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } + + /// Upon completion of file writing, the encryption key + /// will be wiped out. + void WipeOutEncryptionKey() { key_.clear(); } + + bool is_utilized() { + if (key_.empty()) + return false; // can re-use column properties without encryption keys + return utilized_; + } + + /// ColumnEncryptionProperties object can be used for writing one file only. + /// Mark ColumnEncryptionProperties as utilized once it is used in + /// FileEncryptionProperties as the encryption key will be wiped out upon + /// completion of file writing. + void set_utilized() { utilized_ = true; } + + std::shared_ptr DeepClone() { + std::string key_copy = key_; + return std::shared_ptr(new ColumnEncryptionProperties( + encrypted_, column_path_, key_copy, key_metadata_)); + } + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + private: + const std::shared_ptr column_path_; + bool encrypted_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + bool utilized_; + explicit ColumnEncryptionProperties( + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata); +}; + +class PARQUET_EXPORT ColumnDecryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + /// convenience builder for regular (not nested) columns. + explicit Builder(const std::string& name) + : Builder(schema::ColumnPath::FromDotString(name)) {} + + explicit Builder(const std::shared_ptr& path) + : column_path_(path) {} + + /// Set an explicit column key. If applied on a file that contains + /// key metadata for this column the metadata will be ignored, + /// the column will be decrypted with this key. + /// key length must be either 16, 24 or 32 bytes. + Builder* key(const std::string& key); + + std::shared_ptr build(); + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& column_path() { return column_path_; } + const std::string& key() const { return key_; } + bool is_utilized() { return utilized_; } + + /// ColumnDecryptionProperties object can be used for reading one file only. + /// Mark ColumnDecryptionProperties as utilized once it is used in + /// FileDecryptionProperties as the encryption key will be wiped out upon + /// completion of file reading. + void set_utilized() { utilized_ = true; } + + /// Upon completion of file reading, the encryption key + /// will be wiped out. + void WipeOutDecryptionKey(); + + std::shared_ptr DeepClone(); + + private: + const std::shared_ptr column_path_; + std::string key_; + bool utilized_; + + /// This class is only required for setting explicit column decryption keys - + /// to override key retriever (or to provide keys when key metadata and/or + /// key retriever are not available) + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, const std::string& key); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + /// Verifies identity (AAD Prefix) of individual file, + /// or of file collection in a data set. + /// Throws exception if an AAD prefix is wrong. + /// In a data set, AAD Prefixes should be collected, + /// and then checked for missing files. + virtual void Verify(const std::string& aad_prefix) = 0; + virtual ~AADPrefixVerifier() {} +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + Builder() { + check_plaintext_footer_integrity_ = kDefaultCheckSignature; + plaintext_files_allowed_ = kDefaultAllowPlaintextFiles; + } + + /// Set an explicit footer key. If applied on a file that contains + /// footer key metadata the metadata will be ignored, the footer + /// will be decrypted/verified with this key. + /// If explicit key is not set, footer key will be fetched from + /// key retriever. + /// With explicit keys or AAD prefix, new encryption properties object must be + /// created for each encrypted file. + /// Explicit encryption keys (footer and column) are cloned. + /// Upon completion of file reading, the cloned encryption keys in the properties + /// will be wiped out (array values set to 0). + /// Caller is responsible for wiping out the input key array. + /// param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* footer_key(const std::string footer_key); + + /// Set explicit column keys (decryption properties). + /// Its also possible to set a key retriever on this property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. + Builder* column_properties( + const ColumnPathToDecryptionPropertiesMap& column_properties); + + /// Set a key retriever callback. Its also possible to + /// set explicit footer or column keys on this file property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. + Builder* key_retriever(const std::shared_ptr& key_retriever); + + /// Skip integrity verification of plaintext footers. + /// If not called, integrity of plaintext footers will be checked in runtime, + /// and an exception will be thrown in the following situations: + /// - footer signing key is not available + /// (not passed, or not found by key retriever) + /// - footer content and signature don't match + Builder* disable_footer_signature_verification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + /// Explicitly supply the file AAD prefix. + /// A must when a prefix is used for file encryption, but not stored in file. + /// If AAD prefix is stored in file, it will be compared to the explicitly + /// supplied value and an exception will be thrown if they differ. + Builder* aad_prefix(const std::string& aad_prefix); + + /// Set callback for verification of AAD Prefixes stored in file. + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier); + + /// By default, reading plaintext (unencrypted) files is not + /// allowed when using a decryptor + /// - in order to detect files that were not encrypted by mistake. + /// However, the default behavior can be overriden by calling this method. + /// The caller should use then a different method to ensure encryption + /// of files with sensitive data. + Builder* plaintext_files_allowed() { + plaintext_files_allowed_ = true; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new FileDecryptionProperties( + footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, + aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + ColumnPathToDecryptionPropertiesMap column_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + }; + + const std::string& column_key(const std::shared_ptr& column_path); + + const std::string& footer_key() { return footer_key_; } + + const std::string& aad_prefix() { return aad_prefix_; } + std::shared_ptr key_retriever() { return key_retriever_; } + + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } + + bool plaintext_files_allowed() { return plaintext_files_allowed_; } + + const std::shared_ptr& aad_prefix_verifier() { + return aad_prefix_verifier_; + } + + /// Upon completion of file reading, the encryption keys in the properties + /// will be wiped out (array values set to 0). + void WipeOutDecryptionKeys(); + + bool is_utilized(); + + /// FileDecryptionProperties object can be used for reading one file only. + /// Mark FileDecryptionProperties as utilized once it is used to read a file as the + /// encryption keys will be wiped out upon completion of file reading. + void set_utilized() { utilized_ = true; } + + /// FileDecryptionProperties object can be used for reading one file only. + /// (unless this object keeps the keyRetrieval callback only, and no explicit + /// keys or aadPrefix). + /// At the end, keys are wiped out in the memory. + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + const std::string empty_string_ = ""; + ColumnPathToDecryptionPropertiesMap column_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + bool utilized_; + + FileDecryptionProperties(const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_properties, + bool plaintext_files_allowed); +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + explicit Builder(const std::string& footer_key) + : parquet_cipher_(kDefaultEncryptionAlgorithm), + encrypted_footer_(kDefaultEncryptedFooter) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + /// Create files with plaintext footer. + /// If not called, the files will be created with encrypted footer (default). + Builder* set_plaintext_footer() { + encrypted_footer_ = false; + return this; + } + + /// Set encryption algorithm. + /// If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* algorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; + return this; + } + + /// Set a key retrieval metadata (converted from String). + /// use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_id(const std::string& key_id); + + /// Set a key retrieval metadata. + /// use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_metadata(const std::string& footer_key_metadata); + + /// Set the file AAD Prefix. + Builder* aad_prefix(const std::string& aad_prefix); + + /// Skip storing AAD Prefix in file. + /// If not called, and if AAD Prefix is set, it will be stored. + Builder* disable_store_aad_prefix_storage(); + + /// Set the list of encrypted columns and their properties (keys etc). + /// If not called, all columns will be encrypted with the footer key. + /// If called, the file columns not in the list will be left unencrypted. + Builder* column_properties( + const ColumnPathToEncryptionPropertiesMap& column_properties); + + std::shared_ptr build() { + return std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, + aad_prefix_, store_aad_prefix_in_file_, column_properties_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + ColumnPathToEncryptionPropertiesMap column_properties_; + }; + bool encrypted_footer() const { return encrypted_footer_; } + + const EncryptionAlgorithm algorithm() { return algorithm_; } + + const std::string& footer_key() { return footer_key_; } + + const std::string& footer_key_metadata() { return footer_key_metadata_; } + + const std::string& file_aad() const { return file_aad_; } + + std::shared_ptr column_properties( + const std::shared_ptr& column_path); + + bool is_utilized() { return utilized_; } + + /// FileEncryptionProperties object can be used for writing one file only. + /// Mark FileEncryptionProperties as utilized once it is used to write a file as the + /// encryption keys will be wiped out upon completion of file writing. + void set_utilized() { utilized_ = true; } + + /// Upon completion of file writing, the encryption keys + /// will be wiped out (array values set to 0). + void WipeOutEncryptionKeys(); + + /// FileEncryptionProperties object can be used for writing one file only. + /// (at the end, keys are wiped out in the memory). + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); + + private: + EncryptionAlgorithm algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + bool encrypted_footer_; + std::string file_aad_; + std::string aad_prefix_; + bool utilized_; + bool store_aad_prefix_in_file_; + ColumnPathToEncryptionPropertiesMap column_properties_; + + FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const ColumnPathToEncryptionPropertiesMap& column_properties); +}; + +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index bf3239d42c4..695b9b5db72 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -389,9 +389,8 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int } } -AesEncryptor* AesEncryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors) { +AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_encryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -399,9 +398,7 @@ AesEncryptor* AesEncryptor::Make( } AesEncryptor* encryptor = new AesEncryptor(alg_id, key_len, metadata); - if (all_encryptors != NULLPTR) { - all_encryptors->push_back(encryptor); - } + if (all_encryptors != NULLPTR) all_encryptors->push_back(encryptor); return encryptor; } @@ -409,9 +406,8 @@ AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadat : impl_{std::unique_ptr( new AesDecryptorImpl(alg_id, key_len, metadata))} {} -AesDecryptor* AesDecryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors) { +AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_decryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index af668dc4136..9fe82bd28da 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -47,7 +47,7 @@ constexpr int8_t kOffsetIndex = 7; class AesEncryptor { public: static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors); + std::vector* all_encryptors); ~AesEncryptor(); @@ -78,7 +78,7 @@ class AesEncryptor { class AesDecryptor { public: static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors); + std::vector* all_decryptors); ~AesDecryptor(); void WipeOut(); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index d0ca9ca809d..da859bccd36 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -34,18 +34,27 @@ #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" #include "parquet/exception.h" +#include "parquet/file_writer.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#else +namespace parquet { +class InternalFileDecryptor; +} +#endif + namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; -static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; @@ -83,8 +92,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, int row_group_number, - const ReaderProperties& props) - : source_(source), file_metadata_(file_metadata), properties_(props) { + const ReaderProperties& props, + InternalFileDecryptor* file_decryptor = NULLPTR) + : source_(source), + file_metadata_(file_metadata), + properties_(props), + row_group_ordinal_((int16_t)row_group_number), + file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -94,7 +108,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i); + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && @@ -119,8 +133,48 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); + +#ifdef PARQUET_ENCRYPTION + std::unique_ptr crypto_metadata = col->crypto_metadata(); + + // Column is encrypted only if crypto_metadata exists. + if (!crypto_metadata) { + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), NULLPTR, NULLPTR}; + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool(), &ctx); + } + + // The column is encrypted + + // The column is encrypted with footer key + if (crypto_metadata->encrypted_with_footer_key()) { + auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); + auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool(), &ctx); + } + + // The column is encrypted with its own key + std::string column_key_metadata = crypto_metadata->key_metadata(); + std::shared_ptr column_path = + std::make_shared(crypto_metadata->path_in_schema()); + + auto meta_decryptor = + file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); + auto data_decryptor = + file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool(), &ctx); +#else return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool()); +#endif } private: @@ -128,6 +182,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { FileMetaData* file_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; + int16_t row_group_ordinal_; + InternalFileDecryptor* file_decryptor_; }; // ---------------------------------------------------------------------- @@ -142,11 +198,27 @@ class SerializedFile : public ParquetFileReader::Contents { const ReaderProperties& props = default_reader_properties()) : source_(source), properties_(props) {} - void Close() override {} + ~SerializedFile() override { + try { + Close(); + } catch (...) { + } + } + + void Close() override { +#ifdef PARQUET_ENCRYPTION + if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); +#endif + } std::shared_ptr GetRowGroup(int i) override { +#ifdef PARQUET_ENCRYPTION + std::unique_ptr contents(new SerializedRowGroup( + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); +#else std::unique_ptr contents( new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); +#endif return std::make_shared(std::move(contents)); } @@ -175,42 +247,247 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(file_size - footer_read_size, footer_read_size, &footer_buffer)); // Check if all bytes are read. Check if last 4 bytes read have the magic bits +#ifdef PARQUET_ENCRYPTION if (footer_buffer->size() != footer_read_size || - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0) { + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } - - uint32_t metadata_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); - int64_t metadata_start = file_size - kFooterSize - metadata_len; - if (kFooterSize + metadata_len > file_size) { +#else + if (footer_buffer->size() != footer_read_size || + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { + throw ParquetException("Invalid parquet file. Corrupt footer."); + } else if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == + 0) { throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); + "Encrypted parquet file. " + "Should build with parquet encryption support."); } - - std::shared_ptr metadata_buffer; - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + kFooterSize)) { - metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); +#endif + +#if PARQUET_ENCRYPTION + // No encryption or encryption with plaintext footer mode. + if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { + std::shared_ptr metadata_buffer; + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); + + auto file_decryption_properties = properties_.file_decryption_properties(); + if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. + if (file_decryption_properties != NULLPTR) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } + } else { + // Encrypted file with plaintext footer mode. + ParseMetaDataOfEncryptedFileWithPlaintextFooter( + file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); } + } else { + // Encrypted file with Encrypted footer. + ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, + file_size); } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); +#else // not defined PARQUET_ENCRYPTION + std::shared_ptr metadata_buffer; + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); +#endif } private: std::shared_ptr source_; std::shared_ptr file_metadata_; ReaderProperties properties_; + +#ifdef PARQUET_ENCRYPTION + std::unique_ptr file_decryptor_; +#endif + + void ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, + int64_t footer_read_size, int64_t file_size, + std::shared_ptr* metadata_buffer, + uint32_t* metadata_len, uint32_t* read_metadata_len); + +#if PARQUET_ENCRYPTION + std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, + EncryptionAlgorithm& algo); + + void ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len); + + void ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size); +#endif }; +void SerializedFile::ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size, std::shared_ptr* metadata_buffer, uint32_t* metadata_len, + uint32_t* read_metadata_len) { + *metadata_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = file_size - kFooterSize - *metadata_len; + if (kFooterSize + *metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (*metadata_len + kFooterSize)) { + *metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - *metadata_len - kFooterSize, *metadata_len); + } else { + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_start, *metadata_len, metadata_buffer)); + if ((*metadata_buffer)->size() != *metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + *read_metadata_len = *metadata_len; + file_metadata_ = FileMetaData::Make((*metadata_buffer)->data(), read_metadata_len); +} + +#ifdef PARQUET_ENCRYPTION +void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size) { + // encryption with encrypted footer + // both metadata & crypto metadata length + uint32_t footer_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; + if (kFooterSize + footer_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (footer_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != footer_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == nullptr) { + throw ParquetException( + "No decryption properties are provided. Could not read " + "encrypted footer metadata"); + } + uint32_t crypto_metadata_len = footer_len; + std::shared_ptr file_crypto_metadata = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + // Handle AAD prefix + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata(), properties_.memory_pool())); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + std::shared_ptr metadata_buffer; + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException( + "Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); + } + + auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len) { + // Providing decryption properties in plaintext footer mode is not mendatory, for + // example when reading by legacy reader. + if (file_decryption_properties != NULLPTR) { + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + // Handle AAD prefix + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata(), properties_.memory_pool())); + + if (file_decryption_properties->check_plaintext_footer_integrity()) { + if (metadata_len - read_metadata_len != 28) { + throw ParquetException( + "Invalid parquet file. Cannot verify plaintext mode footer."); + } + + auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); + if (!file_metadata_->VerifySignature(encryptor, + metadata_buffer->data() + read_metadata_len)) { + throw ParquetException( + "Invalid parquet file. Could not verify plaintext " + "footer metadata"); + } + } + } +} + +std::string SerializedFile::HandleAadPrefix( + FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) { + std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properties; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + + if (file_has_aad_prefix) { + if (!aad_prefix_in_properties.empty()) { + if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) { + throw ParquetException( + "AAD Prefix in file and in properties " + "is not the same"); + } + } + aad_prefix = aad_prefix_in_file; + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->Verify(aad_prefix); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); + } + } + return aad_prefix + algo.aad.aad_file_unique; +} +#endif // PARQUET_ENCRYPTION + // ---------------------------------------------------------------------- // ParquetFileReader public API @@ -296,6 +573,7 @@ std::shared_ptr ParquetFileReader::RowGroup(int i) { DCHECK(i < metadata()->num_row_groups()) << "The file only has " << metadata()->num_row_groups() << "row groups, requested reader for: " << i; + return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index f2f42f38441..3fca600c285 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -25,15 +25,21 @@ #include "parquet/platform.h" #include "parquet/schema.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" +#else +namespace parquet { +class InternalFileEncryptor; +} +#endif + using arrow::MemoryPool; using parquet::schema::GroupNode; namespace parquet { -// FIXME: copied from reader-internal.cc -static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; - // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -77,16 +83,19 @@ inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) { class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, - RowGroupMetaDataBuilder* metadata, - const WriterProperties* properties, bool buffered_row_group = false) + RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, + const WriterProperties* properties, bool buffered_row_group = false, + InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), properties_(properties), total_bytes_written_(0), closed_(false), + row_group_ordinal_(row_group_ordinal), current_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group) { + buffered_row_group_(buffered_row_group), + file_encryptor_(file_encryptor) { if (buffered_row_group) { InitColumns(); } else { @@ -122,9 +131,24 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++current_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); - std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), col_meta, - properties_->memory_pool()); +#ifdef PARQUET_ENCRYPTION + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; + + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + row_group_ordinal_, static_cast(current_column_index_ - 1), + properties_->memory_pool(), false, meta_encryptor, data_encryptor); +#else + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + row_group_ordinal_, static_cast(current_column_index_ - 1), + properties_->memory_pool(), false); +#endif column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -179,7 +203,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { // Ensures all columns have been written metadata_->set_num_rows(num_rows_); - metadata_->Finish(total_bytes_written_); + metadata_->Finish(total_bytes_written_, row_group_ordinal_); } } @@ -189,9 +213,11 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const WriterProperties* properties_; int64_t total_bytes_written_; bool closed_; + int16_t row_group_ordinal_; int current_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; + InternalFileEncryptor* file_encryptor_; void CheckRowsWritten() const { // verify when only one column is written at a time @@ -219,9 +245,25 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); +#ifdef PARQUET_ENCRYPTION + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + static_cast(row_group_ordinal_), + static_cast(current_column_index_), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor); +#else std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, properties_->memory_pool(), buffered_row_group_); + col_meta, static_cast(row_group_ordinal_), + static_cast(current_column_index_), + properties_->memory_pool(), buffered_row_group_); +#endif column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -260,9 +302,20 @@ class FileSerializer : public ParquetFileWriter::Contents { } row_group_writer_.reset(); +#ifdef PARQUET_ENCRYPTION // Write magic bytes and metadata + auto file_encryption_properties = properties_->file_encryption_properties(); + + if (file_encryption_properties == nullptr) { // Non encrypted file. + file_metadata_ = metadata_->Finish(); + WriteFileMetaData(*file_metadata_, sink_.get()); + } else { // Encrypted file + CloseEncryptedFile(file_encryption_properties); + } +#else file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); +#endif } } @@ -282,8 +335,15 @@ class FileSerializer : public ParquetFileWriter::Contents { } num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); +#ifdef PARQUET_ENCRYPTION + std::unique_ptr contents(new RowGroupSerializer( + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), + buffered_row_group, file_encryptor_.get())); +#else std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, properties_.get(), buffered_row_group)); + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), + buffered_row_group)); +#endif row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } @@ -314,6 +374,38 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } +#ifdef PARQUET_ENCRYPTION + void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { + // Encrypted file with encrypted footer + if (file_encryption_properties->encrypted_footer()) { + // encrypted footer + file_metadata_ = metadata_->Finish(); + + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint64_t metadata_start = static_cast(position); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + + auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true); + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint32_t footer_and_crypto_len = static_cast(position - metadata_start); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); + } else { // Encrypted file with plaintext footer + file_metadata_ = metadata_->Finish(); + auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor, + false); + } + if (file_encryptor_) { + file_encryptor_->WipeOutEncryptionKeys(); + } + } +#endif + std::shared_ptr sink_; bool is_open_; const std::shared_ptr properties_; @@ -323,9 +415,30 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; +#ifdef PARQUET_ENCRYPTION + std::unique_ptr file_encryptor_; +#endif + void StartFile() { - // Parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); +#ifdef PARQUET_ENCRYPTION + auto file_encryption_properties = properties_->file_encryption_properties(); + if (file_encryption_properties == nullptr) { + // Unencrypted parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); + } else { + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, + properties_->memory_pool())); + if (file_encryption_properties->encrypted_footer()) { + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); + } else { + // Encrypted file with plaintext footer mode. + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); + } + } +#else + // Unencrypted parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); +#endif } }; @@ -362,10 +475,9 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); - - // Write MetaData uint32_t metadata_len = static_cast(position); file_metadata.WriteTo(sink); @@ -374,18 +486,59 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); +} + +void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); + return WriteFileMetaData(file_metadata, sink); +} + +#ifdef PARQUET_ENCRYPTION +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + if (encrypt_footer) { // Encrypted file with encrypted footer + // encrypt and write to sink + file_metadata.WriteTo(sink, encryptor); + } else { // Encrypted file with plaintext footer mode. + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); + file_metadata.WriteTo(sink, encryptor); + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; + + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); + } } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, bool encrypt_footer) { ParquetOutputWrapper wrapper(sink); return WriteFileMetaData(file_metadata, &wrapper); } -void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); - return WriteFileMetaData(file_metadata, sink); +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + ParquetOutputWrapper wrapper(sink); + return WriteEncryptedFileMetadata(file_metadata, &wrapper, encryptor, encrypt_footer); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ArrowOutputStream* sink) { + crypto_metadata.WriteTo(sink); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink) { + ParquetOutputWrapper wrapper(sink); + crypto_metadata.WriteTo(&wrapper); } +#endif // PARQUET_ENCRYPTION const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index cdc787f15de..50f6cac2a2e 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -44,6 +44,10 @@ namespace parquet { class ColumnWriter; class OutputStream; +// FIXME: copied from reader-internal.cc +static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; + class PARQUET_EXPORT RowGroupWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more @@ -117,6 +121,25 @@ PARQUET_EXPORT void WriteMetaDataFile(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink); +#ifdef PARQUET_ENCRYPTION +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer); + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor = NULLPTR, + bool encrypt_footer = false); +PARQUET_EXPORT +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ::arrow::io::OutputStream* sink); +#endif + class PARQUET_EXPORT ParquetFileWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc new file mode 100644 index 00000000000..9af59ae01e3 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/internal_file_decryptor.h" +#include "parquet/encryption.h" +#include "parquet/encryption_internal.h" + +namespace parquet { + +// FooterSigningEncryptor + +FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, + const std::string& key, + const std::string& file_aad, + const std::string& aad) + : key_(key), file_aad_(file_aad), aad_(aad) { + aes_encryptor_.reset(encryption::AesEncryptor::Make( + algorithm, static_cast(key_.size()), true, NULLPTR)); +} + +int FooterSigningEncryptor::CiphertextSizeDelta() { + return aes_encryptor_->CiphertextSizeDelta(); +} + +int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + uint8_t* nonce, + uint8_t* encrypted_footer) { + return aes_encryptor_->SignedFooterEncrypt( + footer, footer_len, str2bytes(key_), static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), nonce, encrypted_footer); +} + +// Decryptor +Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_decryptor_(aes_decryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} + +int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } + +int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, + uint8_t* plaintext) { + return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), plaintext); +} + +// InternalFileDecryptor +InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool) + : properties_(properties), + file_aad_(file_aad), + algorithm_(algorithm), + footer_key_metadata_(footer_key_metadata), + pool_(pool) { + if (properties_->is_utilized()) { + throw ParquetException( + "Re-using decryption properties with explicit keys for another file"); + } + properties_->set_utilized(); +} + +void InternalFileDecryptor::WipeOutDecryptionKeys() { + properties_->WipeOutDecryptionKeys(); + for (auto const& i : all_decryptors_) { + i->WipeOut(); + } +} + +std::shared_ptr +InternalFileDecryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; + std::string footer_key = properties_->footer_key(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata_.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->key_retriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException( + "Footer key unavailable. Could not verify " + "plaintext footer metadata"); + } + + std::string aad = encryption::CreateFooterAad(file_aad_); + + footer_signing_encryptor_ = + std::make_shared(algorithm_, footer_key, file_aad_, aad); + return footer_signing_encryptor_; +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { + std::string aad = encryption::CreateFooterAad(file_aad_); + return GetFooterDecryptor(aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( + const std::string& aad) { + return GetFooterDecryptor(aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnData( + const std::string& aad) { + return GetFooterDecryptor(aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( + const std::string& aad, bool metadata) { + if (metadata) { + if (footer_metadata_decryptor_ != NULLPTR) return footer_metadata_decryptor_; + } else { + if (footer_data_decryptor_ != NULLPTR) return footer_data_decryptor_; + } + + std::string footer_key = properties_->footer_key(); + if (footer_key.empty()) { + if (footer_key_metadata_.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->key_retriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException( + "Invalid footer encryption key. " + "Could not parse footer metadata"); + } + + // Create both data and metadata decryptors to avoid redundant retrieval of key + // from the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); + + footer_metadata_decryptor_ = std::make_shared( + aes_metadata_decryptor, footer_key, file_aad_, aad, pool_); + footer_data_decryptor_ = + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad, pool_); + + if (metadata) return footer_metadata_decryptor_; + return footer_data_decryptor_; +} + +std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( + std::shared_ptr column_path, + const std::string& column_key_metadata, const std::string& aad) { + return GetColumnDecryptor(column_path, column_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( + std::shared_ptr column_path, + const std::string& column_key_metadata, const std::string& aad) { + return GetColumnDecryptor(column_path, column_key_metadata, aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( + std::shared_ptr column_path, + const std::string& column_key_metadata, const std::string& aad, bool metadata) { + std::string column_key; + // first look if we already got the decryptor from before + if (metadata) { + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); + } + } else { + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); + } + } + + column_key = properties_->column_key(column_path); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + properties_->key_retriever() != nullptr) { + try { + column_key = properties_->key_retriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " << e.what() + << "\n"; + throw HiddenColumnException(ss.str()); + } + } + if (column_key.empty()) { + throw HiddenColumnException("HiddenColumnException, path=" + + column_path->ToDotString()); + } + + // Create both data and metadata decryptors to avoid redundant retrieval of key + // using the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); + + column_metadata_map_[column_path] = std::make_shared( + aes_metadata_decryptor, column_key, file_aad_, aad, pool_); + column_data_map_[column_path] = + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad, pool_); + + if (metadata) return column_metadata_map_[column_path]; + return column_data_map_[column_path]; +} + +int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("decryption key must be 16, 24 or 32 bytes in length"); +} + +encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (meta_decryptor_[index] == NULLPTR) { + meta_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_)); + } + return meta_decryptor_[index].get(); +} + +encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (data_decryptor_[index] == NULLPTR) { + data_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_)); + } + return data_decryptor_[index].get(); +} + +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h new file mode 100644 index 00000000000..76033700329 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef INTERNAL_FILE_DECRYPTOR_H +#define INTERNAL_FILE_DECRYPTOR_H + +#include +#include +#include +#include + +#include "parquet/schema.h" + +namespace parquet { + +namespace encryption { +class AesDecryptor; +class AesEncryptor; +} // namespace encryption + +class FileDecryptionProperties; + +class FooterSigningEncryptor { + public: + FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, + const std::string& file_aad, const std::string& aad); + int CiphertextSizeDelta(); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* nonce, + uint8_t* encrypted_footer); + + private: + std::string key_; + std::string file_aad_; + std::string aad_; + + std::shared_ptr aes_encryptor_; +}; + +class Decryptor { + public: + Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); + + const std::string& file_aad() const { return file_aad_; } + void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } + + int CiphertextSizeDelta(); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); + + private: + encryption::AesDecryptor* aes_decryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; + ::arrow::MemoryPool* pool_; +}; + +class InternalFileDecryptor { + public: + explicit InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool); + + std::string& file_aad() { return file_aad_; } + + ParquetCipher::type algorithm() { return algorithm_; } + + std::string& footer_key_metadata() { return footer_key_metadata_; } + + std::shared_ptr GetFooterSigningEncryptor(); + + FileDecryptionProperties* properties() { return properties_; } + + void WipeOutDecryptionKeys(); + + std::shared_ptr GetFooterDecryptor(); + std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); + std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); + std::shared_ptr GetColumnMetaDecryptor( + std::shared_ptr column_path, + const std::string& column_key_metadata, const std::string& aad = ""); + std::shared_ptr GetColumnDataDecryptor( + std::shared_ptr column_path, + const std::string& column_key_metadata, const std::string& aad = ""); + + private: + FileDecryptionProperties* properties_; + // Concatenation of aad_prefix (if exists) and aad_file_unique + std::string file_aad_; + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + column_data_map_; + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + column_metadata_map_; + + std::shared_ptr footer_metadata_decryptor_; + std::shared_ptr footer_data_decryptor_; + ParquetCipher::type algorithm_; + std::string footer_key_metadata_; + std::shared_ptr footer_signing_encryptor_; + std::vector all_decryptors_; + + /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_decryptors and data_decryptors. + std::unique_ptr meta_decryptor_[3]; + std::unique_ptr data_decryptor_[3]; + + ::arrow::MemoryPool* pool_; + + std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); + std::shared_ptr GetColumnDecryptor( + std::shared_ptr column_path, + const std::string& column_key_metadata, const std::string& aad, + bool metadata = false); + + encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + + int MapKeyLenToDecryptorArrayIndex(int key_len); +}; + +} // namespace parquet + +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc new file mode 100644 index 00000000000..63a84557c9d --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/internal_file_encryptor.h" +#include "parquet/encryption.h" +#include "parquet/encryption_internal.h" + +namespace parquet { + +// Encryptor +Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_encryptor_(aes_encryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} + +int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } + +int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) { + return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), ciphertext); +} + +// InternalFileEncryptor +InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties, + ::arrow::MemoryPool* pool) + : properties_(properties), pool_(pool) { + if (properties_->is_utilized()) { + throw ParquetException("Re-using encryption properties for another file"); + } + properties_->set_utilized(); +} + +void InternalFileEncryptor::WipeOutEncryptionKeys() { + properties_->WipeOutEncryptionKeys(); + + for (auto const& i : all_encryptors_) { + i->WipeOut(); + } +} + +std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { + if (footer_encryptor_ != NULLPTR) { + return footer_encryptor_; + } + + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); + std::string footer_key = properties_->footer_key(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); + footer_encryptor_ = std::make_shared( + aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_); + return footer_encryptor_; +} + +std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) { + return footer_signing_encryptor_; + } + + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); + std::string footer_signing_key = properties_->footer_key(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); + footer_signing_encryptor_ = std::make_shared( + aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_); + return footer_signing_encryptor_; +} + +std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( + const std::shared_ptr& column_path) { + return GetColumnEncryptor(column_path, true); +} + +std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( + const std::shared_ptr& column_path) { + return GetColumnEncryptor(column_path, false); +} + +std::shared_ptr +InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( + const std::shared_ptr& column_path, bool metadata) { + // first look if we already got the encryptor from before + if (metadata) { + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); + } + } else { + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); + } + } + auto column_prop = properties_->column_properties(column_path); + if (column_prop == NULLPTR) { + return nullptr; + } + + std::string key; + if (column_prop->is_encrypted_with_footer_key()) { + key = properties_->footer_key(); + } else { + key = column_prop->key(); + } + + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size()) + : GetDataAesEncryptor(algorithm, key.size()); + + std::string file_aad = properties_->file_aad(); + std::shared_ptr encryptor = + std::make_shared(aes_encryptor, key, file_aad, "", pool_); + if (metadata) + column_metadata_map_[column_path] = encryptor; + else + column_data_map_[column_path] = encryptor; + + return encryptor; +} + +int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (meta_encryptor_[index] == NULLPTR) { + meta_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_)); + } + return meta_encryptor_[index].get(); +} + +encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (data_encryptor_[index] == NULLPTR) { + data_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_)); + } + return data_encryptor_[index].get(); +} + +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h new file mode 100644 index 00000000000..7d2ce7f4f12 --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef INTERNAL_FILE_ENCRYPTOR_H +#define INTERNAL_FILE_ENCRYPTOR_H + +#include +#include +#include +#include + +#include "parquet/encryption.h" +#include "parquet/schema.h" + +namespace parquet { + +namespace encryption { +class AesEncryptor; +} // namespace encryption + +class FileEncryptionProperties; +class ColumnEncryptionProperties; + +class Encryptor { + public: + Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); + const std::string& file_aad() { return file_aad_; } + void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } + + int CiphertextSizeDelta(); + int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); + + bool EncryptColumnMetaData( + bool encrypted_footer, + const std::shared_ptr& column_encryption_properties) { + // if column is not encrypted then do not encrypt the column metadata + if (!column_encryption_properties || !column_encryption_properties->is_encrypted()) + return false; + // if plaintext footer then encrypt the column metadata + if (!encrypted_footer) return true; + // if column is not encrypted with footer key then encrypt the column metadata + return !column_encryption_properties->is_encrypted_with_footer_key(); + } + + private: + encryption::AesEncryptor* aes_encryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; + ::arrow::MemoryPool* pool_; +}; + +class InternalFileEncryptor { + public: + explicit InternalFileEncryptor(FileEncryptionProperties* propperties, + ::arrow::MemoryPool* pool); + + std::shared_ptr GetFooterEncryptor(); + std::shared_ptr GetFooterSigningEncryptor(); + std::shared_ptr GetColumnMetaEncryptor( + const std::shared_ptr& column_path); + std::shared_ptr GetColumnDataEncryptor( + const std::shared_ptr& column_path); + void WipeOutEncryptionKeys(); + + private: + FileEncryptionProperties* properties_; + + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + column_data_map_; + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + column_metadata_map_; + + std::shared_ptr footer_signing_encryptor_; + std::shared_ptr footer_encryptor_; + + std::vector all_encryptors_; + + // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_encryptors and data_encryptors. + std::unique_ptr meta_encryptor_[3]; + std::unique_ptr data_encryptor_[3]; + + ::arrow::MemoryPool* pool_; + + std::shared_ptr GetColumnEncryptor( + const std::shared_ptr& column_path, bool metadata); + + encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); + encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); + + int MapKeyLenToEncryptorArrayIndex(int key_len); +}; + +} // namespace parquet + +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e6764862a57..3e085df6111 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -22,6 +22,8 @@ #include "arrow/util/logging.h" +#include +#include // IWYU pragma: keep #include "parquet/exception.h" #include "parquet/metadata.h" #include "parquet/platform.h" @@ -30,7 +32,13 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" -#include // IWYU pragma: keep +#ifdef PARQUET_ENCRYPTION +#include "parquet/internal_file_decryptor.h" +#else +namespace parquet { +class Decryptor; +} +#endif namespace parquet { @@ -111,31 +119,117 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d } // MetaData Accessor + +#ifdef PARQUET_ENCRYPTION +// ColumnCryptoMetaData +class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { + public: + explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata) + : crypto_metadata_(crypto_metadata) {} + + ~ColumnCryptoMetaDataImpl() {} + + bool encrypted_with_footer_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY; + } + bool encrypted_with_column_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY; + } + const std::vector& path_in_schema() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema; + } + const std::string& key_metadata() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + } + + private: + const format::ColumnCryptoMetaData* crypto_metadata_; +}; + +std::unique_ptr ColumnCryptoMetaData::Make( + const uint8_t* metadata) { + return std::unique_ptr(new ColumnCryptoMetaData(metadata)); +} + +ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) + : impl_(new ColumnCryptoMetaDataImpl( + reinterpret_cast(metadata))) {} + +ColumnCryptoMetaData::~ColumnCryptoMetaData() {} + +const std::vector& ColumnCryptoMetaData::path_in_schema() const { + return impl_->path_in_schema(); +} +bool ColumnCryptoMetaData::encrypted_with_footer_key() const { + return impl_->encrypted_with_footer_key(); +} +const std::string& ColumnCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); +} +#endif // PARQUET_ENCRYPTION + // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { - const format::ColumnMetaData& meta_data = column->meta_data; - for (auto encoding : meta_data.encodings) { - encodings_.push_back(FromThrift(encoding)); +#ifdef PARQUET_ENCRYPTION + if (column->__isset.crypto_metadata) { // column metadata is encrypted + format::ColumnCryptoMetaData ccmd = column->crypto_metadata; + + if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + is_metadata_set_ = false; + if (file_decryptor != NULLPTR && file_decryptor->properties() != NULLPTR) { + // should decrypt metadata + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + + std::string aad_column_metadata = encryption::CreateModuleAad( + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, + column_ordinal, (int16_t)-1); + auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, + aad_column_metadata); + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg( + reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &decrypted_metadata_, decryptor); + is_metadata_set_ = true; + } + } else { + is_metadata_set_ = true; + } + } else { // column metadata is not encrypted + is_metadata_set_ = true; + } +#else + is_metadata_set_ = true; +#endif // PARQUET_ENCRYPTION + + if (is_metadata_set_) { + const format::ColumnMetaData& meta_data = GetMetadataIfSet(); + for (auto encoding : meta_data.encodings) { + encodings_.push_back(FromThrift(encoding)); + } } possible_stats_ = nullptr; } - // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } // column metadata - inline Type::type type() const { return FromThrift(column_->meta_data.type); } + inline bool is_metadata_set() const { return is_metadata_set_; } + inline Type::type type() const { return FromThrift(GetMetadataIfSet().type); } - inline int64_t num_values() const { return column_->meta_data.num_values; } + inline int64_t num_values() const { return GetMetadataIfSet().num_values; } std::shared_ptr path_in_schema() { - return std::make_shared(column_->meta_data.path_in_schema); + return std::make_shared(GetMetadataIfSet().path_in_schema); } // Check if statistics are set and are valid @@ -145,12 +239,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!column_->meta_data.__isset.statistics || - descr_->sort_order() == SortOrder::UNKNOWN) { + const format::ColumnMetaData& meta_data = GetMetadataIfSet(); + if (!meta_data.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(column_->meta_data, descr_); + possible_stats_ = MakeColumnStats(meta_data, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -162,66 +256,108 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } inline Compression::type compression() const { - return FromThrift(column_->meta_data.codec); + return FromThrift(GetMetadataIfSet().codec); } - const std::vector& encodings() const { return encodings_; } + const std::vector& encodings() const { + GetMetadataIfSet(); + return encodings_; + } inline bool has_dictionary_page() const { - return column_->meta_data.__isset.dictionary_page_offset; + return GetMetadataIfSet().__isset.dictionary_page_offset; } inline int64_t dictionary_page_offset() const { - return column_->meta_data.dictionary_page_offset; + return GetMetadataIfSet().dictionary_page_offset; } - inline int64_t data_page_offset() const { return column_->meta_data.data_page_offset; } + inline int64_t data_page_offset() const { return GetMetadataIfSet().data_page_offset; } inline bool has_index_page() const { - return column_->meta_data.__isset.index_page_offset; + return GetMetadataIfSet().__isset.index_page_offset; } inline int64_t index_page_offset() const { - return column_->meta_data.index_page_offset; + return GetMetadataIfSet().index_page_offset; } inline int64_t total_compressed_size() const { - return column_->meta_data.total_compressed_size; + return GetMetadataIfSet().total_compressed_size; } inline int64_t total_uncompressed_size() const { - return column_->meta_data.total_uncompressed_size; + return GetMetadataIfSet().total_uncompressed_size; + } + +#ifdef PARQUET_ENCRYPTION + inline std::unique_ptr crypto_metadata() const { + if (column_->__isset.crypto_metadata) { + return ColumnCryptoMetaData::Make( + reinterpret_cast(&column_->crypto_metadata)); + } else { + return nullptr; + } } +#endif private: mutable std::shared_ptr possible_stats_; std::vector encodings_; const format::ColumnChunk* column_; + format::ColumnMetaData decrypted_metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; + bool is_metadata_set_; + + inline const format::ColumnMetaData& GetMetadataIfSet() const { +#ifdef PARQUET_ENCRYPTION + if (column_->__isset.crypto_metadata && + column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + if (!is_metadata_set_) { + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); + } else { + return decrypted_metadata_; + } + } else { + return column_->meta_data; + } +#else + return column_->meta_data; +#endif + } }; std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) { + const ApplicationVersion* writer_version, int16_t row_group_ordinal, + int16_t column_ordinal, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version)); + new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, + writer_version, file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - writer_version))} {} -ColumnChunkMetaData::~ColumnChunkMetaData() {} + row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} +ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); } // column metadata +bool ColumnChunkMetaData::is_metadata_set() const { return impl_->is_metadata_set(); } + Type::type ColumnChunkMetaData::type() const { return impl_->type(); } int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); } @@ -270,6 +406,12 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +#ifdef PARQUET_ENCRYPTION +std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { + return impl_->crypto_metadata(); +} +#endif + // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: @@ -284,9 +426,16 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t total_byte_size() const { return row_group_->total_byte_size; } + inline int64_t file_offset() const { return row_group_->file_offset; } + + inline int64_t total_compressed_size() const { + return row_group_->total_compressed_size; + } + inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i) { + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -294,7 +443,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_); + writer_version_, row_group_ordinal, (int16_t)i, + file_decryptor); } private: @@ -325,8 +475,9 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const { - return impl_->ColumnChunk(i); +std::unique_ptr RowGroupMetaData::ColumnChunk( + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } // file metadata @@ -334,11 +485,12 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get()); + metadata_.get(), decryptor); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -352,6 +504,30 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } +#ifdef PARQUET_ENCRYPTION + bool VerifySignature(std::shared_ptr encryptor, + const void* signature) { + // serialize the footer + uint8_t* serialized_data; + uint32_t serialized_len = metadata_len_; + ThriftSerializer serializer; + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt with nonce + uint8_t* nonce = const_cast(reinterpret_cast(signature)); + uint8_t* tag = const_cast(reinterpret_cast(signature)) + + encryption::kNonceLength; + + std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + + serialized_len); + uint32_t encrypted_len = encryptor->SignedFooterEncrypt( + serialized_data, serialized_len, nonce, encrypted_buffer.data()); + return 0 == + memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, + tag, encryption::kGcmTagLength); + } +#endif + inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } inline int64_t num_rows() const { return metadata_->num_rows; } @@ -364,11 +540,52 @@ class FileMetaData::FileMetaDataImpl { return static_cast(metadata_->schema.size()); } +#ifdef PARQUET_ENCRYPTION + inline bool is_encryption_algorithm_set() const { + return metadata_->__isset.encryption_algorithm; + } + inline EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + inline const std::string& footer_signing_key_metadata() { + return metadata_->footer_signing_key_metadata; + } +#endif + const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst) const { + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor) const { ThriftSerializer serializer; +#ifdef PARQUET_ENCRYPTION + // Only in encrypted files with plaintext footers the + // encryption_algorithm is set in footer + if (is_encryption_algorithm_set()) { + uint8_t* serialized_data; + uint32_t serialized_len; + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt the footer key + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); + + // write unencrypted footer + PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len)); + // Write signature (nonce and tag) + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + 4, encryption::kNonceLength)); + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, + encryption::kGcmTagLength)); + } else { // either plaintext file (when encryptor is null) + // or encrypted file with encrypted footer + serializer.Serialize(metadata_.get(), dst, encryptor); + } +#else serializer.Serialize(metadata_.get(), dst); +#endif // PARQUET_ENCRYPTION } std::unique_ptr RowGroup(int i) { @@ -452,15 +669,18 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len) { +std::shared_ptr FileMetaData::Make( + const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); + return std::shared_ptr( + new FileMetaData(metadata, metadata_len, decryptor)); } -FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor) : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} + new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -471,6 +691,13 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } +#ifdef PARQUET_ENCRYPTION +bool FileMetaData::VerifySignature(std::shared_ptr encryptor, + const void* signature) { + return impl_->VerifySignature(encryptor, signature); +} +#endif + uint32_t FileMetaData::size() const { return impl_->size(); } int FileMetaData::num_columns() const { return impl_->num_columns(); } @@ -479,6 +706,20 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } +#ifdef PARQUET_ENCRYPTION +bool FileMetaData::is_encryption_algorithm_set() const { + return impl_->is_encryption_algorithm_set(); +} + +EncryptionAlgorithm FileMetaData::encryption_algorithm() const { + return impl_->encryption_algorithm(); +} + +const std::string& FileMetaData::footer_signing_key_metadata() const { + return impl_->footer_signing_key_metadata(); +} +#endif // PARQUET_ENCRYPTION + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: @@ -512,10 +753,66 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { - return impl_->WriteTo(dst); +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor) const { + return impl_->WriteTo(dst, encryptor); +} + +#ifdef PARQUET_ENCRYPTION +class FileCryptoMetaData::FileCryptoMetaDataImpl { + public: + FileCryptoMetaDataImpl() {} + + explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) { + metadata_.reset(new format::FileCryptoMetaData); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + metadata_len_ = *metadata_len; + } + + ~FileCryptoMetaDataImpl() {} + + EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + const std::string& key_metadata() { return metadata_->key_metadata; } + void WriteTo(::arrow::io::OutputStream* dst) const { + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); + } + + private: + friend FileMetaDataBuilder; + std::unique_ptr metadata_; + uint32_t metadata_len_; +}; + +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const { + return impl_->encryption_algorithm(); +} + +const std::string& FileCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); +} + +std::shared_ptr FileCryptoMetaData::Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len) { + return std::shared_ptr( + new FileCryptoMetaData(serialized_metadata, metadata_len)); } +FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, + uint32_t* metadata_len) + : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} + +FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {} + +FileCryptoMetaData::~FileCryptoMetaData() {} + +void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { + impl_->WriteTo(dst); +} +#endif // PARQUET_ENCRYPTION + ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) : application_(application), version{major, minor, patch, "", "", ""} {} @@ -652,7 +949,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -667,6 +964,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->meta_data.__set_data_page_offset(data_page_offset); column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); column_chunk_->meta_data.__set_total_compressed_size(compressed_size); + std::vector thrift_encodings; if (has_dictionary) { thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); @@ -685,6 +983,62 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } column_chunk_->meta_data.__set_encodings(thrift_encodings); + +#ifdef PARQUET_ENCRYPTION + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + // column is encrypted + if (encrypt_md != NULLPTR && encrypt_md->is_encrypted()) { + column_chunk_->__isset.crypto_metadata = true; + format::ColumnCryptoMetaData ccmd; + if (encrypt_md->is_encrypted_with_footer_key()) { + // encrypted with footer key + ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; + ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); + } else { // encrypted with column key + format::EncryptionWithColumnKey eck; + eck.__set_key_metadata(encrypt_md->key_metadata()); + eck.__set_path_in_schema(column_->path()->ToDotVector()); + ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; + ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); + } + column_chunk_->__set_crypto_metadata(ccmd); + + bool encrypted_footer = + properties_->file_encryption_properties()->encrypted_footer(); + bool encrypt_metadata = + !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); + if (encrypt_metadata) { + ThriftSerializer serializer; + // Serialize and encrypt ColumnMetadata separately + // Thrift-serialize the ColumnMetaData structure, + // encrypt it with the column key, and write to encrypted_column_metadata + uint8_t* serialized_data; + uint32_t serialized_len; + + serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len, + &serialized_data); + + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); + + const char* temp = + const_cast(reinterpret_cast(encrypted_data.data())); + std::string encrypted_column_metadata(temp, encrypted_len); + column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); + + if (encrypted_footer) { + column_chunk_->__isset.meta_data = false; + } else { + // Keep redacted metadata version for old readers + column_chunk_->__isset.meta_data = true; + column_chunk_->meta_data.__isset.statistics = false; + column_chunk_->meta_data.__isset.encoding_stats = false; + } + } + } +#endif // PARQUET_ENCRYPTION } void WriteTo(::arrow::io::OutputStream* sink) { @@ -693,10 +1047,14 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } const ColumnDescriptor* descr() const { return column_; } + int64_t total_compressed_size() const { + return column_chunk_->meta_data.total_compressed_size; + } private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; + column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type())); column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector()); column_chunk_->meta_data.__set_codec( @@ -747,9 +1105,11 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, + const std::shared_ptr& encryptor) { impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, dictionary_fallback); + compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, + encryptor); } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { @@ -764,6 +1124,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) impl_->SetStatistics(result); } +int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { + return impl_->total_compressed_size(); +} + class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, @@ -790,27 +1154,48 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { int current_column() { return current_column_; } - void Finish(int64_t total_bytes_written) { + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { if (!(current_column_ == schema_->num_columns())) { std::stringstream ss; ss << "Only " << current_column_ - 1 << " out of " << schema_->num_columns() << " columns are initialized"; throw ParquetException(ss.str()); } - int64_t total_byte_size = 0; - + // int64_t total_byte_size = 0; + + // for (int i = 0; i < schema_->num_columns(); i++) { + // if (!(row_group_->columns[i].file_offset >= 0)) { + // std::stringstream ss; + // ss << "Column " << i << " is not complete."; + // throw ParquetException(ss.str()); + // } + // total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + // } + // DCHECK(total_bytes_written == total_byte_size) + // << "Total bytes in this RowGroup does not match with compressed sizes of + // columns"; + + // row_group_->__set_total_byte_size(total_byte_size); + int64_t file_offset = 0; + int64_t total_compressed_size = 0; for (int i = 0; i < schema_->num_columns(); i++) { if (!(row_group_->columns[i].file_offset >= 0)) { std::stringstream ss; ss << "Column " << i << " is not complete."; throw ParquetException(ss.str()); } - total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + if (i == 0) { + file_offset = row_group_->columns[0].file_offset; + } + // sometimes column metadata is encrypted and not available to read, + // so we must get total_compressed_size from column builder + total_compressed_size += column_builders_[i]->total_compressed_size(); } - DCHECK(total_bytes_written == total_byte_size) - << "Total bytes in this RowGroup does not match with compressed sizes of columns"; - row_group_->__set_total_byte_size(total_byte_size); + row_group_->__set_file_offset(file_offset); + row_group_->__set_total_compressed_size(total_compressed_size); + row_group_->__set_total_byte_size(total_bytes_written); + row_group_->__set_ordinal(row_group_ordinal); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -858,8 +1243,9 @@ void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) { impl_->set_num_rows(num_rows); } -void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written) { - impl_->Finish(total_bytes_written); +void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, + int16_t row_group_ordinal) { + impl_->Finish(total_bytes_written, row_group_ordinal); } // file metadata @@ -871,6 +1257,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); +#ifdef PARQUET_ENCRYPTION + if (props->file_encryption_properties() != nullptr && + props->file_encryption_properties()->encrypted_footer()) { + crypto_metadata_.reset(new format::FileCryptoMetaData()); + } +#endif } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -926,6 +1318,27 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; +#ifdef PARQUET_ENCRYPTION + // if plaintext footer, set footer signing algorithm + auto file_encryption_properties = properties_->file_encryption_properties(); + if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { + EncryptionAlgorithm signing_algorithm; + EncryptionAlgorithm algo = file_encryption_properties->algorithm(); + signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique; + signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; + if (!algo.aad.supply_aad_prefix) + signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix; + signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1; + + metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm)); + const std::string& footer_signing_key_metadata = + file_encryption_properties->footer_key_metadata(); + if (footer_signing_key_metadata.size() > 0) { + metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); + } + } +#endif // PARQUET_ENCRYPTION + parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), &metadata_->schema); @@ -936,8 +1349,35 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } +#ifdef PARQUET_ENCRYPTION + std::unique_ptr BuildFileCryptoMetaData() { + if (crypto_metadata_ == nullptr) { + return nullptr; + } + + auto file_encryption_properties = properties_->file_encryption_properties(); + + crypto_metadata_->__set_encryption_algorithm( + ToThrift(file_encryption_properties->algorithm())); + std::string key_metadata = file_encryption_properties->footer_key_metadata(); + + if (!key_metadata.empty()) { + crypto_metadata_->__set_key_metadata(key_metadata); + } + + std::unique_ptr file_crypto_metadata = + std::unique_ptr(new FileCryptoMetaData()); + file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_); + + return file_crypto_metadata; + } +#endif + protected: std::unique_ptr metadata_; +#ifdef PARQUET_ENCRYPTION + std::unique_ptr crypto_metadata_; +#endif private: const std::shared_ptr properties_; @@ -969,4 +1409,10 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +#ifdef PARQUET_ENCRYPTION +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { + return impl_->BuildFileCryptoMetaData(); +} +#endif + } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index b3a5f7b808a..aa34a885bec 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -19,6 +19,7 @@ #define PARQUET_FILE_METADATA_H #include +#include #include #include #include @@ -28,6 +29,7 @@ #include "parquet/platform.h" #include "parquet/properties.h" +#include "parquet/schema.h" #include "parquet/types.h" namespace parquet { @@ -37,6 +39,12 @@ class EncodedStatistics; class Statistics; class SchemaDescriptor; +class FileCryptoMetaData; +class InternalFileDecryptor; +class Decryptor; +class Encryptor; +class FooterSigningEncryptor; + namespace schema { class ColumnPath; @@ -98,12 +106,31 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +#ifdef PARQUET_ENCRYPTION +class PARQUET_EXPORT ColumnCryptoMetaData { + public: + static std::unique_ptr Make(const uint8_t* metadata); + ~ColumnCryptoMetaData(); + + const std::vector& path_in_schema() const; + bool encrypted_with_footer_key() const; + const std::string& key_metadata() const; + + private: + explicit ColumnCryptoMetaData(const uint8_t* metadata); + + class ColumnCryptoMetaDataImpl; + std::unique_ptr impl_; +}; +#endif + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -114,6 +141,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const std::string& file_path() const; // column metadata + bool is_metadata_set() const; Type::type type() const; int64_t num_values() const; std::shared_ptr path_in_schema() const; @@ -128,10 +156,15 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; +#ifdef PARQUET_ENCRYPTION + std::unique_ptr crypto_metadata() const; +#endif private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version = NULLPTR, + InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -152,7 +185,10 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i) const; + + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal = -1, + InternalFileDecryptor* file_decryptor = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -167,10 +203,19 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); + + static std::shared_ptr Make( + const void* serialized_metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); +#ifdef PARQUET_ENCRYPTION + /// Verify signature of FileMetadata when file is encrypted but footer is not encrypted + /// (plaintext footer). + /// Signature is 28 bytes (12 byte nonce and 16 byte tags) when encrypting FileMetadata + bool VerifySignature(std::shared_ptr encryptor, + const void* signature); +#endif // file metadata uint32_t size() const; @@ -181,10 +226,16 @@ class PARQUET_EXPORT FileMetaData { const std::string& created_by() const; int num_schema_elements() const; std::unique_ptr RowGroup(int i) const; - const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst) const; +#ifdef PARQUET_ENCRYPTION + bool is_encryption_algorithm_set() const; + EncryptionAlgorithm encryption_algorithm() const; + const std::string& footer_signing_key_metadata() const; +#endif + + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -199,7 +250,9 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); + + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = NULLPTR); // PIMPL Idiom FileMetaData(); @@ -207,6 +260,30 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +#ifdef PARQUET_ENCRYPTION +class PARQUET_EXPORT FileCryptoMetaData { + public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + ~FileCryptoMetaData(); + + EncryptionAlgorithm encryption_algorithm() const; + const std::string& key_metadata() const; + + void WriteTo(::arrow::io::OutputStream* dst) const; + + private: + friend FileMetaDataBuilder; + FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + + // PIMPL Idiom + FileCryptoMetaData(); + class FileCryptoMetaDataImpl; + std::unique_ptr impl_; +}; +#endif + // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { public: @@ -227,11 +304,15 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void SetStatistics(const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; + + int64_t total_compressed_size() const; // commit the metadata + void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); + bool dictionary_fallback, + const std::shared_ptr& encryptor = NULLPTR); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; @@ -266,7 +347,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { void set_num_rows(int64_t num_rows); // commit the metadata - void Finish(int64_t total_bytes_written); + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1); private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, @@ -291,6 +372,11 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); +#ifdef PARQUET_ENCRYPTION + // crypto metadata + std::unique_ptr GetCryptoMetaData(); +#endif + private: explicit FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d08d7b0c8fe..406d1b8bb75 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -21,6 +21,17 @@ #include #include #include +#include + +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption.h" +#else +namespace parquet { +class FileEncryptionProperties; +class FileDecryptionProperties; +class ColumnEncryptionProperties; +} // namespace parquet +#endif #include "parquet/exception.h" #include "parquet/parquet_version.h" @@ -60,10 +71,22 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } +#ifdef PARQUET_ENCRYPTION + void file_decryption_properties( + const std::shared_ptr& decryption) { + file_decryption_properties_ = decryption; + } + + FileDecryptionProperties* file_decryption_properties() { + return file_decryption_properties_.get(); + } +#endif + private: ::arrow::MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + std::shared_ptr file_decryption_properties_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -268,6 +291,14 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } +#ifdef PARQUET_ENCRYPTION + Builder* encryption( + const std::shared_ptr& file_encryption_properties) { + file_encryption_properties_ = file_encryption_properties; + return this; + } +#endif + Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); return this; @@ -313,10 +344,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_properties_), + default_column_properties_, column_properties)); } private: @@ -328,6 +359,8 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type version_; std::string created_by_; + std::shared_ptr file_encryption_properties_; + // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; std::unordered_map encodings_; @@ -393,11 +426,27 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } +#ifdef PARQUET_ENCRYPTION + inline FileEncryptionProperties* file_encryption_properties() const { + return file_encryption_properties_.get(); + } + + std::shared_ptr column_encryption_properties( + const std::shared_ptr& path) const { + if (file_encryption_properties_) { + return file_encryption_properties_->column_properties(path); + } else { + return NULLPTR; + } + } +#endif + private: explicit WriterProperties( ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, + std::shared_ptr file_encryption_properties, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -407,6 +456,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), + file_encryption_properties_(file_encryption_properties), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -417,6 +467,9 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + + std::shared_ptr file_encryption_properties_; + ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 740edbc4904..50a19342a10 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -90,6 +90,13 @@ class PARQUET_EXPORT ColumnPath { std::string ToDotString() const; const std::vector& ToDotVector() const; + struct CmpColumnPath { + bool operator()(const std::shared_ptr& a, + const std::shared_ptr& b) const { + return a->ToDotString() < b->ToDotString(); + } + }; + protected: std::vector path_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index c7b62073df5..3d498fc0253 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -28,6 +28,7 @@ #include #endif #include +#include // TCompactProtocol requires some #defines to work right. #define SIGNED_RIGHT_SHIFT_IS 1 @@ -41,12 +42,24 @@ #include #include "arrow/util/logging.h" + #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/statistics.h" +#include "parquet/types.h" #include "parquet/parquet_types.h" // IYWU pragma: export +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" +#else +namespace parquet { +class Encryptor; +class Decryptor; +} // namespace parquet +#endif namespace parquet { // Check if thrift version < 0.11.0 @@ -81,6 +94,31 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } +static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) { + return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique, + aesGcmV1.supply_aad_prefix}; +} + +static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { + return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique, + aesGcmCtrV1.supply_aad_prefix}; +} + +static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + EncryptionAlgorithm encryption_algorithm; + + if (encryption.__isset.AES_GCM_V1) { + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); + } else if (encryption.__isset.AES_GCM_CTR_V1) { + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); + } else { + throw ParquetException("Unsupported algorithm"); + } + return encryption_algorithm; +} + static inline format::Type::type ToThrift(Type::type type) { return static_cast(type); } @@ -131,16 +169,46 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { return statistics; } +static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { + format::AesGcmV1 aesGcmV1; + // aad_file_unique is always set + aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmV1.__set_aad_prefix(aad.aad_prefix); + } + return aesGcmV1; +} + +static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { + format::AesGcmCtrV1 aesGcmCtrV1; + // aad_file_unique is always set + aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + } + return aesGcmCtrV1; +} + +static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { + format::EncryptionAlgorithm encryption_algorithm; + if (encryption.algorithm == ParquetCipher::AES_GCM_V1) { + encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad)); + } else { + encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad)); + } + return encryption_algorithm; +} + // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; -// Deserialize a thrift message from buf/len. buf/len must at least contain -// all the bytes needed to store the thrift message. On return, len will be -// set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { +inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, + T* deserialized_msg) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -158,6 +226,39 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali *len = *len - bytes_left; } +// Deserialize a thrift message from buf/len. buf/len must at least contain +// all the bytes needed to store the thrift message. On return, len will be +// set to the actual length of the header. +template +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + const std::shared_ptr& decryptor = NULLPTR) { +#ifdef PARQUET_ENCRYPTION + // thrift message is not encrypted + if (decryptor == NULLPTR) { + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); + } else { // thrift message is encrypted + uint32_t clen; + clen = *len; + // decrypt + std::shared_ptr decrypted_buffer = + std::static_pointer_cast(AllocateBuffer( + decryptor->pool(), + static_cast(clen - decryptor->CiphertextSizeDelta()))); + const uint8_t* cipher_buf = buf; + uint32_t decrypted_buffer_len = + decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data()); + if (decrypted_buffer_len <= 0) { + throw ParquetException("Couldn't decrypt buffer\n"); + } + *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); + DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, + deserialized_msg); + } +#else + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); +#endif // PARQUET_ENCRYPTION +} + /// Utility class to serialize thrift objects to a binary format. This object /// should be reused if possible to reuse the underlying memory. /// Note: thrift will encode NULLs into the serialized buffer so it is not valid @@ -186,12 +287,24 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, + const std::shared_ptr& encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); + +#ifdef PARQUET_ENCRYPTION + // obj is not encrypted + if (encryptor == NULLPTR) { + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); + } else { // obj is encrypted + return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); + } +#else PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); +#endif } private: @@ -207,6 +320,22 @@ class ThriftSerializer { } } +#ifdef PARQUET_ENCRYPTION + int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, + uint32_t out_length, + const std::shared_ptr& encryptor) { + std::shared_ptr cipher_buffer = + std::static_pointer_cast(AllocateBuffer( + encryptor->pool(), + static_cast(encryptor->CiphertextSizeDelta() + out_length))); + int cipher_buffer_len = + encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); + + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len); + } +#endif + shared_ptr mem_buffer_; shared_ptr protocol_; }; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 0bfaf99b381..92d006965bd 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -443,10 +443,21 @@ struct Compression { PARQUET_EXPORT std::unique_ptr<::arrow::util::Codec> GetCodecFromArrow(Compression::type codec); -struct Encryption { +struct ParquetCipher { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct AadMetadata { + std::string aad_prefix; + std::string aad_file_unique; + bool supply_aad_prefix; +}; + +struct EncryptionAlgorithm { + ParquetCipher::type algorithm; + AadMetadata aad; +}; + // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 };