diff --git a/.travis.yml b/.travis.yml index 32f0a87edc3..da6ece9e2ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -65,6 +65,7 @@ matrix: - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_PARQUET_ENCRYPTION=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_S3=1 - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 @@ -135,6 +136,7 @@ matrix: - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_S3=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_PARQUET_ENCRYPTION=1 # TODO(ARROW-4763): llvm and llvmdev packages are in conflict: # https://github.com/conda-forge/llvmdev-feedstock/issues/60 # - ARROW_TRAVIS_GANDIVA=1 @@ -279,6 +281,7 @@ matrix: dist: xenial env: - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_PARQUET_ENCRYPTION=1 - ARROW_TRAVIS_USE_SYSTEM=1 - ARROW_TRAVIS_MIMALLOC=1 before_install: diff --git a/ci/appveyor-cpp-build-mingw.bat b/ci/appveyor-cpp-build-mingw.bat index 7f108b7c008..0684040f7e2 100644 --- a/ci/appveyor-cpp-build-mingw.bat +++ b/ci/appveyor-cpp-build-mingw.bat @@ -52,6 +52,7 @@ cmake ^ -DARROW_WITH_SNAPPY=ON ^ -DARROW_WITH_BROTLI=ON ^ -DARROW_PARQUET=ON ^ + -DPARQUET_REQUIRE_ENCRYPTION=ON ^ -DARROW_PYTHON=ON ^ -DARROW_USE_GLOG=OFF ^ -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index d80a4dcda4b..7ecb989b32c 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -79,6 +79,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_S3=%ARROW_S3% ^ -DARROW_MIMALLOC=ON ^ -DARROW_PARQUET=ON ^ + -DPARQUET_REQUIRE_ENCRYPTION=ON ^ -DPARQUET_BUILD_EXECUTABLES=ON ^ -DARROW_PYTHON=ON ^ .. || exit /B diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 16eadd6001e..5204218dea9 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -141,6 +141,10 @@ if [ "$ARROW_TRAVIS_PARQUET" == "1" ]; then -DPARQUET_BUILD_EXECUTABLES=ON" fi +if [ "$ARROW_TRAVIS_PARQUET_ENCRYPTION" == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DPARQUET_REQUIRE_ENCRYPTION=ON" +fi + if [ "$ARROW_TRAVIS_GANDIVA" == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" if [ "$ARROW_TRAVIS_GANDIVA_JAVA" == "1" ]; then diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index eb96b6bc5a3..db4f8d80275 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -141,6 +141,7 @@ cmake -GNinja \ -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_PARQUET=on \ + -DPARQUET_REQUIRE_ENCRYPTION=on \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ diff --git a/ci/windows-pkg-arrow-for-r.sh b/ci/windows-pkg-arrow-for-r.sh index 718ce0f436b..23a518bad83 100644 --- a/ci/windows-pkg-arrow-for-r.sh +++ b/ci/windows-pkg-arrow-for-r.sh @@ -60,6 +60,10 @@ mkdir deps40 && cd deps40 # double-conversion is only available in the Rtools4.0 builds, but apparently that's ok wget https://dl.bintray.com/rtools/mingw64/mingw-w64-x86_64-double-conversion-3.1.2-1-any.pkg.tar.xz wget https://dl.bintray.com/rtools/mingw32/mingw-w64-i686-double-conversion-3.1.2-1-any.pkg.tar.xz + +wget https://dl.bintray.com/rtools/mingw64/mingw-w64-x86_64-openssl-1.1.1.a-1-any.pkg.tar.xz +wget https://dl.bintray.com/rtools/mingw32/mingw-w64-i686-openssl-1.1.1.a-1-any.pkg.tar.xz + # These are the other Rtools 4.0 packages, for future reference # wget https://dl.bintray.com/rtools/mingw32/mingw-w64-i686-boost-1.67.0-9002-any.pkg.tar.xz # wget https://dl.bintray.com/rtools/mingw64/mingw-w64-x86_64-boost-1.67.0-9002-any.pkg.tar.xz diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index c7cedebff97..07a79187304 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -946,7 +946,6 @@ if(ARROW_WITH_BROTLI) include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) endif() -set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) set(PARQUET_REQUIRE_ENCRYPTION OFF) endif() @@ -959,19 +958,16 @@ if(BREW_BIN AND NOT OPENSSL_ROOT_DIR) set(OPENSSL_ROOT_DIR ${OPENSSL_BREW_PREFIX}) endif() endif() + +set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT OR ARROW_S3) # This must work find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) set(ARROW_USE_OPENSSL ON) -elseif(ARROW_PARQUET) - # Enable Parquet encryption if OpenSSL is there, but don't fail if it's not - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} QUIET) - if(OPENSSL_FOUND) - set(ARROW_USE_OPENSSL ON) - endif() endif() if(ARROW_USE_OPENSSL) + message(STATUS "Found OpenSSL Crypto Library: ${OPENSSL_CRYPTO_LIBRARY}") message(STATUS "Building with OpenSSL (Version: ${OPENSSL_VERSION}) support") # OpenSSL::SSL and OpenSSL::Crypto were not added to diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 641a5055756..86d8d870e1a 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -21,6 +21,15 @@ add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) +if (PARQUET_REQUIRE_ENCRYPTION) + add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) + add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) + target_include_directories(parquet-encryption-example PRIVATE low-level-api/) + target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) + target_link_libraries(parquet-encryption-example parquet_static) + target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) +endif() + # Prefer shared linkage but use static if shared build is deactivated if (ARROW_BUILD_SHARED) set(PARQUET_EXAMPLE_LINK_LIBS parquet_shared) @@ -36,3 +45,9 @@ add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 parquet-arrow-example) + +if (PARQUET_REQUIRE_ENCRYPTION) + add_dependencies(parquet + parquet-encryption-example + parquet-encryption-example-all-crypto-options) +endif() diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc new file mode 100644 index 00000000000..0daa1920d6c --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -0,0 +1,663 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + + +/* + * This file contains samples for writing and reading encrypted Parquet files in different + * encryption and decryption configurations. + * Each sample section is dedicated to an independent configuration and shows its creation + * from beginning to end. + * The samples have the following goals: + * 1) Demonstrate usage of different options for data encryption and decryption. + * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr) + * readers that support encryption. + * 3) Produce encrypted files with plaintext footer, for testing the ability of legacy + * readers to parse the footer and read unencrypted columns. + * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading + * encrypted files produced by these writers. + * + * Each write sample produces new independent parquet file, encrypted with a different + * encryption configuration as described below. + * The name of each file is in the form of: + * tester.parquet.encrypted. + * + * The read sample creates a set of decryption configurations and then uses each of them + * to read all encrypted files in the input directory. + * + * The different encryption and decryption configurations are listed below. + * + * Usage: ./encryption-interop-tests + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The write sample creates files with four columns in the following + * encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + * + * The read sample uses each of the following decryption configurations to read every + * encrypted files in the input directory: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + */ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; +const std::string fileName = "tester"; + +using FileClass = ::arrow::io::FileOutputStream; +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; + +void PrintDecryptionConfiguration(int configuration); +// Check that the decryption result is as expected. +void CheckResult(std::string file, int example_id, std::string exception_msg); +// Returns true if FileName ends with suffix. Otherwise returns false. +// Used to skip unencrypted parquet files. +bool FileNameEndsWith(std::string file_name, std::string suffix); + +std::vector GetDirectoryFiles(const std::string& path) { + std::vector files; + struct dirent* entry; + DIR* dir = opendir(path.c_str()); + + if (dir == NULL) { + exit(-1); + } + while ((entry = readdir(dir)) != NULL) { + files.push_back(std::string(entry->d_name)); + } + closedir(dir); + return files; +} + +static std::shared_ptr SetupSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, + ConvertedType::NONE)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} + +void InteropTestWriteEncryptedParquetFiles(std::string root_path) { + /********************************************************************************** + Creating a number of Encryption configurations + **********************************************************************************/ + + // This vector will hold various encryption configuraions. + std::vector> + vector_of_encryption_configurations; + + // Encryption configuration 1: Encrypt all columns and the footer with the same key. + // (uniform encryption) + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey); + // Add to list of encryption configurations. + vector_of_encryption_configurations.push_back( + file_encryption_builder_1.footer_key_metadata("kf")->build()); + + // Encryption configuration 2: Encrypt two columns and the footer, with different keys. + std::map> + encryption_cols2; + std::string path1 = "double_field"; + std::string path2 = "float_field"; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20(path1); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21(path2); + encryption_col_builder_20.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols2[path1] = encryption_col_builder_20.build(); + encryption_cols2[path2] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_2.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols2) + ->build()); + + // Encryption configuration 3: Encrypt two columns, with different keys. + // Don’t encrypt footer. + // (plaintext footer mode, readable by legacy readers) + std::map> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30(path1); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31(path2); + encryption_col_builder_30.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols3[path1] = encryption_col_builder_30.build(); + encryption_cols3[path2] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_3.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols3) + ->set_plaintext_footer() + ->build()); + + // Encryption configuration 4: Encrypt two columns and the footer, with different keys. + // Use aad_prefix. + std::map> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40(path1); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41(path2); + encryption_col_builder_40.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols4[path1] = encryption_col_builder_40.build(); + encryption_cols4[path2] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_4.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols4) + ->aad_prefix(fileName) + ->build()); + + // Encryption configuration 5: Encrypt two columns and the footer, with different keys. + // Use aad_prefix and disable_aad_prefix_storage. + std::map> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50(path1); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51(path2); + encryption_col_builder_50.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols5[path1] = encryption_col_builder_50.build(); + encryption_cols5[path2] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_5.encrypted_columns(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(fileName) + ->disable_aad_prefix_storage() + ->build()); + + // Encryption configuration 6: Encrypt two columns and the footer, with different keys. + // Use AES_GCM_CTR_V1 algorithm. + std::map> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60(path1); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61(path2); + encryption_col_builder_60.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols6[path1] = encryption_col_builder_60.build(); + encryption_cols6[path2] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_6.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build()); + + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + + // Iterate over the encryption configurations and for each one write a parquet file. + for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); + ++example_id) { + std::stringstream ss; + ss << example_id + 1; + std::string test_number_string = ss.str(); + try { + // Create a local file output stream instance. + std::shared_ptr out_file; + std::string file = + root_path + fileName + std::string(test_number_string) + ".parquet.encrypted"; + std::cout << "Write " << file << std::endl; + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(vector_of_encryption_configurations[example_id]); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return; + } + } +} + +void InteropTestReadEncryptedParquetFiles(std::string root_path) { + std::vector files_in_directory = GetDirectoryFiles(root_path); + + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations; + + // Decryption configuration 1: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey); + string_kr2->PutKey("kc1", kColumnEncryptionKey1); + string_kr2->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. + std::string path_double = "double_field"; + std::string path_float = "float_field"; + std::map> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31(path_double); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float); + + decryption_cols[path_double] = + decryption_col_builder31.key(kColumnEncryptionKey1)->build(); + decryption_cols[path_float] = + decryption_col_builder32.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey) + ->column_keys(decryption_cols) + ->build()); + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Iterate over the decryption configurations and use each one to read every files + // in the input directory. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations.size(); + ++example_id) { + PrintDecryptionConfiguration(example_id + 1); + for (auto const& file : files_in_directory) { + std::string exception_msg = ""; + if (!FileNameEndsWith(file, "parquet.encrypted")) // Skip non encrypted files + continue; + try { + std::cout << "--> Read file " << file << std::endl; + + parquet::ReaderProperties reader_properties = + parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + vector_of_decryption_configurations[example_id]->DeepClone()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(root_path + file, false, + reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 4); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(2); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(3); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(file, example_id, exception_msg); + std::cout << "file [" << file << "] Parquet Reading Complete" << std::endl; + } + } +} + +void PrintDecryptionConfiguration(int configuration) { + std::cout << "\n\nDecryption configuration "; + if (configuration == 1) + std::cout << "1: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key." + << std::endl; + else if (configuration == 2) + std::cout << "2: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key. Pass aad_prefix." + << std::endl; + else if (configuration == 3) + std::cout << "3: \n\nDecrypt using explicit column and footer keys." << std::endl; + else { + std::cout << "Unknown configuraion" << std::endl; + exit(-1); + } + std::cout << std::endl; +} + +// Check that the decryption result is as expected. +void CheckResult(std::string file, int example_id, std::string exception_msg) { + int encryption_configuration_number; + std::regex r("tester([0-9]+)\\.parquet.encrypted"); + std::smatch m; + std::regex_search(file, m, r); + if (m.size() == 0) { + std::cerr + << "Error: Error parsing filename to extract encryption configuration number. " + << std::endl; + } + std::string encryption_configuration_number_str = m.str(1); + encryption_configuration_number = atoi(encryption_configuration_number_str.c_str()); + if (encryption_configuration_number < 1 || encryption_configuration_number > 6) { + std::cerr << "Error: Unknown encryption configuration number. " << std::endl; + } + + int decryption_configuration_number = example_id + 1; + + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) + std::cout << "Error: Expecting AAD related exception."; + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected to + // be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) { + std::cout << "Error: Expecting AAD related exception." << std::endl; + } + return; + } + } + if (!exception_msg.empty()) + std::cout << "Error: Unexpected exception was thrown." << exception_msg; +} + +bool FileNameEndsWith(std::string file_name, std::string suffix) { + std::string::size_type idx = file_name.find_first_of('.'); + + if (idx != std::string::npos) { + std::string extension = file_name.substr(idx + 1); + if (extension.compare(suffix) == 0) return true; + } + return false; +} + +int main(int argc, char** argv) { + enum Operation { write, read }; + std::string root_path; + Operation operation = write; + if (argc < 3) { + std::cout << "Usage: encryption-reader-writer-all-crypto-options " + "" + << std::endl; + exit(1); + } + root_path = argv[1]; + if (root_path.compare("read") == 0) { + operation = read; + } + + root_path = argv[2]; + std::cout << "Root path is: " << root_path << std::endl; + + if (operation == write) { + InteropTestWriteEncryptedParquetFiles(root_path); + } else + InteropTestReadEncryptedParquetFiles(root_path); + + return 0; +} diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc new file mode 100644 index 00000000000..92d7f951bba --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -0,0 +1,451 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +/* + * This file contains sample for writing and reading encrypted Parquet file with + * basic encryption configuration. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The write sample creates a file with eight columns where two of the columns and the + * footer are encrypted. + * + * The read sample decrypts using key retriever that holds the keys of two encrypted + * columns and the footer key. + */ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; +const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; + +int main(int argc, char** argv) { + + /********************************************************************************** + PARQUET ENCRYPTION WRITER EXAMPLE + **********************************************************************************/ + + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add encryption properties + // Encryption configuration: Encrypt two columns and the footer. + std::map> + encryption_cols; + + parquet::SchemaDescriptor schema_desc; + schema_desc.Init(schema); + auto column_path1 = schema_desc.Column(5)->path()->ToDotString(); + auto column_path2 = schema_desc.Column(4)->path()->ToDotString(); + + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0(column_path1); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder1(column_path2); + encryption_col_builder0.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder1.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols[column_path1] = encryption_col_builder0.build(); + encryption_cols[column_path2] = encryption_col_builder1.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder( + kFooterEncryptionKey); + + parquet::WriterProperties::Builder builder; + // Add the current encryption configuration to WriterProperties. + builder.encryption(file_encryption_builder.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols) + ->build()); + + // Add other writer properties + builder.compression(parquet::Compression::SNAPPY); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } + + /********************************************************************************** + PARQUET ENCRYPTION READER EXAMPLE + **********************************************************************************/ + + // Decryption configuration: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder; + + + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + file_decryption_builder.key_retriever(kr1)->build()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; + } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; + return 0; +} diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 549326bf22b..0659767a578 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -155,8 +155,11 @@ set(PARQUET_SRCS column_writer.cc deprecated_io.cc encoding.cc + encryption.cc file_reader.cc file_writer.cc + internal_file_decryptor.cc + internal_file_encryptor.cc metadata.cc murmur3.cc parquet_constants.cpp @@ -168,6 +171,12 @@ set(PARQUET_SRCS statistics.cc types.cc) +if(PARQUET_REQUIRE_ENCRYPTION) + set(PARQUET_SRCS ${PARQUET_SRCS} encryption_internal.cc) +else() + set(PARQUET_SRCS ${PARQUET_SRCS} encryption_internal_nossl.cc) +endif() + # Ensure that thrift compilation is done before using its generated headers # in parquet code. add_custom_target(parquet-thrift-deps ALL DEPENDS ${THRIFT_OUTPUT_FILES}) @@ -306,6 +315,15 @@ add_parquet_test(arrow-test arrow/arrow_schema_test.cc test_util.cc) +if(PARQUET_REQUIRE_ENCRYPTION) + add_parquet_test(encryption-test + SOURCES + encryption_write_configurations_test.cc + encryption_read_configurations_test.cc + encryption_properties_test.cc + test_util.cc) +endif() + # Those tests need to use static linking as they access thrift-generated # symbols which are not exported by parquet.dll on Windows (PARQUET-1420). add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc test_util.cc) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index df33eeeb651..b1a9eb33895 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -23,8 +23,10 @@ #include #include #include +#include #include #include +#include #include "arrow/array.h" #include "arrow/builder.h" @@ -38,6 +40,8 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift_internal.h" // IWYU pragma: keep @@ -107,6 +111,8 @@ ReaderProperties default_reader_properties() { // SerializedPageReader deserializes Thrift metadata and pages that have been // assembled in a serialized stream for storing in a Parquet files +static constexpr int16_t kNonPageOrdinal = static_cast(-1); + // This subclass delimits pages appearing in a serialized stream, each preceded // by a serialized Thrift format::PageHeader indicating the type of each page // and the page metadata. @@ -114,11 +120,17 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - ::arrow::MemoryPool* pool) + ::arrow::MemoryPool* pool, const CryptoContext* crypto_ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), + page_ordinal_(0), seen_num_rows_(0), - total_num_rows_(total_num_rows) { + total_num_rows_(total_num_rows), + decryption_buffer_(AllocateBuffer(pool, 0)) { + if (crypto_ctx != nullptr) { + crypto_ctx_ = *crypto_ctx; + InitDecryption(); + } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodec(codec); } @@ -129,6 +141,11 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: + void UpdateDecryption(const std::shared_ptr& decryptor, int8_t module_type, + const std::string& page_aad); + + void InitDecryption(); + std::shared_ptr stream_; format::PageHeader current_page_header_; @@ -138,6 +155,18 @@ class SerializedPageReader : public PageReader { std::unique_ptr<::arrow::util::Codec> decompressor_; std::shared_ptr decompression_buffer_; + // The fields below are used for calculation of AAD (additional authenticated data) + // suffix which is part of the Parquet Modular Encryption. + // The AAD suffix for a parquet module is built internally by + // concatenating different parts some of which include + // the row group ordinal, column ordinal and page ordinal. + // Please refer to the encryption specification for more details: + // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data + + // The ordinal fields in the context below are used for AAD suffix calculation. + CryptoContext crypto_ctx_; + int16_t page_ordinal_; // page ordinal does not count the dictionary page + // Maximum allowed page size uint32_t max_page_header_size_; @@ -146,11 +175,52 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; + + // data_page_aad_ and data_page_header_aad_ contain the AAD for data page and data page + // header in a single column respectively. + // While calculating AAD for different pages in a single column the pages AAD is + // updated by only the page ordinal. + std::string data_page_aad_; + std::string data_page_header_aad_; + // Encryption + std::shared_ptr decryption_buffer_; }; +void SerializedPageReader::InitDecryption() { + // Prepare the AAD for quick update later. + if (crypto_ctx_.data_decryptor != nullptr) { + DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty()); + data_page_aad_ = encryption::CreateModuleAad( + crypto_ctx_.data_decryptor->file_aad(), encryption::kDataPage, + crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal); + } + if (crypto_ctx_.meta_decryptor != nullptr) { + DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty()); + data_page_header_aad_ = encryption::CreateModuleAad( + crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader, + crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal); + } +} + +void SerializedPageReader::UpdateDecryption(const std::shared_ptr& decryptor, + int8_t module_type, + const std::string& page_aad) { + DCHECK(decryptor != nullptr); + if (crypto_ctx_.start_decrypt_with_dictionary_page) { + std::string aad = encryption::CreateModuleAad( + decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal, + crypto_ctx_.column_ordinal, kNonPageOrdinal); + decryptor->UpdateAad(aad); + } else { + encryption::QuickUpdatePageAad(page_aad, page_ordinal_); + decryptor->UpdateAad(page_aad); + } +} + std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with + while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; uint32_t allowed_page_size = kDefaultPageHeaderSize; @@ -168,8 +238,13 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { + if (crypto_ctx_.meta_decryptor != nullptr) { + UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader, + data_page_header_aad_); + } DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_); + &header_size, ¤t_page_header_, + crypto_ctx_.meta_decryptor); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -187,7 +262,10 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - + if (crypto_ctx_.data_decryptor != nullptr) { + UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage, + data_page_aad_); + } // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -198,6 +276,15 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } + // Decrypt it if we need to + if (crypto_ctx_.data_decryptor != nullptr) { + PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( + compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta())); + compressed_len = crypto_ctx_.data_decryptor->Decrypt( + page_buffer->data(), compressed_len, decryption_buffer_->mutable_data()); + + page_buffer = decryption_buffer_; + } // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. @@ -211,6 +298,7 @@ std::shared_ptr SerializedPageReader::NextPage() { } if (current_page_header_.type == format::PageType::DICTIONARY_PAGE) { + crypto_ctx_.start_decrypt_with_dictionary_page = false; const format::DictionaryPageHeader& dict_header = current_page_header_.dictionary_page_header; @@ -220,6 +308,7 @@ std::shared_ptr SerializedPageReader::NextPage() { FromThrift(dict_header.encoding), is_sorted); } else if (current_page_header_.type == format::PageType::DATA_PAGE) { + ++page_ordinal_; const format::DataPageHeader& header = current_page_header_.data_page_header; EncodedStatistics page_statistics; @@ -246,6 +335,7 @@ std::shared_ptr SerializedPageReader::NextPage() { FromThrift(header.definition_level_encoding), FromThrift(header.repetition_level_encoding), page_statistics); } else if (current_page_header_.type == format::PageType::DATA_PAGE_V2) { + ++page_ordinal_; const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2; bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false; @@ -266,9 +356,9 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool) { + Compression::type codec, ::arrow::MemoryPool* pool, const CryptoContext* ctx) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, pool)); + new SerializedPageReader(stream, total_num_rows, codec, pool, ctx)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 2b6ec9f3681..91ae6d733db 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -43,6 +43,7 @@ class RleDecoder; namespace parquet { +class Decryptor; class Page; // 16 MB is the default maximum page header size @@ -72,6 +73,23 @@ class PARQUET_EXPORT LevelDecoder { std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; }; +struct CryptoContext { + CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal, + std::shared_ptr meta, std::shared_ptr data) + : start_decrypt_with_dictionary_page(start_with_dictionary_page), + row_group_ordinal(rg_ordinal), + column_ordinal(col_ordinal), + meta_decryptor(meta), + data_decryptor(data) {} + CryptoContext() {} + + bool start_decrypt_with_dictionary_page = false; + int16_t row_group_ordinal = -1; + int16_t column_ordinal = -1; + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; +}; + // Abstract page iterator interface. This way, we can feed column pages to the // ColumnReader through whatever mechanism we choose class PARQUET_EXPORT PageReader { @@ -80,8 +98,8 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + const CryptoContext* ctx = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index da434bcedb9..5f38cb3fad2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -37,6 +38,8 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -51,6 +54,8 @@ using arrow::Status; using arrow::compute::Datum; using arrow::internal::checked_cast; +static constexpr int16_t kNonPageOrdinal = static_cast(-1); + using BitWriter = arrow::BitUtil::BitWriter; using RleEncoder = arrow::util::RleEncoder; @@ -137,8 +142,11 @@ class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, int compression_level, - ColumnChunkMetaDataBuilder* metadata, - MemoryPool* pool = arrow::default_memory_pool()) + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, + MemoryPool* pool = arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = nullptr, + std::shared_ptr data_encryptor = nullptr) : sink_(sink), metadata_(metadata), pool_(pool), @@ -146,14 +154,22 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_(0), data_page_offset_(0), total_uncompressed_size_(0), - total_compressed_size_(0) { + total_compressed_size_(0), + page_ordinal_(0), + row_group_ordinal_(row_group_ordinal), + column_ordinal_(column_chunk_ordinal), + meta_encryptor_(meta_encryptor), + data_encryptor_(data_encryptor) { + if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) { + InitEncryption(); + } compressor_ = GetCodec(codec, compression_level); thrift_serializer_.reset(new ThriftSerializer); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { int64_t uncompressed_size = page.size(); - std::shared_ptr compressed_data = nullptr; + std::shared_ptr compressed_data; if (has_compressor()) { auto buffer = std::static_pointer_cast( AllocateBuffer(pool_, uncompressed_size)); @@ -168,10 +184,23 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_encoding(ToThrift(page.encoding())); dict_page_header.__set_is_sorted(page.is_sorted()); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + + std::shared_ptr encrypted_data_buffer; + if (data_encryptor_.get()) { + UpdateEncryption(encryption::kDictionaryPage); + encrypted_data_buffer = std::static_pointer_cast(AllocateBuffer( + pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_dictionary_page_header(dict_page_header); // TODO(PARQUET-594) crc checksum @@ -180,11 +209,17 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data)); + + if (meta_encryptor_) { + UpdateEncryption(encryption::kDictionaryPageHeader); + } + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; int64_t final_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(&final_pos)); @@ -192,11 +227,13 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { + if (meta_encryptor_ != nullptr) { + UpdateEncryption(encryption::kColumnMetaData); + } // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, - fallback); - + fallback, meta_encryptor_); // Write metadata at end of column chunk metadata_->WriteTo(sink_.get()); } @@ -225,7 +262,6 @@ class SerializedPageWriter : public PageWriter { int64_t WriteDataPage(const CompressedDataPage& page) override { int64_t uncompressed_size = page.uncompressed_size(); std::shared_ptr compressed_data = page.buffer(); - format::DataPageHeader data_page_header; data_page_header.__set_num_values(page.num_values()); data_page_header.__set_encoding(ToThrift(page.encoding())); @@ -235,10 +271,23 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.repetition_level_encoding())); data_page_header.__set_statistics(ToThrift(page.statistics())); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + + std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); + if (data_encryptor_.get()) { + UpdateEncryption(encryption::kDataPage); + PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( + data_encryptor_->CiphertextSizeDelta() + output_data_len)); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_data_page_header(data_page_header); // TODO(PARQUET-594) crc checksum @@ -248,13 +297,18 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data)); + if (meta_encryptor_) { + UpdateEncryption(encryption::kDataPageHeader); + } + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); + ++page_ordinal_; int64_t current_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(¤t_pos)); return current_pos - start_pos; @@ -273,6 +327,55 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: + void InitEncryption() { + // Prepare the AAD for quick update later. + if (data_encryptor_ != nullptr) { + data_page_aad_ = encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, kNonPageOrdinal); + } + if (meta_encryptor_ != nullptr) { + data_page_header_aad_ = encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, kNonPageOrdinal); + } + } + + void UpdateEncryption(int8_t module_type) { + switch (module_type) { + case encryption::kColumnMetaData: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + kNonPageOrdinal)); + break; + } + case encryption::kDataPage: { + encryption::QuickUpdatePageAad(data_page_aad_, page_ordinal_); + data_encryptor_->UpdateAad(data_page_aad_); + break; + } + case encryption::kDataPageHeader: { + encryption::QuickUpdatePageAad(data_page_header_aad_, page_ordinal_); + meta_encryptor_->UpdateAad(data_page_header_aad_); + break; + } + case encryption::kDictionaryPageHeader: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + kNonPageOrdinal)); + break; + } + case encryption::kDictionaryPage: { + data_encryptor_->UpdateAad(encryption::CreateModuleAad( + data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + kNonPageOrdinal)); + break; + } + default: + throw ParquetException("Unknown module type in UpdateEncryption"); + } + } + std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; MemoryPool* pool_; @@ -281,11 +384,20 @@ class SerializedPageWriter : public PageWriter { int64_t data_page_offset_; int64_t total_uncompressed_size_; int64_t total_compressed_size_; + int16_t page_ordinal_; + int16_t row_group_ordinal_; + int16_t column_ordinal_; std::unique_ptr thrift_serializer_; // Compression codec to use. std::unique_ptr compressor_; + + std::string data_page_aad_; + std::string data_page_header_aad_; + + std::shared_ptr meta_encryptor_; + std::shared_ptr data_encryptor_; }; // This implementation of the PageWriter writes to the final sink on Close . @@ -293,12 +405,16 @@ class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, int compression_level, - ColumnChunkMetaDataBuilder* metadata, - MemoryPool* pool = arrow::default_memory_pool()) + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t current_column_ordinal, + MemoryPool* pool = arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = nullptr, + std::shared_ptr data_encryptor = nullptr) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr(new SerializedPageWriter( - in_memory_sink_, codec, compression_level, metadata, pool)); + in_memory_sink_, codec, compression_level, metadata, row_group_ordinal, + current_column_ordinal, pool, meta_encryptor, data_encryptor)); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -342,14 +458,18 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, - int compression_level, ColumnChunkMetaDataBuilder* metadata, MemoryPool* pool, - bool buffered_row_group) { + int compression_level, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool, + bool buffered_row_group, std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor) { if (buffered_row_group) { - return std::unique_ptr( - new BufferedPageWriter(sink, codec, compression_level, metadata, pool)); + return std::unique_ptr(new BufferedPageWriter( + sink, codec, compression_level, metadata, row_group_ordinal, column_chunk_ordinal, + pool, meta_encryptor, data_encryptor)); } else { - return std::unique_ptr( - new SerializedPageWriter(sink, codec, compression_level, metadata, pool)); + return std::unique_ptr(new SerializedPageWriter( + sink, codec, compression_level, metadata, row_group_ordinal, column_chunk_ordinal, + pool, meta_encryptor, data_encryptor)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 609ee8d13c8..a5f9f58c857 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -46,6 +46,7 @@ class ColumnDescriptor; class CompressedDataPage; class DictionaryPage; class ColumnChunkMetaDataBuilder; +class Encryptor; class WriterProperties; class PARQUET_EXPORT LevelEncoder { @@ -85,8 +86,11 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, int compression_level, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool buffered_row_group = false); + bool buffered_row_group = false, + std::shared_ptr header_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR); // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc new file mode 100644 index 00000000000..0143cb5c5e5 --- /dev/null +++ b/cpp/src/parquet/encryption.cc @@ -0,0 +1,414 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption.h" + +#include +#include +#include + +#include "arrow/util/logging.h" +#include "arrow/util/utf8.h" + +#include "parquet/encryption_internal.h" + +namespace parquet { + +// integer key retriever +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { + key_map_.insert({key_id, key}); +} + +std::string IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) const { + uint32_t key_id; + memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + + return key_map_.at(key_id); +} + +// string key retriever +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { + key_map_.insert({key_id, key}); +} + +std::string StringKeyIdRetriever::GetKey(const std::string& key_id) const { + return key_map_.at(key_id); +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key( + std::string column_key) { + if (column_key.empty()) return this; + + DCHECK(key_.empty()); + key_ = column_key; + return this; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata( + const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("key id should be in UTF8 encoding"); + } + + DCHECK(!key_id.empty()); + this->key_metadata(key_id); + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys( + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) { + if (column_decryption_properties.size() == 0) return this; + + if (column_decryption_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (const auto& element : column_decryption_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + + column_decryption_properties_ = column_decryption_properties; + return this; +} + +void FileDecryptionProperties::WipeOutDecryptionKeys() { + footer_key_.clear(); + + for (const auto& element : column_decryption_properties_) { + element.second->WipeOutDecryptionKey(); + } +} + +bool FileDecryptionProperties::is_utilized() { + if (footer_key_.empty() && column_decryption_properties_.size() == 0 && + aad_prefix_.empty()) + return false; + + return utilized_; +} + +std::shared_ptr FileDecryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy; + + for (const auto& element : column_decryption_properties_) { + column_decryption_properties_map_copy.insert( + {element.second->column_path(), element.second->DeepClone()}); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileDecryptionProperties( + footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix, + aad_prefix_verifier_, column_decryption_properties_map_copy, + plaintext_files_allowed_)); +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key( + const std::string footer_key) { + if (footer_key.empty()) { + return this; + } + DCHECK(footer_key_.empty()); + footer_key_ = footer_key; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever( + const std::shared_ptr& key_retriever) { + if (key_retriever == nullptr) return this; + + DCHECK(key_retriever_ == nullptr); + key_retriever_ = key_retriever; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier( + std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == nullptr) return this; + + DCHECK(aad_prefix_verifier_ == nullptr); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; +} + +ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key( + const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; +} + +std::shared_ptr ColumnDecryptionProperties::Builder::build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); +} + +void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); } + +std::shared_ptr ColumnDecryptionProperties::DeepClone() { + std::string key_copy = key_; + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata( + const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns( + const ColumnPathToEncryptionPropertiesMap& encrypted_columns) { + if (encrypted_columns.size() == 0) return this; + + if (encrypted_columns_.size() != 0) + throw ParquetException("Column properties already set"); + + for (const auto& element : encrypted_columns) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + encrypted_columns_ = encrypted_columns; + return this; +} + +void FileEncryptionProperties::WipeOutEncryptionKeys() { + footer_key_.clear(); + for (const auto& element : encrypted_columns_) { + element.second->WipeOutEncryptionKey(); + } +} + +std::shared_ptr FileEncryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy; + + for (const auto& element : encrypted_columns_) { + encrypted_columns_map_copy.insert( + {element.second->column_path(), element.second->DeepClone()}); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileEncryptionProperties( + algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, + new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; +} + +FileEncryptionProperties::Builder* +FileEncryptionProperties::Builder::disable_aad_prefix_storage() { + DCHECK(!aad_prefix_.empty()); + + store_aad_prefix_in_file_ = false; + return this; +} + +ColumnEncryptionProperties::ColumnEncryptionProperties(bool encrypted, + const std::string& column_path, + const std::string& key, + const std::string& key_metadata) + : column_path_(column_path) { + // column encryption properties object (with a column key) can be used for writing only + // one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + + DCHECK(!column_path.empty()); + if (!encrypted) { + DCHECK(key.empty() && key_metadata.empty()); + } + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) { + DCHECK(key_metadata.empty()); + } + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; +} + +ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path, + const std::string& key) + : column_path_(column_path) { + utilized_ = false; + DCHECK(!column_path.empty()); + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + key_ = key; +} + +std::string FileDecryptionProperties::column_key(const std::string& column_path) const { + if (column_decryption_properties_.find(column_path) != + column_decryption_properties_.end()) { + auto column_prop = column_decryption_properties_.at(column_path); + if (column_prop != nullptr) { + return column_prop->key(); + } + } + return empty_string_; +} + +FileDecryptionProperties::FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties, + bool plaintext_files_allowed) { + DCHECK(!footer_key.empty() || nullptr != key_retriever || + 0 != column_decryption_properties.size()); + + if (!footer_key.empty()) { + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + } + if (footer_key.empty() && check_plaintext_footer_integrity) { + DCHECK(nullptr != key_retriever); + } + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_decryption_properties_ = column_decryption_properties; + plaintext_files_allowed_ = plaintext_files_allowed; + utilized_ = false; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("footer key id should be in UTF8 encoding"); + } + + if (key_id.empty()) { + return this; + } + + return footer_key_metadata(key_id); +} + +std::shared_ptr +FileEncryptionProperties::column_encryption_properties(const std::string& column_path) { + if (encrypted_columns_.size() == 0) { + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder(column_path)); + return builder->build(); + } + if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) { + return encrypted_columns_[column_path]; + } + + return nullptr; +} + +FileEncryptionProperties::FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const ColumnPathToEncryptionPropertiesMap& encrypted_columns) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + aad_prefix_(aad_prefix), + store_aad_prefix_in_file_(store_aad_prefix_in_file), + encrypted_columns_(encrypted_columns) { + // file encryption properties object can be used for writing only one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + + uint8_t aad_file_unique[kAadFileUniqueLength]; + memset(aad_file_unique, 0, kAadFileUniqueLength); + encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength)); + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + kAadFileUniqueLength); + + bool supply_aad_prefix = false; + if (aad_prefix.empty()) { + file_aad_ = aad_file_unique_str; + } else { + file_aad_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } +} + +} // namespace parquet diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h new file mode 100644 index 00000000000..6a5eb93bf04 --- /dev/null +++ b/cpp/src/parquet/encryption.h @@ -0,0 +1,512 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_H +#define PARQUET_ENCRYPTION_H + +#include +#include +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace parquet { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = + ParquetCipher::AES_GCM_V1; +static constexpr int32_t kMaximalAadMetadataLength = 256; +static constexpr bool kDefaultEncryptedFooter = true; +static constexpr bool kDefaultCheckSignature = true; +static constexpr bool kDefaultAllowPlaintextFiles = false; +static constexpr int32_t kAadFileUniqueLength = 8; + +class ColumnDecryptionProperties; +using ColumnPathToDecryptionPropertiesMap = + std::map>; + +class ColumnEncryptionProperties; +using ColumnPathToEncryptionPropertiesMap = + std::map>; + +class PARQUET_EXPORT DecryptionKeyRetriever { + public: + virtual std::string GetKey(const std::string& key_metadata) const = 0; + virtual ~DecryptionKeyRetriever() {} +}; + +/// Simple integer key retriever +class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(uint32_t key_id, const std::string& key); + std::string GetKey(const std::string& key_metadata) const; + + private: + std::map key_map_; +}; + +// Simple string key retriever +class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(const std::string& key_id, const std::string& key); + std::string GetKey(const std::string& key_metadata) const; + + private: + std::map key_map_; +}; + +class PARQUET_EXPORT HiddenColumnException : public ParquetException { + public: + explicit HiddenColumnException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} +}; + +class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { + public: + explicit KeyAccessDeniedException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} +}; + +inline const uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} + +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + /// Convenience builder for encrypted columns. + explicit Builder(const std::string& name) : Builder(name, true) {} + + /// Convenience builder for encrypted columns. + explicit Builder(const std::shared_ptr& path) + : Builder(path->ToDotString(), true) {} + + /// Set a column-specific key. + /// If key is not set on an encrypted column, the column will + /// be encrypted with the footer key. + /// keyBytes Key length must be either 16, 24 or 32 bytes. + /// The key is cloned, and will be wiped out (array values set to 0) upon completion + /// of file writing. + /// Caller is responsible for wiping out the input key array. + Builder* key(std::string column_key); + + /// Set a key retrieval metadata. + /// use either key_metadata() or key_id(), not both + Builder* key_metadata(const std::string& key_metadata); + + /// A convenience function to set key metadata using a string id. + /// Set a key retrieval metadata (converted from String). + /// use either key_metadata() or key_id(), not both + /// key_id will be converted to metadata (UTF-8 array). + Builder* key_id(const std::string& key_id); + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); + } + + private: + const std::string column_path_; + bool encrypted_; + std::string key_; + std::string key_metadata_; + + Builder(const std::string path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} + }; + + std::string column_path() const { return column_path_; } + bool is_encrypted() const { return encrypted_; } + bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + std::string key() const { return key_; } + std::string key_metadata() const { return key_metadata_; } + + /// Upon completion of file writing, the encryption key + /// will be wiped out. + void WipeOutEncryptionKey() { key_.clear(); } + + bool is_utilized() { + if (key_.empty()) + return false; // can re-use column properties without encryption keys + return utilized_; + } + + /// ColumnEncryptionProperties object can be used for writing one file only. + /// Mark ColumnEncryptionProperties as utilized once it is used in + /// FileEncryptionProperties as the encryption key will be wiped out upon + /// completion of file writing. + void set_utilized() { utilized_ = true; } + + std::shared_ptr DeepClone() { + std::string key_copy = key_; + return std::shared_ptr(new ColumnEncryptionProperties( + encrypted_, column_path_, key_copy, key_metadata_)); + } + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + private: + const std::string column_path_; + bool encrypted_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + bool utilized_; + explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path, + const std::string& key, + const std::string& key_metadata); +}; + +class PARQUET_EXPORT ColumnDecryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + explicit Builder(const std::string& name) : column_path_(name) {} + + explicit Builder(const std::shared_ptr& path) + : Builder(path->ToDotString()) {} + + /// Set an explicit column key. If applied on a file that contains + /// key metadata for this column the metadata will be ignored, + /// the column will be decrypted with this key. + /// key length must be either 16, 24 or 32 bytes. + Builder* key(const std::string& key); + + std::shared_ptr build(); + + private: + const std::string column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + std::string column_path() const { return column_path_; } + std::string key() const { return key_; } + bool is_utilized() { return utilized_; } + + /// ColumnDecryptionProperties object can be used for reading one file only. + /// Mark ColumnDecryptionProperties as utilized once it is used in + /// FileDecryptionProperties as the encryption key will be wiped out upon + /// completion of file reading. + void set_utilized() { utilized_ = true; } + + /// Upon completion of file reading, the encryption key + /// will be wiped out. + void WipeOutDecryptionKey(); + + std::shared_ptr DeepClone(); + + private: + const std::string column_path_; + std::string key_; + bool utilized_; + + /// This class is only required for setting explicit column decryption keys - + /// to override key retriever (or to provide keys when key metadata and/or + /// key retriever are not available) + explicit ColumnDecryptionProperties(const std::string& column_path, + const std::string& key); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + /// Verifies identity (AAD Prefix) of individual file, + /// or of file collection in a data set. + /// Throws exception if an AAD prefix is wrong. + /// In a data set, AAD Prefixes should be collected, + /// and then checked for missing files. + virtual void Verify(const std::string& aad_prefix) = 0; + virtual ~AADPrefixVerifier() {} +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + Builder() { + check_plaintext_footer_integrity_ = kDefaultCheckSignature; + plaintext_files_allowed_ = kDefaultAllowPlaintextFiles; + } + + /// Set an explicit footer key. If applied on a file that contains + /// footer key metadata the metadata will be ignored, the footer + /// will be decrypted/verified with this key. + /// If explicit key is not set, footer key will be fetched from + /// key retriever. + /// With explicit keys or AAD prefix, new encryption properties object must be + /// created for each encrypted file. + /// Explicit encryption keys (footer and column) are cloned. + /// Upon completion of file reading, the cloned encryption keys in the properties + /// will be wiped out (array values set to 0). + /// Caller is responsible for wiping out the input key array. + /// param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* footer_key(const std::string footer_key); + + /// Set explicit column keys (decryption properties). + /// Its also possible to set a key retriever on this property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. + Builder* column_keys( + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties); + + /// Set a key retriever callback. Its also possible to + /// set explicit footer or column keys on this file property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. + Builder* key_retriever(const std::shared_ptr& key_retriever); + + /// Skip integrity verification of plaintext footers. + /// If not called, integrity of plaintext footers will be checked in runtime, + /// and an exception will be thrown in the following situations: + /// - footer signing key is not available + /// (not passed, or not found by key retriever) + /// - footer content and signature don't match + Builder* disable_footer_signature_verification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + /// Explicitly supply the file AAD prefix. + /// A must when a prefix is used for file encryption, but not stored in file. + /// If AAD prefix is stored in file, it will be compared to the explicitly + /// supplied value and an exception will be thrown if they differ. + Builder* aad_prefix(const std::string& aad_prefix); + + /// Set callback for verification of AAD Prefixes stored in file. + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier); + + /// By default, reading plaintext (unencrypted) files is not + /// allowed when using a decryptor + /// - in order to detect files that were not encrypted by mistake. + /// However, the default behavior can be overriden by calling this method. + /// The caller should use then a different method to ensure encryption + /// of files with sensitive data. + Builder* plaintext_files_allowed() { + plaintext_files_allowed_ = true; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new FileDecryptionProperties( + footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, + aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_)); + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + ColumnPathToDecryptionPropertiesMap column_decryption_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + }; + + std::string column_key(const std::string& column_path) const; + + std::string footer_key() const { return footer_key_; } + + std::string aad_prefix() const { return aad_prefix_; } + + std::shared_ptr key_retriever() { return key_retriever_; } + + bool check_plaintext_footer_integrity() const { + return check_plaintext_footer_integrity_; + } + + bool plaintext_files_allowed() const { return plaintext_files_allowed_; } + + const std::shared_ptr& aad_prefix_verifier() { + return aad_prefix_verifier_; + } + + /// Upon completion of file reading, the encryption keys in the properties + /// will be wiped out (array values set to 0). + void WipeOutDecryptionKeys(); + + bool is_utilized(); + + /// FileDecryptionProperties object can be used for reading one file only. + /// Mark FileDecryptionProperties as utilized once it is used to read a file as the + /// encryption keys will be wiped out upon completion of file reading. + void set_utilized() { utilized_ = true; } + + /// FileDecryptionProperties object can be used for reading one file only. + /// (unless this object keeps the keyRetrieval callback only, and no explicit + /// keys or aadPrefix). + /// At the end, keys are wiped out in the memory. + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + const std::string empty_string_ = ""; + ColumnPathToDecryptionPropertiesMap column_decryption_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + bool utilized_; + + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties, + bool plaintext_files_allowed); +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + class PARQUET_EXPORT Builder { + public: + explicit Builder(const std::string& footer_key) + : parquet_cipher_(kDefaultEncryptionAlgorithm), + encrypted_footer_(kDefaultEncryptedFooter) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + /// Create files with plaintext footer. + /// If not called, the files will be created with encrypted footer (default). + Builder* set_plaintext_footer() { + encrypted_footer_ = false; + return this; + } + + /// Set encryption algorithm. + /// If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* algorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; + return this; + } + + /// Set a key retrieval metadata (converted from String). + /// use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_id(const std::string& key_id); + + /// Set a key retrieval metadata. + /// use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_metadata(const std::string& footer_key_metadata); + + /// Set the file AAD Prefix. + Builder* aad_prefix(const std::string& aad_prefix); + + /// Skip storing AAD Prefix in file. + /// If not called, and if AAD Prefix is set, it will be stored. + Builder* disable_aad_prefix_storage(); + + /// Set the list of encrypted columns and their properties (keys etc). + /// If not called, all columns will be encrypted with the footer key. + /// If called, the file columns not in the list will be left unencrypted. + Builder* encrypted_columns( + const ColumnPathToEncryptionPropertiesMap& encrypted_columns); + + std::shared_ptr build() { + return std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, + aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + ColumnPathToEncryptionPropertiesMap encrypted_columns_; + }; + bool encrypted_footer() const { return encrypted_footer_; } + + EncryptionAlgorithm algorithm() const { return algorithm_; } + + std::string footer_key() const { return footer_key_; } + + std::string footer_key_metadata() const { return footer_key_metadata_; } + + std::string file_aad() const { return file_aad_; } + + std::shared_ptr column_encryption_properties( + const std::string& column_path); + + bool is_utilized() const { return utilized_; } + + /// FileEncryptionProperties object can be used for writing one file only. + /// Mark FileEncryptionProperties as utilized once it is used to write a file as the + /// encryption keys will be wiped out upon completion of file writing. + void set_utilized() { utilized_ = true; } + + /// Upon completion of file writing, the encryption keys + /// will be wiped out (array values set to 0). + void WipeOutEncryptionKeys(); + + /// FileEncryptionProperties object can be used for writing one file only. + /// (at the end, keys are wiped out in the memory). + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); + + ColumnPathToEncryptionPropertiesMap encrypted_columns() const { + return encrypted_columns_; + } + + private: + EncryptionAlgorithm algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + bool encrypted_footer_; + std::string file_aad_; + std::string aad_prefix_; + bool utilized_; + bool store_aad_prefix_in_file_; + ColumnPathToEncryptionPropertiesMap encrypted_columns_; + + FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const ColumnPathToEncryptionPropertiesMap& encrypted_columns); +}; + +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index bf3239d42c4..87656fca1b1 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -54,22 +54,22 @@ class AesEncryptor::AesEncryptorImpl { explicit AesEncryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata); ~AesEncryptorImpl() { - if (NULLPTR != ctx_) { + if (nullptr != ctx_) { EVP_CIPHER_CTX_free(ctx_); - ctx_ = NULLPTR; + ctx_ = nullptr; } } - int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* ciphertext); + int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext); - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + const uint8_t* nonce, uint8_t* encrypted_footer); void WipeOut() { - if (NULLPTR != ctx_) { + if (nullptr != ctx_) { EVP_CIPHER_CTX_free(ctx_); - ctx_ = NULLPTR; + ctx_ = nullptr; } } @@ -81,11 +81,12 @@ class AesEncryptor::AesEncryptorImpl { int key_length_; int ciphertext_size_delta_; - int GcmEncrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* nonce, uint8_t* aad, int aad_len, uint8_t* ciphertext); + int GcmEncrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, const uint8_t* aad, int aad_len, + uint8_t* ciphertext); - int CtrEncrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* nonce, uint8_t* ciphertext); + int CtrEncrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, uint8_t* ciphertext); }; AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int key_len, @@ -134,11 +135,9 @@ AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int } } -int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(const uint8_t* footer, - int footer_len, uint8_t* key, - int key_len, uint8_t* aad, - int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer) { +int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt( + const uint8_t* footer, int footer_len, const uint8_t* key, int key_len, + const uint8_t* aad, int aad_len, const uint8_t* nonce, uint8_t* encrypted_footer) { if (key_length_ != key_len) { std::stringstream ss; ss << "Wrong key length " << key_len << ". Should be " << key_length_; @@ -154,8 +153,9 @@ int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(const uint8_t* footer, } int AesEncryptor::AesEncryptorImpl::Encrypt(const uint8_t* plaintext, int plaintext_len, - uint8_t* key, int key_len, uint8_t* aad, - int aad_len, uint8_t* ciphertext) { + const uint8_t* key, int key_len, + const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { if (key_length_ != key_len) { std::stringstream ss; ss << "Wrong key length " << key_len << ". Should be " << key_length_; @@ -176,9 +176,10 @@ int AesEncryptor::AesEncryptorImpl::Encrypt(const uint8_t* plaintext, int plaint } int AesEncryptor::AesEncryptorImpl::GcmEncrypt(const uint8_t* plaintext, - int plaintext_len, uint8_t* key, - int key_len, uint8_t* nonce, uint8_t* aad, - int aad_len, uint8_t* ciphertext) { + int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, + const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { int len; int ciphertext_len; @@ -230,8 +231,8 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(const uint8_t* plaintext, } int AesEncryptor::AesEncryptorImpl::CtrEncrypt(const uint8_t* plaintext, - int plaintext_len, uint8_t* key, - int key_len, uint8_t* nonce, + int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, uint8_t* ciphertext) { int len; int ciphertext_len; @@ -277,11 +278,12 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(const uint8_t* plaintext, return kBufferSizeLength + buffer_size; } -AesEncryptor::~AesEncryptor() { impl_->~AesEncryptorImpl(); } +AesEncryptor::~AesEncryptor() {} -int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, - uint8_t* nonce, uint8_t* encrypted_footer) { +int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + const uint8_t* key, int key_len, const uint8_t* aad, + int aad_len, const uint8_t* nonce, + uint8_t* encrypted_footer) { return impl_->SignedFooterEncrypt(footer, footer_len, key, key_len, aad, aad_len, nonce, encrypted_footer); } @@ -290,8 +292,9 @@ void AesEncryptor::WipeOut() { impl_->WipeOut(); } int AesEncryptor::CiphertextSizeDelta() { return impl_->ciphertext_size_delta(); } -int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { +int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { return impl_->Encrypt(plaintext, plaintext_len, key, key_len, aad, aad_len, ciphertext); } @@ -304,19 +307,19 @@ class AesDecryptor::AesDecryptorImpl { explicit AesDecryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata); ~AesDecryptorImpl() { - if (NULLPTR != ctx_) { + if (nullptr != ctx_) { EVP_CIPHER_CTX_free(ctx_); - ctx_ = NULLPTR; + ctx_ = nullptr; } } - int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); void WipeOut() { - if (NULLPTR != ctx_) { + if (nullptr != ctx_) { EVP_CIPHER_CTX_free(ctx_); - ctx_ = NULLPTR; + ctx_ = nullptr; } } @@ -327,21 +330,22 @@ class AesDecryptor::AesDecryptorImpl { int aes_mode_; int key_length_; int ciphertext_size_delta_; - int GcmDecrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); + int GcmDecrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); - int CtrDecrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* plaintext); + int CtrDecrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, uint8_t* plaintext); }; -int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { +int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { return impl_->Decrypt(plaintext, plaintext_len, key, key_len, aad, aad_len, ciphertext); } void AesDecryptor::WipeOut() { impl_->WipeOut(); } -AesDecryptor::~AesDecryptor() { impl_->~AesDecryptorImpl(); } +AesDecryptor::~AesDecryptor() {} AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata) { @@ -389,9 +393,8 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int } } -AesEncryptor* AesEncryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors) { +AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_encryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -399,9 +402,7 @@ AesEncryptor* AesEncryptor::Make( } AesEncryptor* encryptor = new AesEncryptor(alg_id, key_len, metadata); - if (all_encryptors != NULLPTR) { - all_encryptors->push_back(encryptor); - } + if (all_encryptors != nullptr) all_encryptors->push_back(encryptor); return encryptor; } @@ -409,9 +410,8 @@ AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadat : impl_{std::unique_ptr( new AesDecryptorImpl(alg_id, key_len, metadata))} {} -AesDecryptor* AesDecryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors) { +AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_decryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -419,7 +419,7 @@ AesDecryptor* AesDecryptor::Make( } AesDecryptor* decryptor = new AesDecryptor(alg_id, key_len, metadata); - if (all_decryptors != NULLPTR) { + if (all_decryptors != nullptr) { all_decryptors->push_back(decryptor); } return decryptor; @@ -428,9 +428,9 @@ AesDecryptor* AesDecryptor::Make( int AesDecryptor::CiphertextSizeDelta() { return impl_->ciphertext_size_delta(); } int AesDecryptor::AesDecryptorImpl::GcmDecrypt(const uint8_t* ciphertext, - int ciphertext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, - uint8_t* plaintext) { + int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, + int aad_len, uint8_t* plaintext) { int len; int plaintext_len; @@ -490,7 +490,7 @@ int AesDecryptor::AesDecryptorImpl::GcmDecrypt(const uint8_t* ciphertext, } int AesDecryptor::AesDecryptorImpl::CtrDecrypt(const uint8_t* ciphertext, - int ciphertext_len, uint8_t* key, + int ciphertext_len, const uint8_t* key, int key_len, uint8_t* plaintext) { int len; int plaintext_len; @@ -542,8 +542,9 @@ int AesDecryptor::AesDecryptorImpl::CtrDecrypt(const uint8_t* ciphertext, } int AesDecryptor::AesDecryptorImpl::Decrypt(const uint8_t* ciphertext, int ciphertext_len, - uint8_t* key, int key_len, uint8_t* aad, - int aad_len, uint8_t* plaintext) { + const uint8_t* key, int key_len, + const uint8_t* aad, int aad_len, + uint8_t* plaintext) { if (key_length_ != key_len) { std::stringstream ss; ss << "Wrong key length " << key_len << ". Should be " << key_length_; @@ -606,5 +607,7 @@ void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) { reinterpret_cast(page_ordinal_bytes.c_str()), 2); } +void RandBytes(unsigned char* buf, int num) { RAND_bytes(buf, num); } + } // namespace encryption } // namespace parquet diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index af668dc4136..3bb8011bb84 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -47,7 +47,7 @@ constexpr int8_t kOffsetIndex = 7; class AesEncryptor { public: static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors); + std::vector* all_encryptors); ~AesEncryptor(); @@ -56,13 +56,13 @@ class AesEncryptor { /// Encrypts plaintext with the key and aad. Key length is passed only for validation. /// If different from value in constructor, exception will be thrown. - int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* ciphertext); + int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext); /// Encrypts plaintext footer, in order to compute footer signature (tag). - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + const uint8_t* nonce, uint8_t* encrypted_footer); void WipeOut(); @@ -78,7 +78,7 @@ class AesEncryptor { class AesDecryptor { public: static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors); + std::vector* all_decryptors); ~AesDecryptor(); void WipeOut(); @@ -88,8 +88,8 @@ class AesDecryptor { /// Decrypts ciphertext with the key and aad. Key length is passed only for /// validation. If different from value in constructor, exception will be thrown. - int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); private: /// Can serve one key length only. Possible values: 16, 24, 32 bytes. @@ -108,6 +108,9 @@ std::string CreateFooterAad(const std::string& aad_prefix_bytes); // Update last two bytes of page (or page header) module AAD void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal); +// Wraps OpenSSL RAND_bytes function +void RandBytes(unsigned char* buf, int num); + } // namespace encryption } // namespace parquet diff --git a/cpp/src/parquet/encryption_internal_nossl.cc b/cpp/src/parquet/encryption_internal_nossl.cc new file mode 100644 index 00000000000..9e43c9c555e --- /dev/null +++ b/cpp/src/parquet/encryption_internal_nossl.cc @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption_internal.h" +#include "parquet/exception.h" + +namespace parquet { +namespace encryption { + +void ThrowOpenSSLRequiredException() { + throw ParquetException( + "Calling encryption method in Arrow/Parquet built without OpenSSL"); +} + +class AesEncryptor::AesEncryptorImpl {}; + +AesEncryptor::~AesEncryptor() {} + +int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + const uint8_t* key, int key_len, const uint8_t* aad, + int aad_len, const uint8_t* nonce, + uint8_t* encrypted_footer) { + ThrowOpenSSLRequiredException(); + return -1; +} + +void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); } + +int AesEncryptor::CiphertextSizeDelta() { + ThrowOpenSSLRequiredException(); + return -1; +} + +int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { + ThrowOpenSSLRequiredException(); + return -1; +} + +AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) { + ThrowOpenSSLRequiredException(); +} + +class AesDecryptor::AesDecryptorImpl {}; + +int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { + ThrowOpenSSLRequiredException(); + return -1; +} + +void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); } + +AesDecryptor::~AesDecryptor() {} + +AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_encryptors) { + return NULLPTR; +} + +AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) { + ThrowOpenSSLRequiredException(); +} + +AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_decryptors) { + return NULLPTR; +} + +int AesDecryptor::CiphertextSizeDelta() { + ThrowOpenSSLRequiredException(); + return -1; +} + +std::string CreateModuleAad(const std::string& file_aad, int8_t module_type, + int16_t row_group_ordinal, int16_t column_ordinal, + int16_t page_ordinal) { + ThrowOpenSSLRequiredException(); + return ""; +} + +std::string CreateFooterAad(const std::string& aad_prefix_bytes) { + ThrowOpenSSLRequiredException(); + return ""; +} + +void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) { + ThrowOpenSSLRequiredException(); +} + +void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); } + +} // namespace encryption +} // namespace parquet diff --git a/cpp/src/parquet/encryption_properties_test.cc b/cpp/src/parquet/encryption_properties_test.cc new file mode 100644 index 00000000000..2f06b2052d1 --- /dev/null +++ b/cpp/src/parquet/encryption_properties_test.cc @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/encryption.h" + +namespace parquet { + +namespace test { + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { + std::string column_path_1 = "column_1"; + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1, column_props_1->column_path()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(false, column_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, column_props_1->key()); + ASSERT_EQ("kc1", column_props_1->key_metadata()); +} + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithFooterKey) { + std::string column_path_1 = "column_1"; + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1, column_props_1->column_path()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(true, column_props_1->is_encrypted_with_footer_key()); +} + +// Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST(TestEncryptionProperties, UniformEncryption) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ("kf", props->footer_key_metadata()); + + std::shared_ptr column_path = + parquet::schema::ColumnPath::FromDotString("a_column"); + std::shared_ptr out_col_props = + props->column_encryption_properties(column_path->ToDotString()); + + ASSERT_EQ(true, out_col_props->is_encrypted()); + ASSERT_EQ(true, out_col_props->is_encrypted_with_footer_key()); +} + +// Encrypt two columns with their own keys and the same key for +// the footer and other columns +TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1->ToDotString()); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2->ToDotString()); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map> encrypted_columns; + encrypted_columns[column_path_1->ToDotString()] = column_builder_1.build(); + encrypted_columns[column_path_2->ToDotString()] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.encrypted_columns(encrypted_columns); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_encryption_properties(column_path_1->ToDotString()); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_encryption_properties(column_path_2->ToDotString()); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_encryption_properties(column_path_3->ToDotString()); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Encryption configuration 3: Encrypt two columns, don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map> encrypted_columns; + encrypted_columns[column_path_1->ToDotString()] = column_builder_1.build(); + encrypted_columns[column_path_2->ToDotString()] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.set_plaintext_footer(); + builder.encrypted_columns(encrypted_columns); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(false, props->encrypted_footer()); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_encryption_properties(column_path_1->ToDotString()); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_encryption_properties(column_path_2->ToDotString()); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + // other columns: encrypted with footer, footer is not encrypted + // so column is not encrypted as well + std::string column_path_3 = "column_3"; + std::shared_ptr out_col_props_3 = + props->column_encryption_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Use aad_prefix +TEST(TestEncryptionProperties, UseAadPrefix) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->algorithm().aad.aad_prefix); + ASSERT_EQ(false, props->algorithm().aad.supply_aad_prefix); +} + +// Use aad_prefix and +// disable_aad_prefix_storage. +TEST(TestEncryptionProperties, UseAadPrefixNotStoreInFile) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + builder.disable_aad_prefix_storage(); + std::shared_ptr props = builder.build(); + + ASSERT_EQ("", props->algorithm().aad.aad_prefix); + ASSERT_EQ(true, props->algorithm().aad.supply_aad_prefix); +} + +// Use AES_GCM_CTR_V1 algorithm +TEST(TestEncryptionProperties, UseAES_GCM_CTR_V1Algorithm) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.algorithm(ParquetCipher::AES_GCM_CTR_V1); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(ParquetCipher::AES_GCM_CTR_V1, props->algorithm().algorithm); +} + +TEST(TestDecryptionProperties, UseKeyRetriever) { + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder builder; + builder.key_retriever(kr1); + std::shared_ptr props = builder.build(); + + auto out_key_retriever = props->key_retriever(); + ASSERT_EQ(kFooterEncryptionKey, out_key_retriever->GetKey("kf")); + ASSERT_EQ(kColumnEncryptionKey1, out_key_retriever->GetKey("kc1")); + ASSERT_EQ(kColumnEncryptionKey2, out_key_retriever->GetKey("kc2")); +} + +TEST(TestDecryptionProperties, SupplyAadPrefix) { + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->aad_prefix()); +} + +TEST(ColumnDecryptionProperties, SetKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + col_builder_1.key(kColumnEncryptionKey1); + + auto props = col_builder_1.build(); + ASSERT_EQ(kColumnEncryptionKey1, props->key()); +} + +TEST(TestDecryptionProperties, UsingExplicitFooterAndColumnKeys) { + std::string column_path_1 = "column_1"; + std::string column_path_2 = "column_2"; + std::map> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + parquet::ColumnDecryptionProperties::Builder col_builder_2(column_path_2); + + decryption_cols[column_path_1] = col_builder_1.key(kColumnEncryptionKey1)->build(); + decryption_cols[column_path_2] = col_builder_2.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.column_keys(decryption_cols); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, props->column_key(column_path_1)); + ASSERT_EQ(kColumnEncryptionKey2, props->column_key(column_path_2)); +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption_read_configurations_test.cc b/cpp/src/parquet/encryption_read_configurations_test.cc new file mode 100644 index 00000000000..fe2da4bdc67 --- /dev/null +++ b/cpp/src/parquet/encryption_read_configurations_test.cc @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "arrow/io/file.h" + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test_encryption_util.h" +#include "parquet/test_util.h" + +/* + * This file contains a unit-test for reading encrypted Parquet files with + * different decryption configurations. + * + * The unit-test is called multiple times, each time to decrypt parquet files using + * different decryption configuration as described below. + * In each call two encrypted files are read: one temporary file that was generated using + * encryption-write-configurations-test.cc test and will be deleted upon + * reading it, while the second resides in + * parquet-testing/data repository. Those two encrypted files were encrypted using the + * same encryption configuration. + * The encrypted parquet file names are passed as parameter to the unit-test. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The following decryption configurations are used to decrypt each parquet file: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + * - Decryption Configuration 4: PlainText Footer mode - test legacy reads, + * read the footer + all non-encrypted columns. + * (pairs with encryption configuration 3) + * + * The encrypted parquet files that is read was encrypted using one of the configurations + * below: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + + */ + +namespace parquet { +namespace test { + +class TestDecryptionConfiguration + : public testing::TestWithParam> { + public: + void SetUp() { CreateDecryptionConfigurations(); } + + protected: + std::string path_to_double_field_ = "double_field"; + std::string path_to_float_field_ = "float_field"; + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + void CreateDecryptionConfigurations() { + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // Decryption configuration 1: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey_); + string_kr2->PutKey("kc1", kColumnEncryptionKey1_); + string_kr2->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::string path_float_ptr = "float_field"; + std::string path_double_ptr = "double_field"; + std::map> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( + path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey_) + ->column_keys(decryption_cols) + ->build()); + + // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext + // columns. + vector_of_decryption_configurations_.push_back(NULL); + } + + void DecryptFile(std::string file, int decryption_config_num) { + std::string exception_msg; + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + // if we get decryption_config_num = x then it means the actual number is x+1 + // and since we want decryption_config_num=4 we set the condition to 3 + if (decryption_config_num != 3) { + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[decryption_config_num]->DeepClone()); + } + + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + ASSERT_EQ(num_columns, 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + ASSERT_EQ(value, i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + ASSERT_EQ(value, expected_value); + if ((i % 2) == 0) { + ASSERT_EQ(repetition_level, 1); + } else { + ASSERT_EQ(repetition_level, 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + ASSERT_EQ(value.value[j], expected_value.value[j]); + } + i++; + } + + if (decryption_config_num != 3) { + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); + } else { + // There are NULL values in the rows written + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + i++; + } + } + file_reader->Close(); + } + + // Check that the decryption result is as expected. + void CheckResults(const std::string file_name, unsigned decryption_config_num, + unsigned encryption_config_num) { + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_config_num == 5) { + if (decryption_config_num == 1 || decryption_config_num == 3) { + EXPECT_THROW(DecryptFile(file_name, decryption_config_num - 1), ParquetException); + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected + // to be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_config_num == 2) { + if (encryption_config_num != 5 && encryption_config_num != 4) { + EXPECT_THROW(DecryptFile(file_name, decryption_config_num - 1), ParquetException); + return; + } + } + + // decryption config 4 can only work when the encryption configuration is 3 + if (decryption_config_num == 4 && encryption_config_num != 3) { + return; + } + EXPECT_NO_THROW(DecryptFile(file_name, decryption_config_num - 1)); + } + + // Returns true if file exists. Otherwise returns false. + bool fexists(const std::string& filename) { + std::ifstream ifile(filename.c_str()); + return ifile.good(); + } +}; + +// Read encrypted parquet file. +// The test reads two parquet files that were encrypted using the same encryption +// configuration: +// one was generated in encryption-write-configurations-test.cc tests and is deleted +// once the file is read and the second exists in parquet-testing/data folder. +// The name of the files are passed as parameters to the unit-test. +TEST_P(TestDecryptionConfiguration, TestDecryption) { + int encryption_config_num = std::get<0>(GetParam()); + const char* param_file_name = std::get<1>(GetParam()); + // Decrypt parquet file that was generated in encryption-write-configurations-test.cc + // test. + std::string tmp_file_name = "tmp_" + std::string(param_file_name); + std::string file_name = data_file(tmp_file_name.c_str()); + if (!fexists(file_name)) { + std::stringstream ss; + ss << "File " << file_name << " is missing from parquet-testing repo."; + throw ParquetTestException(ss.str()); + } + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { + unsigned decryption_config_num = index + 1; + CheckResults(file_name, decryption_config_num, encryption_config_num); + } + // Delete temporary test file. + ASSERT_EQ(std::remove(file_name.c_str()), 0); + + // Decrypt parquet file that resides in parquet-testing/data directory. + file_name = data_file(param_file_name); + + if (!fexists(file_name)) { + std::stringstream ss; + ss << "File " << file_name << " is missing from parquet-testing repo."; + throw ParquetTestException(ss.str()); + } + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { + unsigned decryption_config_num = index + 1; + CheckResults(file_name, decryption_config_num, encryption_config_num); + } +} + +INSTANTIATE_TEST_CASE_P( + DecryptionTests, TestDecryptionConfiguration, + ::testing::Values( + std::make_tuple(1, "uniform_encryption.parquet.encrypted"), + std::make_tuple(2, "encrypt_columns_and_footer.parquet.encrypted"), + std::make_tuple(3, "encrypt_columns_plaintext_footer.parquet.encrypted"), + std::make_tuple(4, "encrypt_columns_and_footer_aad.parquet.encrypted"), + std::make_tuple( + 5, "encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"), + std::make_tuple(6, "encrypt_columns_and_footer_ctr.parquet.encrypted"))); + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption_write_configurations_test.cc b/cpp/src/parquet/encryption_write_configurations_test.cc new file mode 100644 index 00000000000..42a1d08d417 --- /dev/null +++ b/cpp/src/parquet/encryption_write_configurations_test.cc @@ -0,0 +1,379 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test_encryption_util.h" +#include "parquet/test_util.h" + +/* + * This file contains unit-tests for writing encrypted Parquet files with + * different encryption configurations. + * The files are saved in parquet-testing/data folder and will be deleted after reading + * them in encryption-read-configurations-test.cc test. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * Each unit-test creates a single parquet file with eight columns using one of the + * following encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + */ + +namespace parquet { +namespace test { + +using FileClass = ::arrow::io::FileOutputStream; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + } + + protected: + std::string path_to_double_field_ = "double_field"; + std::string path_to_float_field_ = "float_field"; + std::string file_name_; + int num_rgs = 5; + int rows_per_rowgroup_ = 50; + std::shared_ptr schema_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file_name) { + std::shared_ptr out_file; + + std::string file = data_file(file_name.c_str()); + + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + for (int r = 0; r < num_rgs; r++) { + auto row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + } + + // Close the ParquetFileWriter + file_writer->Close(); + + return; + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration 1: Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST_F(TestEncryptionConfiguration, UniformEncryption) { + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), + "tmp_uniform_encryption.parquet.encrypted"); +} + +// Encryption configuration 2: Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map> + encryption_cols2; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols2) + ->build(), + "tmp_encrypt_columns_and_footer.parquet.encrypted"); +} + +// Encryption configuration 3: Encrypt two columns, with different keys. +// Don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { + std::map> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( + path_to_float_field_); + encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); + encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols3) + ->set_plaintext_footer() + ->build(), + "tmp_encrypt_columns_plaintext_footer.parquet.encrypted"); +} + +// Encryption configuration 4: Encrypt two columns and the footer, with different keys. +// Use aad_prefix. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { + std::map> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( + path_to_float_field_); + encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); + encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols4) + ->aad_prefix(kFileName_) + ->build(), + "tmp_encrypt_columns_and_footer_aad.parquet.encrypted"); +} + +// Encryption configuration 5: Encrypt two columns and the footer, with different keys. +// Use aad_prefix and disable_aad_prefix_storage. +TEST_F(TestEncryptionConfiguration, + EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { + std::map> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( + path_to_float_field_); + encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); + encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey_); + + this->EncryptFile( + file_encryption_builder_5.encrypted_columns(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_aad_prefix_storage() + ->build(), + "tmp_encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"); +} + +// Encryption configuration 6: Encrypt two columns and the footer, with different keys. +// Use AES_GCM_CTR_V1 algorithm. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { + std::map> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( + path_to_float_field_); + encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); + encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey_); + + EXPECT_NO_THROW( + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->encrypted_columns(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build(), + "tmp_encrypt_columns_and_footer_ctr.parquet.encrypted")); +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 6a11baedf82..e02bf01327c 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -32,7 +32,10 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" +#include "parquet/file_writer.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -44,7 +47,6 @@ namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; -static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; @@ -82,8 +84,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, int row_group_number, - const ReaderProperties& props) - : source_(source), file_metadata_(file_metadata), properties_(props) { + const ReaderProperties& props, + InternalFileDecryptor* file_decryptor = nullptr) + : source_(source), + file_metadata_(file_metadata), + properties_(props), + row_group_ordinal_(row_group_number), + file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -93,7 +100,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i); + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && @@ -118,8 +125,41 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); + + std::unique_ptr crypto_metadata = col->crypto_metadata(); + + // Column is encrypted only if crypto_metadata exists. + if (!crypto_metadata) { + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool()); + } + + // The column is encrypted + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; + // The column is encrypted with footer key + if (crypto_metadata->encrypted_with_footer_key()) { + meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); + data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor); + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool(), &ctx); + } + + // The column is encrypted with its own key + std::string column_key_metadata = crypto_metadata->key_metadata(); + const std::string column_path = crypto_metadata->path_in_schema()->ToDotString(); + + meta_decryptor = + file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); + data_decryptor = + file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor); return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool()); + properties_.memory_pool(), &ctx); } private: @@ -127,6 +167,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { FileMetaData* file_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; + int16_t row_group_ordinal_; + InternalFileDecryptor* file_decryptor_; }; // ---------------------------------------------------------------------- @@ -141,11 +183,21 @@ class SerializedFile : public ParquetFileReader::Contents { const ReaderProperties& props = default_reader_properties()) : source_(source), properties_(props) {} - void Close() override {} + ~SerializedFile() override { + try { + Close(); + } catch (...) { + } + } + + void Close() override { + if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); + } std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); + new SerializedRowGroup(source_, file_metadata_.get(), static_cast(i), + properties_, file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -175,41 +227,221 @@ class SerializedFile : public ParquetFileReader::Contents { // Check if all bytes are read. Check if last 4 bytes read have the magic bits if (footer_buffer->size() != footer_read_size || - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0) { + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } - uint32_t metadata_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); - int64_t metadata_start = file_size - kFooterSize - metadata_len; - if (kFooterSize + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); + if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0) { + // Encrypted file with Encrypted footer. + ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, + file_size); + return; } + // No encryption or encryption with plaintext footer mode. std::shared_ptr metadata_buffer; - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + kFooterSize)) { - metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); + + auto file_decryption_properties = properties_.file_decryption_properties(); + if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. + if (file_decryption_properties != nullptr) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } } + } else { + // Encrypted file with plaintext footer mode. + ParseMetaDataOfEncryptedFileWithPlaintextFooter( + file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } private: std::shared_ptr source_; std::shared_ptr file_metadata_; ReaderProperties properties_; + + std::unique_ptr file_decryptor_; + + void ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, + int64_t footer_read_size, int64_t file_size, + std::shared_ptr* metadata_buffer, + uint32_t* metadata_len, uint32_t* read_metadata_len); + + std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, + EncryptionAlgorithm& algo); + + void ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len); + + void ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size); }; +void SerializedFile::ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size, std::shared_ptr* metadata_buffer, uint32_t* metadata_len, + uint32_t* read_metadata_len) { + *metadata_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = file_size - kFooterSize - *metadata_len; + if (kFooterSize + *metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (*metadata_len + kFooterSize)) { + *metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - *metadata_len - kFooterSize, *metadata_len); + } else { + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_start, *metadata_len, metadata_buffer)); + if ((*metadata_buffer)->size() != *metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + *read_metadata_len = *metadata_len; + file_metadata_ = FileMetaData::Make((*metadata_buffer)->data(), read_metadata_len); +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size) { + // encryption with encrypted footer + // both metadata & crypto metadata length + uint32_t footer_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; + if (kFooterSize + footer_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (footer_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != footer_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == nullptr) { + throw ParquetException( + "No decryption properties are provided. Could not read " + "encrypted footer metadata"); + } + uint32_t crypto_metadata_len = footer_len; + std::shared_ptr file_crypto_metadata = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + // Handle AAD prefix + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata(), properties_.memory_pool())); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + std::shared_ptr metadata_buffer; + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException( + "Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); + } + + auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len) { + // Providing decryption properties in plaintext footer mode is not mendatory, for + // example when reading by legacy reader. + if (file_decryption_properties != nullptr) { + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + // Handle AAD prefix + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata(), properties_.memory_pool())); + + if (file_decryption_properties->check_plaintext_footer_integrity()) { + if (metadata_len - read_metadata_len != + (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) { + throw ParquetException( + "Invalid parquet file. Cannot verify plaintext mode footer."); + } + + if (!file_metadata_->VerifySignature(file_decryptor_.get(), + metadata_buffer->data() + read_metadata_len)) { + throw ParquetException( + "Invalid parquet file. Could not verify plaintext " + "footer metadata"); + } + } + } +} + +std::string SerializedFile::HandleAadPrefix( + FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) { + std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properties; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + + if (file_has_aad_prefix) { + if (!aad_prefix_in_properties.empty()) { + if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) { + throw ParquetException( + "AAD Prefix in file and in properties " + "is not the same"); + } + } + aad_prefix = aad_prefix_in_file; + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != nullptr) { + throw ParquetException( + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); + } + } + return aad_prefix + algo.aad.aad_file_unique; +} + // ---------------------------------------------------------------------- // ParquetFileReader public API @@ -295,6 +527,7 @@ std::shared_ptr ParquetFileReader::RowGroup(int i) { DCHECK(i < metadata()->num_row_groups()) << "The file only has " << metadata()->num_row_groups() << "row groups, requested reader for: " << i; + return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index e2121648e97..181a1fb8e99 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -19,12 +19,15 @@ #include #include +#include #include #include #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/types.h" @@ -35,9 +38,6 @@ using parquet::schema::GroupNode; namespace parquet { -// FIXME: copied from reader-internal.cc -static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; - // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -81,16 +81,19 @@ inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) { class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, - RowGroupMetaDataBuilder* metadata, - const WriterProperties* properties, bool buffered_row_group = false) + RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, + const WriterProperties* properties, bool buffered_row_group = false, + InternalFileEncryptor* file_encryptor = nullptr) : sink_(sink), metadata_(metadata), properties_(properties), total_bytes_written_(0), closed_(false), + row_group_ordinal_(row_group_ordinal), next_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group) { + buffered_row_group_(buffered_row_group), + file_encryptor_(file_encryptor) { if (buffered_row_group) { InitColumns(); } else { @@ -126,9 +129,16 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++next_column_index_; const auto& path = col_meta->descr()->path(); + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString()) + : nullptr; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString()) + : nullptr; std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(path), properties_->compression_level(path), - col_meta, properties_->memory_pool()); + col_meta, row_group_ordinal_, static_cast(next_column_index_ - 1), + properties_->memory_pool(), false, meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -183,7 +193,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { // Ensures all columns have been written metadata_->set_num_rows(num_rows_); - metadata_->Finish(total_bytes_written_); + metadata_->Finish(total_bytes_written_, row_group_ordinal_); } } @@ -193,9 +203,11 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const WriterProperties* properties_; int64_t total_bytes_written_; bool closed_; + int16_t row_group_ordinal_; int next_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; + InternalFileEncryptor* file_encryptor_; void CheckRowsWritten() const { // verify when only one column is written at a time @@ -223,9 +235,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const auto& path = col_meta->descr()->path(); + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString()) + : nullptr; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString()) + : nullptr; std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(path), properties_->compression_level(path), - col_meta, properties_->memory_pool(), buffered_row_group_); + col_meta, static_cast(row_group_ordinal_), + static_cast(next_column_index_), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -265,8 +285,14 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*file_metadata_, sink_.get()); + auto file_encryption_properties = properties_->file_encryption_properties(); + + if (file_encryption_properties == nullptr) { // Non encrypted file. + file_metadata_ = metadata_->Finish(); + WriteFileMetaData(*file_metadata_, sink_.get()); + } else { // Encrypted file + CloseEncryptedFile(file_encryption_properties); + } } } @@ -287,7 +313,8 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, properties_.get(), buffered_row_group)); + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), + buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } @@ -318,6 +345,36 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } + void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { + // Encrypted file with encrypted footer + if (file_encryption_properties->encrypted_footer()) { + // encrypted footer + file_metadata_ = metadata_->Finish(); + + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint64_t metadata_start = static_cast(position); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + + auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true); + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint32_t footer_and_crypto_len = static_cast(position - metadata_start); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); + } else { // Encrypted file with plaintext footer + file_metadata_ = metadata_->Finish(); + auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor, + false); + } + if (file_encryptor_) { + file_encryptor_->WipeOutEncryptionKeys(); + } + } + std::shared_ptr sink_; bool is_open_; const std::shared_ptr properties_; @@ -327,9 +384,44 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; + std::unique_ptr file_encryptor_; + void StartFile() { - // Parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + auto file_encryption_properties = properties_->file_encryption_properties(); + if (file_encryption_properties == nullptr) { + // Unencrypted parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); + } else { + // Check that all columns in columnEncryptionProperties exist in the schema. + auto encrypted_columns = file_encryption_properties->encrypted_columns(); + // if columnEncryptionProperties is empty, every column in file schema will be + // encrypted with footer key. + if (encrypted_columns.size() != 0) { + std::vector column_path_vec; + // First, save all column paths in schema. + for (int i = 0; i < num_columns(); i++) { + column_path_vec.push_back(schema_.Column(i)->path()->ToDotString()); + } + // Check if column exists in schema. + for (const auto& elem : encrypted_columns) { + auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first); + if (it == column_path_vec.end()) { + std::stringstream ss; + ss << "Encrypted column " + elem.first + " not in file schema"; + throw ParquetException(ss.str()); + } + } + } + + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, + properties_->memory_pool())); + if (file_encryption_properties->encrypted_footer()) { + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); + } else { + // Encrypted file with plaintext footer mode. + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); + } + } } }; @@ -366,10 +458,9 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); - - // Write MetaData uint32_t metadata_len = static_cast(position); file_metadata.WriteTo(sink); @@ -378,17 +469,56 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); +} + +void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); + return WriteFileMetaData(file_metadata, sink); +} + +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + if (encrypt_footer) { // Encrypted file with encrypted footer + // encrypt and write to sink + file_metadata.WriteTo(sink, encryptor); + } else { // Encrypted file with plaintext footer mode. + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); + file_metadata.WriteTo(sink, encryptor); + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; + + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); + } } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, bool encrypt_footer) { ParquetOutputWrapper wrapper(sink); return WriteFileMetaData(file_metadata, &wrapper); } -void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); - return WriteFileMetaData(file_metadata, sink); +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + ParquetOutputWrapper wrapper(sink); + return WriteEncryptedFileMetadata(file_metadata, &wrapper, encryptor, encrypt_footer); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ArrowOutputStream* sink) { + crypto_metadata.WriteTo(sink); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink) { + ParquetOutputWrapper wrapper(sink); + crypto_metadata.WriteTo(&wrapper); } const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 585e74465e3..693eba1e898 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -31,6 +31,10 @@ namespace parquet { class ColumnWriter; class OutputStream; +// FIXME: copied from reader-internal.cc +static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; + class PARQUET_EXPORT RowGroupWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more @@ -104,6 +108,23 @@ PARQUET_EXPORT void WriteMetaDataFile(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer); + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor = NULLPTR, + bool encrypt_footer = false); +PARQUET_EXPORT +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ::arrow::io::OutputStream* sink); + class PARQUET_EXPORT ParquetFileWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc new file mode 100644 index 00000000000..328067e7ae9 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/internal_file_decryptor.h" +#include "parquet/encryption.h" +#include "parquet/encryption_internal.h" + +namespace parquet { + +// Decryptor +Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_decryptor_(aes_decryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} + +int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } + +int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, + uint8_t* plaintext) { + return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), plaintext); +} + +// InternalFileDecryptor +InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool) + : properties_(properties), + file_aad_(file_aad), + algorithm_(algorithm), + footer_key_metadata_(footer_key_metadata), + pool_(pool) { + if (properties_->is_utilized()) { + throw ParquetException( + "Re-using decryption properties with explicit keys for another file"); + } + properties_->set_utilized(); +} + +void InternalFileDecryptor::WipeOutDecryptionKeys() { + properties_->WipeOutDecryptionKeys(); + for (auto const& i : all_decryptors_) { + i->WipeOut(); + } +} + +std::string InternalFileDecryptor::GetFooterKey() { + std::string footer_key = properties_->footer_key(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata_.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->key_retriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException( + "Footer key unavailable. Could not verify " + "plaintext footer metadata"); + } + return footer_key; +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { + std::string aad = encryption::CreateFooterAad(file_aad_); + return GetFooterDecryptor(aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( + const std::string& aad) { + return GetFooterDecryptor(aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnData( + const std::string& aad) { + return GetFooterDecryptor(aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( + const std::string& aad, bool metadata) { + if (metadata) { + if (footer_metadata_decryptor_ != nullptr) return footer_metadata_decryptor_; + } else { + if (footer_data_decryptor_ != nullptr) return footer_data_decryptor_; + } + + std::string footer_key = properties_->footer_key(); + if (footer_key.empty()) { + if (footer_key_metadata_.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->key_retriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException( + "Invalid footer encryption key. " + "Could not parse footer metadata"); + } + + // Create both data and metadata decryptors to avoid redundant retrieval of key + // from the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); + + footer_metadata_decryptor_ = std::make_shared( + aes_metadata_decryptor, footer_key, file_aad_, aad, pool_); + footer_data_decryptor_ = + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad, pool_); + + if (metadata) return footer_metadata_decryptor_; + return footer_data_decryptor_; +} + +std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad) { + return GetColumnDecryptor(column_path, column_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad) { + return GetColumnDecryptor(column_path, column_key_metadata, aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad, bool metadata) { + std::string column_key; + // first look if we already got the decryptor from before + if (metadata) { + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + auto res(column_metadata_map_.at(column_path)); + res->UpdateAad(aad); + return res; + } + } else { + if (column_data_map_.find(column_path) != column_data_map_.end()) { + auto res(column_data_map_.at(column_path)); + res->UpdateAad(aad); + return res; + } + } + + column_key = properties_->column_key(column_path); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + properties_->key_retriever() != nullptr) { + try { + column_key = properties_->key_retriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + column_path + " " << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } + } + if (column_key.empty()) { + throw HiddenColumnException("HiddenColumnException, path=" + column_path); + } + + // Create both data and metadata decryptors to avoid redundant retrieval of key + // using the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); + + column_metadata_map_[column_path] = std::make_shared( + aes_metadata_decryptor, column_key, file_aad_, aad, pool_); + column_data_map_[column_path] = + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad, pool_); + + if (metadata) return column_metadata_map_[column_path]; + return column_data_map_[column_path]; +} + +int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("decryption key must be 16, 24 or 32 bytes in length"); +} + +encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (meta_decryptor_[index] == nullptr) { + meta_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_)); + } + return meta_decryptor_[index].get(); +} + +encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (data_decryptor_[index] == nullptr) { + data_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_)); + } + return data_decryptor_[index].get(); +} + +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h new file mode 100644 index 00000000000..cdce19647ea --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef INTERNAL_FILE_DECRYPTOR_H +#define INTERNAL_FILE_DECRYPTOR_H + +#include +#include +#include +#include + +#include "parquet/schema.h" + +namespace parquet { + +namespace encryption { +class AesDecryptor; +class AesEncryptor; +} // namespace encryption + +class FileDecryptionProperties; + +class PARQUET_EXPORT Decryptor { + public: + Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); + + const std::string& file_aad() const { return file_aad_; } + void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } + + int CiphertextSizeDelta(); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); + + private: + encryption::AesDecryptor* aes_decryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; + ::arrow::MemoryPool* pool_; +}; + +class InternalFileDecryptor { + public: + explicit InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool); + + std::string& file_aad() { return file_aad_; } + + std::string GetFooterKey(); + + ParquetCipher::type algorithm() { return algorithm_; } + + std::string& footer_key_metadata() { return footer_key_metadata_; } + + FileDecryptionProperties* properties() { return properties_; } + + void WipeOutDecryptionKeys(); + + ::arrow::MemoryPool* pool() { return pool_; } + + std::shared_ptr GetFooterDecryptor(); + std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); + std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); + std::shared_ptr GetColumnMetaDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad = ""); + std::shared_ptr GetColumnDataDecryptor( + const std::string& column_path, const std::string& column_key_metadata, + const std::string& aad = ""); + + private: + FileDecryptionProperties* properties_; + // Concatenation of aad_prefix (if exists) and aad_file_unique + std::string file_aad_; + std::map> column_data_map_; + std::map> column_metadata_map_; + + std::shared_ptr footer_metadata_decryptor_; + std::shared_ptr footer_data_decryptor_; + ParquetCipher::type algorithm_; + std::string footer_key_metadata_; + std::vector all_decryptors_; + + /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_decryptors and data_decryptors. + std::unique_ptr meta_decryptor_[3]; + std::unique_ptr data_decryptor_[3]; + + ::arrow::MemoryPool* pool_; + + std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); + std::shared_ptr GetColumnDecryptor(const std::string& column_path, + const std::string& column_key_metadata, + const std::string& aad, + bool metadata = false); + + encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + + int MapKeyLenToDecryptorArrayIndex(int key_len); +}; + +} // namespace parquet + +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc new file mode 100644 index 00000000000..b634ed677f3 --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/internal_file_encryptor.h" +#include "parquet/encryption.h" +#include "parquet/encryption_internal.h" + +namespace parquet { + +// Encryptor +Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_encryptor_(aes_encryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} + +int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } + +int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) { + return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), ciphertext); +} + +// InternalFileEncryptor +InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties, + ::arrow::MemoryPool* pool) + : properties_(properties), pool_(pool) { + if (properties_->is_utilized()) { + throw ParquetException("Re-using encryption properties for another file"); + } + properties_->set_utilized(); +} + +void InternalFileEncryptor::WipeOutEncryptionKeys() { + properties_->WipeOutEncryptionKeys(); + + for (auto const& i : all_encryptors_) { + i->WipeOut(); + } +} + +std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { + if (footer_encryptor_ != nullptr) { + return footer_encryptor_; + } + + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); + std::string footer_key = properties_->footer_key(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); + footer_encryptor_ = std::make_shared( + aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_); + return footer_encryptor_; +} + +std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != nullptr) { + return footer_signing_encryptor_; + } + + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); + std::string footer_signing_key = properties_->footer_key(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); + footer_signing_encryptor_ = std::make_shared( + aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_); + return footer_signing_encryptor_; +} + +std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( + const std::string& column_path) { + return GetColumnEncryptor(column_path, true); +} + +std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( + const std::string& column_path) { + return GetColumnEncryptor(column_path, false); +} + +std::shared_ptr +InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( + const std::string& column_path, bool metadata) { + // first look if we already got the encryptor from before + if (metadata) { + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); + } + } else { + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); + } + } + auto column_prop = properties_->column_encryption_properties(column_path); + if (column_prop == nullptr) { + return nullptr; + } + + std::string key; + if (column_prop->is_encrypted_with_footer_key()) { + key = properties_->footer_key(); + } else { + key = column_prop->key(); + } + + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size()) + : GetDataAesEncryptor(algorithm, key.size()); + + std::string file_aad = properties_->file_aad(); + std::shared_ptr encryptor = + std::make_shared(aes_encryptor, key, file_aad, "", pool_); + if (metadata) + column_metadata_map_[column_path] = encryptor; + else + column_data_map_[column_path] = encryptor; + + return encryptor; +} + +int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (meta_encryptor_[index] == nullptr) { + meta_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_)); + } + return meta_encryptor_[index].get(); +} + +encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (data_encryptor_[index] == nullptr) { + data_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_)); + } + return data_encryptor_[index].get(); +} + +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h new file mode 100644 index 00000000000..b1ddea02424 --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef INTERNAL_FILE_ENCRYPTOR_H +#define INTERNAL_FILE_ENCRYPTOR_H + +#include +#include +#include +#include + +#include "parquet/encryption.h" +#include "parquet/schema.h" + +namespace parquet { + +namespace encryption { +class AesEncryptor; +} // namespace encryption + +class FileEncryptionProperties; +class ColumnEncryptionProperties; + +class PARQUET_EXPORT Encryptor { + public: + Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); + const std::string& file_aad() { return file_aad_; } + void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } + + int CiphertextSizeDelta(); + int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); + + bool EncryptColumnMetaData( + bool encrypted_footer, + const std::shared_ptr& column_encryption_properties) { + // if column is not encrypted then do not encrypt the column metadata + if (!column_encryption_properties || !column_encryption_properties->is_encrypted()) + return false; + // if plaintext footer then encrypt the column metadata + if (!encrypted_footer) return true; + // if column is not encrypted with footer key then encrypt the column metadata + return !column_encryption_properties->is_encrypted_with_footer_key(); + } + + private: + encryption::AesEncryptor* aes_encryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; + ::arrow::MemoryPool* pool_; +}; + +class InternalFileEncryptor { + public: + explicit InternalFileEncryptor(FileEncryptionProperties* propperties, + ::arrow::MemoryPool* pool); + + std::shared_ptr GetFooterEncryptor(); + std::shared_ptr GetFooterSigningEncryptor(); + std::shared_ptr GetColumnMetaEncryptor(const std::string& column_path); + std::shared_ptr GetColumnDataEncryptor(const std::string& column_path); + void WipeOutEncryptionKeys(); + + private: + FileEncryptionProperties* properties_; + + std::map> column_data_map_; + std::map> column_metadata_map_; + + std::shared_ptr footer_signing_encryptor_; + std::shared_ptr footer_encryptor_; + + std::vector all_encryptors_; + + // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_encryptors and data_encryptors. + std::unique_ptr meta_encryptor_[3]; + std::unique_ptr data_encryptor_[3]; + + ::arrow::MemoryPool* pool_; + + std::shared_ptr GetColumnEncryptor(const std::string& column_path, + bool metadata); + + encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); + encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); + + int MapKeyLenToEncryptorArrayIndex(int key_len); +}; + +} // namespace parquet + +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 582d4c895bd..a8e9c48912b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -23,7 +23,10 @@ #include "arrow/util/logging.h" +#include +#include "parquet/encryption_internal.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -122,31 +125,106 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d } // MetaData Accessor + +// ColumnCryptoMetaData +class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { + public: + explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata) + : crypto_metadata_(crypto_metadata) {} + + ~ColumnCryptoMetaDataImpl() {} + + bool encrypted_with_footer_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY; + } + bool encrypted_with_column_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY; + } + std::shared_ptr path_in_schema() const { + return std::make_shared( + crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + } + const std::string& key_metadata() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + } + + private: + const format::ColumnCryptoMetaData* crypto_metadata_; +}; + +std::unique_ptr ColumnCryptoMetaData::Make( + const uint8_t* metadata) { + return std::unique_ptr(new ColumnCryptoMetaData(metadata)); +} + +ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) + : impl_(new ColumnCryptoMetaDataImpl( + reinterpret_cast(metadata))) {} + +ColumnCryptoMetaData::~ColumnCryptoMetaData() {} + +std::shared_ptr ColumnCryptoMetaData::path_in_schema() const { + return impl_->path_in_schema(); +} +bool ColumnCryptoMetaData::encrypted_with_footer_key() const { + return impl_->encrypted_with_footer_key(); +} +const std::string& ColumnCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); +} + // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor = nullptr) : column_(column), descr_(descr), writer_version_(writer_version) { - const format::ColumnMetaData& meta_data = column->meta_data; - for (auto encoding : meta_data.encodings) { + column_metadata_ = &column->meta_data; + if (column->__isset.crypto_metadata) { // column metadata is encrypted + format::ColumnCryptoMetaData ccmd = column->crypto_metadata; + + if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) { + // should decrypt metadata + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + + std::string aad_column_metadata = encryption::CreateModuleAad( + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, + column_ordinal, (int16_t)-1); + auto decryptor = file_decryptor->GetColumnMetaDecryptor( + path->ToDotString(), key_metadata, aad_column_metadata); + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg( + reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &decrypted_metadata_, decryptor); + column_metadata_ = &decrypted_metadata_; + } else { + throw ParquetException( + "Cannot decrypt ColumnMetadata." + " FileDecryption is not setup correctly"); + } + } + } + for (auto encoding : column_metadata_->encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; } - // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } - // column metadata - inline Type::type type() const { return FromThrift(column_->meta_data.type); } + inline Type::type type() const { return FromThrift(column_metadata_->type); } - inline int64_t num_values() const { return column_->meta_data.num_values; } + inline int64_t num_values() const { return column_metadata_->num_values; } std::shared_ptr path_in_schema() { - return std::make_shared(column_->meta_data.path_in_schema); + return std::make_shared(column_metadata_->path_in_schema); } // Check if statistics are set and are valid @@ -156,12 +234,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!column_->meta_data.__isset.statistics || + if (!column_metadata_->__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(column_->meta_data, descr_); + possible_stats_ = MakeColumnStats(*column_metadata_, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -173,66 +251,79 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } inline Compression::type compression() const { - return FromThrift(column_->meta_data.codec); + return FromThrift(column_metadata_->codec); } const std::vector& encodings() const { return encodings_; } inline bool has_dictionary_page() const { - return column_->meta_data.__isset.dictionary_page_offset; + return column_metadata_->__isset.dictionary_page_offset; } inline int64_t dictionary_page_offset() const { - return column_->meta_data.dictionary_page_offset; + return column_metadata_->dictionary_page_offset; } - inline int64_t data_page_offset() const { return column_->meta_data.data_page_offset; } + inline int64_t data_page_offset() const { return column_metadata_->data_page_offset; } inline bool has_index_page() const { - return column_->meta_data.__isset.index_page_offset; + return column_metadata_->__isset.index_page_offset; } - inline int64_t index_page_offset() const { - return column_->meta_data.index_page_offset; - } + inline int64_t index_page_offset() const { return column_metadata_->index_page_offset; } inline int64_t total_compressed_size() const { - return column_->meta_data.total_compressed_size; + return column_metadata_->total_compressed_size; } inline int64_t total_uncompressed_size() const { - return column_->meta_data.total_uncompressed_size; + return column_metadata_->total_uncompressed_size; + } + + inline std::unique_ptr crypto_metadata() const { + if (column_->__isset.crypto_metadata) { + return ColumnCryptoMetaData::Make( + reinterpret_cast(&column_->crypto_metadata)); + } else { + return nullptr; + } } private: mutable std::shared_ptr possible_stats_; std::vector encodings_; const format::ColumnChunk* column_; + const format::ColumnMetaData* column_metadata_; + format::ColumnMetaData decrypted_metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; }; std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) { + const ApplicationVersion* writer_version, int16_t row_group_ordinal, + int16_t column_ordinal, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version)); + new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, + writer_version, file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - writer_version))} {} -ColumnChunkMetaData::~ColumnChunkMetaData() {} + row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} +ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); } -// column metadata Type::type ColumnChunkMetaData::type() const { return impl_->type(); } int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); } @@ -281,6 +372,10 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { + return impl_->crypto_metadata(); +} + // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: @@ -295,9 +390,16 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t total_byte_size() const { return row_group_->total_byte_size; } + inline int64_t file_offset() const { return row_group_->file_offset; } + + inline int64_t total_compressed_size() const { + return row_group_->total_compressed_size; + } + inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i) { + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = nullptr) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -305,7 +407,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_); + writer_version_, row_group_ordinal, (int16_t)i, + file_decryptor); } private: @@ -336,8 +439,9 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const { - return impl_->ColumnChunk(i); +std::unique_ptr RowGroupMetaData::ColumnChunk( + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } // file metadata @@ -345,11 +449,12 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get()); + metadata_.get(), decryptor); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -363,6 +468,39 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } + bool VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature) { + // serialize the footer + uint8_t* serialized_data; + uint32_t serialized_len = metadata_len_; + ThriftSerializer serializer; + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt with nonce + uint8_t* nonce = const_cast(reinterpret_cast(signature)); + uint8_t* tag = const_cast(reinterpret_cast(signature)) + + encryption::kNonceLength; + + std::string key = file_decryptor->GetFooterKey(); + std::string aad = encryption::CreateFooterAad(file_decryptor->file_aad()); + + auto aes_encryptor = encryption::AesEncryptor::Make( + file_decryptor->algorithm(), static_cast(key.size()), true, nullptr); + + std::shared_ptr encrypted_buffer = std::static_pointer_cast( + AllocateBuffer(file_decryptor->pool(), + aes_encryptor->CiphertextSizeDelta() + serialized_len)); + uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt( + serialized_data, serialized_len, str2bytes(key), static_cast(key.size()), + str2bytes(aad), static_cast(aad.size()), nonce, + encrypted_buffer->mutable_data()); + // Delete AES encryptor object. It was created only to verify the footer signature. + aes_encryptor->WipeOut(); + delete aes_encryptor; + return 0 == + memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength, + tag, encryption::kGcmTagLength); + } + inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } inline int64_t num_rows() const { return metadata_->num_rows; } @@ -375,11 +513,46 @@ class FileMetaData::FileMetaDataImpl { return static_cast(metadata_->schema.size()); } + inline bool is_encryption_algorithm_set() const { + return metadata_->__isset.encryption_algorithm; + } + inline EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + inline const std::string& footer_signing_key_metadata() { + return metadata_->footer_signing_key_metadata; + } + const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst) const { + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst); + // Only in encrypted files with plaintext footers the + // encryption_algorithm is set in footer + if (is_encryption_algorithm_set()) { + uint8_t* serialized_data; + uint32_t serialized_len; + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt the footer key + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); + + // write unencrypted footer + PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len)); + // Write signature (nonce and tag) + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + 4, encryption::kNonceLength)); + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, + encryption::kGcmTagLength)); + } else { // either plaintext file (when encryptor is null) + // or encrypted file with encrypted footer + serializer.Serialize(metadata_.get(), dst, encryptor); + } } std::unique_ptr RowGroup(int i) { @@ -463,15 +636,18 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len) { +std::shared_ptr FileMetaData::Make( + const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); + return std::shared_ptr( + new FileMetaData(metadata, metadata_len, decryptor)); } -FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor) : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} + new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -482,6 +658,11 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } +bool FileMetaData::VerifySignature(InternalFileDecryptor* file_decryptor, + const void* signature) { + return impl_->VerifySignature(file_decryptor, signature); +} + uint32_t FileMetaData::size() const { return impl_->size(); } int FileMetaData::num_columns() const { return impl_->num_columns(); } @@ -490,6 +671,18 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } +bool FileMetaData::is_encryption_algorithm_set() const { + return impl_->is_encryption_algorithm_set(); +} + +EncryptionAlgorithm FileMetaData::encryption_algorithm() const { + return impl_->encryption_algorithm(); +} + +const std::string& FileMetaData::footer_signing_key_metadata() const { + return impl_->footer_signing_key_metadata(); +} + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: @@ -523,8 +716,62 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { - return impl_->WriteTo(dst); +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor) const { + return impl_->WriteTo(dst, encryptor); +} + +class FileCryptoMetaData::FileCryptoMetaDataImpl { + public: + FileCryptoMetaDataImpl() {} + + explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) { + metadata_.reset(new format::FileCryptoMetaData); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + metadata_len_ = *metadata_len; + } + + ~FileCryptoMetaDataImpl() {} + + EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + const std::string& key_metadata() { return metadata_->key_metadata; } + void WriteTo(::arrow::io::OutputStream* dst) const { + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); + } + + private: + friend FileMetaDataBuilder; + std::unique_ptr metadata_; + uint32_t metadata_len_; +}; + +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const { + return impl_->encryption_algorithm(); +} + +const std::string& FileCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); +} + +std::shared_ptr FileCryptoMetaData::Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len) { + return std::shared_ptr( + new FileCryptoMetaData(serialized_metadata, metadata_len)); +} + +FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, + uint32_t* metadata_len) + : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} + +FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {} + +FileCryptoMetaData::~FileCryptoMetaData() {} + +void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { + impl_->WriteTo(dst); } ApplicationVersion::ApplicationVersion(const std::string& application, int major, @@ -663,7 +910,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -678,6 +925,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->meta_data.__set_data_page_offset(data_page_offset); column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); column_chunk_->meta_data.__set_total_compressed_size(compressed_size); + std::vector thrift_encodings; if (has_dictionary) { thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); @@ -696,6 +944,61 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } column_chunk_->meta_data.__set_encodings(thrift_encodings); + + const auto& encrypt_md = + properties_->column_encryption_properties(column_->path()->ToDotString()); + // column is encrypted + if (encrypt_md != nullptr && encrypt_md->is_encrypted()) { + column_chunk_->__isset.crypto_metadata = true; + format::ColumnCryptoMetaData ccmd; + if (encrypt_md->is_encrypted_with_footer_key()) { + // encrypted with footer key + ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; + ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); + } else { // encrypted with column key + format::EncryptionWithColumnKey eck; + eck.__set_key_metadata(encrypt_md->key_metadata()); + eck.__set_path_in_schema(column_->path()->ToDotVector()); + ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; + ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); + } + column_chunk_->__set_crypto_metadata(ccmd); + + bool encrypted_footer = + properties_->file_encryption_properties()->encrypted_footer(); + bool encrypt_metadata = + !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); + if (encrypt_metadata) { + ThriftSerializer serializer; + // Serialize and encrypt ColumnMetadata separately + // Thrift-serialize the ColumnMetaData structure, + // encrypt it with the column key, and write to encrypted_column_metadata + uint8_t* serialized_data; + uint32_t serialized_len; + + serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len, + &serialized_data); + + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); + + const char* temp = + const_cast(reinterpret_cast(encrypted_data.data())); + std::string encrypted_column_metadata(temp, encrypted_len); + column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); + + if (encrypted_footer) { + column_chunk_->__isset.meta_data = false; + } else { + // Keep redacted metadata version for old readers + column_chunk_->__isset.meta_data = true; + column_chunk_->meta_data.__isset.statistics = false; + column_chunk_->meta_data.__isset.encoding_stats = false; + } + } + } } void WriteTo(::arrow::io::OutputStream* sink) { @@ -704,10 +1007,14 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } const ColumnDescriptor* descr() const { return column_; } + int64_t total_compressed_size() const { + return column_chunk_->meta_data.total_compressed_size; + } private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; + column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type())); column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector()); column_chunk_->meta_data.__set_codec( @@ -758,9 +1065,11 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, + const std::shared_ptr& encryptor) { impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, dictionary_fallback); + compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, + encryptor); } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { @@ -775,6 +1084,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) impl_->SetStatistics(result); } +int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { + return impl_->total_compressed_size(); +} + class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, @@ -801,27 +1114,34 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { int current_column() { return next_column_ - 1; } - void Finish(int64_t total_bytes_written) { + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { if (!(next_column_ == schema_->num_columns())) { std::stringstream ss; ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns() << " columns are initialized"; throw ParquetException(ss.str()); } - int64_t total_byte_size = 0; + int64_t file_offset = 0; + int64_t total_compressed_size = 0; for (int i = 0; i < schema_->num_columns(); i++) { if (!(row_group_->columns[i].file_offset >= 0)) { std::stringstream ss; ss << "Column " << i << " is not complete."; throw ParquetException(ss.str()); } - total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + if (i == 0) { + file_offset = row_group_->columns[0].file_offset; + } + // sometimes column metadata is encrypted and not available to read, + // so we must get total_compressed_size from column builder + total_compressed_size += column_builders_[i]->total_compressed_size(); } - DCHECK(total_bytes_written == total_byte_size) - << "Total bytes in this RowGroup does not match with compressed sizes of columns"; - row_group_->__set_total_byte_size(total_byte_size); + row_group_->__set_file_offset(file_offset); + row_group_->__set_total_compressed_size(total_compressed_size); + row_group_->__set_total_byte_size(total_bytes_written); + row_group_->__set_ordinal(row_group_ordinal); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -869,8 +1189,9 @@ void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) { impl_->set_num_rows(num_rows); } -void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written) { - impl_->Finish(total_bytes_written); +void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, + int16_t row_group_ordinal) { + impl_->Finish(total_bytes_written, row_group_ordinal); } // file metadata @@ -882,6 +1203,10 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); + if (props->file_encryption_properties() != nullptr && + props->file_encryption_properties()->encrypted_footer()) { + crypto_metadata_.reset(new format::FileCryptoMetaData()); + } } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -937,6 +1262,25 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; + // if plaintext footer, set footer signing algorithm + auto file_encryption_properties = properties_->file_encryption_properties(); + if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { + EncryptionAlgorithm signing_algorithm; + EncryptionAlgorithm algo = file_encryption_properties->algorithm(); + signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique; + signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; + if (!algo.aad.supply_aad_prefix) + signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix; + signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1; + + metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm)); + const std::string& footer_signing_key_metadata = + file_encryption_properties->footer_key_metadata(); + if (footer_signing_key_metadata.size() > 0) { + metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); + } + } + parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), &metadata_->schema); @@ -947,8 +1291,31 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } + std::unique_ptr BuildFileCryptoMetaData() { + if (crypto_metadata_ == nullptr) { + return nullptr; + } + + auto file_encryption_properties = properties_->file_encryption_properties(); + + crypto_metadata_->__set_encryption_algorithm( + ToThrift(file_encryption_properties->algorithm())); + std::string key_metadata = file_encryption_properties->footer_key_metadata(); + + if (!key_metadata.empty()) { + crypto_metadata_->__set_key_metadata(key_metadata); + } + + std::unique_ptr file_crypto_metadata = + std::unique_ptr(new FileCryptoMetaData()); + file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_); + + return file_crypto_metadata; + } + protected: std::unique_ptr metadata_; + std::unique_ptr crypto_metadata_; private: const std::shared_ptr properties_; @@ -980,4 +1347,8 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { + return impl_->BuildFileCryptoMetaData(); +} + } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 7df4f6d9e2a..68ba73f82cc 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -19,6 +19,7 @@ #define PARQUET_FILE_METADATA_H #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "parquet/platform.h" #include "parquet/properties.h" +#include "parquet/schema.h" #include "parquet/types.h" namespace parquet { @@ -36,6 +38,12 @@ class EncodedStatistics; class Statistics; class SchemaDescriptor; +class FileCryptoMetaData; +class InternalFileDecryptor; +class Decryptor; +class Encryptor; +class FooterSigningEncryptor; + namespace schema { class ColumnPath; @@ -97,12 +105,29 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +class PARQUET_EXPORT ColumnCryptoMetaData { + public: + static std::unique_ptr Make(const uint8_t* metadata); + ~ColumnCryptoMetaData(); + + std::shared_ptr path_in_schema() const; + bool encrypted_with_footer_key() const; + const std::string& key_metadata() const; + + private: + explicit ColumnCryptoMetaData(const uint8_t* metadata); + + class ColumnCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -113,6 +138,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const std::string& file_path() const; // column metadata + bool is_metadata_set() const; Type::type type() const; int64_t num_values() const; std::shared_ptr path_in_schema() const; @@ -127,10 +153,13 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; + std::unique_ptr crypto_metadata() const; private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version = NULLPTR, + InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -151,7 +180,10 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i) const; + + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal = -1, + InternalFileDecryptor* file_decryptor = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -166,11 +198,18 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); + + static std::shared_ptr Make( + const void* serialized_metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); + /// Verify signature of FileMetadata when file is encrypted but footer is not encrypted + /// (plaintext footer). + /// Signature is 28 bytes (12 byte nonce and 16 byte tags) when encrypting FileMetadata + bool VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature); + // file metadata uint32_t size() const; @@ -183,10 +222,14 @@ class PARQUET_EXPORT FileMetaData { const std::string& created_by() const; int num_schema_elements() const; std::unique_ptr RowGroup(int i) const; - const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst) const; + bool is_encryption_algorithm_set() const; + EncryptionAlgorithm encryption_algorithm() const; + const std::string& footer_signing_key_metadata() const; + + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -201,7 +244,9 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); + + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = NULLPTR); // PIMPL Idiom FileMetaData(); @@ -209,6 +254,28 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +class PARQUET_EXPORT FileCryptoMetaData { + public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + ~FileCryptoMetaData(); + + EncryptionAlgorithm encryption_algorithm() const; + const std::string& key_metadata() const; + + void WriteTo(::arrow::io::OutputStream* dst) const; + + private: + friend FileMetaDataBuilder; + FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + + // PIMPL Idiom + FileCryptoMetaData(); + class FileCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { public: @@ -229,11 +296,15 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void SetStatistics(const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; + + int64_t total_compressed_size() const; // commit the metadata + void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); + bool dictionary_fallback, + const std::shared_ptr& encryptor = NULLPTR); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; @@ -268,7 +339,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { void set_num_rows(int64_t num_rows); // commit the metadata - void Finish(int64_t total_bytes_written); + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1); private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, @@ -293,6 +364,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); + // crypto metadata + std::unique_ptr GetCryptoMetaData(); + private: explicit FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a834be8b211..0aae6dcd183 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -22,10 +22,12 @@ #include #include #include +#include #include "arrow/type.h" #include "arrow/util/compression.h" +#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" @@ -64,10 +66,20 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } + void file_decryption_properties( + const std::shared_ptr& decryption) { + file_decryption_properties_ = decryption; + } + + FileDecryptionProperties* file_decryption_properties() { + return file_decryption_properties_.get(); + } + private: MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + std::shared_ptr file_decryption_properties_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -329,6 +341,12 @@ class PARQUET_EXPORT WriterProperties { return this->compression_level(path->ToDotString(), compression_level); } + Builder* encryption( + const std::shared_ptr& file_encryption_properties) { + file_encryption_properties_ = file_encryption_properties; + return this; + } + Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); return this; @@ -376,10 +394,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_properties_), + default_column_properties_, column_properties)); } private: @@ -391,6 +409,8 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type version_; std::string created_by_; + std::shared_ptr file_encryption_properties_; + // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; std::unordered_map encodings_; @@ -461,11 +481,26 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } + inline FileEncryptionProperties* file_encryption_properties() const { + return file_encryption_properties_.get(); + } + + std::shared_ptr column_encryption_properties( + const std::string& path) const { + if (file_encryption_properties_) { + return file_encryption_properties_->column_encryption_properties(path); + } else { + return NULLPTR; + } + } + private: explicit WriterProperties( MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, - const std::string& created_by, const ColumnProperties& default_column_properties, + const std::string& created_by, + std::shared_ptr file_encryption_properties, + const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), @@ -474,6 +509,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), + file_encryption_properties_(file_encryption_properties), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -484,6 +520,9 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + + std::shared_ptr file_encryption_properties_; + ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; diff --git a/cpp/src/parquet/test_encryption_util.h b/cpp/src/parquet/test_encryption_util.h new file mode 100644 index 00000000000..e430246eb27 --- /dev/null +++ b/cpp/src/parquet/test_encryption_util.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/testing/util.h" + +#include "parquet/column_page.h" +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/encoding.h" +#include "parquet/platform.h" +#include "parquet/test_util.h" + +namespace parquet { +namespace test { + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +constexpr int kFixedLength = 10; + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +inline std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 9886503216c..5b76e687ee6 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -28,6 +28,7 @@ #include #endif #include +#include // TCompactProtocol requires some #defines to work right. #define SIGNED_RIGHT_SHIFT_IS 1 @@ -41,9 +42,13 @@ #include #include "arrow/util/logging.h" + #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/statistics.h" +#include "parquet/types.h" #include "parquet/parquet_types.h" // IYWU pragma: export @@ -77,6 +82,31 @@ static inline Encoding::type FromThrift(format::Encoding::type type) { return static_cast(type); } +static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) { + return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique, + aesGcmV1.supply_aad_prefix}; +} + +static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { + return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique, + aesGcmCtrV1.supply_aad_prefix}; +} + +static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + EncryptionAlgorithm encryption_algorithm; + + if (encryption.__isset.AES_GCM_V1) { + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); + } else if (encryption.__isset.AES_GCM_CTR_V1) { + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); + } else { + throw ParquetException("Unsupported algorithm"); + } + return encryption_algorithm; +} + static inline format::Type::type ToThrift(Type::type type) { return static_cast(type); } @@ -167,16 +197,46 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { return statistics; } +static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { + format::AesGcmV1 aesGcmV1; + // aad_file_unique is always set + aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmV1.__set_aad_prefix(aad.aad_prefix); + } + return aesGcmV1; +} + +static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { + format::AesGcmCtrV1 aesGcmCtrV1; + // aad_file_unique is always set + aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + } + return aesGcmCtrV1; +} + +static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { + format::EncryptionAlgorithm encryption_algorithm; + if (encryption.algorithm == ParquetCipher::AES_GCM_V1) { + encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad)); + } else { + encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad)); + } + return encryption_algorithm; +} + // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; -// Deserialize a thrift message from buf/len. buf/len must at least contain -// all the bytes needed to store the thrift message. On return, len will be -// set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { +inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, + T* deserialized_msg) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -194,6 +254,35 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali *len = *len - bytes_left; } +// Deserialize a thrift message from buf/len. buf/len must at least contain +// all the bytes needed to store the thrift message. On return, len will be +// set to the actual length of the header. +template +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + const std::shared_ptr& decryptor = NULLPTR) { + // thrift message is not encrypted + if (decryptor == NULLPTR) { + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); + } else { // thrift message is encrypted + uint32_t clen; + clen = *len; + // decrypt + std::shared_ptr decrypted_buffer = + std::static_pointer_cast(AllocateBuffer( + decryptor->pool(), + static_cast(clen - decryptor->CiphertextSizeDelta()))); + const uint8_t* cipher_buf = buf; + uint32_t decrypted_buffer_len = + decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data()); + if (decrypted_buffer_len <= 0) { + throw ParquetException("Couldn't decrypt buffer\n"); + } + *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); + DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, + deserialized_msg); + } +} + /// Utility class to serialize thrift objects to a binary format. This object /// should be reused if possible to reuse the underlying memory. /// Note: thrift will encode NULLs into the serialized buffer so it is not valid @@ -222,12 +311,19 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, + const std::shared_ptr& encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); - return static_cast(out_length); + + // obj is not encrypted + if (encryptor == NULLPTR) { + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); + } else { // obj is encrypted + return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); + } } private: @@ -243,6 +339,20 @@ class ThriftSerializer { } } + int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, + uint32_t out_length, + const std::shared_ptr& encryptor) { + std::shared_ptr cipher_buffer = + std::static_pointer_cast(AllocateBuffer( + encryptor->pool(), + static_cast(encryptor->CiphertextSizeDelta() + out_length))); + int cipher_buffer_len = + encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); + + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len); + } + shared_ptr mem_buffer_; shared_ptr protocol_; }; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 0d3b3ba492c..ebb8c2446e3 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -466,10 +466,21 @@ std::unique_ptr GetCodec(Compression::type codec); PARQUET_EXPORT std::unique_ptr GetCodec(Compression::type codec, int compression_level); -struct Encryption { +struct ParquetCipher { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct AadMetadata { + std::string aad_prefix; + std::string aad_file_unique; + bool supply_aad_prefix; +}; + +struct EncryptionAlgorithm { + ParquetCipher::type algorithm; + AadMetadata aad; +}; + // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index 142e6e0b377..9330075cb43 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -59,6 +59,7 @@ cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DARROW_PYTHON=ON \ -DARROW_PLASMA=ON \ -DARROW_PARQUET=ON \ + -DPARQUET_REQUIRE_ENCRYPTION=ON \ -DARROW_ORC=ON \ -DARROW_BUILD_TESTS=OFF \ -GNinja \ diff --git a/dev/lint/run_iwyu.sh b/dev/lint/run_iwyu.sh index 46517cfa91c..a4040b08a10 100755 --- a/dev/lint/run_iwyu.sh +++ b/dev/lint/run_iwyu.sh @@ -26,6 +26,7 @@ cmake -GNinja \ -DARROW_FLIGHT=ON \ -DARROW_GANDIVA=ON \ -DARROW_PARQUET=ON \ + -DPARQUET_REQUIRE_ENCRYPTION=ON \ -DARROW_PYTHON=ON \ -DCMAKE_CXX_FLAGS='-D_GLIBCXX_USE_CXX11_ABI=0' \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index b88fcc33e17..5bfae7ba714 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -226,6 +226,7 @@ ${ARROW_CMAKE_OPTIONS:-} -DARROW_PYTHON=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON +-DPARQUET_REQUIRE_ENCRYPTION=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON diff --git a/docs/source/developers/cpp.rst b/docs/source/developers/cpp.rst index c188e258312..6522208c65e 100644 --- a/docs/source/developers/cpp.rst +++ b/docs/source/developers/cpp.rst @@ -940,7 +940,9 @@ Apache Parquet Development ========================== To build the C++ libraries for Apache Parquet, add the flag -``-DARROW_PARQUET=ON`` when invoking CMake. The Parquet libraries and unit tests +``-DARROW_PARQUET=ON`` when invoking CMake. +To build Apache Parquet with encryption support, add the flag +``-DPARQUET_REQUIRE_ENCRYPTION=ON`` when invoking CMake. The Parquet libraries and unit tests can be built with the ``parquet`` make target: .. code-block:: shell diff --git a/r/configure.win b/r/configure.win index b7bb0ff8fc9..44dbb4c5f7c 100644 --- a/r/configure.win +++ b/r/configure.win @@ -38,9 +38,10 @@ fi # R version, e.g. if the R build is a patch release, so find what the dir is # actually called: RWINLIB="../windows/$(ls windows/ | grep ^arrow-)" +OPENSSL_LIBS="-lcrypto -lcrypt32" PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" -PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lws2_32" +PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz ${OPENSSL_LIBS} -lws2_32" echo "*** Writing Makevars.win" sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars.win