From 17bd705a26f2fb724b6822ff6cc980c44080ca8c Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Fri, 20 Sep 2024 16:30:51 +0800 Subject: [PATCH 01/18] GH-43535: [C++] support the AWS S3 SSE-C encryption --- cpp/src/arrow/filesystem/s3_internal.h | 40 +++++++++++++++++++ cpp/src/arrow/filesystem/s3fs.cc | 40 +++++++++++++------ cpp/src/arrow/filesystem/s3fs.h | 3 ++ cpp/src/arrow/filesystem/s3fs_test.cc | 55 ++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index 54da3d5987e..dcffdc57398 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -29,11 +30,13 @@ #include #include #include +#include #include #include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/s3fs.h" #include "arrow/status.h" +#include "arrow/util/base64.h" #include "arrow/util/logging.h" #include "arrow/util/print.h" #include "arrow/util/string.h" @@ -291,6 +294,43 @@ class ConnectRetryStrategy : public Aws::Client::RetryStrategy { int32_t max_retry_duration_; }; +/// \brief calculate the MD5 of the input sse-c key (raw key, not base64 encoded) +/// \param sse_customer_key is the input sse key +/// \return the base64 encoded MD5 for the input key +inline Result CalculateSSECustomerKeyMD5( + const std::string& sse_customer_key) { + // the key needs to be 256 bits (32 bytes) according to + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption + if (sse_customer_key.length() != 32) { + return Status::Invalid("32 bytes sse-c key is expected"); + } + + // Convert the raw binary key to an Aws::String + Aws::String sse_customer_key_aws_string(sse_customer_key.data(), + sse_customer_key.length()); + + // Compute the MD5 hash of the raw binary key + Aws::Utils::ByteBuffer sse_customer_key_md5 = + Aws::Utils::HashingUtils::CalculateMD5(sse_customer_key_aws_string); + + // Base64-encode the MD5 hash + return arrow::util::base64_encode(std::string_view( + reinterpret_cast(sse_customer_key_md5.GetUnderlyingData()), + sse_customer_key_md5.GetLength())); +} + +template +Status SetSSECustomerKey(S3RequestType& request, const std::string& sse_customer_key) { + if (sse_customer_key.empty()) { + return Status::OK(); // do nothing if the sse_customer_key is not configured + } + ARROW_ASSIGN_OR_RAISE(auto md5, internal::CalculateSSECustomerKeyMD5(sse_customer_key)); + request.SetSSECustomerKeyMD5(md5); + request.SetSSECustomerKey(arrow::util::base64_encode(sse_customer_key)); + request.SetSSECustomerAlgorithm("AES256"); + return Status::OK(); +} + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 13d6ead6ef6..39303c4cf21 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -160,6 +160,7 @@ using internal::IsNotFound; using internal::OutcomeToResult; using internal::OutcomeToStatus; using internal::S3Backend; +using internal::SetSSECustomerKey; using internal::ToAwsString; using internal::ToURLEncodedAwsString; @@ -439,7 +440,8 @@ bool S3Options::Equals(const S3Options& other) const { background_writes == other.background_writes && allow_bucket_creation == other.allow_bucket_creation && allow_bucket_deletion == other.allow_bucket_deletion && - default_metadata_equals && GetAccessKey() == other.GetAccessKey() && + sse_customer_key == other.sse_customer_key && default_metadata_equals && + GetAccessKey() == other.GetAccessKey() && GetSecretKey() == other.GetSecretKey() && GetSessionToken() == other.GetSessionToken()); } @@ -1292,11 +1294,14 @@ Aws::IOStreamFactory AwsWriteableStreamFactory(void* data, int64_t nbytes) { } Result GetObjectRange(Aws::S3::S3Client* client, - const S3Path& path, int64_t start, - int64_t length, void* out) { + const S3Path& path, + const std::string& sse_customer_key, + int64_t start, int64_t length, + void* out) { S3Model::GetObjectRequest req; req.SetBucket(ToAwsString(path.bucket)); req.SetKey(ToAwsString(path.key)); + RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key)); req.SetRange(ToAwsString(FormatRange(start, length))); req.SetResponseStreamFactory(AwsWriteableStreamFactory(out, length)); return OutcomeToResult("GetObject", client->GetObject(req)); @@ -1433,11 +1438,13 @@ bool IsDirectory(std::string_view key, const S3Model::HeadObjectResult& result) class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile(std::shared_ptr holder, const io::IOContext& io_context, - const S3Path& path, int64_t size = kNoSize) + const S3Path& path, int64_t size = kNoSize, + const std::string& sse_customer_key = "") : holder_(std::move(holder)), io_context_(io_context), path_(path), - content_length_(size) {} + content_length_(size), + sse_customer_key_(sse_customer_key) {} Status Init() { // Issue a HEAD Object to get the content-length and ensure any @@ -1450,6 +1457,7 @@ class ObjectInputFile final : public io::RandomAccessFile { S3Model::HeadObjectRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); + RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); auto outcome = client_lock.Move()->HeadObject(req); @@ -1534,9 +1542,9 @@ class ObjectInputFile final : public io::RandomAccessFile { // Read the desired range of bytes ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); - ARROW_ASSIGN_OR_RAISE( - S3Model::GetObjectResult result, - GetObjectRange(client_lock.get(), path_, position, nbytes, out)); + ARROW_ASSIGN_OR_RAISE(S3Model::GetObjectResult result, + GetObjectRange(client_lock.get(), path_, sse_customer_key_, + position, nbytes, out)); auto& stream = result.GetBody(); stream.ignore(nbytes); @@ -1584,6 +1592,7 @@ class ObjectInputFile final : public io::RandomAccessFile { int64_t pos_ = 0; int64_t content_length_ = kNoSize; std::shared_ptr metadata_; + std::string sse_customer_key_; }; // Upload size per part. While AWS and Minio support different sizes for each @@ -1620,7 +1629,8 @@ class ObjectOutputStream final : public io::OutputStream { metadata_(metadata), default_metadata_(options.default_metadata), background_writes_(options.background_writes), - allow_delayed_open_(options.allow_delayed_open) {} + allow_delayed_open_(options.allow_delayed_open), + sse_customer_key_(options.sse_customer_key) {} ~ObjectOutputStream() override { // For compliance with the rest of the IO stack, Close rather than Abort, @@ -1668,6 +1678,7 @@ class ObjectOutputStream final : public io::OutputStream { S3Model::CreateMultipartUploadRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); + RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); RETURN_NOT_OK(SetMetadataInRequest(&req)); auto outcome = client_lock.Move()->CreateMultipartUpload(req); @@ -1769,6 +1780,7 @@ class ObjectOutputStream final : public io::OutputStream { S3Model::CompleteMultipartUploadRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); + RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); req.SetUploadId(multipart_upload_id_); req.SetMultipartUpload(std::move(completed_upload)); @@ -1958,6 +1970,7 @@ class ObjectOutputStream final : public io::OutputStream { req.SetKey(ToAwsString(path_.key)); req.SetBody(std::make_shared(data, nbytes)); req.SetContentLength(nbytes); + RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); if (!background_writes_) { req.SetBody(std::make_shared(data, nbytes)); @@ -2171,6 +2184,7 @@ class ObjectOutputStream final : public io::OutputStream { Future<> pending_uploads_completed = Future<>::MakeFinished(Status::OK()); }; std::shared_ptr upload_state_; + std::string sse_customer_key_; }; // This function assumes info->path() is already set @@ -2338,6 +2352,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_this(holder_, fs->io_context(), path); + auto ptr = std::make_shared(holder_, fs->io_context(), path, kNoSize, + fs->options().sse_customer_key); RETURN_NOT_OK(ptr->Init()); return ptr; } @@ -3002,8 +3018,8 @@ class S3FileSystem::Impl : public std::enable_shared_from_this(holder_, fs->io_context(), path, info.size()); + auto ptr = std::make_shared( + holder_, fs->io_context(), path, info.size(), fs->options().sse_customer_key); RETURN_NOT_OK(ptr->Init()); return ptr; } diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 85d5ff8fed5..46aa5430ed9 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -196,6 +196,9 @@ struct ARROW_EXPORT S3Options { /// delay between retries. std::shared_ptr retry_strategy; + /// the SSE-C customized key (raw 32 bytes key). + std::string sse_customer_key; + S3Options(); /// Configure with the default AWS credentials provider chain. diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 43091aaa986..121af8ca33e 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -80,6 +80,7 @@ using ::arrow::internal::ToChars; using ::arrow::internal::Zip; using ::arrow::util::UriEscape; +using ::arrow::fs::internal::CalculateSSECustomerKeyMD5; using ::arrow::fs::internal::ConnectRetryStrategy; using ::arrow::fs::internal::ErrorToStatus; using ::arrow::fs::internal::OutcomeToStatus; @@ -530,9 +531,9 @@ class TestS3FS : public S3TestMixin { } Result> MakeNewFileSystem( - io::IOContext io_context = io::default_io_context()) { + io::IOContext io_context = io::default_io_context(), bool use_https = false) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "http"; + options_.scheme = use_https ? "https" : "http"; options_.endpoint_override = minio_->connect_string(); if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); @@ -540,7 +541,9 @@ class TestS3FS : public S3TestMixin { return S3FileSystem::Make(options_, io_context); } - void MakeFileSystem() { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem()); } + void MakeFileSystem(bool use_https = false) { + ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context(), use_https)); + } template void AssertMetadataRoundtrip(const std::string& path, @@ -1298,6 +1301,36 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +TEST_F(TestS3FS, SSECustomerKeyMatch) { + // normal write/read with correct SSEC key + std::shared_ptr stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(true); // need to use https, otherwise get 'InvalidRequest Message: + // Requests specifying Server Side Encryption with Customer + // provided keys must be made over a secure connection.' + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4)); + AssertBufferEqual(*buf, "some"); + ASSERT_OK(RestoreTestBucket()); +} + +TEST_F(TestS3FS, SSECustomerKeyMismatch) { + std::shared_ptr stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(true); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + + options_.sse_customer_key = "87654321876543218765432187654321"; + MakeFileSystem(true); + ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK(RestoreTestBucket()); +} + struct S3OptionsTestParameters { bool background_writes{false}; bool allow_delayed_open{false}; @@ -1593,5 +1626,21 @@ TEST(S3GlobalOptions, DefaultsLogLevel) { } } +TEST(CalculateSSECustomerKeyMD5, Sanity) { + ASSERT_RAISES(Invalid, CalculateSSECustomerKeyMD5("")); // invalid length + ASSERT_RAISES(Invalid, + CalculateSSECustomerKeyMD5( + "1234567890123456789012345678901234567890")); // invalid length + // valid case, with some non-ASCII character and a null byte in the sse_customer_key + char sse_customer_key[32] = {}; + sse_customer_key[0] = '\x40'; // '@' character + sse_customer_key[1] = '\0'; // null byte + sse_customer_key[2] = '\xFF'; // non-ASCII + sse_customer_key[31] = '\xFA'; // non-ASCII + std::string sse_customer_key_string(sse_customer_key, sizeof(sse_customer_key)); + ASSERT_OK_AND_ASSIGN(auto md5, CalculateSSECustomerKeyMD5(sse_customer_key_string)) + ASSERT_STREQ(md5.c_str(), "97FTa6lj0hE7lshKdBy61g=="); // valid case +} + } // namespace fs } // namespace arrow From 3c7a583f4f95b43c773738d1dd349e8750d0ac02 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Wed, 25 Sep 2024 23:52:56 -0400 Subject: [PATCH 02/18] support to start the minio test server with https, and default use https --- cpp/src/arrow/filesystem/s3_test_cert.h | 76 ++++++++++++++++++++++++ cpp/src/arrow/filesystem/s3_test_util.cc | 49 +++++++++++++-- cpp/src/arrow/filesystem/s3_test_util.h | 3 +- cpp/src/arrow/filesystem/s3fs_test.cc | 19 +++--- 4 files changed, 132 insertions(+), 15 deletions(-) create mode 100644 cpp/src/arrow/filesystem/s3_test_cert.h diff --git a/cpp/src/arrow/filesystem/s3_test_cert.h b/cpp/src/arrow/filesystem/s3_test_cert.h new file mode 100644 index 00000000000..852bfec701f --- /dev/null +++ b/cpp/src/arrow/filesystem/s3_test_cert.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { +namespace fs { + +const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY----- +MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCqwKYHsTSciGqP +uU3qkTWpnXIi3iC0eeW7JSzJHGFs880WdR5JdK4WufPK+1xzgiYjMEPfAcuSWz3b +qYyCI61q+a9Iu2nj7cFTW9bfZrmWlnI0YOLJc+q0AAdAjF1lvRKenH8tbjz/2jyl +i/cYQ+I5Tg4nngrX8OmOfluNzwD/nwGLq6/DVbzDUdPI9q1XtVT/0Vf7qwbDG1HD +NkIzKT5B+YdSLaOCRYNK3x7RPsfazKIBrTmRy1v454wKe8TjTmTB7+m5wKqfCJcq +lI253WHcK0lsw6zCNtX/kahPAvm/8mniPolW4qxoD6xwebgMVkrNTs3ztcPIG9O4 +pmCbATijAgMBAAECggEACL5swiAU7Z8etdVrZAOjl9f0LEzrp9JGLVst++50Hrwt +WGUO8/wBnjBPh6lvhoq3oT2rfBP/dLMva7w28cMZ8kxu6W6PcZiPOdGOI0qDXm69 +0mjTtDU3Y5hMxsVpUvhnp6+j45Otk/x89o1ATgHL59tTZjv1mjFABIf78DsVdgF9 +CMi2q6Lv7NLftieyWmz1K3p109z9+xkDNSOkVrv1JFChviKqWgIS0rdFjySvTgoy +rHYT+TweDliKJrZCeoUJmNB0uVW/dM9lXhcvkvkJZKPPurylx1oH5a7K/sWFPf7A +Ed1vjvZQFlaXu/bOUUSOZtkErAir/oCxrUDsHxGsAQKBgQDZghyy7jNGNdjZe1Xs +On1ZVgIS3Nt+OLGCVH7tTsfZsCOb+SkrhB1RQva3YzPMfgoZScI9+bN/pRVf49Pj +qGEHkW/wozutUve7UMzeTOm1aWxUuaKSrmYST7muvAnlYEtO7agd0wrcusYXlMoG +KQwghkufO9I7wXcrudMKXZalIwKBgQDI+FaUwhgfThkgq6bRbdMEeosgohrCM9Wm +E5JMePQq4VaGcgGveWUoNOgT8kvJa0qQwQOqLZj7kUIdj+SCRt0u+Wu3p5IMqdOq +6tMnLNQ3wzUC2KGFLSfISR3L/bo5Bo6Jqz4hVtjMk3PV9bu50MNTNaofYb2xlf/f +/WgiEG0WgQKBgAr8RVLMMQ7EvXUOg6Jwuc//Rg+J1BQl7OE2P0rhBbr66HGCPhAS +liB6j1dnzT/wxbXNQeA7clNqFRBIw3TmFjB5qfuvYt44KIbvZ8l6fPtKncwRrCJY +aJNYL3qhyKYrHOKZojoPZKcNT9/1BdcVz6T842jhbpbSCKDOu9f0Lh2dAoGATZeM +Hh0eISAPFY0QeDV1znnds3jC6g4HQ/q0dnAQnWmo9XmY6v3sr2xV2jWnSxnwjRjo +aFD4itBXfYBr0ly30wYbr6mz+s2q2oeVhL+LJAhrNDEdk4SOooaQSY0p1BCTAdYq +w8Z7J+kaRRZ+J0zRzROgHkOncKQgSYPWK6i55YECgYAC+ECrHhUlPsfusjKpFsEe +stW1HCt3wXtKQn6SJ6IAesbxwALZS6Da/ZC2x1mdBHS3GwWvtGLc0BPnPVfJjr9V +m82qkgJ+p5d7qp7pRA7SFD+5809yVqRnEF3rSLafgGet9ah0ZjZvQ3fwnYZNnNH9 +t9pJcv2E5xY7/nFNIorpKg== +-----END PRIVATE KEY----- +)"; + +const char* kMinioPublicCert = R"(-----BEGIN CERTIFICATE----- +MIIDiTCCAnGgAwIBAgIUXbHZ6FAhKSXg4WSGUQySlSyE4U0wDQYJKoZIhvcNAQEL +BQAwXzELMAkGA1UEBhMCVVMxCzAJBgNVBAgMAlZBMQ4wDAYDVQQHDAVBcnJvdzEO +MAwGA1UECgwFQXJyb3cxDjAMBgNVBAsMBUFycm93MRMwEQYDVQQDDApBcnJyb3dU +ZXN0MB4XDTI0MDkyNDA5MzUxNloXDTM0MDkyMjA5MzUxNlowXzELMAkGA1UEBhMC +VVMxCzAJBgNVBAgMAlZBMQ4wDAYDVQQHDAVBcnJvdzEOMAwGA1UECgwFQXJyb3cx +DjAMBgNVBAsMBUFycm93MRMwEQYDVQQDDApBcnJyb3dUZXN0MIIBIjANBgkqhkiG +9w0BAQEFAAOCAQ8AMIIBCgKCAQEAqsCmB7E0nIhqj7lN6pE1qZ1yIt4gtHnluyUs +yRxhbPPNFnUeSXSuFrnzyvtcc4ImIzBD3wHLkls926mMgiOtavmvSLtp4+3BU1vW +32a5lpZyNGDiyXPqtAAHQIxdZb0Snpx/LW48/9o8pYv3GEPiOU4OJ54K1/Dpjn5b +jc8A/58Bi6uvw1W8w1HTyPatV7VU/9FX+6sGwxtRwzZCMyk+QfmHUi2jgkWDSt8e +0T7H2syiAa05kctb+OeMCnvE405kwe/pucCqnwiXKpSNud1h3CtJbMOswjbV/5Go +TwL5v/Jp4j6JVuKsaA+scHm4DFZKzU7N87XDyBvTuKZgmwE4owIDAQABoz0wOzAa +BgNVHREEEzARhwR/AAABgglsb2NhbGhvc3QwHQYDVR0OBBYEFOUNqUSfROf1dz3o +hAVBhgd3UIvKMA0GCSqGSIb3DQEBCwUAA4IBAQBSwWJ2dSw3jlHU0l2V3ozqthTt +XFo07AyWGw8AWNCM6mQ+GKBf0JJ1d7e4lyTf2lCobknS94EgGPORWeiucKYAoCjS +dh1eKGsSevz1rNbp7wsO7DoiRPciK+S95DbsPowloGI6fvOeE12Cf1udeNIpEYWs +OBFwN0HxfYqdPALCtw7l0icpTrJ2Us06UfL9kbkdZwQhXvOscG7JDRtNjBxl9XNm +TFeMNKROmrEPCWaYr6MJ+ItHtb5Cawapea4THz9GCjR9eLq2CbMqLezZ8xBHPzc4 +ixI2l0uCfg7ZUSA+90yaScc7bhEQ8CMiPtJgNKaKIqB58DpY7028xJpW7Ma2 +-----END CERTIFICATE----- +)"; +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index db0c60f2e80..59602d47404 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -19,6 +19,7 @@ # include #endif +#include "arrow/filesystem/s3_test_cert.h" #include "arrow/filesystem/s3_test_util.h" #include "arrow/filesystem/s3fs.h" #include "arrow/testing/process.h" @@ -31,6 +32,11 @@ namespace arrow { namespace fs { +using ::arrow::internal::FileClose; +using ::arrow::internal::FileDescriptor; +using ::arrow::internal::FileOpenWritable; +using ::arrow::internal::FileWrite; +using ::arrow::internal::PlatformFilename; using ::arrow::internal::TemporaryDir; namespace { @@ -50,6 +56,7 @@ std::string GenerateConnectString() { return GetListenAddress(); } struct MinioTestServer::Impl { std::unique_ptr temp_dir_; + std::unique_ptr temp_dir_ca_; std::string connect_string_; std::string access_key_ = kMinioAccessKey; std::string secret_key_ = kMinioSecretKey; @@ -69,7 +76,28 @@ std::string MinioTestServer::access_key() const { return impl_->access_key_; } std::string MinioTestServer::secret_key() const { return impl_->secret_key_; } -Status MinioTestServer::Start() { +void MinioTestServer::GenerateCertificateFile() { + PlatformFilename public_crt_file, private_key_file; + ASSERT_OK_AND_ASSIGN(public_crt_file, + PlatformFilename::FromString( + impl_->temp_dir_ca_->path().ToString() + "/public.crt")); + ASSERT_OK_AND_ASSIGN(FileDescriptor public_cert_fd, FileOpenWritable(public_crt_file)); + ASSERT_OK(FileWrite(public_cert_fd.fd(), + reinterpret_cast(kMinioPublicCert), + strlen(kMinioPublicCert))); + ASSERT_OK(public_cert_fd.Close()); + + ASSERT_OK_AND_ASSIGN(private_key_file, + PlatformFilename::FromString( + impl_->temp_dir_ca_->path().ToString() + "/private.key")); + ASSERT_OK_AND_ASSIGN(FileDescriptor private_key_fd, FileOpenWritable(private_key_file)); + ASSERT_OK(FileWrite(private_key_fd.fd(), + reinterpret_cast(kMinioPrivateKey), + strlen(kMinioPrivateKey))); + ASSERT_OK(private_key_fd.Close()); +} + +Status MinioTestServer::Start(bool enable_tls) { const char* connect_str = std::getenv(kEnvConnectString); const char* access_key = std::getenv(kEnvAccessKey); const char* secret_key = std::getenv(kEnvSecretKey); @@ -91,9 +119,22 @@ Status MinioTestServer::Start() { impl_->connect_string_ = GenerateConnectString(); ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); // NOTE: --quiet makes startup faster by suppressing remote version check - impl_->server_process_->SetArgs({"server", "--quiet", "--compat", "--address", - impl_->connect_string_, - impl_->temp_dir_->path().ToString()}); + std::vector start_args = {"server", "--quiet", "--compat", "--address", + impl_->connect_string_}; + if (enable_tls) { + // create the dedicated folder for certificate file, rather than reuse the data + // folder, since there is case to check whether the folder is empty. + ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca")); + GenerateCertificateFile(); + start_args.push_back("--certs-dir"); + start_args.push_back(impl_->temp_dir_ca_->path().ToString()); + arrow::fs::FileSystemGlobalOptions global_options; + global_options.tls_ca_dir_path = impl_->temp_dir_ca_->path().ToString(); + ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); + } + // apply the data path at the end since some minio version only support it at the end + start_args.push_back(impl_->temp_dir_->path().ToString()); + impl_->server_process_->SetArgs(start_args); ARROW_RETURN_NOT_OK(impl_->server_process_->Execute()); return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index e270a6e1c46..19824deb2a3 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -40,7 +40,7 @@ class MinioTestServer { MinioTestServer(); ~MinioTestServer(); - Status Start(); + Status Start(bool enable_tls = true); Status Stop(); @@ -51,6 +51,7 @@ class MinioTestServer { std::string secret_key() const; private: + void GenerateCertificateFile(); struct Impl; std::unique_ptr impl_; }; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 121af8ca33e..0d9ce61baef 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -206,7 +206,8 @@ class S3TestMixin : public AwsTestMixin { ARROW_ASSIGN_OR_RAISE(minio_, GetMinioEnv()->GetOneServer()); client_config_.reset(new Aws::Client::ClientConfiguration()); client_config_->endpointOverride = ToAwsString(minio_->connect_string()); - client_config_->scheme = Aws::Http::Scheme::HTTP; + client_config_->scheme = Aws::Http::Scheme::HTTPS; + client_config_->verifySSL = false; client_config_->retryStrategy = std::make_shared(kRetryInterval, kMaxRetryDuration); credentials_ = {ToAwsString(minio_->access_key()), ToAwsString(minio_->secret_key())}; @@ -531,7 +532,7 @@ class TestS3FS : public S3TestMixin { } Result> MakeNewFileSystem( - io::IOContext io_context = io::default_io_context(), bool use_https = false) { + io::IOContext io_context = io::default_io_context(), bool use_https = true) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); options_.scheme = use_https ? "https" : "http"; options_.endpoint_override = minio_->connect_string(); @@ -541,7 +542,7 @@ class TestS3FS : public S3TestMixin { return S3FileSystem::Make(options_, io_context); } - void MakeFileSystem(bool use_https = false) { + void MakeFileSystem(bool use_https = true) { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context(), use_https)); } @@ -1305,9 +1306,7 @@ TEST_F(TestS3FS, SSECustomerKeyMatch) { // normal write/read with correct SSEC key std::shared_ptr stream; options_.sse_customer_key = "12345678123456781234567812345678"; - MakeFileSystem(true); // need to use https, otherwise get 'InvalidRequest Message: - // Requests specifying Server Side Encryption with Customer - // provided keys must be made over a secure connection.' + MakeFileSystem(); ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); ASSERT_OK(stream->Write("some")); ASSERT_OK(stream->Close()); @@ -1320,7 +1319,7 @@ TEST_F(TestS3FS, SSECustomerKeyMatch) { TEST_F(TestS3FS, SSECustomerKeyMismatch) { std::shared_ptr stream; options_.sse_customer_key = "12345678123456781234567812345678"; - MakeFileSystem(true); + MakeFileSystem(); ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); ASSERT_OK(stream->Write("some")); ASSERT_OK(stream->Close()); @@ -1453,7 +1452,7 @@ TEST_F(TestS3FS, FileSystemFromUri) { std::stringstream ss; ss << "s3://" << minio_->access_key() << ":" << minio_->secret_key() << "@bucket/somedir/subdir/subfile" - << "?scheme=http&endpoint_override=" << UriEscape(minio_->connect_string()); + << "?scheme=https&endpoint_override=" << UriEscape(minio_->connect_string()); std::string path; ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path)); @@ -1555,7 +1554,7 @@ class TestS3FSGeneric : public S3TestMixin, public GenericFileSystemTest { } options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "http"; + options_.scheme = "https"; options_.endpoint_override = minio_->connect_string(); options_.retry_strategy = std::make_shared(); ASSERT_OK_AND_ASSIGN(s3fs_, S3FileSystem::Make(options_)); @@ -1639,7 +1638,7 @@ TEST(CalculateSSECustomerKeyMD5, Sanity) { sse_customer_key[31] = '\xFA'; // non-ASCII std::string sse_customer_key_string(sse_customer_key, sizeof(sse_customer_key)); ASSERT_OK_AND_ASSIGN(auto md5, CalculateSSECustomerKeyMD5(sse_customer_key_string)) - ASSERT_STREQ(md5.c_str(), "97FTa6lj0hE7lshKdBy61g=="); // valid case + ASSERT_EQ(md5, "97FTa6lj0hE7lshKdBy61g=="); // valid case } } // namespace fs From e668e62703e3a22548cb93c05943716bba4a26c6 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Fri, 27 Sep 2024 03:12:21 -0400 Subject: [PATCH 03/18] always start the minio server with tls for ut --- cpp/src/arrow/filesystem/s3_internal.h | 1 - cpp/src/arrow/filesystem/s3_test_cert.h | 6 +++- cpp/src/arrow/filesystem/s3_test_util.cc | 41 ++++++++++++------------ cpp/src/arrow/filesystem/s3_test_util.h | 4 ++- cpp/src/arrow/filesystem/s3fs_test.cc | 12 +++---- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index dcffdc57398..85425b12601 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include #include diff --git a/cpp/src/arrow/filesystem/s3_test_cert.h b/cpp/src/arrow/filesystem/s3_test_cert.h index 852bfec701f..59b8d83c5ab 100644 --- a/cpp/src/arrow/filesystem/s3_test_cert.h +++ b/cpp/src/arrow/filesystem/s3_test_cert.h @@ -20,6 +20,10 @@ namespace arrow { namespace fs { +// The below two static strings are generated according to +// https://github.com/minio/minio/tree/RELEASE.2024-09-22T00-33-43Z/docs/tls#323-generate-a-self-signed-certificate +// `openssl req -new -x509 -nodes -days 36500 -keyout private.key -out public.crt -config +// openssl.conf` const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY----- MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCqwKYHsTSciGqP uU3qkTWpnXIi3iC0eeW7JSzJHGFs880WdR5JdK4WufPK+1xzgiYjMEPfAcuSWz3b @@ -50,7 +54,7 @@ t9pJcv2E5xY7/nFNIorpKg== -----END PRIVATE KEY----- )"; -const char* kMinioPublicCert = R"(-----BEGIN CERTIFICATE----- +const char* kMinioCert = R"(-----BEGIN CERTIFICATE----- MIIDiTCCAnGgAwIBAgIUXbHZ6FAhKSXg4WSGUQySlSyE4U0wDQYJKoZIhvcNAQEL BQAwXzELMAkGA1UEBhMCVVMxCzAJBgNVBAgMAlZBMQ4wDAYDVQQHDAVBcnJvdzEO MAwGA1UECgwFQXJyb3cxDjAMBgNVBAsMBUFycm93MRMwEQYDVQQDDApBcnJyb3dU diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 59602d47404..760962c72c9 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -76,28 +76,30 @@ std::string MinioTestServer::access_key() const { return impl_->access_key_; } std::string MinioTestServer::secret_key() const { return impl_->secret_key_; } +std::string MinioTestServer::ca_path() const { + return impl_->temp_dir_ca_->path().ToString(); +} + void MinioTestServer::GenerateCertificateFile() { - PlatformFilename public_crt_file, private_key_file; - ASSERT_OK_AND_ASSIGN(public_crt_file, + ASSERT_OK_AND_ASSIGN(auto public_crt_file, PlatformFilename::FromString( impl_->temp_dir_ca_->path().ToString() + "/public.crt")); - ASSERT_OK_AND_ASSIGN(FileDescriptor public_cert_fd, FileOpenWritable(public_crt_file)); - ASSERT_OK(FileWrite(public_cert_fd.fd(), - reinterpret_cast(kMinioPublicCert), - strlen(kMinioPublicCert))); + ASSERT_OK_AND_ASSIGN(auto public_cert_fd, FileOpenWritable(public_crt_file)); + ASSERT_OK(FileWrite(public_cert_fd.fd(), reinterpret_cast(kMinioCert), + strlen(kMinioCert))); ASSERT_OK(public_cert_fd.Close()); - ASSERT_OK_AND_ASSIGN(private_key_file, + ASSERT_OK_AND_ASSIGN(auto private_key_file, PlatformFilename::FromString( impl_->temp_dir_ca_->path().ToString() + "/private.key")); - ASSERT_OK_AND_ASSIGN(FileDescriptor private_key_fd, FileOpenWritable(private_key_file)); + ASSERT_OK_AND_ASSIGN(auto private_key_fd, FileOpenWritable(private_key_file)); ASSERT_OK(FileWrite(private_key_fd.fd(), reinterpret_cast(kMinioPrivateKey), strlen(kMinioPrivateKey))); ASSERT_OK(private_key_fd.Close()); } -Status MinioTestServer::Start(bool enable_tls) { +Status MinioTestServer::Start() { const char* connect_str = std::getenv(kEnvConnectString); const char* access_key = std::getenv(kEnvAccessKey); const char* secret_key = std::getenv(kEnvSecretKey); @@ -121,17 +123,16 @@ Status MinioTestServer::Start(bool enable_tls) { // NOTE: --quiet makes startup faster by suppressing remote version check std::vector start_args = {"server", "--quiet", "--compat", "--address", impl_->connect_string_}; - if (enable_tls) { - // create the dedicated folder for certificate file, rather than reuse the data - // folder, since there is case to check whether the folder is empty. - ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca")); - GenerateCertificateFile(); - start_args.push_back("--certs-dir"); - start_args.push_back(impl_->temp_dir_ca_->path().ToString()); - arrow::fs::FileSystemGlobalOptions global_options; - global_options.tls_ca_dir_path = impl_->temp_dir_ca_->path().ToString(); - ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); - } + + // create the dedicated folder for certificate file, rather than reuse the data + // folder, since there is test case to check whether the folder is empty. + ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca-")); + GenerateCertificateFile(); + start_args.push_back("--certs-dir"); + start_args.push_back(ca_path()); + arrow::fs::FileSystemGlobalOptions global_options; + global_options.tls_ca_dir_path = ca_path(); + ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); // apply the data path at the end since some minio version only support it at the end start_args.push_back(impl_->temp_dir_->path().ToString()); impl_->server_process_->SetArgs(start_args); diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index 19824deb2a3..4b35c9ba488 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -40,7 +40,7 @@ class MinioTestServer { MinioTestServer(); ~MinioTestServer(); - Status Start(bool enable_tls = true); + Status Start(); Status Stop(); @@ -50,6 +50,8 @@ class MinioTestServer { std::string secret_key() const; + std::string ca_path() const; + private: void GenerateCertificateFile(); struct Impl; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 0d9ce61baef..d8649a659de 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -207,7 +207,7 @@ class S3TestMixin : public AwsTestMixin { client_config_.reset(new Aws::Client::ClientConfiguration()); client_config_->endpointOverride = ToAwsString(minio_->connect_string()); client_config_->scheme = Aws::Http::Scheme::HTTPS; - client_config_->verifySSL = false; + client_config_->caPath = ToAwsString(minio_->ca_path()); client_config_->retryStrategy = std::make_shared(kRetryInterval, kMaxRetryDuration); credentials_ = {ToAwsString(minio_->access_key()), ToAwsString(minio_->secret_key())}; @@ -532,9 +532,9 @@ class TestS3FS : public S3TestMixin { } Result> MakeNewFileSystem( - io::IOContext io_context = io::default_io_context(), bool use_https = true) { + io::IOContext io_context = io::default_io_context()) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = use_https ? "https" : "http"; + options_.scheme = "https"; options_.endpoint_override = minio_->connect_string(); if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); @@ -542,8 +542,8 @@ class TestS3FS : public S3TestMixin { return S3FileSystem::Make(options_, io_context); } - void MakeFileSystem(bool use_https = true) { - ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context(), use_https)); + void MakeFileSystem() { + ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context())); } template @@ -1325,7 +1325,7 @@ TEST_F(TestS3FS, SSECustomerKeyMismatch) { ASSERT_OK(stream->Close()); options_.sse_customer_key = "87654321876543218765432187654321"; - MakeFileSystem(true); + MakeFileSystem(); ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); ASSERT_OK(RestoreTestBucket()); } From d51a77fd467ed8f3ec3deb638783f37e1684a377 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Fri, 27 Sep 2024 05:54:29 -0400 Subject: [PATCH 04/18] switch the benchmark case also to https --- cpp/src/arrow/filesystem/s3_test_util.cc | 21 +++++++-------------- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 5 +++-- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 760962c72c9..9985aca778b 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -82,16 +82,14 @@ std::string MinioTestServer::ca_path() const { void MinioTestServer::GenerateCertificateFile() { ASSERT_OK_AND_ASSIGN(auto public_crt_file, - PlatformFilename::FromString( - impl_->temp_dir_ca_->path().ToString() + "/public.crt")); + PlatformFilename::FromString(ca_path() + "/public.crt")); ASSERT_OK_AND_ASSIGN(auto public_cert_fd, FileOpenWritable(public_crt_file)); ASSERT_OK(FileWrite(public_cert_fd.fd(), reinterpret_cast(kMinioCert), strlen(kMinioCert))); ASSERT_OK(public_cert_fd.Close()); ASSERT_OK_AND_ASSIGN(auto private_key_file, - PlatformFilename::FromString( - impl_->temp_dir_ca_->path().ToString() + "/private.key")); + PlatformFilename::FromString(ca_path() + "/private.key")); ASSERT_OK_AND_ASSIGN(auto private_key_fd, FileOpenWritable(private_key_file)); ASSERT_OK(FileWrite(private_key_fd.fd(), reinterpret_cast(kMinioPrivateKey), @@ -119,23 +117,18 @@ Status MinioTestServer::Start() { // Disable the embedded console (one less listening address to care about) impl_->server_process_->SetEnv("MINIO_BROWSER", "off"); impl_->connect_string_ = GenerateConnectString(); - ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); - // NOTE: --quiet makes startup faster by suppressing remote version check - std::vector start_args = {"server", "--quiet", "--compat", "--address", - impl_->connect_string_}; - // create the dedicated folder for certificate file, rather than reuse the data // folder, since there is test case to check whether the folder is empty. ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca-")); GenerateCertificateFile(); - start_args.push_back("--certs-dir"); - start_args.push_back(ca_path()); arrow::fs::FileSystemGlobalOptions global_options; global_options.tls_ca_dir_path = ca_path(); ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); - // apply the data path at the end since some minio version only support it at the end - start_args.push_back(impl_->temp_dir_->path().ToString()); - impl_->server_process_->SetArgs(start_args); + ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); + // NOTE: --quiet makes startup faster by suppressing remote version check + impl_->server_process_->SetArgs({"server", "--quiet", "--compat", "--certs-dir", + ca_path(), "--address", impl_->connect_string_, + impl_->temp_dir_->path().ToString()}); ARROW_RETURN_NOT_OK(impl_->server_process_->Execute()); return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 21216429639..f4e4b4101ce 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -81,7 +81,8 @@ class MinioFixture : public benchmark::Fixture { } client_config_.endpointOverride = ToAwsString(minio_->connect_string()); - client_config_.scheme = Aws::Http::Scheme::HTTP; + client_config_.scheme = Aws::Http::Scheme::HTTPS; + client_config_.caPath = ToAwsString(minio_->ca_path()); if (!region_.empty()) { client_config_.region = ToAwsString(region_); } @@ -110,7 +111,7 @@ class MinioFixture : public benchmark::Fixture { void MakeFileSystem() { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "http"; + options_.scheme = "https"; if (!region_.empty()) { options_.region = region_; } From d83f0b29663c3dc9c7525f15b3a9b6371f1f7507 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Sat, 28 Sep 2024 22:53:03 -0400 Subject: [PATCH 05/18] install the self-signed certificate for windows, as there is no interface to set the ca for windows --- cpp/src/arrow/filesystem/s3_test_util.cc | 58 ++++++++++++++-------- cpp/src/arrow/filesystem/s3_test_util.h | 2 +- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 6 +++ cpp/src/arrow/filesystem/s3fs_test.cc | 6 +++ 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 9985aca778b..4617bb6f9d9 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -80,21 +80,41 @@ std::string MinioTestServer::ca_path() const { return impl_->temp_dir_ca_->path().ToString(); } -void MinioTestServer::GenerateCertificateFile() { - ASSERT_OK_AND_ASSIGN(auto public_crt_file, - PlatformFilename::FromString(ca_path() + "/public.crt")); - ASSERT_OK_AND_ASSIGN(auto public_cert_fd, FileOpenWritable(public_crt_file)); - ASSERT_OK(FileWrite(public_cert_fd.fd(), reinterpret_cast(kMinioCert), - strlen(kMinioCert))); - ASSERT_OK(public_cert_fd.Close()); - - ASSERT_OK_AND_ASSIGN(auto private_key_file, - PlatformFilename::FromString(ca_path() + "/private.key")); - ASSERT_OK_AND_ASSIGN(auto private_key_fd, FileOpenWritable(private_key_file)); - ASSERT_OK(FileWrite(private_key_fd.fd(), - reinterpret_cast(kMinioPrivateKey), - strlen(kMinioPrivateKey))); - ASSERT_OK(private_key_fd.Close()); +Status MinioTestServer::GenerateCertificateFile() { + // create the dedicated folder for certificate file, rather than reuse the data + // folder, since there is test case to check whether the folder is empty. + ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca-")); + + ARROW_ASSIGN_OR_RAISE(auto public_crt_file, + PlatformFilename::FromString(ca_path() + "/public.crt")); + ARROW_ASSIGN_OR_RAISE(auto public_cert_fd, FileOpenWritable(public_crt_file)); + ARROW_RETURN_NOT_OK(FileWrite(public_cert_fd.fd(), + reinterpret_cast(kMinioCert), + strlen(kMinioCert))); + ARROW_RETURN_NOT_OK(public_cert_fd.Close()); + + ARROW_ASSIGN_OR_RAISE(auto private_key_file, + PlatformFilename::FromString(ca_path() + "/private.key")); + ARROW_ASSIGN_OR_RAISE(auto private_key_fd, FileOpenWritable(private_key_file)); + ARROW_RETURN_NOT_OK(FileWrite(private_key_fd.fd(), + reinterpret_cast(kMinioPrivateKey), + strlen(kMinioPrivateKey))); + ARROW_RETURN_NOT_OK(private_key_fd.Close()); + + // Set the trusted CA certificate +#if defined(__linux__) + arrow::fs::FileSystemGlobalOptions global_options; + global_options.tls_ca_dir_path = ca_path(); + ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); +#elif defined(_WIN32) + // Windows does not have a standard location for CA certificates + auto import_cert_process = std::make_unique(); + ARROW_RETURN_NOT_OK(import_cert_process->SetExecutable("certutil")); + import_cert_process->SetArgs( + {"-addstore", "-f", "ArrowTest", public_crt_file.ToString()}); + ARROW_RETURN_NOT_OK(import_cert_process->Execute()); +#endif + return Status::OK(); } Status MinioTestServer::Start() { @@ -117,13 +137,7 @@ Status MinioTestServer::Start() { // Disable the embedded console (one less listening address to care about) impl_->server_process_->SetEnv("MINIO_BROWSER", "off"); impl_->connect_string_ = GenerateConnectString(); - // create the dedicated folder for certificate file, rather than reuse the data - // folder, since there is test case to check whether the folder is empty. - ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca-")); - GenerateCertificateFile(); - arrow::fs::FileSystemGlobalOptions global_options; - global_options.tls_ca_dir_path = ca_path(); - ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); + ARROW_RETURN_NOT_OK(GenerateCertificateFile()); ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); // NOTE: --quiet makes startup faster by suppressing remote version check impl_->server_process_->SetArgs({"server", "--quiet", "--compat", "--certs-dir", diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index 4b35c9ba488..6c1a1675c29 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -53,7 +53,7 @@ class MinioTestServer { std::string ca_path() const; private: - void GenerateCertificateFile(); + Status GenerateCertificateFile(); struct Impl; std::unique_ptr impl_; }; diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index f4e4b4101ce..d5e2e78d61d 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -82,7 +82,13 @@ class MinioFixture : public benchmark::Fixture { client_config_.endpointOverride = ToAwsString(minio_->connect_string()); client_config_.scheme = Aws::Http::Scheme::HTTPS; +// The caPath only take effect on linux according to the AWS SDK documentation +// https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/client-config.html +#if defined(__linux__) client_config_.caPath = ToAwsString(minio_->ca_path()); +#else + client_config_.verifySSL = false; +#endif if (!region_.empty()) { client_config_.region = ToAwsString(region_); } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index d8649a659de..619163a7420 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -207,7 +207,13 @@ class S3TestMixin : public AwsTestMixin { client_config_.reset(new Aws::Client::ClientConfiguration()); client_config_->endpointOverride = ToAwsString(minio_->connect_string()); client_config_->scheme = Aws::Http::Scheme::HTTPS; +// The caPath only take effect on linux according to the AWS SDK documentation +// https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/client-config.html +#if defined(__linux__) client_config_->caPath = ToAwsString(minio_->ca_path()); +#else + client_config_->verifySSL = false; +#endif client_config_->retryStrategy = std::make_shared(kRetryInterval, kMaxRetryDuration); credentials_ = {ToAwsString(minio_->access_key()), ToAwsString(minio_->secret_key())}; From b1fba0c92b7dd598912d2609b55bd4bae0ab2b2a Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Tue, 8 Oct 2024 05:05:54 -0400 Subject: [PATCH 06/18] according to https://github.com/aws/aws-sdk-cpp/commit/7989e6f3f11e1dabb81f495466522b3d82f91ca7, CompleteMultipartUploadRequest add the SSEC related function after 1.9.201 --- cpp/src/arrow/filesystem/s3_internal.h | 3 +++ cpp/src/arrow/filesystem/s3fs.cc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index 85425b12601..3c2e94c1196 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -323,6 +323,9 @@ Status SetSSECustomerKey(S3RequestType& request, const std::string& sse_customer if (sse_customer_key.empty()) { return Status::OK(); // do nothing if the sse_customer_key is not configured } +#ifndef ARROW_S3_SUPPORT_SSEC + return Status::NotImplemented("SSE-C is not supported"); +#endif ARROW_ASSIGN_OR_RAISE(auto md5, internal::CalculateSSECustomerKeyMD5(sse_customer_key)); request.SetSSECustomerKeyMD5(md5); request.SetSSECustomerKey(arrow::util::base64_encode(sse_customer_key)); diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 39303c4cf21..12cfce50f0d 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -105,6 +105,10 @@ # define ARROW_S3_HAS_S3CLIENT_CONFIGURATION #endif +#if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 201) +# define ARROW_S3_SUPPORT_SSEC +#endif + #ifdef ARROW_S3_HAS_CRT # include # include From 8235c87471526d166838be0c599c393971cda72c Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Wed, 9 Oct 2024 05:31:14 -0400 Subject: [PATCH 07/18] expose the tls option on the s3options,only start the minio with tls in linux as the minio with mingw could NOT work well --- cpp/src/arrow/filesystem/filesystem.h | 3 ++ cpp/src/arrow/filesystem/s3_internal.h | 7 +++-- cpp/src/arrow/filesystem/s3_test_util.cc | 29 +++++++++--------- cpp/src/arrow/filesystem/s3_test_util.h | 6 ++++ cpp/src/arrow/filesystem/s3fs.cc | 23 +++++++++++++-- cpp/src/arrow/filesystem/s3fs.h | 14 +++++++++ cpp/src/arrow/filesystem/s3fs_benchmark.cc | 16 +++++----- cpp/src/arrow/filesystem/s3fs_test.cc | 34 +++++++++++++++------- 8 files changed, 93 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index d4f62f86a74..4ee5d8bbb15 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -710,6 +710,9 @@ struct FileSystemGlobalOptions { /// /// If empty, the underlying TLS library's defaults will be used. std::string tls_ca_dir_path; + + /// Controls whether to verify TLS certificates. Defaults to true. + bool tls_verify_certificates = true; }; /// EXPERIMENTAL: optional global initialization routine diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index 3c2e94c1196..fc4b4288ae6 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -323,14 +323,15 @@ Status SetSSECustomerKey(S3RequestType& request, const std::string& sse_customer if (sse_customer_key.empty()) { return Status::OK(); // do nothing if the sse_customer_key is not configured } -#ifndef ARROW_S3_SUPPORT_SSEC - return Status::NotImplemented("SSE-C is not supported"); -#endif +#ifdef ARROW_S3_SUPPORT_SSEC ARROW_ASSIGN_OR_RAISE(auto md5, internal::CalculateSSECustomerKeyMD5(sse_customer_key)); request.SetSSECustomerKeyMD5(md5); request.SetSSECustomerKey(arrow::util::base64_encode(sse_customer_key)); request.SetSSECustomerAlgorithm("AES256"); return Status::OK(); +#else + return Status::NotImplemented("SSE-C is not supported"); +#endif } } // namespace internal diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 4617bb6f9d9..8b181df9ffe 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -61,6 +61,7 @@ struct MinioTestServer::Impl { std::string access_key_ = kMinioAccessKey; std::string secret_key_ = kMinioSecretKey; std::unique_ptr server_process_; + std::string scheme_ = "http"; }; MinioTestServer::MinioTestServer() : impl_(new Impl) {} @@ -80,6 +81,8 @@ std::string MinioTestServer::ca_path() const { return impl_->temp_dir_ca_->path().ToString(); } +std::string MinioTestServer::scheme() const { return impl_->scheme_; } + Status MinioTestServer::GenerateCertificateFile() { // create the dedicated folder for certificate file, rather than reuse the data // folder, since there is test case to check whether the folder is empty. @@ -101,19 +104,10 @@ Status MinioTestServer::GenerateCertificateFile() { strlen(kMinioPrivateKey))); ARROW_RETURN_NOT_OK(private_key_fd.Close()); - // Set the trusted CA certificate -#if defined(__linux__) arrow::fs::FileSystemGlobalOptions global_options; - global_options.tls_ca_dir_path = ca_path(); + global_options.tls_verify_certificates = false; ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); -#elif defined(_WIN32) - // Windows does not have a standard location for CA certificates - auto import_cert_process = std::make_unique(); - ARROW_RETURN_NOT_OK(import_cert_process->SetExecutable("certutil")); - import_cert_process->SetArgs( - {"-addstore", "-f", "ArrowTest", public_crt_file.ToString()}); - ARROW_RETURN_NOT_OK(import_cert_process->Execute()); -#endif + return Status::OK(); } @@ -137,12 +131,19 @@ Status MinioTestServer::Start() { // Disable the embedded console (one less listening address to care about) impl_->server_process_->SetEnv("MINIO_BROWSER", "off"); impl_->connect_string_ = GenerateConnectString(); + std::vector minio_args({"server", "--quiet", "--compat", "--address", + impl_->connect_string_, + impl_->temp_dir_->path().ToString()}); +#ifdef MINIO_SERVER_WITH_TLS ARROW_RETURN_NOT_OK(GenerateCertificateFile()); + minio_args.push_back("--certs-dir"); + minio_args.push_back(ca_path()); + impl_->scheme_ = "https"; +#endif // MINIO_SERVER_WITH_TLS + ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); // NOTE: --quiet makes startup faster by suppressing remote version check - impl_->server_process_->SetArgs({"server", "--quiet", "--compat", "--certs-dir", - ca_path(), "--address", impl_->connect_string_, - impl_->temp_dir_->path().ToString()}); + impl_->server_process_->SetArgs(minio_args); ARROW_RETURN_NOT_OK(impl_->server_process_->Execute()); return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index 6c1a1675c29..0dfdcf9d783 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -30,6 +30,10 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" +#if defined(__linux__) +# define MINIO_SERVER_WITH_TLS +#endif // Linux + namespace arrow { namespace fs { @@ -52,6 +56,8 @@ class MinioTestServer { std::string ca_path() const; + std::string scheme() const; + private: Status GenerateCertificateFile(); struct Impl; diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 12cfce50f0d..353b48329a4 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -408,6 +408,13 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { } else if (kv.first == "allow_bucket_deletion") { ARROW_ASSIGN_OR_RAISE(options.allow_bucket_deletion, ::arrow::internal::ParseBoolean(kv.second)); + } else if (kv.first == "tls_ca_file_path") { + options.tls_ca_file_path = kv.second; + } else if (kv.first == "tls_ca_dir_path") { + options.tls_ca_dir_path = kv.second; + } else if (kv.first == "tls_verify_certificates") { + ARROW_ASSIGN_OR_RAISE(options.tls_verify_certificates, + ::arrow::internal::ParseBoolean(kv.second)); } else { return Status::Invalid("Unexpected query parameter in S3 URI: '", kv.first, "'"); } @@ -444,6 +451,9 @@ bool S3Options::Equals(const S3Options& other) const { background_writes == other.background_writes && allow_bucket_creation == other.allow_bucket_creation && allow_bucket_deletion == other.allow_bucket_deletion && + tls_ca_file_path == other.tls_ca_file_path && + tls_ca_dir_path == other.tls_ca_dir_path && + tls_verify_certificates == other.tls_verify_certificates && sse_customer_key == other.sse_customer_key && default_metadata_equals && GetAccessKey() == other.GetAccessKey() && GetSecretKey() == other.GetSecretKey() && @@ -1131,12 +1141,21 @@ class ClientBuilder { } else { client_config_.retryStrategy = std::make_shared(); } - if (!internal::global_options.tls_ca_file_path.empty()) { + if (!options_.tls_ca_file_path.empty()) { + client_config_.caFile = ToAwsString(options_.tls_ca_file_path); + } else if (!internal::global_options.tls_ca_file_path.empty()) { client_config_.caFile = ToAwsString(internal::global_options.tls_ca_file_path); } - if (!internal::global_options.tls_ca_dir_path.empty()) { + if (!options_.tls_ca_dir_path.empty()) { + client_config_.caPath = ToAwsString(options_.tls_ca_dir_path); + } else if (!internal::global_options.tls_ca_dir_path.empty()) { client_config_.caPath = ToAwsString(internal::global_options.tls_ca_dir_path); } + if (!options_.tls_verify_certificates) { + client_config_.verifySSL = options_.tls_verify_certificates; + } else if (!internal::global_options.tls_verify_certificates) { + client_config_.verifySSL = internal::global_options.tls_verify_certificates; + } // Set proxy options if provided if (!options_.proxy_options.scheme.empty()) { diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 46aa5430ed9..a1c0a98a800 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -199,6 +199,20 @@ struct ARROW_EXPORT S3Options { /// the SSE-C customized key (raw 32 bytes key). std::string sse_customer_key; + /// Path to a single PEM file holding all TLS CA certificates + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_file_path; + + /// Path to a directory holding TLS CA certificates in individual PEM files + /// named along the OpenSSL "hashed" format. + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_dir_path; + + /// Controls whether to verify TLS certificates. Defaults to true. + bool tls_verify_certificates = true; + S3Options(); /// Configure with the default AWS credentials provider chain. diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index d5e2e78d61d..9dd2bf0228b 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -81,14 +81,12 @@ class MinioFixture : public benchmark::Fixture { } client_config_.endpointOverride = ToAwsString(minio_->connect_string()); - client_config_.scheme = Aws::Http::Scheme::HTTPS; -// The caPath only take effect on linux according to the AWS SDK documentation -// https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/client-config.html -#if defined(__linux__) - client_config_.caPath = ToAwsString(minio_->ca_path()); -#else - client_config_.verifySSL = false; -#endif + if (minio_->scheme() == "https") { + client_config_.scheme = Aws::Http::Scheme::HTTPS; + client_config_.verifySSL = false; + } else { + client_config_.scheme = Aws::Http::Scheme::HTTP; + } if (!region_.empty()) { client_config_.region = ToAwsString(region_); } @@ -117,7 +115,7 @@ class MinioFixture : public benchmark::Fixture { void MakeFileSystem() { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "https"; + options_.scheme = minio_->scheme(); if (!region_.empty()) { options_.region = region_; } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 619163a7420..ce4b9781b9b 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -206,14 +206,12 @@ class S3TestMixin : public AwsTestMixin { ARROW_ASSIGN_OR_RAISE(minio_, GetMinioEnv()->GetOneServer()); client_config_.reset(new Aws::Client::ClientConfiguration()); client_config_->endpointOverride = ToAwsString(minio_->connect_string()); - client_config_->scheme = Aws::Http::Scheme::HTTPS; -// The caPath only take effect on linux according to the AWS SDK documentation -// https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/client-config.html -#if defined(__linux__) - client_config_->caPath = ToAwsString(minio_->ca_path()); -#else - client_config_->verifySSL = false; -#endif + if (minio_->scheme() == "https") { + client_config_->scheme = Aws::Http::Scheme::HTTPS; + client_config_->verifySSL = false; + } else { + client_config_->scheme = Aws::Http::Scheme::HTTP; + } client_config_->retryStrategy = std::make_shared(kRetryInterval, kMaxRetryDuration); credentials_ = {ToAwsString(minio_->access_key()), ToAwsString(minio_->secret_key())}; @@ -309,6 +307,16 @@ TEST_F(S3OptionsTest, FromUri) { ASSERT_EQ(options.endpoint_override, "localhost"); ASSERT_EQ(path, "mybucket/foo/bar"); + // Explicit tls related configuration + ASSERT_OK_AND_ASSIGN( + options, + S3Options::FromUri("s3://mybucket/foo/bar/?tls_ca_dir_path=/test&tls_ca_file_path=/" + "test/test.pem&tls_verify_certificates=false", + &path)); + ASSERT_EQ(options.tls_ca_dir_path, "/test"); + ASSERT_EQ(options.tls_ca_file_path, "/test/test.pem"); + ASSERT_EQ(options.tls_verify_certificates, false); + // Missing bucket name ASSERT_RAISES(Invalid, S3Options::FromUri("s3:///foo/bar/", &path)); @@ -451,6 +459,7 @@ class TestS3FS : public S3TestMixin { // Most tests will create buckets options_.allow_bucket_creation = true; options_.allow_bucket_deletion = true; + options_.tls_verify_certificates = false; MakeFileSystem(); // Set up test bucket { @@ -540,7 +549,7 @@ class TestS3FS : public S3TestMixin { Result> MakeNewFileSystem( io::IOContext io_context = io::default_io_context()) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "https"; + options_.scheme = minio_->scheme(); options_.endpoint_override = minio_->connect_string(); if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); @@ -1308,6 +1317,7 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +#ifdef MINIO_SERVER_WITH_TLS TEST_F(TestS3FS, SSECustomerKeyMatch) { // normal write/read with correct SSEC key std::shared_ptr stream; @@ -1335,6 +1345,7 @@ TEST_F(TestS3FS, SSECustomerKeyMismatch) { ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); ASSERT_OK(RestoreTestBucket()); } +#endif // MINIO_SERVER_WITH_TLS struct S3OptionsTestParameters { bool background_writes{false}; @@ -1458,7 +1469,8 @@ TEST_F(TestS3FS, FileSystemFromUri) { std::stringstream ss; ss << "s3://" << minio_->access_key() << ":" << minio_->secret_key() << "@bucket/somedir/subdir/subfile" - << "?scheme=https&endpoint_override=" << UriEscape(minio_->connect_string()); + << "?scheme=" << minio_->scheme() + << "&endpoint_override=" << UriEscape(minio_->connect_string()); std::string path; ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path)); @@ -1560,7 +1572,7 @@ class TestS3FSGeneric : public S3TestMixin, public GenericFileSystemTest { } options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "https"; + options_.scheme = minio_->scheme(); options_.endpoint_override = minio_->connect_string(); options_.retry_strategy = std::make_shared(); ASSERT_OK_AND_ASSIGN(s3fs_, S3FileSystem::Make(options_)); From 4eb9ace63ba2d2a845a45b6229490759085313de Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Wed, 9 Oct 2024 22:14:32 -0400 Subject: [PATCH 08/18] refine code according to review comments --- cpp/src/arrow/filesystem/filesystem.h | 3 - cpp/src/arrow/filesystem/s3_internal.h | 31 +++++++-- ...s3_test_cert.h => s3_test_cert_internal.h} | 11 ++-- cpp/src/arrow/filesystem/s3_test_util.cc | 26 +++++--- cpp/src/arrow/filesystem/s3_test_util.h | 9 ++- cpp/src/arrow/filesystem/s3fs.cc | 22 ++----- cpp/src/arrow/filesystem/s3fs.h | 8 ++- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 9 +-- cpp/src/arrow/filesystem/s3fs_test.cc | 65 ++++++++++++------- 9 files changed, 110 insertions(+), 74 deletions(-) rename cpp/src/arrow/filesystem/{s3_test_cert.h => s3_test_cert_internal.h} (95%) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 4ee5d8bbb15..d4f62f86a74 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -710,9 +710,6 @@ struct FileSystemGlobalOptions { /// /// If empty, the underlying TLS library's defaults will be used. std::string tls_ca_dir_path; - - /// Controls whether to verify TLS certificates. Defaults to true. - bool tls_verify_certificates = true; }; /// EXPERIMENTAL: optional global initialization routine diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index fc4b4288ae6..f86c7cddec7 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -40,6 +40,27 @@ #include "arrow/util/print.h" #include "arrow/util/string.h" +#ifndef ARROW_AWS_SDK_VERSION_CHECK +// AWS_SDK_VERSION_{MAJOR,MINOR,PATCH} are available since 1.9.7. +# if defined(AWS_SDK_VERSION_MAJOR) && defined(AWS_SDK_VERSION_MINOR) && \ + defined(AWS_SDK_VERSION_PATCH) +// Redundant "(...)" are for suppressing "Weird number of spaces at +// line-start. Are you using a 2-space indent? [whitespace/indent] +// [3]" errors... +# define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) \ + ((AWS_SDK_VERSION_MAJOR > (major) || \ + (AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR > (minor)) || \ + ((AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR == (minor) && \ + AWS_SDK_VERSION_PATCH >= (patch))))) +# else +# define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) 0 +# endif +#endif // !ARROW_AWS_SDK_VERSION_CHECK + +#if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 201) +# define ARROW_S3_HAS_SSE_C +#endif + namespace arrow { namespace fs { namespace internal { @@ -319,15 +340,15 @@ inline Result CalculateSSECustomerKeyMD5( } template -Status SetSSECustomerKey(S3RequestType& request, const std::string& sse_customer_key) { +Status SetSSECustomerKey(S3RequestType* request, const std::string& sse_customer_key) { if (sse_customer_key.empty()) { return Status::OK(); // do nothing if the sse_customer_key is not configured } -#ifdef ARROW_S3_SUPPORT_SSEC +#ifdef ARROW_S3_HAS_SSE_C ARROW_ASSIGN_OR_RAISE(auto md5, internal::CalculateSSECustomerKeyMD5(sse_customer_key)); - request.SetSSECustomerKeyMD5(md5); - request.SetSSECustomerKey(arrow::util::base64_encode(sse_customer_key)); - request.SetSSECustomerAlgorithm("AES256"); + request->SetSSECustomerKeyMD5(md5); + request->SetSSECustomerKey(arrow::util::base64_encode(sse_customer_key)); + request->SetSSECustomerAlgorithm("AES256"); return Status::OK(); #else return Status::NotImplemented("SSE-C is not supported"); diff --git a/cpp/src/arrow/filesystem/s3_test_cert.h b/cpp/src/arrow/filesystem/s3_test_cert_internal.h similarity index 95% rename from cpp/src/arrow/filesystem/s3_test_cert.h rename to cpp/src/arrow/filesystem/s3_test_cert_internal.h index 59b8d83c5ab..0a69ade7d0e 100644 --- a/cpp/src/arrow/filesystem/s3_test_cert.h +++ b/cpp/src/arrow/filesystem/s3_test_cert_internal.h @@ -17,14 +17,12 @@ #pragma once -namespace arrow { -namespace fs { - +namespace arrow::fs { // The below two static strings are generated according to // https://github.com/minio/minio/tree/RELEASE.2024-09-22T00-33-43Z/docs/tls#323-generate-a-self-signed-certificate // `openssl req -new -x509 -nodes -days 36500 -keyout private.key -out public.crt -config // openssl.conf` -const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY----- +static constexpr const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY----- MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCqwKYHsTSciGqP uU3qkTWpnXIi3iC0eeW7JSzJHGFs880WdR5JdK4WufPK+1xzgiYjMEPfAcuSWz3b qYyCI61q+a9Iu2nj7cFTW9bfZrmWlnI0YOLJc+q0AAdAjF1lvRKenH8tbjz/2jyl @@ -54,7 +52,7 @@ t9pJcv2E5xY7/nFNIorpKg== -----END PRIVATE KEY----- )"; -const char* kMinioCert = R"(-----BEGIN CERTIFICATE----- +static constexpr const char* kMinioCert = R"(-----BEGIN CERTIFICATE----- MIIDiTCCAnGgAwIBAgIUXbHZ6FAhKSXg4WSGUQySlSyE4U0wDQYJKoZIhvcNAQEL BQAwXzELMAkGA1UEBhMCVVMxCzAJBgNVBAgMAlZBMQ4wDAYDVQQHDAVBcnJvdzEO MAwGA1UECgwFQXJyb3cxDjAMBgNVBAsMBUFycm93MRMwEQYDVQQDDApBcnJyb3dU @@ -76,5 +74,4 @@ TFeMNKROmrEPCWaYr6MJ+ItHtb5Cawapea4THz9GCjR9eLq2CbMqLezZ8xBHPzc4 ixI2l0uCfg7ZUSA+90yaScc7bhEQ8CMiPtJgNKaKIqB58DpY7028xJpW7Ma2 -----END CERTIFICATE----- )"; -} // namespace fs -} // namespace arrow +} // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 8b181df9ffe..15b124f321a 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -19,7 +19,7 @@ # include #endif -#include "arrow/filesystem/s3_test_cert.h" +#include "arrow/filesystem/s3_test_cert_internal.h" #include "arrow/filesystem/s3_test_util.h" #include "arrow/filesystem/s3fs.h" #include "arrow/testing/process.h" @@ -77,10 +77,14 @@ std::string MinioTestServer::access_key() const { return impl_->access_key_; } std::string MinioTestServer::secret_key() const { return impl_->secret_key_; } -std::string MinioTestServer::ca_path() const { +std::string MinioTestServer::ca_dir_path() const { return impl_->temp_dir_ca_->path().ToString(); } +std::string MinioTestServer::ca_file_path() const { + return impl_->temp_dir_ca_->path().ToString() + "/public.crt"; +} + std::string MinioTestServer::scheme() const { return impl_->scheme_; } Status MinioTestServer::GenerateCertificateFile() { @@ -89,7 +93,7 @@ Status MinioTestServer::GenerateCertificateFile() { ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca-")); ARROW_ASSIGN_OR_RAISE(auto public_crt_file, - PlatformFilename::FromString(ca_path() + "/public.crt")); + PlatformFilename::FromString(ca_dir_path() + "/public.crt")); ARROW_ASSIGN_OR_RAISE(auto public_cert_fd, FileOpenWritable(public_crt_file)); ARROW_RETURN_NOT_OK(FileWrite(public_cert_fd.fd(), reinterpret_cast(kMinioCert), @@ -97,7 +101,7 @@ Status MinioTestServer::GenerateCertificateFile() { ARROW_RETURN_NOT_OK(public_cert_fd.Close()); ARROW_ASSIGN_OR_RAISE(auto private_key_file, - PlatformFilename::FromString(ca_path() + "/private.key")); + PlatformFilename::FromString(ca_dir_path() + "/private.key")); ARROW_ASSIGN_OR_RAISE(auto private_key_fd, FileOpenWritable(private_key_file)); ARROW_RETURN_NOT_OK(FileWrite(private_key_fd.fd(), reinterpret_cast(kMinioPrivateKey), @@ -105,13 +109,13 @@ Status MinioTestServer::GenerateCertificateFile() { ARROW_RETURN_NOT_OK(private_key_fd.Close()); arrow::fs::FileSystemGlobalOptions global_options; - global_options.tls_verify_certificates = false; + global_options.tls_ca_file_path = ca_file_path(); ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); return Status::OK(); } -Status MinioTestServer::Start() { +Status MinioTestServer::Start(bool enable_tls_if_supported) { const char* connect_str = std::getenv(kEnvConnectString); const char* access_key = std::getenv(kEnvAccessKey); const char* secret_key = std::getenv(kEnvSecretKey); @@ -134,12 +138,14 @@ Status MinioTestServer::Start() { std::vector minio_args({"server", "--quiet", "--compat", "--address", impl_->connect_string_, impl_->temp_dir_->path().ToString()}); + if (enable_tls_if_supported) { #ifdef MINIO_SERVER_WITH_TLS - ARROW_RETURN_NOT_OK(GenerateCertificateFile()); - minio_args.push_back("--certs-dir"); - minio_args.push_back(ca_path()); - impl_->scheme_ = "https"; + ARROW_RETURN_NOT_OK(GenerateCertificateFile()); + minio_args.push_back("--certs-dir"); + minio_args.push_back(ca_dir_path()); + impl_->scheme_ = "https"; #endif // MINIO_SERVER_WITH_TLS + } ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); // NOTE: --quiet makes startup faster by suppressing remote version check diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index 0dfdcf9d783..add1ffbf7fd 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -44,7 +44,10 @@ class MinioTestServer { MinioTestServer(); ~MinioTestServer(); - Status Start(); + // enable_tls_if_supported = true: start Minio with TLS if MINIO_SERVER_WITH_TLS is + // defined, Currently only enabled on Linux platfrom. enable_tls_if_supported = false: + // start Minio without TLS in all platfroms + Status Start(bool enable_tls_if_supported = true); Status Stop(); @@ -54,7 +57,9 @@ class MinioTestServer { std::string secret_key() const; - std::string ca_path() const; + std::string ca_dir_path() const; + + std::string ca_file_path() const; std::string scheme() const; diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 353b48329a4..9ad99a890d0 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -105,10 +105,6 @@ # define ARROW_S3_HAS_S3CLIENT_CONFIGURATION #endif -#if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 201) -# define ARROW_S3_SUPPORT_SSEC -#endif - #ifdef ARROW_S3_HAS_CRT # include # include @@ -1151,11 +1147,7 @@ class ClientBuilder { } else if (!internal::global_options.tls_ca_dir_path.empty()) { client_config_.caPath = ToAwsString(internal::global_options.tls_ca_dir_path); } - if (!options_.tls_verify_certificates) { - client_config_.verifySSL = options_.tls_verify_certificates; - } else if (!internal::global_options.tls_verify_certificates) { - client_config_.verifySSL = internal::global_options.tls_verify_certificates; - } + client_config_.verifySSL = options_.tls_verify_certificates; // Set proxy options if provided if (!options_.proxy_options.scheme.empty()) { @@ -1324,7 +1316,7 @@ Result GetObjectRange(Aws::S3::S3Client* client, S3Model::GetObjectRequest req; req.SetBucket(ToAwsString(path.bucket)); req.SetKey(ToAwsString(path.key)); - RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key)); + RETURN_NOT_OK(SetSSECustomerKey(&req, sse_customer_key)); req.SetRange(ToAwsString(FormatRange(start, length))); req.SetResponseStreamFactory(AwsWriteableStreamFactory(out, length)); return OutcomeToResult("GetObject", client->GetObject(req)); @@ -1480,7 +1472,7 @@ class ObjectInputFile final : public io::RandomAccessFile { S3Model::HeadObjectRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); - RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); + RETURN_NOT_OK(SetSSECustomerKey(&req, sse_customer_key_)); ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock()); auto outcome = client_lock.Move()->HeadObject(req); @@ -1701,7 +1693,7 @@ class ObjectOutputStream final : public io::OutputStream { S3Model::CreateMultipartUploadRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); - RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); + RETURN_NOT_OK(SetSSECustomerKey(&req, sse_customer_key_)); RETURN_NOT_OK(SetMetadataInRequest(&req)); auto outcome = client_lock.Move()->CreateMultipartUpload(req); @@ -1803,9 +1795,9 @@ class ObjectOutputStream final : public io::OutputStream { S3Model::CompleteMultipartUploadRequest req; req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); - RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); req.SetUploadId(multipart_upload_id_); req.SetMultipartUpload(std::move(completed_upload)); + RETURN_NOT_OK(SetSSECustomerKey(&req, sse_customer_key_)); auto outcome = client_lock.Move()->CompleteMultipartUploadWithErrorFixup(std::move(req)); @@ -1993,7 +1985,7 @@ class ObjectOutputStream final : public io::OutputStream { req.SetKey(ToAwsString(path_.key)); req.SetBody(std::make_shared(data, nbytes)); req.SetContentLength(nbytes); - RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key_)); + RETURN_NOT_OK(SetSSECustomerKey(&req, sse_customer_key_)); if (!background_writes_) { req.SetBody(std::make_shared(data, nbytes)); @@ -2375,7 +2367,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisStart()); + ASSERT_OK(minio_->Start(false)); const char* region_str = std::getenv(kEnvAwsRegion); if (region_str) { @@ -81,12 +81,7 @@ class MinioFixture : public benchmark::Fixture { } client_config_.endpointOverride = ToAwsString(minio_->connect_string()); - if (minio_->scheme() == "https") { - client_config_.scheme = Aws::Http::Scheme::HTTPS; - client_config_.verifySSL = false; - } else { - client_config_.scheme = Aws::Http::Scheme::HTTP; - } + client_config_.scheme = Aws::Http::Scheme::HTTP; if (!region_.empty()) { client_config_.region = ToAwsString(region_); } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index ce4b9781b9b..15c9ad29cd0 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -208,7 +208,7 @@ class S3TestMixin : public AwsTestMixin { client_config_->endpointOverride = ToAwsString(minio_->connect_string()); if (minio_->scheme() == "https") { client_config_->scheme = Aws::Http::Scheme::HTTPS; - client_config_->verifySSL = false; + client_config_->caFile = ToAwsString(minio_->ca_file_path()); } else { client_config_->scheme = Aws::Http::Scheme::HTTP; } @@ -306,6 +306,7 @@ TEST_F(S3OptionsTest, FromUri) { ASSERT_EQ(options.scheme, "http"); ASSERT_EQ(options.endpoint_override, "localhost"); ASSERT_EQ(path, "mybucket/foo/bar"); + ASSERT_EQ(options.tls_verify_certificates, true); // Explicit tls related configuration ASSERT_OK_AND_ASSIGN( @@ -459,7 +460,6 @@ class TestS3FS : public S3TestMixin { // Most tests will create buckets options_.allow_bucket_creation = true; options_.allow_bucket_deletion = true; - options_.tls_verify_certificates = false; MakeFileSystem(); // Set up test bucket { @@ -557,9 +557,7 @@ class TestS3FS : public S3TestMixin { return S3FileSystem::Make(options_, io_context); } - void MakeFileSystem() { - ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context())); - } + void MakeFileSystem() { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem()); } template void AssertMetadataRoundtrip(const std::string& path, @@ -1322,28 +1320,51 @@ TEST_F(TestS3FS, SSECustomerKeyMatch) { // normal write/read with correct SSEC key std::shared_ptr stream; options_.sse_customer_key = "12345678123456781234567812345678"; - MakeFileSystem(); - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); - ASSERT_OK(stream->Write("some")); - ASSERT_OK(stream->Close()); - ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/newfile_with_sse_c")); - ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4)); - AssertBufferEqual(*buf, "some"); - ASSERT_OK(RestoreTestBucket()); + for (const auto& allow_delayed_open : {false, true}) { + ARROW_SCOPED_TRACE("allow_delayed_open = ", allow_delayed_open); + options_.allow_delayed_open = allow_delayed_open; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(5)); + AssertBufferEqual(*buf, "some"); + ASSERT_OK(RestoreTestBucket()); + } } TEST_F(TestS3FS, SSECustomerKeyMismatch) { std::shared_ptr stream; - options_.sse_customer_key = "12345678123456781234567812345678"; - MakeFileSystem(); - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); - ASSERT_OK(stream->Write("some")); - ASSERT_OK(stream->Close()); + for (const auto& allow_delayed_open : {false, true}) { + options_.allow_delayed_open = allow_delayed_open; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + options_.sse_customer_key = "87654321876543218765432187654321"; + MakeFileSystem(); + ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK(RestoreTestBucket()); + } +} - options_.sse_customer_key = "87654321876543218765432187654321"; - MakeFileSystem(); - ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); - ASSERT_OK(RestoreTestBucket()); +TEST_F(TestS3FS, SSECustomerKeyMissing) { + std::shared_ptr stream; + for (const auto& allow_delayed_open : {false, true}) { + options_.allow_delayed_open = allow_delayed_open; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + + options_.sse_customer_key = {}; + MakeFileSystem(); + ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK(RestoreTestBucket()); + } } #endif // MINIO_SERVER_WITH_TLS From a27bdd106c164be75ce1221c3b71b25cca03adc3 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Thu, 10 Oct 2024 06:05:10 -0400 Subject: [PATCH 09/18] using the fixed localhost as the endpoint, so that the client verify the host could success during the TLS handshake --- cpp/src/arrow/filesystem/s3_test_util.cc | 2 +- cpp/src/arrow/testing/util.cc | 6 ++++++ cpp/src/arrow/testing/util.h | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 15b124f321a..99a4c42afc2 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -50,7 +50,7 @@ const char* kEnvConnectString = "ARROW_TEST_S3_CONNECT_STRING"; const char* kEnvAccessKey = "ARROW_TEST_S3_ACCESS_KEY"; const char* kEnvSecretKey = "ARROW_TEST_S3_SECRET_KEY"; -std::string GenerateConnectString() { return GetListenAddress(); } +std::string GenerateConnectString() { return GetListenAddress("localhost"); } } // namespace diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index 7bef9f7d475..e5e53801df9 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -206,6 +206,12 @@ std::string GetListenAddress() { return ss.str(); } +std::string GetListenAddress(const std::string& host) { + std::stringstream ss; + ss << host << ":" << GetListenPort(); + return ss.str(); +} + const std::vector>& all_dictionary_index_types() { static std::vector> types = { int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()}; diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index b4b2785a362..8cc28a8b073 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -128,6 +128,10 @@ ARROW_TESTING_EXPORT int GetListenPort(); // port conflicts. ARROW_TESTING_EXPORT std::string GetListenAddress(); +// Get a "host:port" to listen on. Compared to GetListenAddress(), this function would use +// the host passed in. +ARROW_TESTING_EXPORT std::string GetListenAddress(const std::string& host); + ARROW_TESTING_EXPORT const std::vector>& all_dictionary_index_types(); From bb1a5a555d71b0c6033e1ea09065db31aac54d52 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Sat, 12 Oct 2024 01:26:37 -0400 Subject: [PATCH 10/18] force to use http for some cases --- cpp/src/arrow/filesystem/s3_test_util.cc | 9 ++++++--- cpp/src/arrow/filesystem/s3_test_util.h | 3 ++- cpp/src/arrow/filesystem/s3fs_test.cc | 24 ++++++++++++++++++++---- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 99a4c42afc2..d93ce2550d6 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -169,16 +169,19 @@ struct MinioTestEnvironment::Impl { } }; -MinioTestEnvironment::MinioTestEnvironment() : impl_(new Impl) {} +MinioTestEnvironment::MinioTestEnvironment(bool enable_tls_if_supported) + : impl_(new Impl), enable_tls_if_supported_(enable_tls_if_supported) {} MinioTestEnvironment::~MinioTestEnvironment() = default; void MinioTestEnvironment::SetUp() { auto pool = ::arrow::internal::GetCpuThreadPool(); - auto launch_one_server = []() -> Result> { + auto launch_one_server = + [enable_tls_if_supported = + enable_tls_if_supported_]() -> Result> { auto server = std::make_shared(); - RETURN_NOT_OK(server->Start()); + RETURN_NOT_OK(server->Start(enable_tls_if_supported)); return server; }; impl_->server_generator_ = [pool, launch_one_server]() { diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index add1ffbf7fd..db3a5f80755 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -74,7 +74,7 @@ class MinioTestServer { class MinioTestEnvironment : public ::testing::Environment { public: - MinioTestEnvironment(); + explicit MinioTestEnvironment(bool enable_tls_if_supported = true); ~MinioTestEnvironment(); void SetUp() override; @@ -84,6 +84,7 @@ class MinioTestEnvironment : public ::testing::Environment { protected: struct Impl; std::unique_ptr impl_; + bool enable_tls_if_supported_ = true; // by default, enable TLS if supported }; // A global test "environment", to ensure that the S3 API is initialized before diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 15c9ad29cd0..95cd4b08e2e 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -95,8 +95,15 @@ ::testing::Environment* s3_env = ::testing::AddGlobalTestEnvironment(new S3Envir ::testing::Environment* minio_env = ::testing::AddGlobalTestEnvironment(new MinioTestEnvironment); -MinioTestEnvironment* GetMinioEnv() { - return ::arrow::internal::checked_cast(minio_env); +::testing::Environment* minio_env_http = + ::testing::AddGlobalTestEnvironment(new MinioTestEnvironment(false)); + +MinioTestEnvironment* GetMinioEnv(bool enable_tls_if_supported) { + if (enable_tls_if_supported) { + return ::arrow::internal::checked_cast(minio_env); + } else { + return ::arrow::internal::checked_cast(minio_env_http); + } } class ShortRetryStrategy : public S3RetryStrategy { @@ -203,7 +210,7 @@ class S3TestMixin : public AwsTestMixin { protected: Status InitServerAndClient() { - ARROW_ASSIGN_OR_RAISE(minio_, GetMinioEnv()->GetOneServer()); + ARROW_ASSIGN_OR_RAISE(minio_, GetMinioEnv(enable_tls_if_supported_)->GetOneServer()); client_config_.reset(new Aws::Client::ClientConfiguration()); client_config_->endpointOverride = ToAwsString(minio_->connect_string()); if (minio_->scheme() == "https") { @@ -230,6 +237,7 @@ class S3TestMixin : public AwsTestMixin { std::unique_ptr client_config_; Aws::Auth::AWSCredentials credentials_; std::unique_ptr client_; + bool enable_tls_if_supported_ = true; }; void AssertGetObject(Aws::S3::Model::GetObjectResult& result, @@ -914,7 +922,15 @@ TEST_F(TestS3FS, GetFileInfoGenerator) { // Non-root dir case is tested by generic tests } -TEST_F(TestS3FS, GetFileInfoGeneratorStress) { +class TestS3FSHTTP : public TestS3FS { + public: + void SetUp() override { + enable_tls_if_supported_ = false; + TestS3FS::SetUp(); + } +}; + +TEST_F(TestS3FSHTTP, GetFileInfoGeneratorStress) { // This test is slow because it needs to create a bunch of seed files. However, it is // the only test that stresses listing and deleting when there are more than 1000 // files and paging is required. From c7a3c85e65ac86812a1de1da2dc8e0e4927feafb Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Sat, 12 Oct 2024 05:49:44 -0400 Subject: [PATCH 11/18] try to fix the crash on arm64 platform --- cpp/src/arrow/filesystem/s3fs_test.cc | 35 ++++++++++++++------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 95cd4b08e2e..befba785601 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -182,6 +182,24 @@ class AwsTestMixin : public ::testing::Test { #endif }; +// Test the CalculateSSECustomerKeyMD5 in AwsTestMixin fixture, since there is AWS SDK +// function call in it +TEST_F(AwsTestMixin, CalculateSSECustomerKeyMD5) { + ASSERT_RAISES(Invalid, CalculateSSECustomerKeyMD5("")); // invalid length + ASSERT_RAISES(Invalid, + CalculateSSECustomerKeyMD5( + "1234567890123456789012345678901234567890")); // invalid length + // valid case, with some non-ASCII character and a null byte in the sse_customer_key + char sse_customer_key[32] = {}; + sse_customer_key[0] = '\x40'; // '@' character + sse_customer_key[1] = '\0'; // null byte + sse_customer_key[2] = '\xFF'; // non-ASCII + sse_customer_key[31] = '\xFA'; // non-ASCII + std::string sse_customer_key_string(sse_customer_key, sizeof(sse_customer_key)); + ASSERT_OK_AND_ASSIGN(auto md5, CalculateSSECustomerKeyMD5(sse_customer_key_string)) + ASSERT_EQ(md5, "97FTa6lj0hE7lshKdBy61g=="); // valid case +} + class S3TestMixin : public AwsTestMixin { public: void SetUp() override { @@ -1679,22 +1697,5 @@ TEST(S3GlobalOptions, DefaultsLogLevel) { ASSERT_EQ(S3LogLevel::Fatal, arrow::fs::S3GlobalOptions::Defaults().log_level); } } - -TEST(CalculateSSECustomerKeyMD5, Sanity) { - ASSERT_RAISES(Invalid, CalculateSSECustomerKeyMD5("")); // invalid length - ASSERT_RAISES(Invalid, - CalculateSSECustomerKeyMD5( - "1234567890123456789012345678901234567890")); // invalid length - // valid case, with some non-ASCII character and a null byte in the sse_customer_key - char sse_customer_key[32] = {}; - sse_customer_key[0] = '\x40'; // '@' character - sse_customer_key[1] = '\0'; // null byte - sse_customer_key[2] = '\xFF'; // non-ASCII - sse_customer_key[31] = '\xFA'; // non-ASCII - std::string sse_customer_key_string(sse_customer_key, sizeof(sse_customer_key)); - ASSERT_OK_AND_ASSIGN(auto md5, CalculateSSECustomerKeyMD5(sse_customer_key_string)) - ASSERT_EQ(md5, "97FTa6lj0hE7lshKdBy61g=="); // valid case -} - } // namespace fs } // namespace arrow From 7fb4e90ec4749d20f7f6bc3995a94d2929e29a51 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Wed, 16 Oct 2024 01:42:16 -0400 Subject: [PATCH 12/18] refine the SSE-C words in comments --- cpp/src/arrow/filesystem/s3_internal.h | 8 ++++---- cpp/src/arrow/filesystem/s3_test_util.cc | 2 +- cpp/src/arrow/filesystem/s3fs.h | 2 +- cpp/src/arrow/filesystem/s3fs_test.cc | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index f86c7cddec7..e1af8aa880e 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -314,15 +314,15 @@ class ConnectRetryStrategy : public Aws::Client::RetryStrategy { int32_t max_retry_duration_; }; -/// \brief calculate the MD5 of the input sse-c key (raw key, not base64 encoded) -/// \param sse_customer_key is the input sse key +/// \brief calculate the MD5 of the input SSE-C key (raw key, not base64 encoded) +/// \param sse_customer_key is the input SSE-C key /// \return the base64 encoded MD5 for the input key inline Result CalculateSSECustomerKeyMD5( const std::string& sse_customer_key) { - // the key needs to be 256 bits (32 bytes) according to + // The key needs to be 256 bits (32 bytes) according to // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption if (sse_customer_key.length() != 32) { - return Status::Invalid("32 bytes sse-c key is expected"); + return Status::Invalid("32 bytes SSE-C key is expected"); } // Convert the raw binary key to an Aws::String diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index d93ce2550d6..96d4508afef 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -135,6 +135,7 @@ Status MinioTestServer::Start(bool enable_tls_if_supported) { // Disable the embedded console (one less listening address to care about) impl_->server_process_->SetEnv("MINIO_BROWSER", "off"); impl_->connect_string_ = GenerateConnectString(); + // NOTE: --quiet makes startup faster by suppressing remote version check std::vector minio_args({"server", "--quiet", "--compat", "--address", impl_->connect_string_, impl_->temp_dir_->path().ToString()}); @@ -148,7 +149,6 @@ Status MinioTestServer::Start(bool enable_tls_if_supported) { } ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); - // NOTE: --quiet makes startup faster by suppressing remote version check impl_->server_process_->SetArgs(minio_args); ARROW_RETURN_NOT_OK(impl_->server_process_->Execute()); return Status::OK(); diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 14327793dcf..23b0f9d8862 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -196,7 +196,7 @@ struct ARROW_EXPORT S3Options { /// delay between retries. std::shared_ptr retry_strategy; - /// the SSE-C customized key (raw 32 bytes key). + /// The SSE-C customized key (raw 32 bytes key). std::string sse_customer_key; /// Path to a single PEM file holding all TLS CA certificates diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index befba785601..16d63989b97 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -1351,7 +1351,7 @@ TEST_F(TestS3FS, OpenInputFile) { #ifdef MINIO_SERVER_WITH_TLS TEST_F(TestS3FS, SSECustomerKeyMatch) { - // normal write/read with correct SSEC key + // normal write/read with correct SSE-C key std::shared_ptr stream; options_.sse_customer_key = "12345678123456781234567812345678"; for (const auto& allow_delayed_open : {false, true}) { From 8f15323d12715fff922d3418da7f3282d2d5f14e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 17 Oct 2024 16:44:55 +0200 Subject: [PATCH 13/18] Fix CopyFile and add test (+ nits) --- cpp/src/arrow/filesystem/s3_internal.h | 41 +++++++++++++++++----- cpp/src/arrow/filesystem/s3fs.cc | 12 ++++++- cpp/src/arrow/filesystem/s3fs.h | 24 ++++++++----- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 2 +- cpp/src/arrow/filesystem/s3fs_test.cc | 18 ++++++++-- 5 files changed, 76 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index e1af8aa880e..772387e5fb6 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -58,7 +58,7 @@ #endif // !ARROW_AWS_SDK_VERSION_CHECK #if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 201) -# define ARROW_S3_HAS_SSE_C +# define ARROW_S3_HAS_SSE_CUSTOMER_KEY #endif namespace arrow { @@ -339,19 +339,42 @@ inline Result CalculateSSECustomerKeyMD5( sse_customer_key_md5.GetLength())); } -template -Status SetSSECustomerKey(S3RequestType* request, const std::string& sse_customer_key) { +struct SSECustomerKeyHeaders { + std::string sse_customer_key; + std::string sse_customer_key_md5; + std::string sse_customer_algorithm; +}; + +inline Result> GetSSECustomerKeyHeaders( + const std::string& sse_customer_key) { if (sse_customer_key.empty()) { - return Status::OK(); // do nothing if the sse_customer_key is not configured + return std::nullopt; } -#ifdef ARROW_S3_HAS_SSE_C +#ifdef ARROW_S3_HAS_SSE_CUSTOMER_KEY ARROW_ASSIGN_OR_RAISE(auto md5, internal::CalculateSSECustomerKeyMD5(sse_customer_key)); - request->SetSSECustomerKeyMD5(md5); - request->SetSSECustomerKey(arrow::util::base64_encode(sse_customer_key)); - request->SetSSECustomerAlgorithm("AES256"); + return SSECustomerKeyHeaders{arrow::util::base64_encode(sse_customer_key), md5, + "AES256"}; +#else + return Status::NotImplemented( + "SSE customer key not supported by this version of the AWS SDK"); +#endif +} + +template +Status SetSSECustomerKey(S3RequestType* request, const std::string& sse_customer_key) { + ARROW_ASSIGN_OR_RAISE(auto maybe_headers, GetSSECustomerKeyHeaders(sse_customer_key)); + if (!maybe_headers.has_value()) { + return Status::OK(); + } +#ifdef ARROW_S3_HAS_SSE_CUSTOMER_KEY + auto headers = std::move(maybe_headers).value(); + request->SetSSECustomerKey(headers.sse_customer_key); + request->SetSSECustomerKeyMD5(headers.sse_customer_key_md5); + request->SetSSECustomerAlgorithm(headers.sse_customer_algorithm); return Status::OK(); #else - return Status::NotImplemented("SSE-C is not supported"); + return Status::NotImplemented( + "SSE customer key not supported by this version of the AWS SDK"); #endif } diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 9ad99a890d0..ee47e1c7020 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2367,8 +2367,18 @@ class S3FileSystem::Impl : public std::enable_shared_from_this retry_strategy; - /// The SSE-C customized key (raw 32 bytes key). + /// Optional customer-provided key for server-side encryption (SSE-C). + /// + /// This should be the 32-byte AES-256 key, unencoded. std::string sse_customer_key; - /// Path to a single PEM file holding all TLS CA certificates + /// Optional path to a single PEM file holding all TLS CA certificates /// - /// If empty, global filesystem options will be used, if the global filesystem options - /// is also empty, the underlying TLS library's defaults will be used. + /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); + /// if the corresponding global filesystem option is also empty, the underlying + /// TLS library's defaults will be used. std::string tls_ca_file_path; - /// Path to a directory holding TLS CA certificates in individual PEM files + /// Optional path to a directory holding TLS CA + /// + /// The given directory should contain CA certificates as individual PEM files /// named along the OpenSSL "hashed" format. /// - /// If empty, global filesystem options will be used, if the global filesystem options - /// is also empty, the underlying TLS library's defaults will be used. + /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); + /// if the corresponding global filesystem option is also empty, the underlying + /// TLS library's defaults will be used. std::string tls_ca_dir_path; - /// Whether to verify the S3 endpoint's TLS certificate, if the scheme is "https". + /// Whether to verify the S3 endpoint's TLS certificate + /// + /// This option applies if the scheme is "https". bool tls_verify_certificates = true; S3Options(); diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 5a6753e84c7..2867012b866 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -61,7 +61,7 @@ class MinioFixture : public benchmark::Fixture { public: void SetUp(const ::benchmark::State& state) override { minio_.reset(new MinioTestServer()); - ASSERT_OK(minio_->Start(false)); + ASSERT_OK(minio_->Start(/*enable_tls_if_supported=*/false)); const char* region_str = std::getenv(kEnvAwsRegion); if (region_str) { diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 16d63989b97..3b834798882 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -95,8 +95,8 @@ ::testing::Environment* s3_env = ::testing::AddGlobalTestEnvironment(new S3Envir ::testing::Environment* minio_env = ::testing::AddGlobalTestEnvironment(new MinioTestEnvironment); -::testing::Environment* minio_env_http = - ::testing::AddGlobalTestEnvironment(new MinioTestEnvironment(false)); +::testing::Environment* minio_env_http = ::testing::AddGlobalTestEnvironment( + new MinioTestEnvironment(/*enable_tls_if_supported=*/false)); MinioTestEnvironment* GetMinioEnv(bool enable_tls_if_supported) { if (enable_tls_if_supported) { @@ -1349,6 +1349,7 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +// Minio only allows Server Side Encryption on HTTPS client connections. #ifdef MINIO_SERVER_WITH_TLS TEST_F(TestS3FS, SSECustomerKeyMatch) { // normal write/read with correct SSE-C key @@ -1400,6 +1401,18 @@ TEST_F(TestS3FS, SSECustomerKeyMissing) { ASSERT_OK(RestoreTestBucket()); } } + +TEST_F(TestS3FS, SSECustomerKeyCopyFile) { + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + ASSERT_OK(fs_->CopyFile("bucket/newfile_with_sse_c", "bucket/copied_with_sse_c")); + + ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/copied_with_sse_c")); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(5)); + AssertBufferEqual(*buf, "some"); + ASSERT_OK(RestoreTestBucket()); +} #endif // MINIO_SERVER_WITH_TLS struct S3OptionsTestParameters { @@ -1697,5 +1710,6 @@ TEST(S3GlobalOptions, DefaultsLogLevel) { ASSERT_EQ(S3LogLevel::Fatal, arrow::fs::S3GlobalOptions::Defaults().log_level); } } + } // namespace fs } // namespace arrow From 7be0aa2b3dd5ed63e92e45c793d151cdd820333e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 17 Oct 2024 17:17:53 +0200 Subject: [PATCH 14/18] Temporarily disable workaround --- cpp/src/arrow/filesystem/s3fs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 3b834798882..e91387fa23b 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -943,7 +943,7 @@ TEST_F(TestS3FS, GetFileInfoGenerator) { class TestS3FSHTTP : public TestS3FS { public: void SetUp() override { - enable_tls_if_supported_ = false; + enable_tls_if_supported_ = true; TestS3FS::SetUp(); } }; From 8ae70142946b177f8b29f96efe40cf90d3c2bf01 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Thu, 17 Oct 2024 21:44:00 -0400 Subject: [PATCH 15/18] only enable the https for ssec related unit test cases, meanwhile, for http mionio server, rollback to use the random host ip --- cpp/src/arrow/filesystem/s3_test_util.cc | 7 ++++-- cpp/src/arrow/filesystem/s3fs_test.cc | 30 +++++++++++++----------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 96d4508afef..191185300de 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -50,7 +50,7 @@ const char* kEnvConnectString = "ARROW_TEST_S3_CONNECT_STRING"; const char* kEnvAccessKey = "ARROW_TEST_S3_ACCESS_KEY"; const char* kEnvSecretKey = "ARROW_TEST_S3_SECRET_KEY"; -std::string GenerateConnectString() { return GetListenAddress("localhost"); } +std::string GenerateConnectString() { return GetListenAddress(); } } // namespace @@ -145,7 +145,10 @@ Status MinioTestServer::Start(bool enable_tls_if_supported) { minio_args.push_back("--certs-dir"); minio_args.push_back(ca_dir_path()); impl_->scheme_ = "https"; -#endif // MINIO_SERVER_WITH_TLS + impl_->connect_string_ = + GetListenAddress("localhost"); // for TLS enabled case, we need to use localhost + // which is the fixed hostname in the certificate +#endif // MINIO_SERVER_WITH_TLS } ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index e91387fa23b..ed99bd11472 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -255,7 +255,9 @@ class S3TestMixin : public AwsTestMixin { std::unique_ptr client_config_; Aws::Auth::AWSCredentials credentials_; std::unique_ptr client_; - bool enable_tls_if_supported_ = true; + bool enable_tls_if_supported_ = + false; // by default, use the HTTP for all the test cases, since there is the + // random failure when there is stress test against the minio HTTPS Server }; void AssertGetObject(Aws::S3::Model::GetObjectResult& result, @@ -940,15 +942,7 @@ TEST_F(TestS3FS, GetFileInfoGenerator) { // Non-root dir case is tested by generic tests } -class TestS3FSHTTP : public TestS3FS { - public: - void SetUp() override { - enable_tls_if_supported_ = true; - TestS3FS::SetUp(); - } -}; - -TEST_F(TestS3FSHTTP, GetFileInfoGeneratorStress) { +TEST_F(TestS3FS, GetFileInfoGeneratorStress) { // This test is slow because it needs to create a bunch of seed files. However, it is // the only test that stresses listing and deleting when there are more than 1000 // files and paging is required. @@ -1351,7 +1345,15 @@ TEST_F(TestS3FS, OpenInputFile) { // Minio only allows Server Side Encryption on HTTPS client connections. #ifdef MINIO_SERVER_WITH_TLS -TEST_F(TestS3FS, SSECustomerKeyMatch) { +class TestS3FSHTTPS : public TestS3FS { + public: + void SetUp() override { + enable_tls_if_supported_ = true; + TestS3FS::SetUp(); + } +}; + +TEST_F(TestS3FSHTTPS, SSECustomerKeyMatch) { // normal write/read with correct SSE-C key std::shared_ptr stream; options_.sse_customer_key = "12345678123456781234567812345678"; @@ -1369,7 +1371,7 @@ TEST_F(TestS3FS, SSECustomerKeyMatch) { } } -TEST_F(TestS3FS, SSECustomerKeyMismatch) { +TEST_F(TestS3FSHTTPS, SSECustomerKeyMismatch) { std::shared_ptr stream; for (const auto& allow_delayed_open : {false, true}) { options_.allow_delayed_open = allow_delayed_open; @@ -1385,7 +1387,7 @@ TEST_F(TestS3FS, SSECustomerKeyMismatch) { } } -TEST_F(TestS3FS, SSECustomerKeyMissing) { +TEST_F(TestS3FSHTTPS, SSECustomerKeyMissing) { std::shared_ptr stream; for (const auto& allow_delayed_open : {false, true}) { options_.allow_delayed_open = allow_delayed_open; @@ -1402,7 +1404,7 @@ TEST_F(TestS3FS, SSECustomerKeyMissing) { } } -TEST_F(TestS3FS, SSECustomerKeyCopyFile) { +TEST_F(TestS3FSHTTPS, SSECustomerKeyCopyFile) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); ASSERT_OK(stream->Write("some")); ASSERT_OK(stream->Close()); From 20bf5dfcd4bd72c3f416bac020647f73dc417d67 Mon Sep 17 00:00:00 2001 From: Hang Zheng Date: Thu, 17 Oct 2024 23:00:33 -0400 Subject: [PATCH 16/18] set the correct address for the minio start args --- cpp/src/arrow/filesystem/s3_test_util.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 191185300de..14f9775afec 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -136,20 +136,24 @@ Status MinioTestServer::Start(bool enable_tls_if_supported) { impl_->server_process_->SetEnv("MINIO_BROWSER", "off"); impl_->connect_string_ = GenerateConnectString(); // NOTE: --quiet makes startup faster by suppressing remote version check - std::vector minio_args({"server", "--quiet", "--compat", "--address", - impl_->connect_string_, - impl_->temp_dir_->path().ToString()}); + std::vector minio_args({"server", "--quiet", "--compat"}); if (enable_tls_if_supported) { #ifdef MINIO_SERVER_WITH_TLS ARROW_RETURN_NOT_OK(GenerateCertificateFile()); - minio_args.push_back("--certs-dir"); - minio_args.push_back(ca_dir_path()); + minio_args.emplace_back("--certs-dir"); + minio_args.emplace_back(ca_dir_path()); impl_->scheme_ = "https"; impl_->connect_string_ = GetListenAddress("localhost"); // for TLS enabled case, we need to use localhost // which is the fixed hostname in the certificate #endif // MINIO_SERVER_WITH_TLS } + minio_args.emplace_back("--address"); + minio_args.emplace_back( + impl_->connect_string_); // the connect_string_ differs for the http and https, + // https is fixed localhost while http is the dynamic ip + // in range 127.0.0.1/8 + minio_args.emplace_back(impl_->temp_dir_->path().ToString()); ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); impl_->server_process_->SetArgs(minio_args); From 452185cbcfbff3d7a374a879336cfe86d9e28cfe Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 3 Nov 2024 17:06:31 +0100 Subject: [PATCH 17/18] Various cleanups --- cpp/src/arrow/filesystem/s3_test_util.cc | 38 ++++++----- cpp/src/arrow/filesystem/s3_test_util.h | 12 +--- cpp/src/arrow/filesystem/s3fs.h | 4 ++ cpp/src/arrow/filesystem/s3fs_benchmark.cc | 2 +- cpp/src/arrow/filesystem/s3fs_test.cc | 75 +++++++++++++--------- 5 files changed, 69 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 14f9775afec..87359aa1832 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -50,8 +50,6 @@ const char* kEnvConnectString = "ARROW_TEST_S3_CONNECT_STRING"; const char* kEnvAccessKey = "ARROW_TEST_S3_ACCESS_KEY"; const char* kEnvSecretKey = "ARROW_TEST_S3_SECRET_KEY"; -std::string GenerateConnectString() { return GetListenAddress(); } - } // namespace struct MinioTestServer::Impl { @@ -115,7 +113,7 @@ Status MinioTestServer::GenerateCertificateFile() { return Status::OK(); } -Status MinioTestServer::Start(bool enable_tls_if_supported) { +Status MinioTestServer::Start(bool enable_tls) { const char* connect_str = std::getenv(kEnvConnectString); const char* access_key = std::getenv(kEnvAccessKey); const char* secret_key = std::getenv(kEnvSecretKey); @@ -134,25 +132,23 @@ Status MinioTestServer::Start(bool enable_tls_if_supported) { impl_->server_process_->SetEnv("MINIO_SECRET_KEY", kMinioSecretKey); // Disable the embedded console (one less listening address to care about) impl_->server_process_->SetEnv("MINIO_BROWSER", "off"); - impl_->connect_string_ = GenerateConnectString(); // NOTE: --quiet makes startup faster by suppressing remote version check std::vector minio_args({"server", "--quiet", "--compat"}); - if (enable_tls_if_supported) { -#ifdef MINIO_SERVER_WITH_TLS + if (enable_tls) { ARROW_RETURN_NOT_OK(GenerateCertificateFile()); minio_args.emplace_back("--certs-dir"); minio_args.emplace_back(ca_dir_path()); impl_->scheme_ = "https"; - impl_->connect_string_ = - GetListenAddress("localhost"); // for TLS enabled case, we need to use localhost - // which is the fixed hostname in the certificate -#endif // MINIO_SERVER_WITH_TLS + // With TLS enabled, we need the connection hostname to match the certificate's + // subject name. This also constrains the actual listening IP address. + impl_->connect_string_ = GetListenAddress("localhost"); + } else { + // Without TLS enabled, we want to minimize the likelihood of address collisions + // by varying the listening IP address (note that most tests don't enable TLS). + impl_->connect_string_ = GetListenAddress(); } minio_args.emplace_back("--address"); - minio_args.emplace_back( - impl_->connect_string_); // the connect_string_ differs for the http and https, - // https is fixed localhost while http is the dynamic ip - // in range 127.0.0.1/8 + minio_args.emplace_back(impl_->connect_string_); minio_args.emplace_back(impl_->temp_dir_->path().ToString()); ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName)); @@ -168,16 +164,19 @@ Status MinioTestServer::Stop() { struct MinioTestEnvironment::Impl { std::function>()> server_generator_; + bool enable_tls_; + + explicit Impl(bool enable_tls) : enable_tls_(enable_tls) {} Result> LaunchOneServer() { auto server = std::make_shared(); - RETURN_NOT_OK(server->Start()); + RETURN_NOT_OK(server->Start(enable_tls_)); return server; } }; -MinioTestEnvironment::MinioTestEnvironment(bool enable_tls_if_supported) - : impl_(new Impl), enable_tls_if_supported_(enable_tls_if_supported) {} +MinioTestEnvironment::MinioTestEnvironment(bool enable_tls) + : impl_(new Impl(enable_tls)) {} MinioTestEnvironment::~MinioTestEnvironment() = default; @@ -185,10 +184,9 @@ void MinioTestEnvironment::SetUp() { auto pool = ::arrow::internal::GetCpuThreadPool(); auto launch_one_server = - [enable_tls_if_supported = - enable_tls_if_supported_]() -> Result> { + [enable_tls = impl_->enable_tls_]() -> Result> { auto server = std::make_shared(); - RETURN_NOT_OK(server->Start(enable_tls_if_supported)); + RETURN_NOT_OK(server->Start(enable_tls)); return server; }; impl_->server_generator_ = [pool, launch_one_server]() { diff --git a/cpp/src/arrow/filesystem/s3_test_util.h b/cpp/src/arrow/filesystem/s3_test_util.h index db3a5f80755..0a89a7a9d5a 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.h +++ b/cpp/src/arrow/filesystem/s3_test_util.h @@ -30,10 +30,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" -#if defined(__linux__) -# define MINIO_SERVER_WITH_TLS -#endif // Linux - namespace arrow { namespace fs { @@ -44,10 +40,7 @@ class MinioTestServer { MinioTestServer(); ~MinioTestServer(); - // enable_tls_if_supported = true: start Minio with TLS if MINIO_SERVER_WITH_TLS is - // defined, Currently only enabled on Linux platfrom. enable_tls_if_supported = false: - // start Minio without TLS in all platfroms - Status Start(bool enable_tls_if_supported = true); + Status Start(bool enable_tls = false); Status Stop(); @@ -74,7 +67,7 @@ class MinioTestServer { class MinioTestEnvironment : public ::testing::Environment { public: - explicit MinioTestEnvironment(bool enable_tls_if_supported = true); + explicit MinioTestEnvironment(bool enable_tls = false); ~MinioTestEnvironment(); void SetUp() override; @@ -84,7 +77,6 @@ class MinioTestEnvironment : public ::testing::Environment { protected: struct Impl; std::unique_ptr impl_; - bool enable_tls_if_supported_ = true; // by default, enable TLS if supported }; // A global test "environment", to ensure that the S3 API is initialized before diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 4b6457acaec..ac6342f26a3 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -206,6 +206,8 @@ struct ARROW_EXPORT S3Options { /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); /// if the corresponding global filesystem option is also empty, the underlying /// TLS library's defaults will be used. + /// + /// Note this option may be ignored on some systems (Windows, macOS). std::string tls_ca_file_path; /// Optional path to a directory holding TLS CA @@ -216,6 +218,8 @@ struct ARROW_EXPORT S3Options { /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); /// if the corresponding global filesystem option is also empty, the underlying /// TLS library's defaults will be used. + /// + /// Note this option may be ignored on some systems (Windows, macOS). std::string tls_ca_dir_path; /// Whether to verify the S3 endpoint's TLS certificate diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 2867012b866..b7b6dda6419 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -61,7 +61,7 @@ class MinioFixture : public benchmark::Fixture { public: void SetUp(const ::benchmark::State& state) override { minio_.reset(new MinioTestServer()); - ASSERT_OK(minio_->Start(/*enable_tls_if_supported=*/false)); + ASSERT_OK(minio_->Start(/*enable_tls=*/false)); const char* region_str = std::getenv(kEnvAwsRegion); if (region_str) { diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index ed99bd11472..92e441b8d70 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -71,6 +71,12 @@ #include "arrow/util/range.h" #include "arrow/util/string.h" +// TLS tests require the ability to set a custom CA file when initiating S3 client +// connections, which the AWS SDK currently only supports on Linux. +#if defined(__linux__) +# define ENABLE_TLS_TESTS +#endif // Linux + namespace arrow { namespace fs { @@ -95,14 +101,14 @@ ::testing::Environment* s3_env = ::testing::AddGlobalTestEnvironment(new S3Envir ::testing::Environment* minio_env = ::testing::AddGlobalTestEnvironment(new MinioTestEnvironment); -::testing::Environment* minio_env_http = ::testing::AddGlobalTestEnvironment( - new MinioTestEnvironment(/*enable_tls_if_supported=*/false)); +::testing::Environment* minio_env_https = + ::testing::AddGlobalTestEnvironment(new MinioTestEnvironment(/*enable_tls=*/true)); -MinioTestEnvironment* GetMinioEnv(bool enable_tls_if_supported) { - if (enable_tls_if_supported) { - return ::arrow::internal::checked_cast(minio_env); +MinioTestEnvironment* GetMinioEnv(bool enable_tls) { + if (enable_tls) { + return ::arrow::internal::checked_cast(minio_env_https); } else { - return ::arrow::internal::checked_cast(minio_env_http); + return ::arrow::internal::checked_cast(minio_env); } } @@ -182,24 +188,6 @@ class AwsTestMixin : public ::testing::Test { #endif }; -// Test the CalculateSSECustomerKeyMD5 in AwsTestMixin fixture, since there is AWS SDK -// function call in it -TEST_F(AwsTestMixin, CalculateSSECustomerKeyMD5) { - ASSERT_RAISES(Invalid, CalculateSSECustomerKeyMD5("")); // invalid length - ASSERT_RAISES(Invalid, - CalculateSSECustomerKeyMD5( - "1234567890123456789012345678901234567890")); // invalid length - // valid case, with some non-ASCII character and a null byte in the sse_customer_key - char sse_customer_key[32] = {}; - sse_customer_key[0] = '\x40'; // '@' character - sse_customer_key[1] = '\0'; // null byte - sse_customer_key[2] = '\xFF'; // non-ASCII - sse_customer_key[31] = '\xFA'; // non-ASCII - std::string sse_customer_key_string(sse_customer_key, sizeof(sse_customer_key)); - ASSERT_OK_AND_ASSIGN(auto md5, CalculateSSECustomerKeyMD5(sse_customer_key_string)) - ASSERT_EQ(md5, "97FTa6lj0hE7lshKdBy61g=="); // valid case -} - class S3TestMixin : public AwsTestMixin { public: void SetUp() override { @@ -228,7 +216,7 @@ class S3TestMixin : public AwsTestMixin { protected: Status InitServerAndClient() { - ARROW_ASSIGN_OR_RAISE(minio_, GetMinioEnv(enable_tls_if_supported_)->GetOneServer()); + ARROW_ASSIGN_OR_RAISE(minio_, GetMinioEnv(enable_tls_)->GetOneServer()); client_config_.reset(new Aws::Client::ClientConfiguration()); client_config_->endpointOverride = ToAwsString(minio_->connect_string()); if (minio_->scheme() == "https") { @@ -255,9 +243,11 @@ class S3TestMixin : public AwsTestMixin { std::unique_ptr client_config_; Aws::Auth::AWSCredentials credentials_; std::unique_ptr client_; - bool enable_tls_if_supported_ = - false; // by default, use the HTTP for all the test cases, since there is the - // random failure when there is stress test against the minio HTTPS Server + // Use plain HTTP by default, as this allows us to listen on different loopback + // addresses and thus minimize the risk of address reuse (HTTPS requires the + // hostname to match the certificate's subject name, constraining us to a + // single loopback address). + bool enable_tls_ = false; }; void AssertGetObject(Aws::S3::Model::GetObjectResult& result, @@ -283,6 +273,27 @@ void AssertObjectContents(Aws::S3::S3Client* client, const std::string& bucket, AssertGetObject(result, expected); } +//////////////////////////////////////////////////////////////////////////// +// Misc tests + +class InternalsTest : public AwsTestMixin {}; + +TEST_F(InternalsTest, CalculateSSECustomerKeyMD5) { + ASSERT_RAISES(Invalid, CalculateSSECustomerKeyMD5("")); // invalid length + ASSERT_RAISES(Invalid, + CalculateSSECustomerKeyMD5( + "1234567890123456789012345678901234567890")); // invalid length + // valid case, with some non-ASCII character and a null byte in the sse_customer_key + char sse_customer_key[32] = {}; + sse_customer_key[0] = '\x40'; // '@' character + sse_customer_key[1] = '\0'; // null byte + sse_customer_key[2] = '\xFF'; // non-ASCII + sse_customer_key[31] = '\xFA'; // non-ASCII + std::string sse_customer_key_string(sse_customer_key, sizeof(sse_customer_key)); + ASSERT_OK_AND_ASSIGN(auto md5, CalculateSSECustomerKeyMD5(sse_customer_key_string)) + ASSERT_EQ(md5, "97FTa6lj0hE7lshKdBy61g=="); // valid case +} + //////////////////////////////////////////////////////////////////////////// // S3Options tests @@ -1344,11 +1355,11 @@ TEST_F(TestS3FS, OpenInputFile) { } // Minio only allows Server Side Encryption on HTTPS client connections. -#ifdef MINIO_SERVER_WITH_TLS +#ifdef ENABLE_TLS_TESTS class TestS3FSHTTPS : public TestS3FS { public: void SetUp() override { - enable_tls_if_supported_ = true; + enable_tls_ = true; TestS3FS::SetUp(); } }; @@ -1374,6 +1385,7 @@ TEST_F(TestS3FSHTTPS, SSECustomerKeyMatch) { TEST_F(TestS3FSHTTPS, SSECustomerKeyMismatch) { std::shared_ptr stream; for (const auto& allow_delayed_open : {false, true}) { + ARROW_SCOPED_TRACE("allow_delayed_open = ", allow_delayed_open); options_.allow_delayed_open = allow_delayed_open; options_.sse_customer_key = "12345678123456781234567812345678"; MakeFileSystem(); @@ -1390,6 +1402,7 @@ TEST_F(TestS3FSHTTPS, SSECustomerKeyMismatch) { TEST_F(TestS3FSHTTPS, SSECustomerKeyMissing) { std::shared_ptr stream; for (const auto& allow_delayed_open : {false, true}) { + ARROW_SCOPED_TRACE("allow_delayed_open = ", allow_delayed_open); options_.allow_delayed_open = allow_delayed_open; options_.sse_customer_key = "12345678123456781234567812345678"; MakeFileSystem(); @@ -1415,7 +1428,7 @@ TEST_F(TestS3FSHTTPS, SSECustomerKeyCopyFile) { AssertBufferEqual(*buf, "some"); ASSERT_OK(RestoreTestBucket()); } -#endif // MINIO_SERVER_WITH_TLS +#endif // ENABLE_TLS_TESTS struct S3OptionsTestParameters { bool background_writes{false}; From ce67a270fc2bc65e10c58ce01d609bab0328ca8f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 3 Nov 2024 18:09:48 +0100 Subject: [PATCH 18/18] Avoid mutating global state, to fix TSAN failure --- cpp/src/arrow/filesystem/s3_test_util.cc | 4 ---- cpp/src/arrow/filesystem/s3fs_test.cc | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc index 87359aa1832..0cfe038599c 100644 --- a/cpp/src/arrow/filesystem/s3_test_util.cc +++ b/cpp/src/arrow/filesystem/s3_test_util.cc @@ -106,10 +106,6 @@ Status MinioTestServer::GenerateCertificateFile() { strlen(kMinioPrivateKey))); ARROW_RETURN_NOT_OK(private_key_fd.Close()); - arrow::fs::FileSystemGlobalOptions global_options; - global_options.tls_ca_file_path = ca_file_path(); - ARROW_RETURN_NOT_OK(arrow::fs::Initialize(global_options)); - return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 92e441b8d70..3082ecb7843 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -499,6 +499,9 @@ class TestS3FS : public S3TestMixin { // Most tests will create buckets options_.allow_bucket_creation = true; options_.allow_bucket_deletion = true; + if (enable_tls_) { + options_.tls_ca_file_path = minio_->ca_file_path(); + } MakeFileSystem(); // Set up test bucket {