From 3e5ea1ee1e5feac12dc6d7e7ab27ddc826ca22bf Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Sat, 4 Dec 2021 17:53:27 +0000 Subject: [PATCH] feat(storage): capture metadata info in downloads A lot of useful metadata information is included the the metadata headers for a GCS download over REST, and as an initial `Object` proto for gRPC. This information is needed to perform consistent reads from versioned objects (or detect inconsistent reads from objects without versioning). I will need this information as part of the changes for Arrow too. --- .../storage/internal/curl_download_request.cc | 33 +++++++-- .../internal/grpc_object_read_source.cc | 10 ++- .../storage/internal/object_read_source.h | 3 + .../storage/internal/object_read_streambuf.cc | 4 + .../storage/internal/object_read_streambuf.h | 14 ++++ google/cloud/storage/object_read_stream.h | 45 +++++++++++- .../object_read_headers_integration_test.cc | 73 +++++++++++++++++++ 7 files changed, 172 insertions(+), 10 deletions(-) diff --git a/google/cloud/storage/internal/curl_download_request.cc b/google/cloud/storage/internal/curl_download_request.cc index 5a0a7feca6313..644c0f76fe100 100644 --- a/google/cloud/storage/internal/curl_download_request.cc +++ b/google/cloud/storage/internal/curl_download_request.cc @@ -43,14 +43,33 @@ std::string ExtractHashValue(std::string const& hash_header, ReadSourceResult MakeReadResult(std::size_t bytes_received, HttpResponse response) { auto r = ReadSourceResult{bytes_received, std::move(response)}; - for (auto const& kv : r.response.headers) { - if (!r.generation && kv.first == "x-goog-generation") { - r.generation = std::stoll(kv.second); - } - if (kv.first != "x-goog-hash") continue; + auto const end = r.response.headers.end(); + auto f = r.response.headers.find("x-goog-generation"); + if (f != end && !r.generation) r.generation = std::stoll(f->second); + f = r.response.headers.find("x-goog-metageneration"); + if (f != end && !r.metageneration) r.metageneration = std::stoll(f->second); + f = r.response.headers.find("x-goog-storage-class"); + if (f != end && !r.storage_class) r.storage_class = f->second; + f = r.response.headers.find("x-goog-stored-content-length"); + if (f != end && !r.size) r.size = std::stoull(f->second); + + // Prefer "Content-Range" over "Content-Length" because the former works for + // ranged downloads. + f = r.response.headers.find("content-range"); + if (f != end && !r.size) { + auto const l = f->second.find_last_of('/'); + if (l != std::string::npos) r.size = std::stoll(f->second.substr(l + 1)); + } + f = r.response.headers.find("content-length"); + if (f != end && !r.size) r.size = std::stoll(f->second); + + // x-goog-hash is special in that it does appear multiple times in the + // headers, and we want to accumulate all the values. + auto const range = r.response.headers.equal_range("x-goog-hash"); + for (auto i = range.first; i != range.second; ++i) { HashValues h; - h.crc32c = ExtractHashValue(kv.second, "crc32c="); - h.md5 = ExtractHashValue(kv.second, "md5="); + h.crc32c = ExtractHashValue(i->second, "crc32c="); + h.md5 = ExtractHashValue(i->second, "md5="); r.hashes = Merge(std::move(r.hashes), std::move(h)); } return r; diff --git a/google/cloud/storage/internal/grpc_object_read_source.cc b/google/cloud/storage/internal/grpc_object_read_source.cc index ed4b192932bd3..857df29988cea 100644 --- a/google/cloud/storage/internal/grpc_object_read_source.cc +++ b/google/cloud/storage/internal/grpc_object_read_source.cc @@ -91,8 +91,14 @@ StatusOr GrpcObjectReadSource::Read(char* buf, HashValues{{}, GrpcClient::MD5FromProto(checksums.md5_hash())}); } } - if (response.has_metadata() && !result.generation) { - result.generation = response.metadata().generation(); + if (response.has_metadata()) { + result.generation = + result.generation.value_or(response.metadata().generation()); + result.metageneration = result.metageneration.value_or( + response.metadata().metageneration()); + result.storage_class = + result.storage_class.value_or(response.metadata().storage_class()); + result.size = result.size.value_or(response.metadata().size()); } } }; diff --git a/google/cloud/storage/internal/object_read_source.h b/google/cloud/storage/internal/object_read_source.h index a9bd84683bf2b..473cd1af72752 100644 --- a/google/cloud/storage/internal/object_read_source.h +++ b/google/cloud/storage/internal/object_read_source.h @@ -50,6 +50,9 @@ struct ReadSourceResult { HttpResponse response; HashValues hashes; absl::optional generation; + absl::optional metageneration; + absl::optional storage_class; + absl::optional size; ReadSourceResult() = default; ReadSourceResult(std::size_t b, HttpResponse r) diff --git a/google/cloud/storage/internal/object_read_streambuf.cc b/google/cloud/storage/internal/object_read_streambuf.cc index 1f099929541f9..8cbbb52023240 100644 --- a/google/cloud/storage/internal/object_read_streambuf.cc +++ b/google/cloud/storage/internal/object_read_streambuf.cc @@ -196,6 +196,10 @@ std::streamsize ObjectReadStreambuf::xsgetn(char* s, std::streamsize count) { for (auto const& kv : read->response.headers) { headers_.emplace(kv.first, kv.second); } + if (!generation_) generation_ = std::move(read->generation); + if (!metageneration_) metageneration_ = std::move(read->metageneration); + if (!storage_class_) storage_class_ = std::move(read->storage_class); + if (!size_) size_ = std::move(read->size); return run_validator_if_closed(Status()); } diff --git a/google/cloud/storage/internal/object_read_streambuf.h b/google/cloud/storage/internal/object_read_streambuf.h index 753c62dc6db8d..38a0a5396f877 100644 --- a/google/cloud/storage/internal/object_read_streambuf.h +++ b/google/cloud/storage/internal/object_read_streambuf.h @@ -69,6 +69,16 @@ class ObjectReadStreambuf : public std::basic_streambuf { return headers_; } + // See ObjectReadStream for details about these attributes. + absl::optional const& generation() const { return generation_; } + absl::optional const& metageneration() const { + return metageneration_; + } + absl::optional const& storage_class() const { + return storage_class_; + } + absl::optional const& size() const { return size_; } + private: int_type ReportError(Status status); void ThrowHashMismatchDelegate(char const* function_name); @@ -88,6 +98,10 @@ class ObjectReadStreambuf : public std::basic_streambuf { std::string received_hash_; Status status_; std::multimap headers_; + absl::optional generation_; + absl::optional metageneration_; + absl::optional storage_class_; + absl::optional size_; }; } // namespace internal diff --git a/google/cloud/storage/object_read_stream.h b/google/cloud/storage/object_read_stream.h index da2ebcffb6fa5..c94ecb0096aff 100644 --- a/google/cloud/storage/object_read_stream.h +++ b/google/cloud/storage/object_read_stream.h @@ -79,7 +79,6 @@ class ObjectReadStream : public std::basic_istream { */ void Close(); - //@{ /** * Report any download errors. * @@ -137,6 +136,50 @@ class ObjectReadStream : public std::basic_istream { * next, as we find more (or different) opportunities for optimization. */ HeadersMap const& headers() const { return buf_->headers(); } + + //@{ + /** + * @name Object metadata information. + * + * When downloading an object a limited amount of information about the + * object's metadata is returned as part of the download. Some of this + * information is important for applications performing multiple downloads + * (maybe of different ranges) of the same object. Such applications may + * want to use the generation number to guarantee all the downloads are + * actually referencing the same object. One could do this by first querying + * the metadata before the first download, but this is less efficient as it + * requires one additional server roundtrip. + * + * Note that all these attributes are `absl::optional<>`, as the attributes + * may not be known (or exist) if there is an error during the download. If + * the attribute is needed for the application's correctness the application + * should fetch the object metadata when the attribute is not available. + */ + /// The object's generation at the time of the download, if known. + absl::optional const& generation() const { + return buf_->generation(); + } + + /// The object's metageneration at the time of the download, if known. + absl::optional const& metageneration() const { + return buf_->metageneration(); + } + + /// The object's storage class at the time of the download, if known. + absl::optional const& storage_class() const { + return buf_->storage_class(); + } + + /** + * The object's size at the time of the download, if known. + * + * If you are using [object transcoding] this represents the stored size of + * the object, the number of downloaded bytes (after decompression) may be + * larger. + * + * [object transcoding]: https://cloud.google.com/storage/docs/transcoding + */ + absl::optional const& size() const { return buf_->size(); } //@} private: diff --git a/google/cloud/storage/tests/object_read_headers_integration_test.cc b/google/cloud/storage/tests/object_read_headers_integration_test.cc index f6d55b73ecca7..62ca0251fcbf4 100644 --- a/google/cloud/storage/tests/object_read_headers_integration_test.cc +++ b/google/cloud/storage/tests/object_read_headers_integration_test.cc @@ -51,6 +51,79 @@ class ObjectReadHeadersIntegrationTest std::string bucket_name_; }; +TEST_F(ObjectReadHeadersIntegrationTest, CaptureMetadataXml) { + StatusOr client = MakeIntegrationTestClient(); + ASSERT_STATUS_OK(client); + + auto const object_name = MakeRandomObjectName(); + + auto insert = client->InsertObject(bucket_name(), object_name, LoremIpsum(), + IfGenerationMatch(0)); + ASSERT_THAT(insert, IsOk()); + ScheduleForDelete(*insert); + + auto is = client->ReadObject(bucket_name(), object_name, + Generation(insert->generation())); + EXPECT_EQ(insert->generation(), is.generation().value_or(0)); + EXPECT_EQ(insert->metageneration(), is.metageneration().value_or(0)); + EXPECT_EQ(insert->storage_class(), is.storage_class().value_or("")); + EXPECT_EQ(insert->size(), is.size().value_or(0)); + + auto const actual = std::string{std::istreambuf_iterator(is), {}}; + is.Close(); + EXPECT_THAT(is.status(), IsOk()); +} + +TEST_F(ObjectReadHeadersIntegrationTest, CaptureMetadataJson) { + StatusOr client = MakeIntegrationTestClient(); + ASSERT_STATUS_OK(client); + + auto const object_name = MakeRandomObjectName(); + + auto insert = client->InsertObject(bucket_name(), object_name, LoremIpsum(), + IfGenerationMatch(0)); + ASSERT_THAT(insert, IsOk()); + ScheduleForDelete(*insert); + + auto is = client->ReadObject( + bucket_name(), object_name, Generation(insert->generation()), + // Force JSON (if using REST) as this is not supported by the XML API. + IfMetagenerationNotMatch(0)); + EXPECT_EQ(insert->generation(), is.generation().value_or(0)); + EXPECT_EQ(insert->metageneration(), is.metageneration().value_or(0)); + EXPECT_EQ(insert->storage_class(), is.storage_class().value_or("")); + EXPECT_EQ(insert->size(), is.size().value_or(0)); + + auto const actual = std::string{std::istreambuf_iterator(is), {}}; + is.Close(); + EXPECT_THAT(is.status(), IsOk()); +} + +TEST_F(ObjectReadHeadersIntegrationTest, CaptureMetadataJsonRanged) { + StatusOr client = MakeIntegrationTestClient(); + ASSERT_STATUS_OK(client); + + auto const object_name = MakeRandomObjectName(); + + auto insert = client->InsertObject(bucket_name(), object_name, LoremIpsum(), + IfGenerationMatch(0)); + ASSERT_THAT(insert, IsOk()); + ScheduleForDelete(*insert); + + auto is = client->ReadObject( + bucket_name(), object_name, Generation(insert->generation()), + // Force JSON (if using REST) as this is not supported by the XML API. + IfMetagenerationNotMatch(0), ReadFromOffset(4)); + EXPECT_EQ(insert->generation(), is.generation().value_or(0)); + EXPECT_EQ(insert->metageneration(), is.metageneration().value_or(0)); + EXPECT_EQ(insert->storage_class(), is.storage_class().value_or("")); + EXPECT_EQ(insert->size(), is.size().value_or(0)); + + auto const actual = std::string{std::istreambuf_iterator(is), {}}; + is.Close(); + EXPECT_THAT(is.status(), IsOk()); +} + TEST_F(ObjectReadHeadersIntegrationTest, SmokeTest) { StatusOr client = MakeIntegrationTestClient(); ASSERT_STATUS_OK(client);