From 1f7dc2ea350fda9a0840dc318134987c77ce1d81 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Wed, 16 Apr 2025 18:36:34 +0200 Subject: [PATCH] Make DataLake metadata more lazy --- src/Disks/ObjectStorages/IObjectStorage.cpp | 10 ++++++++++ src/Disks/ObjectStorages/IObjectStorage.h | 2 ++ .../ObjectStorage/DataLakes/IDataLakeMetadata.cpp | 10 +++++++--- src/Storages/ObjectStorage/ReadBufferIterator.cpp | 8 +++----- .../ObjectStorage/StorageObjectStorageSource.cpp | 6 +----- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index ce5f06e8f25f..f729b7ce6913 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -97,4 +97,14 @@ WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings return write_settings; } + +void RelativePathWithMetadata::loadMetadata(ObjectStoragePtr object_storage) +{ + if (!metadata) + { + const auto & path = isArchive() ? getPathToArchive() : getPath(); + metadata = object_storage->tryGetObjectMetadata(path); + } +} + } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 26d28906c17d..f99864ebb25c 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -83,6 +83,8 @@ struct RelativePathWithMetadata virtual bool isArchive() const { return false; } virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } + + void loadMetadata(ObjectStoragePtr object_storage); }; struct ObjectKeyWithMetadata diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index 61f31766a455..6bbf81c74965 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -34,12 +34,16 @@ class KeysIterator : public IObjectIterator return nullptr; auto key = data_files[current_index]; - auto object_metadata = object_storage->getObjectMetadata(key); if (callback) - callback(FileProgress(0, object_metadata.size_bytes)); + { + /// Too expencive to load size for metadata always + /// because it requires API call to external storage. + /// In many cases only keys are needed. + callback(FileProgress(0, 1)); + } - return std::make_shared(key, std::move(object_metadata)); + return std::make_shared(key, std::nullopt); } } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index df9faa048c2b..2baed9ad4176 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -74,10 +74,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( const auto & object_info = (*it); auto get_last_mod_time = [&] -> std::optional { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - if (!object_info->metadata) - object_info->metadata = object_storage->tryGetObjectMetadata(path); - + object_info->loadMetadata(object_storage); return object_info->metadata ? std::optional(object_info->metadata->last_modified.epochTime()) : std::nullopt; @@ -149,7 +146,6 @@ std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { auto context = getContext(); - const auto & path = current_object_info->isArchive() ? current_object_info->getPathToArchive() : current_object_info->getPath(); auto impl = StorageObjectStorageSource::createReadBuffer(*current_object_info, object_storage, context, getLogger("ReadBufferIterator")); const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); @@ -248,6 +244,8 @@ ReadBufferIterator::Data ReadBufferIterator::next() prev_read_keys_size = read_keys.size(); } + current_object_info->loadMetadata(object_storage); + if (query_settings.skip_empty_files && current_object_info->metadata && current_object_info->metadata->size_bytes == 0) continue; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 5622a2605535..f0c541eddff7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -389,11 +389,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (!object_info || object_info->getPath().empty()) return {}; - if (!object_info->metadata) - { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - object_info->metadata = object_storage->getObjectMetadata(path); - } + object_info->loadMetadata(object_storage); } while (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0);