From fc2395806612fc1727a9a88a939bbe3589922a54 Mon Sep 17 00:00:00 2001
From: Daniil Ivanik <61067749+divanik@users.noreply.github.com>
Date: Thu, 27 Feb 2025 12:31:59 +0000
Subject: [PATCH 01/14] Merge pull request #75010 from
 ClickHouse/divanik/add_sequence_id_and_implementation_refactoring

Add sequence id to iceberg metadata and iceberg avro parsing refactoring
---
 src/Common/ErrorCodes.cpp                     |   1 +
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 174 ++++++++++++------
 .../DataLakes/Iceberg/IcebergMetadata.h       |  25 +--
 .../DataLakes/Iceberg/IteratorWrapper.h       |  29 +++
 .../DataLakes/Iceberg/ManifestFile.cpp        | 167 ++++++++++-------
 .../DataLakes/Iceberg/ManifestFile.h          |  37 ++--
 .../DataLakes/Iceberg/ManifestFileImpl.h      |   7 +-
 .../DataLakes/Iceberg/Snapshot.h              |  27 +--
 .../ObjectStorage/DataLakes/Iceberg/Utils.cpp |  75 ++++++++
 .../ObjectStorage/DataLakes/Iceberg/Utils.h   |   8 +
 10 files changed, 380 insertions(+), 170 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index c9ae045b7db5..b531d247066b 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -620,6 +620,7 @@
     M(740, POTENTIALLY_BROKEN_DATA_PART) \
     M(741, TABLE_UUID_MISMATCH) \
     M(742, DELTA_KERNEL_ERROR) \
+    M(743, ICEBERG_SPECIFICATION_VIOLATION) \
 \
     M(900, DISTRIBUTED_CACHE_ERROR) \
     M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 2f8e80b34e26..2f2615c36e68 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -1,4 +1,5 @@
 #include "Core/NamesAndTypes.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
 #include "config.h"
 
 #if USE_AVRO
@@ -39,10 +40,16 @@ extern const int FILE_DOESNT_EXIST;
 extern const int ILLEGAL_COLUMN;
 extern const int BAD_ARGUMENTS;
 extern const int LOGICAL_ERROR;
+extern const int ICEBERG_SPECIFICATION_VIOLATION;
 }
 
 using namespace Iceberg;
 
+
+constexpr const char * COLUMN_SEQ_NUMBER_NAME = "sequence_number";
+constexpr const char * COLUMN_MANIFEST_FILE_PATH_NAME = "manifest_path";
+constexpr const char * FIELD_FORMAT_VERSION_NAME = "format-version";
+
 std::pair<Int32, Poco::JSON::Object::Ptr>
 parseTableSchemaFromManifestFile(const avro::DataFileReaderBase & manifest_file_reader, const String & manifest_file_name)
 {
@@ -132,7 +139,7 @@ std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV1Method(const Poco::J
 Int32 IcebergMetadata::parseTableSchema(
     const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, LoggerPtr metadata_logger)
 {
-    Int32 format_version = metadata_object->getValue<Int32>("format-version");
+    Int32 format_version = metadata_object->getValue<Int32>(FIELD_FORMAT_VERSION_NAME);
     if (format_version == 2)
     {
         auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object);
@@ -242,11 +249,12 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
 
     auto metadata_object = readJSON(metadata_file_path, local_context);
 
-    chassert(format_version == metadata_object->getValue<int>("format-version"));
+    chassert(format_version == metadata_object->getValue<int>(FIELD_FORMAT_VERSION_NAME));
 
 
     auto manifest_list_file = getRelevantManifestList(metadata_object);
-    if (manifest_list_file && (!current_snapshot.has_value() || (manifest_list_file.value() != current_snapshot->getName())))
+    if (manifest_list_file
+        && (!current_snapshot.has_value() || (manifest_list_file.value() != current_snapshot->manifest_list_iterator.getName())))
     {
         current_snapshot = getSnapshot(manifest_list_file.value());
         cached_unprunned_files_for_current_snapshot = std::nullopt;
@@ -278,12 +286,13 @@ std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON:
 
 std::optional<Int32> IcebergMetadata::getSchemaVersionByFileIfOutdated(String data_path) const
 {
-    auto manifest_file_it = manifest_entry_by_data_file.find(data_path);
-    if (manifest_file_it == manifest_entry_by_data_file.end())
+    auto manifest_file_it = manifest_file_by_data_file.find(data_path);
+    if (manifest_file_it == manifest_file_by_data_file.end())
     {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find manifest file for data file: {}", data_path);
     }
-    auto schema_id = manifest_file_it->second.getContent().getSchemaId();
+    const ManifestFileContent & manifest_file = *manifest_file_it->second;
+    auto schema_id = manifest_file.getSchemaId();
     if (schema_id == current_schema_id)
         return std::nullopt;
     return std::optional{schema_id};
@@ -314,7 +323,7 @@ DataLakeMetadataPtr IcebergMetadata::create(
 
     IcebergSchemaProcessor schema_processor;
 
-    auto format_version = object->getValue<int>("format-version");
+    auto format_version = object->getValue<int>(FIELD_FORMAT_VERSION_NAME);
 
     auto ptr
         = std::make_unique<IcebergMetadata>(object_storage, configuration_ptr, local_context, metadata_version, format_version, object);
@@ -322,7 +331,7 @@ DataLakeMetadataPtr IcebergMetadata::create(
     return ptr;
 }
 
-ManifestList IcebergMetadata::initializeManifestList(const String & manifest_list_file) const
+ManifestList IcebergMetadata::initializeManifestList(const String & filename) const
 {
     auto configuration_ptr = configuration.lock();
     if (configuration_ptr == nullptr)
@@ -330,46 +339,83 @@ ManifestList IcebergMetadata::initializeManifestList(const String & manifest_lis
 
     auto context = getContext();
     ObjectInfo object_info(
-        std::filesystem::path(configuration_ptr->getPath()) / "metadata" / manifest_list_file);
+        std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename);
     auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
 
     auto manifest_list_file_reader
         = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*manifest_list_buf));
 
-    auto data_type = AvroSchemaReader::avroNodeToDataType(manifest_list_file_reader->dataSchema().root()->leafAt(0));
-    Block header{{data_type->createColumn(), data_type, "manifest_path"}};
+    auto [name_to_index, name_to_data_type, header] = getColumnsAndTypesFromAvroByNames(
+        manifest_list_file_reader->dataSchema().root(),
+        {COLUMN_MANIFEST_FILE_PATH_NAME, COLUMN_SEQ_NUMBER_NAME},
+        {avro::Type::AVRO_STRING, avro::Type::AVRO_LONG});
+
+    if (name_to_index.find(COLUMN_MANIFEST_FILE_PATH_NAME) == name_to_index.end())
+        throw Exception(
+            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+            "Required columns are not found in manifest file: {}",
+            COLUMN_MANIFEST_FILE_PATH_NAME);
+    if (format_version > 1 && name_to_index.find(COLUMN_SEQ_NUMBER_NAME) == name_to_index.end())
+        throw Exception(
+            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: sequence_number");
+
+
     auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
-    auto & col = columns.at(0);
+    const auto & manifest_path_col = columns.at(name_to_index.at(COLUMN_MANIFEST_FILE_PATH_NAME));
+
+    std::optional<const ColumnInt64 *> sequence_number_column = std::nullopt;
+    if (format_version > 1)
+    {
+        if (columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getDataType() != TypeIndex::Int64)
+        {
+            throw Exception(
+                DB::ErrorCodes::ILLEGAL_COLUMN,
+                "The parsed column from Avro file of `sequence_number` field should be Int64 type, got {}",
+                columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getFamilyName());
+        }
+        sequence_number_column = assert_cast<const ColumnInt64 *>(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME)).get());
+    }
 
-    if (col->getDataType() != TypeIndex::String)
+    if (manifest_path_col->getDataType() != TypeIndex::String)
     {
         throw Exception(
             ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `manifest_path` field should be String type, got {}",
-            col->getFamilyName());
+            "The parsed column from Avro file of `{}` field should be String type, got {}",
+            COLUMN_MANIFEST_FILE_PATH_NAME,
+            manifest_path_col->getFamilyName());
     }
 
-    const auto * col_str = typeid_cast<ColumnString *>(col.get());
-    std::vector<ManifestFileEntry> manifest_files;
-    for (size_t i = 0; i < col_str->size(); ++i)
+    const auto * manifest_path_col_str = typeid_cast<ColumnString *>(manifest_path_col.get());
+    ManifestList manifest_list;
+
+
+    for (size_t i = 0; i < manifest_path_col_str->size(); ++i)
     {
-        const auto file_path = col_str->getDataAt(i).toView();
-        const auto filename = std::filesystem::path(file_path).filename();
-        String manifest_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename;
-        auto manifest_file_it = manifest_files_by_name.find(manifest_file);
-        if (manifest_file_it != manifest_files_by_name.end())
+        const auto file_path = manifest_path_col_str->getDataAt(i).toView();
+        const auto current_filename = std::filesystem::path(file_path).filename();
+        Int64 added_sequence_number = 0;
+        if (format_version > 1)
         {
-            manifest_files.emplace_back(manifest_file_it);
-            continue;
+            added_sequence_number = sequence_number_column.value()->getInt(i);
         }
-        manifest_files.emplace_back(initializeManifestFile(filename, configuration_ptr));
+        /// We can't encapsulate this logic in getManifestFile because we need not only the name of the file, but also an inherited sequence number which is known only during the parsing of ManifestList
+        auto manifest_file_content = initializeManifestFile(current_filename, added_sequence_number);
+        auto [iterator, _inserted] = manifest_files_by_name.emplace(current_filename, std::move(manifest_file_content));
+        auto manifest_file_iterator = ManifestFileIterator{iterator};
+        for (const auto & data_file_path : manifest_file_iterator->getFiles())
+        {
+            if (std::holds_alternative<DataFileEntry>(data_file_path.file))
+                manifest_file_by_data_file.emplace(std::get<DataFileEntry>(data_file_path.file).file_name, manifest_file_iterator);
+        }
+        manifest_list.push_back(ManifestListFileEntry{manifest_file_iterator, added_sequence_number});
     }
 
-    return ManifestList{manifest_files};
+    return manifest_list;
 }
 
-ManifestFileEntry IcebergMetadata::initializeManifestFile(const String & filename, const ConfigurationPtr & configuration_ptr) const
+ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const
 {
+    auto configuration_ptr = configuration.lock();
     String manifest_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename;
 
     ObjectInfo manifest_object_info(manifest_file);
@@ -383,32 +429,49 @@ ManifestFileEntry IcebergMetadata::initializeManifestFile(const String & filenam
         configuration_ptr->getPath(),
         getFormatSettings(getContext()),
         schema_id,
-        schema_processor);
-    auto [manifest_file_iterator, _inserted]
-        = manifest_files_by_name.emplace(manifest_file, ManifestFileContent(std::move(manifest_file_impl)));
-    ManifestFileEntry manifest_file_entry{manifest_file_iterator};
-    for (const auto & data_file : manifest_file_entry.getContent().getDataFiles())
-    {
-        manifest_entry_by_data_file.emplace(data_file.data_file_name, manifest_file_entry);
-    }
-    return manifest_file_entry;
+        schema_processor,
+        inherited_sequence_number);
+    return ManifestFileContent(std::move(manifest_file_impl));
+}
+
+ManifestFileIterator IcebergMetadata::getManifestFile(const String & filename) const
+{
+    auto manifest_file_it = manifest_files_by_name.find(filename);
+    if (manifest_file_it != manifest_files_by_name.end())
+        return ManifestFileIterator{manifest_file_it};
+    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find manifest file: {}", filename);
+}
+
+std::optional<ManifestFileIterator> IcebergMetadata::tryGetManifestFile(const String & filename) const
+{
+    auto manifest_file_it = manifest_files_by_name.find(filename);
+    if (manifest_file_it != manifest_files_by_name.end())
+        return ManifestFileIterator{manifest_file_it};
+    return std::nullopt;
+}
+
+ManifestListIterator IcebergMetadata::getManifestList(const String & filename) const
+{
+    auto manifest_file_it = manifest_lists_by_name.find(filename);
+    if (manifest_file_it != manifest_lists_by_name.end())
+        return ManifestListIterator{manifest_file_it};
+    auto configuration_ptr = configuration.lock();
+    auto [manifest_file_iterator, _inserted] = manifest_lists_by_name.emplace(filename, initializeManifestList(filename));
+    return ManifestListIterator{manifest_file_iterator};
 }
 
 
-IcebergSnapshot IcebergMetadata::getSnapshot(const String & manifest_list_file) const
+IcebergSnapshot IcebergMetadata::getSnapshot(const String & filename) const
 {
-    const auto manifest_list_file_it = manifest_lists_by_name.find(manifest_list_file);
-    if (manifest_list_file_it != manifest_lists_by_name.end())
-        return IcebergSnapshot(manifest_list_file_it);
-    return IcebergSnapshot{manifest_lists_by_name.emplace(manifest_list_file, initializeManifestList(manifest_list_file)).first};
+    return IcebergSnapshot{getManifestList(filename)};
 }
 
 std::vector<Int32>
-getRelevantPartitionColumnIds(const ManifestFileEntry & entry, const IcebergSchemaProcessor & schema_processor, Int32 current_schema_id)
+getRelevantPartitionColumnIds(const ManifestFileIterator & entry, const IcebergSchemaProcessor & schema_processor, Int32 current_schema_id)
 {
     std::vector<Int32> partition_column_ids;
-    partition_column_ids.reserve(entry.getContent().getPartitionColumnInfos().size());
-    for (const auto & partition_column_info : entry.getContent().getPartitionColumnInfos())
+    partition_column_ids.reserve(entry->getPartitionColumnInfos().size());
+    for (const auto & partition_column_info : entry->getPartitionColumnInfos())
     {
         std::optional<NameAndTypePair> name_and_type
             = schema_processor.tryGetFieldCharacteristics(current_schema_id, partition_column_info.source_id);
@@ -430,9 +493,10 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
         return cached_unprunned_files_for_current_snapshot.value();
 
     Strings data_files;
-    for (const auto & manifest_entry : current_snapshot->getManifestList().getManifestFiles())
+    for (const auto & manifest_list_entry : *(current_snapshot->manifest_list_iterator))
     {
-        const auto & partition_columns_ids = getRelevantPartitionColumnIds(manifest_entry, schema_processor, current_schema_id);
+        const auto & partition_columns_ids
+            = getRelevantPartitionColumnIds(manifest_list_entry.manifest_file, schema_processor, current_schema_id);
         const auto & partition_pruning_columns_names_and_types
             = schema_processor.tryGetFieldsCharacteristics(current_schema_id, partition_columns_ids);
 
@@ -441,16 +505,20 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
         const KeyCondition partition_key_condition(
             filter_dag, getContext(), partition_pruning_columns_names_and_types.getNames(), partition_minmax_idx_expr);
 
-        const auto & data_files_in_manifest = manifest_entry.getContent().getDataFiles();
-        for (const auto & data_file : data_files_in_manifest)
+        const auto & data_files_in_manifest = manifest_list_entry.manifest_file->getFiles();
+        for (const auto & manifest_file_entry : data_files_in_manifest)
         {
-            if (data_file.status != ManifestEntryStatus::DELETED)
+            if (manifest_file_entry.status != ManifestEntryStatus::DELETED)
             {
                 if (partition_key_condition
                         .checkInHyperrectangle(
-                            data_file.getPartitionRanges(partition_columns_ids), partition_pruning_columns_names_and_types.getTypes())
+                            manifest_file_entry.getPartitionRanges(partition_columns_ids),
+                            partition_pruning_columns_names_and_types.getTypes())
                         .can_be_true)
-                    data_files.push_back(data_file.data_file_name);
+                {
+                    if (std::holds_alternative<DataFileEntry>(manifest_file_entry.file))
+                        data_files.push_back(std::get<DataFileEntry>(manifest_file_entry.file).file_name);
+                }
                 else
                     ProfileEvents::increment(ProfileEvents::IcebergPartitionPrunnedFiles);
             }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 9de2a296a8d1..684035dfa41e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -14,11 +14,9 @@
 #include <Poco/JSON/Parser.h>
 
 #include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h"
 #include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
 #include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
 
-#include <unordered_map>
 
 namespace DB
 {
@@ -86,22 +84,21 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     bool update(const ContextPtr & local_context) override;
 
-
     Strings makePartitionPruning(const ActionsDAG & filter_dag) override;
 
     bool supportsPartitionPruning() override { return true; }
 
 private:
-    using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFileEntry>;
+    using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFileIterator>;
 
     const ObjectStoragePtr object_storage;
     const ConfigurationObserverPtr configuration;
     mutable IcebergSchemaProcessor schema_processor;
     LoggerPtr log;
 
-    mutable Iceberg::ManifestFilesByName manifest_files_by_name;
-    mutable Iceberg::ManifestListsByName manifest_lists_by_name;
-    mutable ManifestEntryByDataFile manifest_entry_by_data_file;
+    mutable Iceberg::ManifestFilesStorage manifest_files_by_name;
+    mutable Iceberg::ManifestListsStorage manifest_lists_by_name;
+    mutable ManifestEntryByDataFile manifest_file_by_data_file;
 
     Int32 current_metadata_version;
     Int32 format_version;
@@ -110,21 +107,27 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     mutable std::optional<Strings> cached_unprunned_files_for_current_snapshot;
 
-    Iceberg::ManifestList initializeManifestList(const String & manifest_list_file) const;
+    mutable std::vector<Iceberg::ManifestFileEntry> positional_delete_files_for_current_query;
+
+    Iceberg::ManifestList initializeManifestList(const String & filename) const;
 
-    Iceberg::IcebergSnapshot getSnapshot(const String & manifest_list_file) const;
+    Iceberg::ManifestListIterator getManifestList(const String & filename) const;
+
+    Iceberg::IcebergSnapshot getSnapshot(const String & filename) const;
 
     std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const;
 
-    Iceberg::ManifestFileEntry getManifestFile(const String & manifest_file) const;
+    Iceberg::ManifestFileContent initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const;
 
-    Iceberg::ManifestFileEntry initializeManifestFile(const String & filename, const ConfigurationPtr & configuration_ptr) const;
+    Iceberg::ManifestFileIterator getManifestFile(const String & filename) const;
 
     std::optional<String> getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata);
 
     Poco::JSON::Object::Ptr readJSON(const String & metadata_file_path, const ContextPtr & local_context) const;
 
     Strings getDataFilesImpl(const ActionsDAG * filter_dag) const;
+
+    std::optional<Iceberg::ManifestFileIterator> tryGetManifestFile(const String & filename) const;
 };
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h
new file mode 100644
index 000000000000..e64a6e95fb81
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <map>
+#include <base/types.h>
+namespace Iceberg
+{
+
+template <typename T>
+class IteratorWrapper
+{
+private:
+    using StorageType = std::map<String, T>;
+    using StorageConstIterator = StorageType::const_iterator;
+    using StorageIterator = StorageType::iterator;
+
+public:
+    explicit IteratorWrapper(StorageConstIterator iterator_) : iterator(iterator_) { }
+    explicit IteratorWrapper(StorageIterator iterator_) : iterator(iterator_) { }
+
+    String getName() const { return iterator->first; }
+
+    const T * operator->() const { return &iterator->second; }
+    const T & operator*() const { return iterator->second; }
+
+private:
+    StorageIterator iterator;
+};
+
+}
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index ab3d2221345d..c220e68f9762 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -11,20 +11,30 @@
 #include <Poco/JSON/Parser.h>
 #include "DataTypes/DataTypeTuple.h"
 
+#    include <Common/logger_useful.h>
 
 namespace DB::ErrorCodes
 {
 extern const int ILLEGAL_COLUMN;
 extern const int BAD_ARGUMENTS;
 extern const int UNSUPPORTED_METHOD;
+extern const int ICEBERG_SPECIFICATION_VIOLATION;
 }
 
 namespace Iceberg
 {
 
-const std::vector<DataFileEntry> & ManifestFileContent::getDataFiles() const
+constexpr const char * COLUMN_STATUS_NAME = "status";
+constexpr const char * COLUMN_TUPLE_DATA_FILE_NAME = "data_file";
+constexpr const char * COLUMN_SEQ_NUMBER_NAME = "sequence_number";
+
+constexpr const char * SUBCOLUMN_FILE_PATH_NAME = "file_path";
+constexpr const char * SUBCOLUMN_CONTENT_NAME = "content";
+constexpr const char * SUBCOLUMN_PARTITION_NAME = "partition";
+
+const std::vector<ManifestFileEntry> & ManifestFileContent::getFiles() const
 {
-    return impl->data_files;
+    return impl->files;
 }
 
 Int32 ManifestFileContent::getSchemaId() const
@@ -32,7 +42,7 @@ Int32 ManifestFileContent::getSchemaId() const
     return impl->schema_id;
 }
 
-std::vector<DB::Range> DataFileEntry::getPartitionRanges(const std::vector<Int32> & partition_columns_ids) const
+std::vector<DB::Range> ManifestFileEntry::getPartitionRanges(const std::vector<Int32> & partition_columns_ids) const
 {
     std::vector<DB::Range> filtered_partition_ranges;
     filtered_partition_ranges.reserve(partition_columns_ids.size());
@@ -46,6 +56,7 @@ std::vector<DB::Range> DataFileEntry::getPartitionRanges(const std::vector<Int32
 
 const std::vector<PartitionColumnInfo> & ManifestFileContent::getPartitionColumnInfos() const
 {
+    chassert(impl != nullptr);
     return impl->partition_column_infos;
 }
 
@@ -63,97 +74,77 @@ ManifestFileContentImpl::ManifestFileContentImpl(
     const String & common_path,
     const DB::FormatSettings & format_settings,
     Int32 schema_id_,
-    const IcebergSchemaProcessor & schema_processor)
+    const IcebergSchemaProcessor & schema_processor,
+    Int64 inherited_sequence_number)
 {
     this->schema_id = schema_id_;
+
     avro::NodePtr root_node = manifest_file_reader_->dataSchema().root();
-    size_t leaves_num = root_node->leaves();
-    size_t expected_min_num = format_version_ == 1 ? 3 : 2;
-    if (leaves_num < expected_min_num)
-    {
-        throw Exception(
-            DB::ErrorCodes::BAD_ARGUMENTS, "Unexpected number of columns {}. Expected at least {}", root_node->leaves(), expected_min_num);
-    }
 
-    avro::NodePtr status_node = root_node->leafAt(0);
-    if (status_node->type() != avro::Type::AVRO_INT)
-    {
-        throw Exception(
-            DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `status` field should be Int type, got {}",
-            magic_enum::enum_name(status_node->type()));
-    }
+    auto [name_to_index, name_to_data_type, manifest_file_header] = getColumnsAndTypesFromAvroByNames(
+        root_node,
+        {COLUMN_STATUS_NAME, COLUMN_TUPLE_DATA_FILE_NAME, COLUMN_SEQ_NUMBER_NAME},
+        {avro::Type::AVRO_INT, avro::Type::AVRO_RECORD, avro::Type::AVRO_UNION});
 
-    avro::NodePtr data_file_node = root_node->leafAt(static_cast<int>(leaves_num) - 1);
-    if (data_file_node->type() != avro::Type::AVRO_RECORD)
+    for (const auto & column_name : {COLUMN_STATUS_NAME, COLUMN_TUPLE_DATA_FILE_NAME})
     {
-        throw Exception(
-            DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `data_file` field should be Tuple type, got {}",
-            magic_enum::enum_name(data_file_node->type()));
+        if (name_to_index.find(column_name) == name_to_index.end())
+            throw Exception(
+                DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: {}", column_name);
     }
 
-    auto status_col_data_type = AvroSchemaReader::avroNodeToDataType(status_node);
-    auto data_col_data_type = AvroSchemaReader::avroNodeToDataType(data_file_node);
-    Block manifest_file_header
-        = {{status_col_data_type->createColumn(), status_col_data_type, "status"},
-           {data_col_data_type->createColumn(), data_col_data_type, "data_file"}};
+    if (format_version_ > 1 && name_to_index.find(COLUMN_SEQ_NUMBER_NAME) == name_to_index.end())
+        throw Exception(
+            ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: {}", COLUMN_SEQ_NUMBER_NAME);
 
     auto columns = parseAvro(*manifest_file_reader_, manifest_file_header, format_settings);
-    if (columns.size() != 2)
-        throw Exception(DB::ErrorCodes::ILLEGAL_COLUMN, "Unexpected number of columns. Expected 2, got {}", columns.size());
-
-    if (columns.at(0)->getDataType() != TypeIndex::Int32)
+    if (columns.at(name_to_index.at(COLUMN_STATUS_NAME))->getDataType() != TypeIndex::Int32)
     {
         throw Exception(
             DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `status` field should be Int32 type, got {}",
-            columns.at(0)->getFamilyName());
+            "The parsed column from Avro file of `{}` field should be Int32 type, got {}",
+            COLUMN_STATUS_NAME,
+            columns.at(name_to_index.at(COLUMN_STATUS_NAME))->getFamilyName());
     }
-    if (columns.at(1)->getDataType() != TypeIndex::Tuple)
+    if (columns.at(name_to_index.at(COLUMN_TUPLE_DATA_FILE_NAME))->getDataType() != TypeIndex::Tuple)
     {
         throw Exception(
             DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `file_path` field should be Tuple type, got {}",
-            columns.at(1)->getFamilyName());
+            "The parsed column from Avro file of `{}` field should be Tuple type, got {}",
+            COLUMN_TUPLE_DATA_FILE_NAME,
+            magic_enum::enum_name(columns.at(name_to_index.at(COLUMN_TUPLE_DATA_FILE_NAME))->getDataType()));
     }
 
-    const auto * status_int_column = assert_cast<DB::ColumnInt32 *>(columns.at(0).get());
-    const auto & data_file_tuple_type = assert_cast<const DataTypeTuple &>(*data_col_data_type.get());
-    const auto * data_file_tuple_column = assert_cast<DB::ColumnTuple *>(columns.at(1).get());
+    const auto * status_int_column = assert_cast<DB::ColumnInt32 *>(columns.at(name_to_index.at(COLUMN_STATUS_NAME)).get());
 
-    if (status_int_column->size() != data_file_tuple_column->size())
-    {
-        throw Exception(
-            DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `file_path` and `status` have different rows number: {} and {}",
-            status_int_column->size(),
-            data_file_tuple_column->size());
-    }
+    const auto & data_file_tuple_type = assert_cast<const DataTypeTuple &>(*name_to_data_type.at(COLUMN_TUPLE_DATA_FILE_NAME).get());
+    const auto * data_file_tuple_column = assert_cast<DB::ColumnTuple *>(columns.at(name_to_index.at(COLUMN_TUPLE_DATA_FILE_NAME)).get());
 
-    ColumnPtr file_path_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("file_path"));
+    ColumnPtr file_path_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName(SUBCOLUMN_FILE_PATH_NAME));
 
     if (file_path_column->getDataType() != TypeIndex::String)
     {
         throw Exception(
             ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `file_path` field should be String type, got {}",
-            file_path_column->getFamilyName());
+            "The parsed column from Avro file of `{}` field should be String type, got {}",
+            SUBCOLUMN_FILE_PATH_NAME,
+            magic_enum::enum_name(file_path_column->getDataType()));
     }
 
     const auto * file_path_string_column = assert_cast<const ColumnString *>(file_path_column.get());
 
     ColumnPtr content_column;
     const ColumnInt32 * content_int_column = nullptr;
-    if (format_version_ == 2)
+    if (format_version_ > 1)
     {
-        content_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("content"));
+        content_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName(SUBCOLUMN_CONTENT_NAME));
         if (content_column->getDataType() != TypeIndex::Int32)
         {
             throw Exception(
                 ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `content` field should be Int type, got {}",
-                content_column->getFamilyName());
+                "The parsed column from Avro file of `{}` field should be Int type, got {}",
+                SUBCOLUMN_CONTENT_NAME,
+                magic_enum::enum_name(content_column->getDataType()));
         }
 
         content_int_column = assert_cast<const ColumnInt32 *>(content_column.get());
@@ -162,13 +153,14 @@ ManifestFileContentImpl::ManifestFileContentImpl(
 
     Poco::JSON::Parser parser;
 
-    ColumnPtr big_partition_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("partition"));
+    ColumnPtr big_partition_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName(SUBCOLUMN_PARTITION_NAME));
     if (big_partition_column->getDataType() != TypeIndex::Tuple)
     {
         throw Exception(
             ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `partition` field should be Tuple type, got {}",
-            big_partition_column->getFamilyName());
+            "The parsed column from Avro file of `{}` field should be Tuple type, got {}",
+            SUBCOLUMN_PARTITION_NAME,
+            magic_enum::enum_name(big_partition_column->getDataType()));
     }
     const auto * big_partition_tuple = assert_cast<const ColumnTuple *>(big_partition_column.get());
 
@@ -198,13 +190,35 @@ ManifestFileContentImpl::ManifestFileContentImpl(
         partition_columns.push_back(removeNullable(big_partition_tuple->getColumnPtr(i)));
     }
 
+    std::optional<const ColumnNullable *> sequence_number_column = std::nullopt;
+    if (format_version_ > 1)
+    {
+        if (columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getDataType() != TypeIndex::Nullable)
+        {
+            throw Exception(
+                DB::ErrorCodes::ILLEGAL_COLUMN,
+                "The parsed column from Avro file of `{}` field should be Nullable type, got {}",
+                COLUMN_SEQ_NUMBER_NAME,
+                magic_enum::enum_name(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getDataType()));
+        }
+        sequence_number_column = assert_cast<const ColumnNullable *>(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME)).get());
+        if (sequence_number_column.value()->getNestedColumnPtr()->getDataType() != TypeIndex::Int64)
+        {
+            throw Exception(
+                DB::ErrorCodes::ILLEGAL_COLUMN,
+                "The parsed column from Avro file of `{}` field should be Int64 type, got {}",
+                COLUMN_SEQ_NUMBER_NAME,
+                magic_enum::enum_name(sequence_number_column.value()->getNestedColumnPtr()->getDataType()));
+        }
+    }
+
     for (size_t i = 0; i < data_file_tuple_column->size(); ++i)
     {
-        DataFileContent content_type = DataFileContent::DATA;
-        if (format_version_ == 2)
+        FileContentType content_type = FileContentType::DATA;
+        if (format_version_ > 1)
         {
-            content_type = DataFileContent(content_int_column->getElement(i));
-            if (content_type != DataFileContent::DATA)
+            content_type = FileContentType(content_int_column->getElement(i));
+            if (content_type != FileContentType::DATA)
                 throw Exception(
                     ErrorCodes::UNSUPPORTED_METHOD, "Cannot read Iceberg table: positional and equality deletes are not supported");
         }
@@ -228,7 +242,30 @@ ManifestFileContentImpl::ManifestFileContentImpl(
                     partition_columns[j],
                     schema_processor.getFieldCharacteristics(schema_id, source_id).type));
         }
-        this->data_files.push_back({file_path, status, content_type, partition_ranges});
+        FileEntry file = FileEntry{DataFileEntry{file_path}};
+
+        Int64 added_sequence_number = 0;
+        if (format_version_ > 1)
+        {
+            switch (status)
+            {
+                case ManifestEntryStatus::ADDED:
+                    added_sequence_number = inherited_sequence_number;
+                    break;
+                case ManifestEntryStatus::EXISTING:
+                    if (sequence_number_column.value()->isNullAt(i))
+                        throw Exception(
+                            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+                            "Data sequence number is null for the file added in another snapshot");
+                    else
+                        added_sequence_number = sequence_number_column.value()->getInt(i);
+                    break;
+                case ManifestEntryStatus::DELETED:
+                    added_sequence_number = inherited_sequence_number;
+                    break;
+            }
+        }
+        this->files.emplace_back(status, added_sequence_number, partition_ranges, file);
     }
 }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 07a1019252bb..f3f8d227caa0 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -2,11 +2,11 @@
 
 #include "config.h"
 
+#include <cstdint>
 #if USE_AVRO
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
-
-#include <cstdint>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h>
 
 namespace Iceberg
 {
@@ -18,10 +18,9 @@ enum class ManifestEntryStatus : uint8_t
     EXISTING = 0,
     ADDED = 1,
     DELETED = 2,
-
 };
 
-enum class DataFileContent : uint8_t
+enum class FileContentType : uint8_t
 {
     DATA = 0,
     POSITION_DELETES = 1,
@@ -30,11 +29,19 @@ enum class DataFileContent : uint8_t
 
 struct DataFileEntry
 {
-    String data_file_name;
+    String file_name;
+};
+
+using FileEntry = std::variant<DataFileEntry>; // In the future we will add PositionalDeleteFileEntry and EqualityDeleteFileEntry here
+
+struct ManifestFileEntry
+{
     ManifestEntryStatus status;
-    DataFileContent content;
+    Int64 added_sequence_number;
     std::unordered_map<Int32, DB::Range> partition_ranges;
 
+    FileEntry file;
+
     std::vector<DB::Range> getPartitionRanges(const std::vector<Int32> & partition_columns_ids) const;
 };
 
@@ -50,9 +57,10 @@ class ManifestFileContent
 public:
     explicit ManifestFileContent(std::unique_ptr<ManifestFileContentImpl> impl_);
 
-    const std::vector<DataFileEntry> & getDataFiles() const;
+    const std::vector<ManifestFileEntry> & getFiles() const;
     Int32 getSchemaId() const;
     const std::vector<PartitionColumnInfo> & getPartitionColumnInfos() const;
+    Int32 getPartitionSpecId() const;
 
 
 private:
@@ -60,19 +68,8 @@ class ManifestFileContent
 };
 
 
-using ManifestFilesByName = std::map<String, ManifestFileContent>;
-
-struct ManifestFileEntry
-{
-    explicit ManifestFileEntry(const ManifestFilesByName::const_iterator & reference_) : reference(reference_) { }
-    const ManifestFileContent & getContent() const { return reference->second; }
-    const String & getName() const { return reference->first; }
-
-
-private:
-    ManifestFilesByName::const_iterator reference;
-};
-
+using ManifestFilesStorage = std::map<String, ManifestFileContent>;
+using ManifestFileIterator = IteratorWrapper<ManifestFileContent>;
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index 9b87fe5406f3..ffd2964a41b9 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -47,16 +47,17 @@ struct ManifestFileContentImpl
         const String & common_path,
         const DB::FormatSettings & format_settings,
         Int32 schema_id_,
-        const DB::IcebergSchemaProcessor & schema_processor);
+        const DB::IcebergSchemaProcessor & schema_processor,
+        Int64 inherited_sequence_number);
 
     Int32 schema_id;
 
+
     // Size - number of supported partition columns
     std::vector<PartitionColumnInfo> partition_column_infos;
 
-
     // Size - number of files
-    std::vector<DataFileEntry> data_files;
+    std::vector<ManifestFileEntry> files;
 };
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
index c857fbb78008..ba9fac631385 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -7,31 +7,22 @@
 namespace Iceberg
 {
 
-class ManifestList
+struct ManifestListFileEntry
 {
-public:
-    explicit ManifestList(std::vector<ManifestFileEntry> manifest_files_) : manifest_files(std::move(manifest_files_)) { }
-    const std::vector<ManifestFileEntry> & getManifestFiles() const { return manifest_files; }
-
-private:
-    std::vector<ManifestFileEntry> manifest_files;
+    ManifestFileIterator manifest_file;
+    Int64 added_sequence_number;
 };
 
-using ManifestListsByName = std::map<String, ManifestList>;
-
-class IcebergSnapshot
-{
-public:
-    explicit IcebergSnapshot(const ManifestListsByName::const_iterator & reference_) : reference(reference_) { }
+using ManifestList = std::vector<ManifestListFileEntry>;
 
-    const ManifestList & getManifestList() const { return reference->second; }
-    const String & getName() const { return reference->first; }
 
+using ManifestListsStorage = std::map<String, ManifestList>;
+using ManifestListIterator = IteratorWrapper<ManifestList>;
 
-private:
-    ManifestListsByName::const_iterator reference;
+struct IcebergSnapshot
+{
+    ManifestListIterator manifest_list_iterator;
 };
-
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
index cd8a8fff86fc..db393373989f 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
@@ -1,16 +1,25 @@
 
+#include <typeinfo>
 #include "config.h"
 
 #if USE_AVRO
 
 #include <Processors/Formats/Impl/AvroRowInputFormat.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
+#include <Common/logger_useful.h>
+
+namespace DB::ErrorCodes
+{
+extern const int ICEBERG_SPECIFICATION_VIOLATION;
+extern const int BAD_TYPE_OF_FIELD;
+}
 
 namespace Iceberg
 {
 
 using namespace DB;
 
+
 MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & header, const FormatSettings & settings)
 {
     auto deserializer = std::make_unique<DB::AvroDeserializer>(header, file_reader.dataSchema(), true, true, settings);
@@ -23,9 +32,75 @@ MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & h
         file_reader.decr();
         deserializer->deserializeRow(columns, file_reader.decoder(), ext);
     }
+
+    for (size_t i = 0; i < columns.size(); ++i)
+    {
+        if (columns[0]->size() != columns[i]->size())
+        {
+            throw Exception(DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "All columns should have the same size");
+        }
+    }
     return columns;
 }
 
+std::tuple<NameToIndex, NameToDataType, DB::Block> getColumnsAndTypesFromAvroByNames(
+    avro::NodePtr root_node, const std::vector<String> & names, const std::vector<avro::Type> & expected_types)
+{
+    NameToIndex name_to_index;
+    NameToDataType name_to_data_type;
+
+    std::unordered_map<String, std::optional<size_t>> initial_index_by_name;
+    for (const auto & name : names)
+    {
+        initial_index_by_name.insert({name, std::nullopt});
+    }
+
+    size_t leaves_num = root_node->leaves();
+    for (size_t i = 0; i < leaves_num; ++i)
+    {
+        const auto & name = root_node->nameAt(static_cast<int>(i));
+
+        if (initial_index_by_name.find(name) != initial_index_by_name.end())
+            initial_index_by_name[name] = i;
+    }
+
+
+    size_t current_new_index = 0;
+    ColumnsWithTypeAndName columns_to_add = {};
+    for (size_t i = 0; i < names.size(); ++i)
+    {
+        const auto & name = names[i];
+        if (initial_index_by_name.at(name).has_value())
+        {
+            name_to_index.insert({name, current_new_index++});
+            const auto node = root_node->leafAt(static_cast<int>(initial_index_by_name.at(name).value()));
+            const size_t initial_index = initial_index_by_name.at(name).value();
+            if (node->type() != expected_types.at(i))
+            {
+                throw Exception(
+                    ErrorCodes::BAD_TYPE_OF_FIELD,
+                    "The parsed column from Avro file of `{}` field should be {} type, got {}",
+                    name,
+                    magic_enum::enum_name(expected_types[initial_index]),
+                    magic_enum::enum_name(node->type()));
+            }
+            name_to_data_type.insert({name, AvroSchemaReader::avroNodeToDataType(node)});
+            columns_to_add.push_back(ColumnWithTypeAndName{name_to_data_type.at(name)->createColumn(), name_to_data_type.at(name), name});
+        }
+    }
+
+    return std::make_tuple(name_to_index, name_to_data_type, Block{columns_to_add});
+}
+
+void checkColumnType(const DB::ColumnPtr & column, DB::TypeIndex expected_type_index)
+{
+    if (column->getDataType() != expected_type_index)
+        throw Exception(
+            ErrorCodes::BAD_TYPE_OF_FIELD,
+            "The parsed column from Avro file should be {} type, got {}",
+            magic_enum::enum_name(expected_type_index),
+            column->getFamilyName());
+}
 }
 
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
index 94aa921962d4..f3ec30ca72e5 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
@@ -9,7 +9,15 @@
 namespace Iceberg
 {
 
+using NameToIndex = std::unordered_map<String, size_t>;
+using NameToDataType = std::unordered_map<String, DB::DataTypePtr>;
+
 DB::MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const DB::Block & header, const DB::FormatSettings & settings);
+
+std::tuple<NameToIndex, NameToDataType, DB::Block> getColumnsAndTypesFromAvroByNames(
+    avro::NodePtr root_node, const std::vector<String> & names, const std::vector<avro::Type> & expected_types);
 }
 
+void checkColumnType(const DB::ColumnPtr & column, DB::TypeIndex expected_type_index);
+
 #endif

From 63d283b1a5a34121ef3235d4633da99a0832ec44 Mon Sep 17 00:00:00 2001
From: Daniil Ivanik <61067749+divanik@users.noreply.github.com>
Date: Fri, 28 Feb 2025 13:31:05 +0000
Subject: [PATCH 02/14] Merge pull request #76681 from
 ClickHouse/divanik/fix_path_parsing_in_iceberg

Fix path parsing in iceberg
---
 .../DataLakes/Iceberg/IcebergMetadata.cpp     |  29 +++----
 .../DataLakes/Iceberg/IcebergMetadata.h       |   1 +
 .../DataLakes/Iceberg/ManifestFile.cpp        |  12 +--
 .../DataLakes/Iceberg/ManifestFileImpl.h      |   3 +-
 .../ObjectStorage/DataLakes/Iceberg/Utils.cpp |  74 ++++++++++++++++-
 .../ObjectStorage/DataLakes/Iceberg/Utils.h   |   3 +
 ...ceberg_table_with_confusing_name.reference |   1 +
 ...3362_iceberg_table_with_confusing_name.sql |   4 +
 ...98c-41fa-a222-0981e71942b0-0-00001.parquet | Bin 0 -> 701 bytes
 ...521e92-07a7-450d-ba44-3a179b730c85-m0.avro | Bin 0 -> 7045 bytes
 ...-83521e92-07a7-450d-ba44-3a179b730c85.avro | Bin 0 -> 4373 bytes
 .../data_minio/est/metadata/v2.metadata.json  |  78 ++++++++++++++++++
 12 files changed, 178 insertions(+), 27 deletions(-)
 create mode 100644 tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.reference
 create mode 100644 tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.sql
 create mode 100644 tests/queries/0_stateless/data_minio/est/data/00000-0-52725061-398c-41fa-a222-0981e71942b0-0-00001.parquet
 create mode 100644 tests/queries/0_stateless/data_minio/est/metadata/83521e92-07a7-450d-ba44-3a179b730c85-m0.avro
 create mode 100644 tests/queries/0_stateless/data_minio/est/metadata/snap-3858160944799047644-1-83521e92-07a7-450d-ba44-3a179b730c85.avro
 create mode 100644 tests/queries/0_stateless/data_minio/est/metadata/v2.metadata.json

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 2f2615c36e68..3e3c532a6aee 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -58,7 +58,7 @@ parseTableSchemaFromManifestFile(const avro::DataFileReaderBase & manifest_file_
     if (avro_schema_it == avro_metadata.end())
         throw Exception(
             ErrorCodes::BAD_ARGUMENTS,
-            "Cannot read Iceberg table: manifest file {} doesn't have table schema in its metadata",
+            "Cannot read Iceberg table: manifest file '{}' doesn't have table schema in its metadata",
             manifest_file_name);
     std::vector<uint8_t> schema_json = avro_schema_it->second;
     String schema_json_string = String(reinterpret_cast<char *>(schema_json.data()), schema_json.size());
@@ -84,6 +84,7 @@ IcebergMetadata::IcebergMetadata(
     , log(getLogger("IcebergMetadata"))
     , current_metadata_version(metadata_version_)
     , format_version(format_version_)
+    , table_location(object->getValue<String>("location"))
 {
     auto manifest_list_file = getRelevantManifestList(object);
     if (manifest_list_file)
@@ -278,7 +279,7 @@ std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON:
         if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
         {
             const auto path = snapshot->getValue<String>("manifest-list");
-            return std::filesystem::path(path).filename();
+            return getProperFilePathFromMetadataInfo(std::string_view(path), configuration_ptr->getPath(), table_location);
         }
     }
     return std::nullopt;
@@ -338,8 +339,7 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
 
     auto context = getContext();
-    ObjectInfo object_info(
-        std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename);
+    StorageObjectStorage::ObjectInfo object_info(filename);
     auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
 
     auto manifest_list_file_reader
@@ -357,7 +357,7 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
             COLUMN_MANIFEST_FILE_PATH_NAME);
     if (format_version > 1 && name_to_index.find(COLUMN_SEQ_NUMBER_NAME) == name_to_index.end())
         throw Exception(
-            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: sequence_number");
+            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: `{}`", COLUMN_SEQ_NUMBER_NAME);
 
 
     auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
@@ -370,7 +370,8 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
         {
             throw Exception(
                 DB::ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `sequence_number` field should be Int64 type, got {}",
+                "The parsed column from Avro file of `{}` field should be Int64 type, got `{}`",
+                COLUMN_SEQ_NUMBER_NAME,
                 columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getFamilyName());
         }
         sequence_number_column = assert_cast<const ColumnInt64 *>(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME)).get());
@@ -380,7 +381,7 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
     {
         throw Exception(
             ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `{}` field should be String type, got {}",
+            "The parsed column from Avro file of `{}` field should be String type, got `{}`",
             COLUMN_MANIFEST_FILE_PATH_NAME,
             manifest_path_col->getFamilyName());
     }
@@ -391,16 +392,16 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
 
     for (size_t i = 0; i < manifest_path_col_str->size(); ++i)
     {
-        const auto file_path = manifest_path_col_str->getDataAt(i).toView();
-        const auto current_filename = std::filesystem::path(file_path).filename();
+        const std::string_view file_path = manifest_path_col_str->getDataAt(i).toView();
+        const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location);
         Int64 added_sequence_number = 0;
         if (format_version > 1)
         {
             added_sequence_number = sequence_number_column.value()->getInt(i);
         }
         /// We can't encapsulate this logic in getManifestFile because we need not only the name of the file, but also an inherited sequence number which is known only during the parsing of ManifestList
-        auto manifest_file_content = initializeManifestFile(current_filename, added_sequence_number);
-        auto [iterator, _inserted] = manifest_files_by_name.emplace(current_filename, std::move(manifest_file_content));
+        auto manifest_file_content = initializeManifestFile(manifest_file_name, added_sequence_number);
+        auto [iterator, _inserted] = manifest_files_by_name.emplace(manifest_file_name, std::move(manifest_file_content));
         auto manifest_file_iterator = ManifestFileIterator{iterator};
         for (const auto & data_file_path : manifest_file_iterator->getFiles())
         {
@@ -416,9 +417,8 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
 ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const
 {
     auto configuration_ptr = configuration.lock();
-    String manifest_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename;
 
-    ObjectInfo manifest_object_info(manifest_file);
+    ObjectInfo manifest_object_info(filename);
     auto buffer = StorageObjectStorageSource::createReadBuffer(manifest_object_info, object_storage, getContext(), log);
     auto manifest_file_reader = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*buffer));
     auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(*manifest_file_reader, filename);
@@ -430,7 +430,8 @@ ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filen
         getFormatSettings(getContext()),
         schema_id,
         schema_processor,
-        inherited_sequence_number);
+        inherited_sequence_number,
+        table_location);
     return ManifestFileContent(std::move(manifest_file_impl));
 }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 684035dfa41e..d82d0817d900 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -104,6 +104,7 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
     Int32 format_version;
     Int32 current_schema_id;
     std::optional<Iceberg::IcebergSnapshot> current_snapshot;
+    String table_location;
 
     mutable std::optional<Strings> cached_unprunned_files_for_current_snapshot;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index c220e68f9762..7d0df0ae5ff9 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -11,12 +11,9 @@
 #include <Poco/JSON/Parser.h>
 #include "DataTypes/DataTypeTuple.h"
 
-#    include <Common/logger_useful.h>
-
 namespace DB::ErrorCodes
 {
 extern const int ILLEGAL_COLUMN;
-extern const int BAD_ARGUMENTS;
 extern const int UNSUPPORTED_METHOD;
 extern const int ICEBERG_SPECIFICATION_VIOLATION;
 }
@@ -75,7 +72,8 @@ ManifestFileContentImpl::ManifestFileContentImpl(
     const DB::FormatSettings & format_settings,
     Int32 schema_id_,
     const IcebergSchemaProcessor & schema_processor,
-    Int64 inherited_sequence_number)
+    Int64 inherited_sequence_number,
+    const String & table_location)
 {
     this->schema_id = schema_id_;
 
@@ -224,12 +222,8 @@ ManifestFileContentImpl::ManifestFileContentImpl(
         }
         const auto status = ManifestEntryStatus(status_int_column->getInt(i));
 
-        const auto data_path = std::string(file_path_string_column->getDataAt(i).toView());
-        const auto pos = data_path.find(common_path);
-        if (pos == std::string::npos)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", common_path, data_path);
+        const auto file_path = getProperFilePathFromMetadataInfo(file_path_string_column->getDataAt(i).toView(), common_path, table_location);
 
-        const auto file_path = data_path.substr(pos);
         std::unordered_map<Int32, Range> partition_ranges;
         for (size_t j = 0; j < partition_columns.size(); ++j)
         {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index ffd2964a41b9..7d740e0f9429 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -48,7 +48,8 @@ struct ManifestFileContentImpl
         const DB::FormatSettings & format_settings,
         Int32 schema_id_,
         const DB::IcebergSchemaProcessor & schema_processor,
-        Int64 inherited_sequence_number);
+        Int64 inherited_sequence_number,
+        const std::string & table_location);
 
     Int32 schema_id;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
index db393373989f..c4fc63e7f9ba 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
@@ -6,12 +6,16 @@
 
 #include <Processors/Formats/Impl/AvroRowInputFormat.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
-#include <Common/logger_useful.h>
+#include <filesystem>
+
+using namespace DB;
+
 
 namespace DB::ErrorCodes
 {
 extern const int ICEBERG_SPECIFICATION_VIOLATION;
 extern const int BAD_TYPE_OF_FIELD;
+extern const int BAD_ARGUMENTS;
 }
 
 namespace Iceberg
@@ -22,7 +26,7 @@ using namespace DB;
 
 MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & header, const FormatSettings & settings)
 {
-    auto deserializer = std::make_unique<DB::AvroDeserializer>(header, file_reader.dataSchema(), true, true, settings);
+    auto deserializer = std::make_unique<AvroDeserializer>(header, file_reader.dataSchema(), true, true, settings);
     MutableColumns columns = header.cloneEmptyColumns();
 
     file_reader.init();
@@ -43,6 +47,70 @@ MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & h
     return columns;
 }
 
+
+// This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files.
+// For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro
+// Common path should end with "<table_name>" or "<table_name>/".
+std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location)
+{
+    auto trim_backward_slash = [](std::string_view str) -> std::string_view
+    {
+        if (str.ends_with('/'))
+        {
+            return str.substr(0, str.size() - 1);
+        }
+        return str;
+    };
+    auto trim_forward_slash = [](std::string_view str) -> std::string_view
+    {
+        if (str.starts_with('/'))
+        {
+            return str.substr(1);
+        }
+        return str;
+    };
+    common_path = trim_backward_slash(common_path);
+    table_location = trim_backward_slash(table_location);
+    if (data_path.starts_with(table_location) && table_location.ends_with(common_path))
+    {
+        return std::filesystem::path{common_path} / trim_forward_slash(data_path.substr(table_location.size()));
+    }
+
+
+    auto pos = data_path.find(common_path);
+    size_t good_pos = std::string::npos;
+    while (pos != std::string::npos)
+    {
+        auto potential_position = pos + common_path.size();
+        if ((std::string_view(data_path.data() + potential_position, 6) == "/data/")
+            || (std::string_view(data_path.data() + potential_position, 10) == "/metadata/"))
+        {
+            good_pos = pos;
+            break;
+        }
+        size_t new_pos = data_path.find(common_path, pos + 1);
+        if (new_pos == std::string::npos)
+        {
+            break;
+        }
+        pos = new_pos;
+    }
+
+
+    if (good_pos != std::string::npos)
+    {
+        return std::string{data_path.substr(good_pos)};
+    }
+    else if (pos != std::string::npos)
+    {
+        return std::string{data_path.substr(pos)};
+    }
+    else
+    {
+        throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Expected to find '{}' in data path: '{}'", common_path, data_path);
+    }
+}
+
 std::tuple<NameToIndex, NameToDataType, DB::Block> getColumnsAndTypesFromAvroByNames(
     avro::NodePtr root_node, const std::vector<String> & names, const std::vector<avro::Type> & expected_types)
 {
@@ -101,7 +169,7 @@ void checkColumnType(const DB::ColumnPtr & column, DB::TypeIndex expected_type_i
             magic_enum::enum_name(expected_type_index),
             column->getFamilyName());
 }
-}
 
+}
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
index f3ec30ca72e5..3166a7fb6ab5 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
@@ -14,6 +14,9 @@ using NameToDataType = std::unordered_map<String, DB::DataTypePtr>;
 
 DB::MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const DB::Block & header, const DB::FormatSettings & settings);
 
+
+std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location);
+
 std::tuple<NameToIndex, NameToDataType, DB::Block> getColumnsAndTypesFromAvroByNames(
     avro::NodePtr root_node, const std::vector<String> & names, const std::vector<avro::Type> & expected_types);
 }
diff --git a/tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.reference b/tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.reference
new file mode 100644
index 000000000000..98617091f36f
--- /dev/null
+++ b/tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.reference
@@ -0,0 +1 @@
+1	Mars
diff --git a/tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.sql b/tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.sql
new file mode 100644
index 000000000000..f42dca735bde
--- /dev/null
+++ b/tests/queries/0_stateless/03362_iceberg_table_with_confusing_name.sql
@@ -0,0 +1,4 @@
+-- Tags: no-fasttest
+-- Tag no-fasttest: Depends on AWS
+
+select * from icebergS3(s3_conn, filename='est') limit 10;
\ No newline at end of file
diff --git a/tests/queries/0_stateless/data_minio/est/data/00000-0-52725061-398c-41fa-a222-0981e71942b0-0-00001.parquet b/tests/queries/0_stateless/data_minio/est/data/00000-0-52725061-398c-41fa-a222-0981e71942b0-0-00001.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..899c1518d28ec8bd0f3448cd45cc7d63bf2f3f25
GIT binary patch
literal 701
zcmZ`%!D`z;5FIUQ(ZPfkGFvU^U>0m#P+V<oCyp_J+)ANf3I#(cLDFthfo(Y}sZi2f
z3%Tb5a%%Gf{fGWQejtbbL63#5EXP)%-Gg>^=DoM`W;Ho}EfDY#a{O`m<@TY6#2g)T
z06O2F{f6l=fCbfB<9>a+yxaK~&Ni@Dq0ZQ2!d&N`RSS_Y3a~2ME1<jD)@hGAI5Bl#
z9-UT^sf=Gd$irx+&<3<y;5Je%l|ES%^GNA81h7#impYY2oS($0R4o|To@aC@VIK3_
zj)yjDIl%trVuKQTL?>-$ylr+HQRr+eY_p~}GX+3m)UOEc(i;nppgr}lvA&|-$>xgb
zxN=&3jr7WQb*f9=l#)L!>;nBvN!y{<)^0fucPELJ@q60M;ty&bx?inxo_|^@KH^!f
zjdSkvs<f9#J_>yPK2bB7@zL9JUS@>P7va3b*XL(gTAdY%Rt92plz*=}OBbj7!mr5B
zH{`mz|C3EiNlAZq0q)CXsLu+O@6C1dqtaQDE}Eh%dR@`%L`5=_O&RsYuqT2;IfxE=
meH9N5dx0Dt1hEX2*cYlF45f_1r_496T0ZO(GyaMBasL49KADLC

literal 0
HcmV?d00001

diff --git a/tests/queries/0_stateless/data_minio/est/metadata/83521e92-07a7-450d-ba44-3a179b730c85-m0.avro b/tests/queries/0_stateless/data_minio/est/metadata/83521e92-07a7-450d-ba44-3a179b730c85-m0.avro
new file mode 100644
index 0000000000000000000000000000000000000000..35cf562b420fb3fb8de6b46470e8c3471533b23c
GIT binary patch
literal 7045
zcmcIo%Z}Vc6z#NTF#;52fj~l8l)R-oW0&XEEzJT!f+8hh$YT|aTGj6Aj2X9WZ1+S%
zlodNxd_w+!EejS1Ar^=QpTJ*$*ucG2c0Ib=?$>10^Rlbz-h1xz)EC3AZrpk=56=07
z{n>uy6jv#C`c7VC)1YuVj(pP%M^4}EIOC8<qul8~f92p4uj9mQg6orP#Ir#>oeX&f
zyE6XuG|V_G8M7$oo$A;qiC;LcJ90NwyVEQgfw)1CjCjLtP%6azXRkkCmsxU0>h@c|
z(wczW05owrq|}Qz6BdVKo)-fi7uglQscax%gw&pZxvZGx_*U9aoG=FK%f-Ta;=b+>
zV@J%={5&ZJfP>!l+=-`Agb#$JXYj{}kJ&UToIb%<_K}`_B!I@_fGZf4HftX1TgQ&W
zh!67XI(y$bd&CMhKmi5@P2R|;0u`dj^gxir1-QsKr*gq431F%7sa%kgWK4>4PEcx}
zxF@JWhj=Gxl7~f@#AL)HUhup_=m{9jqOiECo+;tdqn>YDBalj2ac*i+1|51*D)Jx+
zm=N^r@qO|#EY8WjCq#szKtY68dA@7Mj5vRsWD{oDIVa&Bm`3C}f)Uy$815aCWC8vl
zDa#;IX5-Uy?Qnusg+RoxrqMT^JL8=*@O5SNX>}Swz)3MpBi@l_Gcsg39}&PF9i!xu
zJmtYG<Sw-WNG2>K{?zupNFOczkti(adbuGdp%jLY&WTtQ|KT@$5XOVy6@-QXh|E(h
z=xI{0NW>P&VGD{;TP)AokrO1*bP^A6i+Q@rTa7!*GA7DUSnvtvfPOV!ye>TNXmIIy
z$I8T+thwOgtoot_2UvJtvS?~1tFqrT&O3$@3z$U7i!fl(Q_&Kzl%`O7HC7K<iXM@+
zlW;_e1U`tLfDThJ%r8*&#v14z&ub_H;+moC0hH9;0?OXJqKiVwgp#^jLMbw%192Ec
z(-Cw((M1T0N8}NEv>aPoW$Zu$5dv=mVCo^jGc`na%;m94?E<z^tWvr$z=#GBLLi3A
z`qLPxo?Xsk7Qa0({bK;r?`<Bf{(`hx7Oj473ovo?!+HR5DN$8LT2_dq!Rp(kDoS3$
zNI3-0z{Ckfq%3mV*Hn6laeA~lwDhpK5G?E%^k`#j1rkIcu@pplVCgnZ)3*d7R#ShA
zYD(?WG?(lPLG(8-rviypL9}xRD%W7b3x<==qDXY4u4lS6-i?FoDiu>AY$nxaGK`Yp
zQWyGm8O&1{VF$^0obzJcL{RM=X5tBq*VWwXn|=HV3MkkRhE79DV<q(MDXyBcfefv!
zVM(-R6g)H)yjzX&K{A2qGR$GL5KWfc|4hhh*4564Jjnn}L04<28){_u@(FB!sT=q$
zh+FEs{W3z0Apkqn`+~3{zYTV=@7$K-efN@QIgpk%b=270#jzP2)pl=(0f(acq9nS%
ze_YLqE7Efy>?J*~_^m<Af#7RijwV;MT$B3T<haJx22Ss5k+Y0*wFCCm0fH^f@z&K&
z(N_xwLp8RV#?ON)d}rKp_*yANQ)>>Bq*aop>!Ga5Ev|N_fJPG4Iz1{<S*p@RTD1i2
zdLf~rsw>L4w<2xT^0b>&^jp_IE4m5DN@?clrYu$Kq$Akorgb1lrdrBx=BZkU4k#Jr
zHalHOR4r50QjS$F+5FTv-%4eb3CQ$14s<TjFMvaef8d%=6l+KH>hcxHkgdC4_L}mo
zz5zrhDhI8|8MO^PWrGl-c+1*0z(L4B7T7=`v98C4X0oP?@^mIIU}WoBMj>nKEQ18w
z*0MR4q|ERkEXw{ePleXC!iPYrvBC$}ZEwX=dZO&22g)v+gRE;4J*>5f0(O;6!!Txe
zH7y@>*T@``7-;o%+S}9r+UnP*jBljO=IWaxxl-TPO$g$_n7ixLca#F+g4SuAy|ozK
zdu9ggIytI3Mqv(juZ}sfdOoy`@&S~XvL{q4YCO7N?$+w*7_^7$&9s9+)Y{1+zC^j^
znOo_(uS?m?U|pwZ*+ZnS?AaRTWcb`xvw)a{cb=c!ET^tr9Ohbow;z1|;T!(z4?o}h
z=#TK-$L+t`?e>rV{qp^{KmGIFMXS|%b0haze{T<NdW${$#%+5pes$gM0X?Dz?xEN9
zd&fa{-y5?oqm*{t-m%Azyxu+?;xb(A-2qkbrv2vWCy&2;cK2!f-*?(O?bDsp7W{$_
h{%Y;CTH=peH$XOi?d(2eS&r{!AGO-;U0gkD!vCRcwbB3p

literal 0
HcmV?d00001

diff --git a/tests/queries/0_stateless/data_minio/est/metadata/snap-3858160944799047644-1-83521e92-07a7-450d-ba44-3a179b730c85.avro b/tests/queries/0_stateless/data_minio/est/metadata/snap-3858160944799047644-1-83521e92-07a7-450d-ba44-3a179b730c85.avro
new file mode 100644
index 0000000000000000000000000000000000000000..5aca9ec365125197f9366a3c91016fa5b95666cf
GIT binary patch
literal 4373
zcmbW4&5q+l5XWb7;=~EfX#_0`aY|<F#F>0Zdsyv)v|=UDW;qS3sAG4=27hGT?a5|W
zBO!4?T;MSf689D2#3S$!JOdI3sBXKv-HzjNR+=0pDc7&Ms;lbnFIL~2zJG>qv-E;`
z0g3Tn=#9m88)D6ol@Tw^e5;SF1jpi=7$@PHP_|r$5#c|qLlXHEjIZD98*MNPxRLZn
zSr#TY{2M>@IN)jOVHTzd`s&#ev<+E+K3gCWhju;)eJ~x{_FW%B+Y2Nj$&E>gQiJh4
zWjI1SUPQy!1ceD&?HHjddI@k0zy@b5<gqR3hImT|qH>ThVU<%XzyLjYjB-kR#8R})
zLbf0O&~Wdi2_uCUnH|&Vqr!lqbd6X*kj(rX*$;foa347jd=im{lQ<ItMaj#YBpz8N
zdAuT7^&GWSElNvNSfXt}5(AF5m?G@^L}WwMzRaF8Hkfe;dF#x5nr`7vA^CvBL#3)G
zuNTa@GLRyyB~9>#1}Uo*tbWkaidT&Vm=$MgrP-qf8oyO^>7{vFB_Hy9At-jXL&~(8
zJKHr}lUE^S&?lnD?)Cm@Y26Aii&I=1znlNZ;*Pb)-&BY!-8RHtt3#0*ZrYp~$7~vv
znNH4CW?KB4MU{!^<X$GG#k*NoHBrRqDZBpKO7bXztF5Ol&N95?mmQ`^%tx#KX@4MR
zV|%I^qc(^6jKN_-q45Pm&EZv=Mg%APxpELc&vKZIYlM?sF)9Pz=+KB!gjC$<7DxI1
zV4E4YS3?Ux{hXGBzqp@{lP0)N);NzCB!-6vUaGz1OpRzFm@L_H1r1uA-zw==u?Z@n
zd@0915|1MBC|ai(!jKBl?;>rjF_^0=-)!DB2Jz^-#o*k%fJ|7>y)ld)x#OmHm9sAL
zIL0uP_~{7t0-fx8k7!T8{Rcuec&Sgk_joDJ<nRUBC!^_j>Rj0~*PYB}wmZ3U-E#oN
zm<?`8Mq#n`h9Am_KPVPX&p8i00z2xa6eDjxl6T(%?T~x0b*)0Chy&Sur9VJ^aGd}$
zp<ikQOK!|%AwxjTyv)N4R@WR{k$%|?+_4ZPMJu#QlDod7IQtYSL=JI^(N|IqiV}?&
zDuil4EkzDNgoY>?&9X!RoQVP`G1A5DF~UT{#tJ5DuH?uesFAS65z<6cuN1XWkaQw<
z6ekL5j}){)l5|2YI>-~vb89?9lpNw!FUr0^(av?<)bc&rzgomq4WtpeNOJ!kO^~~|
z5#Z=fDhg6v?}zeZT)sV-qJnpHu~mMKcCYeNkrJAq%PZ4!G(e`O$-k|~fSNNPP%3#@
z<A|zpspfG6A{ET4`vs(k`ofwkOEtOWyP}%aGj(&dY7kPTLN{i$)DC?)Mvss!I-?L}
z@fvAqEEg*ks&Qr|S?<F&=oL-Xi^&irhoNBt%Ne2)63yx&Of*dICqsQ44?C<txV}*3
zH+L)iPz2qgFyQfz;lgokQjrZpN1Qx+`1i%XU!Q$`{oC33Prst?dcU9ka7ssbet8M6
zyX=yGV!|+g55Am^#zTk9h68(oCj)nE`-2sB-N6Vuli6xAvc2he5Zf1Ato#z3ob-Bc
V|NHaj@BjGc$8US^AOEZK{s*H$lnnp?

literal 0
HcmV?d00001

diff --git a/tests/queries/0_stateless/data_minio/est/metadata/v2.metadata.json b/tests/queries/0_stateless/data_minio/est/metadata/v2.metadata.json
new file mode 100644
index 000000000000..dc6e46254f09
--- /dev/null
+++ b/tests/queries/0_stateless/data_minio/est/metadata/v2.metadata.json
@@ -0,0 +1,78 @@
+{
+  "format-version" : 2,
+  "table-uuid" : "a5920f9f-12d0-4773-87fc-569c00676258",
+  "location" : "s3a://test/est",
+  "last-sequence-number" : 1,
+  "last-updated-ms" : 1740666694866,
+  "last-column-id" : 2,
+  "current-schema-id" : 0,
+  "schemas" : [ {
+    "type" : "struct",
+    "schema-id" : 0,
+    "fields" : [ {
+      "id" : 1,
+      "name" : "order_number",
+      "required" : false,
+      "type" : "long"
+    }, {
+      "id" : 2,
+      "name" : "product_code",
+      "required" : false,
+      "type" : "string"
+    } ]
+  } ],
+  "default-spec-id" : 0,
+  "partition-specs" : [ {
+    "spec-id" : 0,
+    "fields" : [ ]
+  } ],
+  "last-partition-id" : 999,
+  "default-sort-order-id" : 0,
+  "sort-orders" : [ {
+    "order-id" : 0,
+    "fields" : [ ]
+  } ],
+  "properties" : {
+    "owner" : "divanik",
+    "option.format-version" : "2",
+    "write.parquet.compression-codec" : "zstd"
+  },
+  "current-snapshot-id" : 3858160944799047644,
+  "refs" : {
+    "main" : {
+      "snapshot-id" : 3858160944799047644,
+      "type" : "branch"
+    }
+  },
+  "snapshots" : [ {
+    "sequence-number" : 1,
+    "snapshot-id" : 3858160944799047644,
+    "timestamp-ms" : 1740666694866,
+    "summary" : {
+      "operation" : "append",
+      "spark.app.id" : "local-1740666666602",
+      "added-data-files" : "1",
+      "added-records" : "1",
+      "added-files-size" : "701",
+      "changed-partition-count" : "1",
+      "total-records" : "1",
+      "total-files-size" : "701",
+      "total-data-files" : "1",
+      "total-delete-files" : "0",
+      "total-position-deletes" : "0",
+      "total-equality-deletes" : "0"
+    },
+    "manifest-list" : "s3a://test/est/metadata/snap-3858160944799047644-1-83521e92-07a7-450d-ba44-3a179b730c85.avro",
+    "schema-id" : 0
+  } ],
+  "statistics" : [ ],
+  "partition-statistics" : [ ],
+  "snapshot-log" : [ {
+    "timestamp-ms" : 1740666694866,
+    "snapshot-id" : 3858160944799047644
+  } ],
+  "metadata-log" : [ {
+    "timestamp-ms" : 1740666685851,
+    "metadata-file" : "s3a://test/est/metadata/v1.metadata.json"
+  } ]
+}
\ No newline at end of file

From 04743b69401547d441ea29ccc59473b01e1ca96b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 17 Mar 2025 21:25:23 +0000
Subject: [PATCH 03/14] Merge pull request #77403 from
 ClickHouse/add_iceberg_truncate

Add `icebergTruncate` function and support it in partition pruning for Iceberg
---
 src/Common/DateLUTImpl.cpp                    |  12 --
 src/Common/DateLUTImpl.h                      |  14 +-
 src/Functions/DateTimeTransforms.h            |  64 ++++++
 src/Functions/IsOperation.h                   |   2 +-
 src/Functions/icebergTruncate.cpp             | 201 ++++++++++++++++++
 src/Functions/toMonthNumSinceEpoch.cpp        |  24 +++
 src/Functions/toYearNumSinceEpoch.cpp         |  24 +++
 .../DataLakes/Iceberg/IcebergMetadata.cpp     |  52 ++---
 .../DataLakes/Iceberg/ManifestFile.cpp        |  92 ++++----
 .../DataLakes/Iceberg/ManifestFile.h          |  21 +-
 .../DataLakes/Iceberg/ManifestFileImpl.h      |  10 +-
 .../DataLakes/Iceberg/PartitionPruning.cpp    | 196 +++++++++--------
 .../DataLakes/Iceberg/PartitionPruning.h      |  69 +++---
 .../integration/test_storage_iceberg/test.py  |  45 +++-
 .../03376_iceberg_truncate.reference          |  42 ++++
 .../0_stateless/03376_iceberg_truncate.sql    |  56 +++++
 16 files changed, 680 insertions(+), 244 deletions(-)
 create mode 100644 src/Functions/icebergTruncate.cpp
 create mode 100644 src/Functions/toMonthNumSinceEpoch.cpp
 create mode 100644 src/Functions/toYearNumSinceEpoch.cpp
 create mode 100644 tests/queries/0_stateless/03376_iceberg_truncate.reference
 create mode 100644 tests/queries/0_stateless/03376_iceberg_truncate.sql

diff --git a/src/Common/DateLUTImpl.cpp b/src/Common/DateLUTImpl.cpp
index 4117099b7944..8b7388a36644 100644
--- a/src/Common/DateLUTImpl.cpp
+++ b/src/Common/DateLUTImpl.cpp
@@ -291,15 +291,3 @@ namespace cctz_extension
 
     ZoneInfoSourceFactory zone_info_source_factory = custom_factory;
 }
-
-DateLUTImpl::Values DateLUTImpl::lutIndexByMonthSinceEpochStartsZeroIndexing(Int32 months) const
-{
-    Int16 year = 1970 + months / 12;
-    UInt8 month = months % 12 + 1;
-    return lut[makeLUTIndex(year, month, 1)];
-}
-
-DateLUTImpl::Values DateLUTImpl::lutIndexByYearSinceEpochStartsZeroIndexing(Int16 years) const
-{
-    return lut[makeLUTIndex(years + 1970, 1, 1)];
-}
diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h
index 3fa2e12406b0..252d612b3f7a 100644
--- a/src/Common/DateLUTImpl.h
+++ b/src/Common/DateLUTImpl.h
@@ -659,6 +659,9 @@ class DateLUTImpl
     template <typename DateOrTime>
     Int16 toYear(DateOrTime v) const { return lut[toLUTIndex(v)].year; }
 
+    template <typename DateOrTime>
+    Int16 toYearSinceEpoch(DateOrTime v) const { return lut[toLUTIndex(v)].year - 1970; }
+
     /// 1-based, starts on Monday
     template <typename DateOrTime>
     UInt8 toDayOfWeek(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_week; }
@@ -952,6 +955,13 @@ class DateLUTImpl
         return lut[i].year * 12 + lut[i].month;
     }
 
+    template <typename DateOrTime>
+    Int32 toMonthNumSinceEpoch(DateOrTime v) const
+    {
+        const LUTIndex i = toLUTIndex(v);
+        return (lut[i].year - 1970) * 12 + lut[i].month - 1;
+    }
+
     template <typename DateOrTime>
     Int32 toRelativeQuarterNum(DateOrTime v) const
     {
@@ -1168,10 +1178,6 @@ class DateLUTImpl
         return LUTIndex{std::min(index, static_cast<UInt32>(DATE_LUT_SIZE - 1))};
     }
 
-    Values lutIndexByMonthSinceEpochStartsZeroIndexing(Int32 months) const;
-
-    Values lutIndexByYearSinceEpochStartsZeroIndexing(Int16 years) const;
-
     /// Create DayNum from year, month, day of month.
     ExtendedDayNum makeDayNum(Int16 year, UInt8 month, UInt8 day_of_month, Int32 default_error_day_num = 0) const
     {
diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h
index 93745cfd7a15..055138221568 100644
--- a/src/Functions/DateTimeTransforms.h
+++ b/src/Functions/DateTimeTransforms.h
@@ -1734,6 +1734,38 @@ struct ToRelativeYearNumImpl
     using FactorTransform = ZeroTransform;
 };
 
+template <ResultPrecision precision_>
+struct ToYearNumSinceEpochImpl
+{
+    static constexpr auto name = "toYearNumSinceEpoch";
+
+    static auto execute(Int64 t, const DateLUTImpl & time_zone)
+    {
+        if constexpr (precision_ == ResultPrecision::Extended)
+            return time_zone.toYearSinceEpoch(t);
+        else
+            return static_cast<UInt16>(time_zone.toYearSinceEpoch(t));
+    }
+    static UInt16 execute(UInt32 t, const DateLUTImpl & time_zone)
+    {
+        return time_zone.toYearSinceEpoch(static_cast<time_t>(t));
+    }
+    static auto execute(Int32 d, const DateLUTImpl & time_zone)
+    {
+        if constexpr (precision_ == ResultPrecision::Extended)
+            return time_zone.toYearSinceEpoch(ExtendedDayNum(d));
+        else
+            return static_cast<UInt16>(time_zone.toYearSinceEpoch(ExtendedDayNum(d)));
+    }
+    static UInt16 execute(UInt16 d, const DateLUTImpl & time_zone)
+    {
+        return time_zone.toYearSinceEpoch(DayNum(d));
+    }
+    static constexpr bool hasPreimage() { return false; }
+
+    using FactorTransform = ZeroTransform;
+};
+
 template <ResultPrecision precision_>
 struct ToRelativeQuarterNumImpl
 {
@@ -1798,6 +1830,38 @@ struct ToRelativeMonthNumImpl
     using FactorTransform = ZeroTransform;
 };
 
+template <ResultPrecision precision_>
+struct ToMonthNumSinceEpochImpl
+{
+    static constexpr auto name = "toMonthNumSinceEpoch";
+
+    static auto execute(Int64 t, const DateLUTImpl & time_zone)
+    {
+        if constexpr (precision_ == ResultPrecision::Extended)
+            return time_zone.toMonthNumSinceEpoch(t);
+        else
+            return static_cast<UInt16>(time_zone.toMonthNumSinceEpoch(t));
+    }
+    static UInt16 execute(UInt32 t, const DateLUTImpl & time_zone)
+    {
+        return time_zone.toMonthNumSinceEpoch(static_cast<time_t>(t));
+    }
+    static auto execute(Int32 d, const DateLUTImpl & time_zone)
+    {
+        if constexpr (precision_ == ResultPrecision::Extended)
+            return time_zone.toMonthNumSinceEpoch(ExtendedDayNum(d));
+        else
+            return static_cast<UInt16>(time_zone.toMonthNumSinceEpoch(ExtendedDayNum(d)));
+    }
+    static UInt16 execute(UInt16 d, const DateLUTImpl & time_zone)
+    {
+        return time_zone.toMonthNumSinceEpoch(DayNum(d));
+    }
+    static constexpr bool hasPreimage() { return false; }
+
+    using FactorTransform = ZeroTransform;
+};
+
 template <ResultPrecision precision_>
 struct ToRelativeWeekNumImpl
 {
diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h
index a74df8f4dd94..3c45a061af0b 100644
--- a/src/Functions/IsOperation.h
+++ b/src/Functions/IsOperation.h
@@ -60,7 +60,7 @@ struct IsOperation
 
     static constexpr bool bit_hamming_distance = IsSameOperation<Op, BitHammingDistanceImpl>::value;
 
-    static constexpr bool division = div_floating || int_div || int_div_or_zero || modulo;
+    static constexpr bool division = div_floating || int_div || int_div_or_zero || modulo || positive_modulo;
     // NOTE: allow_decimal should not fully contain `division` because of divInt
     static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest;
 };
diff --git a/src/Functions/icebergTruncate.cpp b/src/Functions/icebergTruncate.cpp
new file mode 100644
index 000000000000..a8d4d6bdf2b2
--- /dev/null
+++ b/src/Functions/icebergTruncate.cpp
@@ -0,0 +1,201 @@
+#include <Functions/FunctionFactory.h>
+#include <Columns/ColumnString.h>
+#include <Columns/IColumn.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeString.h>
+#include <Functions/IFunction.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+
+/// This function specification https://iceberg.apache.org/spec/#truncate-transform-details
+class FunctionIcebergTruncate : public IFunction
+{
+
+public:
+    static inline const char * name = "icebergTruncate";
+
+    explicit FunctionIcebergTruncate(ContextPtr)
+    {
+    }
+
+    static FunctionPtr create(ContextPtr context_)
+    {
+        return std::make_shared<FunctionIcebergTruncate>(context_);
+    }
+
+    String getName() const override
+    {
+        return name;
+    }
+
+    bool isVariadic() const override
+    {
+        return false;
+    }
+
+    size_t getNumberOfArguments() const override
+    {
+        return 2;
+    }
+
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        /// You may ask, why use global context and not the context provided
+        /// in create/Constructor? Two reasons:
+        /// 1. We need context only to access global functions factory, that is why global context is the most suitable
+        /// 2. It's terribly unsafe to store ContextPtr inside function because function object is so low-level
+        /// that it can be stored in multiple other objects which itself stored in global context.
+        /// Very common example ContextPtr->Storage->KeyDescription->Expressions->Function->ContextPtr oops
+        /// here we have a loop and memory leak.
+        auto context = Context::getGlobalContextInstance();
+
+        if (arguments.size() != 2)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Incorrect number of arguments: expected 2 arguments");
+
+        const auto & truncate_number = arguments[0];
+        if (!WhichDataType(truncate_number).isNativeUInt())
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument should be UInt data type");
+
+        const auto & truncate_type = arguments[1];
+        WhichDataType which_truncate(truncate_type);
+        if (!which_truncate.isDecimal64() && !which_truncate.isDecimal32() && !which_truncate.isStringOrFixedString() && !which_truncate.isNativeInteger())
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument must be of native integer type, String/FixedString, Decimal");
+
+        if (which_truncate.isStringOrFixedString())
+        {
+            return std::make_shared<DataTypeString>();
+        }
+        else
+        {
+            auto get_column_const = [] (const DataTypePtr data_type)
+            {
+                return ColumnWithTypeAndName(data_type->createColumnConst(1, data_type->getDefault()), data_type, "");
+            };
+
+            ColumnsWithTypeAndName modulo_arguments;
+            if (which_truncate.isNativeInteger())
+            {
+                modulo_arguments = {get_column_const(arguments[1]), get_column_const(arguments[0])};
+            }
+            else
+            {
+                auto decimal_scaled = arguments[1]->createColumnConst(1, arguments[1]->getDefault());
+                ColumnWithTypeAndName decimal_scaled_with_type(decimal_scaled, arguments[1], "");
+                modulo_arguments = {get_column_const(arguments[1]), decimal_scaled_with_type};
+            }
+
+            auto modulo_func = FunctionFactory::instance().get("positiveModulo", context)->build(modulo_arguments);
+            auto modulo_result_type = modulo_func->getResultType();
+            auto minus_arguments = {get_column_const(arguments[1]), get_column_const(modulo_result_type)};
+            auto minus_func = FunctionFactory::instance().get("minus", context)->build(minus_arguments);
+            auto minus_result_type = minus_func->getResultType();
+
+            return minus_result_type;
+        }
+
+    }
+
+    bool hasInformationAboutMonotonicity() const override { return true; }
+    Monotonicity getMonotonicityForRange(const IDataType &, const Field &, const Field &) const override { return { .is_monotonic = true, .is_always_monotonic = true }; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /* result_type */, size_t input_rows_count) const override
+    {
+        auto value = (*arguments[0].column)[0].safeGet<Int64>();
+        if (value <= 0)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function icebergTruncate accepts only positive width");
+
+        auto context = Context::getGlobalContextInstance();
+        WhichDataType which_truncate(arguments[1].type);
+        if (which_truncate.isStringOrFixedString())
+        {
+            auto string_arguments = {arguments[1], arguments[0]};
+            if (which_truncate.isFixedString())
+            {
+                auto substr_func = FunctionFactory::instance().get("left", context)->build(string_arguments);
+                return substr_func->execute(string_arguments, std::make_shared<DataTypeString>(), input_rows_count, false);
+            }
+            else
+            {
+                auto substr_func = FunctionFactory::instance().get("leftUTF8", context)->build(string_arguments);
+                return substr_func->execute(string_arguments, std::make_shared<DataTypeString>(), input_rows_count, false);
+            }
+        }
+        else if (which_truncate.isNativeInteger() || which_truncate.isDecimal())
+        {
+            ColumnsWithTypeAndName modulo_arguments;
+            if (which_truncate.isNativeInteger())
+            {
+                modulo_arguments = {arguments[1], arguments[0]};
+            }
+            else
+            {
+                ColumnPtr decimal_scaled;
+                if (const auto * decimal_type = checkDecimal<Decimal32>(*arguments[1].type))
+                    decimal_scaled = arguments[1].type->createColumnConst(input_rows_count, DecimalField<Decimal32>(value, decimal_type->getScale()));
+                if (const auto * decimal_type = checkDecimal<Decimal64>(*arguments[1].type))
+                    decimal_scaled = arguments[1].type->createColumnConst(input_rows_count, DecimalField<Decimal64>(value, decimal_type->getScale()));
+
+                if (!decimal_scaled)
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected decimal data type");
+
+                ColumnWithTypeAndName decimal_scaled_with_type(decimal_scaled, arguments[1].type, "");
+                modulo_arguments = {arguments[1], decimal_scaled_with_type};
+            }
+
+            auto modulo_func = FunctionFactory::instance().get("positiveModulo", context)->build(modulo_arguments);
+            auto modulo_result_type = modulo_func->getResultType();
+            auto modulo_result = modulo_func->execute(modulo_arguments, modulo_result_type, input_rows_count, false);
+            ColumnWithTypeAndName modulo_result_with_type(modulo_result, modulo_result_type, "");
+            auto minus_arguments = {arguments[1], modulo_result_with_type};
+            auto minus_func = FunctionFactory::instance().get("minus", context)->build(minus_arguments);
+            auto minus_result_type = minus_func->getResultType();
+            return minus_func->execute(minus_arguments, minus_result_type, input_rows_count, false);
+        }
+
+        std::unreachable();
+    }
+
+    bool useDefaultImplementationForConstants() const override
+    {
+        return true;
+    }
+
+    DataTypePtr getReturnTypeForDefaultImplementationForDynamic() const override
+    {
+        return std::make_shared<DataTypeString>();
+    }
+
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+};
+
+REGISTER_FUNCTION(IcebergTruncate)
+{
+    FunctionDocumentation::Description description = R"(Implements logic of iceberg truncate transform: https://iceberg.apache.org/spec/#truncate-transform-details.)";
+    FunctionDocumentation::Syntax syntax = "icebergTruncate(N, value)";
+    FunctionDocumentation::Arguments arguments = {{"value", "String, integer or Decimal value."}};
+    FunctionDocumentation::ReturnedValue returned_value = "The same type as argument";
+    FunctionDocumentation::Examples examples = {{"Example", "SELECT icebergTruncate(3, 'iceberg')", "ice"}};
+    FunctionDocumentation::Category category = {"Other"};
+
+    factory.registerFunction<FunctionIcebergTruncate>({description, syntax, arguments, returned_value, examples, category});
+}
+
+}
+
+}
diff --git a/src/Functions/toMonthNumSinceEpoch.cpp b/src/Functions/toMonthNumSinceEpoch.cpp
new file mode 100644
index 000000000000..906fb0c0c9ac
--- /dev/null
+++ b/src/Functions/toMonthNumSinceEpoch.cpp
@@ -0,0 +1,24 @@
+#include <Functions/FunctionFactory.h>
+#include <Functions/DateTimeTransforms.h>
+#include <Functions/FunctionDateOrDateTimeToSomething.h>
+#include <DataTypes/DataTypesNumber.h>
+
+
+namespace DB
+{
+
+using FunctionToMonthNumSinceEpoch = FunctionDateOrDateTimeToSomething<DataTypeUInt32, ToMonthNumSinceEpochImpl<ResultPrecision::Standard>>;
+
+REGISTER_FUNCTION(ToMonthNumSinceEpoch)
+{
+    FunctionDocumentation::Description description = R"(Returns amount of months passed from year 1970)";
+    FunctionDocumentation::Syntax syntax = "toMonthNumSinceEpoch(date)";
+    FunctionDocumentation::Arguments arguments = {{"date", "Date, DateTime or DateTime64"}};
+    FunctionDocumentation::ReturnedValue returned_value = "Positive integer";
+    FunctionDocumentation::Examples examples = {{"Example", "SELECT toMonthNumSinceEpoch(toDate('2024-10-01'))", "657"}};
+    FunctionDocumentation::Category category = {"DateTime"};
+
+    factory.registerFunction<FunctionToMonthNumSinceEpoch>({description, syntax, arguments, returned_value, examples, category});
+}
+
+}
diff --git a/src/Functions/toYearNumSinceEpoch.cpp b/src/Functions/toYearNumSinceEpoch.cpp
new file mode 100644
index 000000000000..ec764d239455
--- /dev/null
+++ b/src/Functions/toYearNumSinceEpoch.cpp
@@ -0,0 +1,24 @@
+#include <Functions/FunctionFactory.h>
+#include <Functions/DateTimeTransforms.h>
+#include <Functions/FunctionDateOrDateTimeToSomething.h>
+#include <DataTypes/DataTypesNumber.h>
+
+
+namespace DB
+{
+
+using FunctionToYearNumSinceEpoch = FunctionDateOrDateTimeToSomething<DataTypeUInt16, ToYearNumSinceEpochImpl<ResultPrecision::Standard>>;
+
+REGISTER_FUNCTION(ToYearNumSinceEpoch)
+{
+    FunctionDocumentation::Description description = R"(Returns amount of years passed from year 1970)";
+    FunctionDocumentation::Syntax syntax = "toYearNumSinceEpoch(date)";
+    FunctionDocumentation::Arguments arguments = {{"date", "Date, DateTime or DateTime64"}};
+    FunctionDocumentation::ReturnedValue returned_value = "Positive integer";
+    FunctionDocumentation::Examples examples = {{"Example", "SELECT toYearNumSinceEpoch(toDate('2024-10-01'))", "54"}};
+    FunctionDocumentation::Category category = {"DateTime"};
+
+    factory.registerFunction<FunctionToYearNumSinceEpoch>({description, syntax, arguments, returned_value, examples, category});
+}
+
+}
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 3e3c532a6aee..d469da0d3c66 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -23,6 +23,7 @@
 
 #include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
 #include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
 
 #include <Common/ProfileEvents.h>
 
@@ -431,7 +432,9 @@ ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filen
         schema_id,
         schema_processor,
         inherited_sequence_number,
-        table_location);
+        table_location,
+        getContext());
+
     return ManifestFileContent(std::move(manifest_file_impl));
 }
 
@@ -467,24 +470,6 @@ IcebergSnapshot IcebergMetadata::getSnapshot(const String & filename) const
     return IcebergSnapshot{getManifestList(filename)};
 }
 
-std::vector<Int32>
-getRelevantPartitionColumnIds(const ManifestFileIterator & entry, const IcebergSchemaProcessor & schema_processor, Int32 current_schema_id)
-{
-    std::vector<Int32> partition_column_ids;
-    partition_column_ids.reserve(entry->getPartitionColumnInfos().size());
-    for (const auto & partition_column_info : entry->getPartitionColumnInfos())
-    {
-        std::optional<NameAndTypePair> name_and_type
-            = schema_processor.tryGetFieldCharacteristics(current_schema_id, partition_column_info.source_id);
-        if (name_and_type)
-        {
-            partition_column_ids.push_back(partition_column_info.source_id);
-        }
-    }
-    return partition_column_ids;
-}
-
-
 Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
 {
     if (!current_snapshot)
@@ -496,39 +481,30 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
     Strings data_files;
     for (const auto & manifest_list_entry : *(current_snapshot->manifest_list_iterator))
     {
-        const auto & partition_columns_ids
-            = getRelevantPartitionColumnIds(manifest_list_entry.manifest_file, schema_processor, current_schema_id);
-        const auto & partition_pruning_columns_names_and_types
-            = schema_processor.tryGetFieldsCharacteristics(current_schema_id, partition_columns_ids);
-
-        ExpressionActionsPtr partition_minmax_idx_expr = std::make_shared<ExpressionActions>(
-            ActionsDAG(partition_pruning_columns_names_and_types), ExpressionActionsSettings(getContext()));
-        const KeyCondition partition_key_condition(
-            filter_dag, getContext(), partition_pruning_columns_names_and_types.getNames(), partition_minmax_idx_expr);
-
+        PartitionPruner pruner(schema_processor, current_schema_id, filter_dag, *manifest_list_entry.manifest_file, getContext());
         const auto & data_files_in_manifest = manifest_list_entry.manifest_file->getFiles();
         for (const auto & manifest_file_entry : data_files_in_manifest)
         {
             if (manifest_file_entry.status != ManifestEntryStatus::DELETED)
             {
-                if (partition_key_condition
-                        .checkInHyperrectangle(
-                            manifest_file_entry.getPartitionRanges(partition_columns_ids),
-                            partition_pruning_columns_names_and_types.getTypes())
-                        .can_be_true)
+                if (pruner.canBePruned(manifest_file_entry))
+                {
+                    ProfileEvents::increment(ProfileEvents::IcebergPartitionPrunnedFiles);
+                }
+                else
                 {
                     if (std::holds_alternative<DataFileEntry>(manifest_file_entry.file))
                         data_files.push_back(std::get<DataFileEntry>(manifest_file_entry.file).file_name);
                 }
-                else
-                    ProfileEvents::increment(ProfileEvents::IcebergPartitionPrunnedFiles);
             }
         }
     }
 
-
     if (!filter_dag)
-        return (cached_unprunned_files_for_current_snapshot = data_files).value();
+    {
+        cached_unprunned_files_for_current_snapshot = data_files;
+        return cached_unprunned_files_for_current_snapshot.value();
+    }
 
     return data_files;
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index 7d0df0ae5ff9..46953627c92c 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -10,12 +10,17 @@
 #include <Columns/ColumnTuple.h>
 #include <Poco/JSON/Parser.h>
 #include "DataTypes/DataTypeTuple.h"
+#include <Storages/ColumnsDescription.h>
+#include <Parsers/ASTFunction.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <Common/quoteString.h>
 
 namespace DB::ErrorCodes
 {
 extern const int ILLEGAL_COLUMN;
 extern const int UNSUPPORTED_METHOD;
 extern const int ICEBERG_SPECIFICATION_VIOLATION;
+extern const int LOGICAL_ERROR;
 }
 
 namespace Iceberg
@@ -29,6 +34,7 @@ constexpr const char * SUBCOLUMN_FILE_PATH_NAME = "file_path";
 constexpr const char * SUBCOLUMN_CONTENT_NAME = "content";
 constexpr const char * SUBCOLUMN_PARTITION_NAME = "partition";
 
+
 const std::vector<ManifestFileEntry> & ManifestFileContent::getFiles() const
 {
     return impl->files;
@@ -39,24 +45,6 @@ Int32 ManifestFileContent::getSchemaId() const
     return impl->schema_id;
 }
 
-std::vector<DB::Range> ManifestFileEntry::getPartitionRanges(const std::vector<Int32> & partition_columns_ids) const
-{
-    std::vector<DB::Range> filtered_partition_ranges;
-    filtered_partition_ranges.reserve(partition_columns_ids.size());
-    for (const auto & partition_column_id : partition_columns_ids)
-    {
-        filtered_partition_ranges.push_back(partition_ranges.at(partition_column_id));
-    }
-    return filtered_partition_ranges;
-}
-
-
-const std::vector<PartitionColumnInfo> & ManifestFileContent::getPartitionColumnInfos() const
-{
-    chassert(impl != nullptr);
-    return impl->partition_column_infos;
-}
-
 
 ManifestFileContent::ManifestFileContent(std::unique_ptr<ManifestFileContentImpl> impl_) : impl(std::move(impl_))
 {
@@ -73,7 +61,8 @@ ManifestFileContentImpl::ManifestFileContentImpl(
     Int32 schema_id_,
     const IcebergSchemaProcessor & schema_processor,
     Int64 inherited_sequence_number,
-    const String & table_location)
+    const String & table_location,
+    DB::ContextPtr context)
 {
     this->schema_id = schema_id_;
 
@@ -167,27 +156,40 @@ ManifestFileContentImpl::ManifestFileContentImpl(
     std::vector<uint8_t> partition_spec_json_bytes = avro_metadata["partition-spec"];
     String partition_spec_json_string
         = String(reinterpret_cast<char *>(partition_spec_json_bytes.data()), partition_spec_json_bytes.size());
+
     Poco::Dynamic::Var partition_spec_json = parser.parse(partition_spec_json_string);
     const Poco::JSON::Array::Ptr & partition_specification = partition_spec_json.extract<Poco::JSON::Array::Ptr>();
 
     std::vector<ColumnPtr> partition_columns;
+    DB::NamesAndTypesList partition_columns_description;
+    std::shared_ptr<DB::ASTFunction> partition_key_ast = std::make_shared<DB::ASTFunction>();
+    partition_key_ast->name = "tuple";
+    partition_key_ast->arguments = std::make_shared<DB::ASTExpressionList>();
+    partition_key_ast->children.push_back(partition_key_ast->arguments);
 
     for (size_t i = 0; i != partition_specification->size(); ++i)
     {
-        auto current_field = partition_specification->getObject(static_cast<UInt32>(i));
-
-        auto source_id = current_field->getValue<Int32>("source-id");
-        PartitionTransform transform = getTransform(current_field->getValue<String>("transform"));
-
-        if (transform == PartitionTransform::Unsupported || transform == PartitionTransform::Void)
-        {
+        auto partition_specification_field = partition_specification->getObject(static_cast<UInt32>(i));
+
+        auto source_id = partition_specification_field->getValue<Int32>("source-id");
+        /// NOTE: tricky part to support RENAME column in partition key. Instead of some name
+        /// we use column internal number as it's name.
+        auto numeric_column_name = DB::backQuote(DB::toString(source_id));
+        DB::NameAndTypePair manifest_file_column_characteristics = schema_processor.getFieldCharacteristics(schema_id, source_id);
+        auto partition_ast = getASTFromTransform(partition_specification_field->getValue<String>("transform"), numeric_column_name);
+        /// Unsupported partition key expression
+        if (partition_ast == nullptr)
             continue;
-        }
 
-        partition_column_infos.emplace_back(transform, source_id);
+        partition_key_ast->arguments->children.emplace_back(std::move(partition_ast));
+        partition_columns_description.emplace_back(numeric_column_name, removeNullable(manifest_file_column_characteristics.type));
         partition_columns.push_back(removeNullable(big_partition_tuple->getColumnPtr(i)));
+        this->partition_column_ids.push_back(source_id);
     }
 
+    if (!partition_column_ids.empty())
+        this->partition_key_description.emplace(DB::KeyDescription::getKeyFromAST(std::move(partition_key_ast), ColumnsDescription(partition_columns_description), context));
+
     std::optional<const ColumnNullable *> sequence_number_column = std::nullopt;
     if (format_version_ > 1)
     {
@@ -224,17 +226,12 @@ ManifestFileContentImpl::ManifestFileContentImpl(
 
         const auto file_path = getProperFilePathFromMetadataInfo(file_path_string_column->getDataAt(i).toView(), common_path, table_location);
 
-        std::unordered_map<Int32, Range> partition_ranges;
-        for (size_t j = 0; j < partition_columns.size(); ++j)
+        DB::Row partition_key_value;
+        for (const auto & partition_column : partition_columns)
         {
-            const Int32 source_id = partition_column_infos[j].source_id;
-            partition_ranges.emplace(
-                source_id,
-                getPartitionRange(
-                    partition_column_infos[j].transform,
-                    i,
-                    partition_columns[j],
-                    schema_processor.getFieldCharacteristics(schema_id, source_id).type));
+            Field partition_value;
+            partition_column->get(i, partition_value);
+            partition_key_value.emplace_back(partition_value);
         }
         FileEntry file = FileEntry{DataFileEntry{file_path}};
 
@@ -259,10 +256,27 @@ ManifestFileContentImpl::ManifestFileContentImpl(
                     break;
             }
         }
-        this->files.emplace_back(status, added_sequence_number, partition_ranges, file);
+        this->files.emplace_back(status, added_sequence_number, file, partition_key_value);
     }
 }
 
+bool ManifestFileContent::hasPartitionKey() const
+{
+    return !impl->partition_column_ids.empty();
+}
+
+const DB::KeyDescription & ManifestFileContent::getPartitionKeyDescription() const
+{
+    if (!hasPartitionKey())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Table has no partition key, but it was requested");
+    return *(impl->partition_key_description);
+}
+
+const std::vector<Int32> & ManifestFileContent::getPartitionKeyColumnIDs() const
+{
+    return impl->partition_column_ids;
+}
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index f3f8d227caa0..14b3dd82294a 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -5,8 +5,9 @@
 #include <cstdint>
 #if USE_AVRO
 
-#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h>
+#include <Storages/KeyDescription.h>
+#include <Core/Field.h>
 
 namespace Iceberg
 {
@@ -34,24 +35,16 @@ struct DataFileEntry
 
 using FileEntry = std::variant<DataFileEntry>; // In the future we will add PositionalDeleteFileEntry and EqualityDeleteFileEntry here
 
+/// Description of Data file in manifest file
 struct ManifestFileEntry
 {
     ManifestEntryStatus status;
     Int64 added_sequence_number;
-    std::unordered_map<Int32, DB::Range> partition_ranges;
 
     FileEntry file;
-
-    std::vector<DB::Range> getPartitionRanges(const std::vector<Int32> & partition_columns_ids) const;
-};
-
-struct PartitionColumnInfo
-{
-    PartitionTransform transform;
-    Int32 source_id;
+    DB::Row partition_key_value;
 };
 
-
 class ManifestFileContent
 {
 public:
@@ -59,10 +52,10 @@ class ManifestFileContent
 
     const std::vector<ManifestFileEntry> & getFiles() const;
     Int32 getSchemaId() const;
-    const std::vector<PartitionColumnInfo> & getPartitionColumnInfos() const;
-    Int32 getPartitionSpecId() const;
-
 
+    bool hasPartitionKey() const;
+    const DB::KeyDescription & getPartitionKeyDescription() const;
+    const std::vector<Int32> & getPartitionKeyColumnIDs() const;
 private:
     std::unique_ptr<ManifestFileContentImpl> impl;
 };
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index 7d740e0f9429..f96ea285483e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -7,6 +7,7 @@
 #include <Processors/Formats/Impl/AvroRowInputFormat.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
+#include <Storages/KeyDescription.h>
 
 #include "SchemaProcessor.h"
 
@@ -49,14 +50,13 @@ struct ManifestFileContentImpl
         Int32 schema_id_,
         const DB::IcebergSchemaProcessor & schema_processor,
         Int64 inherited_sequence_number,
-        const std::string & table_location);
+        const std::string & table_location,
+        DB::ContextPtr context);
 
     Int32 schema_id;
 
-
-    // Size - number of supported partition columns
-    std::vector<PartitionColumnInfo> partition_column_infos;
-
+    std::optional<DB::KeyDescription> partition_key_description;
+    std::vector<Int32> partition_column_ids;
     // Size - number of files
     std::vector<ManifestFileEntry> files;
 };
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp
index 308cc7792564..9e24494e071e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp
@@ -1,130 +1,144 @@
+#include "config.h"
+
+#if USE_AVRO
+
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnsDateTime.h>
 #include <Common/DateLUTImpl.h>
 #include <DataTypes/DataTypeNullable.h>
+#include <Common/logger_useful.h>
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTLiteral.h>
+#include <IO/ReadHelpers.h>
+#include <Common/quoteString.h>
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
-
-namespace DB::ErrorCodes
-{
-extern const int BAD_ARGUMENTS;
-}
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
 
 using namespace DB;
 
 namespace Iceberg
 {
 
-Iceberg::PartitionTransform getTransform(const String & transform_name)
+DB::ASTPtr getASTFromTransform(const String & transform_name_src, const String & column_name)
 {
+    std::string transform_name = Poco::toLower(transform_name_src);
+
     if (transform_name == "year" || transform_name == "years")
+        return makeASTFunction("toYearNumSinceEpoch", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "month" || transform_name == "months")
+        return makeASTFunction("toMonthNumSinceEpoch", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "day" || transform_name == "date" || transform_name == "days" || transform_name == "dates")
+        return makeASTFunction("toRelativeDayNum", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "hour" || transform_name == "hours")
+        return makeASTFunction("toRelativeHourNum", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "identity")
+        return std::make_shared<ASTIdentifier>(column_name);
+
+    if (transform_name == "void")
+        return makeASTFunction("tuple");
+
+    if (transform_name.starts_with("truncate"))
     {
-        return Iceberg::PartitionTransform::Year;
-    }
-    else if (transform_name == "month" || transform_name == "months")
-    {
-        return Iceberg::PartitionTransform::Month;
-    }
-    else if (transform_name == "day" || transform_name == "date" || transform_name == "days" || transform_name == "dates")
-    {
-        return Iceberg::PartitionTransform::Day;
-    }
-    else if (transform_name == "hour" || transform_name == "hours")
-    {
-        return Iceberg::PartitionTransform::Hour;
-    }
-    else if (transform_name == "identity")
-    {
-        return Iceberg::PartitionTransform::Identity;
-    }
-    else if (transform_name == "void")
-    {
-        return Iceberg::PartitionTransform::Void;
+        /// should look like transform[N]
+
+        if (transform_name.back() != ']')
+            return nullptr;
+
+        auto argument_start = transform_name.find('[');
+
+        if (argument_start == std::string::npos)
+            return nullptr;
+
+        auto argument_width = transform_name.length() - 2 - argument_start;
+        std::string width = transform_name.substr(argument_start + 1, argument_width);
+        size_t truncate_width;
+        bool parsed = DB::tryParse<size_t>(truncate_width, width);
+
+        if (!parsed)
+            return nullptr;
+
+        return makeASTFunction("icebergTruncate", std::make_shared<DB::ASTLiteral>(truncate_width), std::make_shared<DB::ASTIdentifier>(column_name));
     }
     else
     {
-        return Iceberg::PartitionTransform::Unsupported;
+        return nullptr;
     }
 }
 
-// This function is used to convert the value to the start of the corresponding time period (internal ClickHouse representation)
-DateLUTImpl::Values getDateLUTImplValues(Int32 value, Iceberg::PartitionTransform transform)
+std::unique_ptr<DB::ActionsDAG> PartitionPruner::transformFilterDagForManifest(const DB::ActionsDAG * source_dag, Int32 manifest_schema_id, const std::vector<Int32> & partition_column_ids) const
 {
-    switch (transform)
+    if (source_dag == nullptr)
+        return nullptr;
+
+    ActionsDAG dag_with_renames;
+    for (const auto column_id : partition_column_ids)
     {
-        case Iceberg::PartitionTransform::Year:
-            return DateLUT::instance().lutIndexByYearSinceEpochStartsZeroIndexing(value);
-        case Iceberg::PartitionTransform::Month:
-            return DateLUT::instance().lutIndexByMonthSinceEpochStartsZeroIndexing(static_cast<UInt32>(value));
-        case Iceberg::PartitionTransform::Day:
-            return DateLUT::instance().getValues(static_cast<ExtendedDayNum>(value));
-        case Iceberg::PartitionTransform::Hour:
-        {
-            DateLUTImpl::Values values = DateLUT::instance().getValues(static_cast<ExtendedDayNum>(value / 24));
-            values.date += (value % 24) * 3600;
-            return values;
-        }
-        default:
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported partition transform for get values function: {}", transform);
+        auto column = schema_processor.tryGetFieldCharacteristics(current_schema_id, column_id);
+
+        /// Columns which we dropped and doesn't exist in current schema
+        /// cannot be queried in WHERE expression.
+        if (!column.has_value())
+            continue;
+
+        /// We take data type from manifest schema, not latest type
+        auto column_type = schema_processor.getFieldCharacteristics(manifest_schema_id, column_id).type;
+        auto numeric_column_name = DB::backQuote(DB::toString(column_id));
+        const auto * node = &dag_with_renames.addInput(numeric_column_name, column_type);
+        node = &dag_with_renames.addAlias(*node, column->name);
+        dag_with_renames.getOutputs().push_back(node);
     }
-}
+    auto result = std::make_unique<DB::ActionsDAG>(DB::ActionsDAG::merge(std::move(dag_with_renames), source_dag->clone()));
+    result->removeUnusedActions();
+    return result;
 
-// This function is used to convert the value to the start of the corresponding time period (in seconds)
-Int64 getTime(Int32 value, Iceberg::PartitionTransform transform)
-{
-    DateLUTImpl::Values values = getDateLUTImplValues(value, transform);
-    return values.date;
 }
 
-// This function is used to convert the value to the start of the corresponding date period (in days)
-Int16 getDay(Int32 value, Iceberg::PartitionTransform transform)
+PartitionPruner::PartitionPruner(
+    const DB::IcebergSchemaProcessor & schema_processor_,
+    Int32 current_schema_id_,
+    const DB::ActionsDAG * filter_dag,
+    const ManifestFileContent & manifest_file,
+    DB::ContextPtr context)
+    : schema_processor(schema_processor_)
+    , current_schema_id(current_schema_id_)
 {
-    DateLUTImpl::Time got_time = getTime(value, transform);
-    return DateLUT::instance().toDayNum(got_time);
+    if (manifest_file.hasPartitionKey())
+    {
+        partition_key = &manifest_file.getPartitionKeyDescription();
+        auto transformed_dag = transformFilterDagForManifest(filter_dag, manifest_file.getSchemaId(), manifest_file.getPartitionKeyColumnIDs());
+        if (transformed_dag != nullptr)
+            key_condition.emplace(transformed_dag.get(), context, partition_key->column_names, partition_key->expression, true /* single_point */);
+    }
 }
 
-
-Range getPartitionRange(
-    Iceberg::PartitionTransform partition_transform, size_t index, ColumnPtr partition_column, DataTypePtr column_data_type)
+bool PartitionPruner::canBePruned(const ManifestFileEntry & entry) const
 {
-    if (partition_transform == Iceberg::PartitionTransform::Unsupported)
-    {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported partition transform: {}", partition_transform);
-    }
-    if (partition_transform == Iceberg::PartitionTransform::Identity)
-    {
-        Field entry = (*partition_column.get())[index];
-        return Range{entry, true, entry, true};
-    }
-    if (partition_column->getDataType() != TypeIndex::Int32)
+    if (!key_condition.has_value())
+        return false;
+
+    const auto & partition_value = entry.partition_key_value;
+    std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
+    for (auto & field : index_value)
     {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported partition column type: {}", partition_column->getFamilyName());
+        // NULL_LAST
+        if (field.isNull())
+            field = POSITIVE_INFINITY;
     }
 
-    auto nested_data_type = removeNullable(column_data_type);
+    bool can_be_true = key_condition->mayBeTrueInRange(
+        partition_value.size(), index_value.data(), index_value.data(), partition_key->data_types);
 
-    const auto * casted_innner_column = assert_cast<const ColumnInt32 *>(partition_column.get());
-    Int32 value = casted_innner_column->getElement(index);
-
-    if ((WhichDataType(nested_data_type).isDate() || WhichDataType(nested_data_type).isDate32())
-        && (partition_transform != Iceberg::PartitionTransform::Hour))
-    {
-        const UInt16 begin_range_value = getDay(value, partition_transform);
-        const UInt16 end_range_value = getDay(value + 1, partition_transform);
-        return Range{begin_range_value, true, end_range_value, false};
-    }
-    else if (WhichDataType(nested_data_type).isDateTime64() || WhichDataType(nested_data_type).isDateTime())
-    {
-        const UInt64 begin_range_value = getTime(value, partition_transform);
-        const UInt64 end_range_value = getTime(value + 1, partition_transform);
-        return Range{begin_range_value, true, end_range_value, false};
-    }
-    else
-    {
-        throw Exception(
-            ErrorCodes::BAD_ARGUMENTS, "Partition transform {} is not supported for the type: {}", partition_transform, nested_data_type);
-    }
+    return !can_be_true;
 }
 
 
 }
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h
index d5252e83a15d..8b9fd80c612f 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h
@@ -1,44 +1,49 @@
 #pragma once
+#include "config.h"
+
+#if USE_AVRO
 
-#include <vector>
 #include <Core/NamesAndTypes.h>
-#include <Core/Range.h>
+#include <Parsers/IAST_fwd.h>
+#include <Interpreters/ActionsDAG.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
+#include <Storages/KeyDescription.h>
+#include <Storages/MergeTree/KeyCondition.h>
+
 
 namespace Iceberg
 {
 
-enum class PartitionTransform
-{
-    Year,
-    Month,
-    Day,
-    Hour,
-    Identity,
-    Void,
-    Unsupported
-};
+struct ManifestFileEntry;
+class ManifestFileContent;
+
+DB::ASTPtr getASTFromTransform(const String & transform_name_src, const String & column_name);
 
-struct SpecificSchemaPartitionInfo
+/// Prune specific data files based on manifest content
+class PartitionPruner
 {
-    std::vector<std::vector<DB::Range>> ranges;
-    DB::NamesAndTypesList partition_names_and_types;
+private:
+    const DB::IcebergSchemaProcessor & schema_processor;
+    Int32 current_schema_id;
+    const DB::KeyDescription * partition_key;
+
+    std::optional<DB::KeyCondition> key_condition;
+    /// NOTE: tricky part to support RENAME column in partition key.
+    /// Takes ActionDAG representation of user's WHERE expression and
+    /// rename columns to the their origina numeric id's in iceberg
+    std::unique_ptr<DB::ActionsDAG> transformFilterDagForManifest(const DB::ActionsDAG * source_dag, Int32 manifest_schema_id, const std::vector<Int32> & partition_column_ids) const;
+public:
+    PartitionPruner(
+        const DB::IcebergSchemaProcessor & schema_processor_,
+        Int32 current_schema_id_,
+        const DB::ActionsDAG * filter_dag,
+        const ManifestFileContent & manifest_file,
+        DB::ContextPtr context);
+
+    bool canBePruned(const ManifestFileEntry & entry) const;
+
 };
 
-Iceberg::PartitionTransform getTransform(const String & transform_name);
-
-// This function is used to convert the entry `partition_column[index]` to the range which type coincides with the
-// `column_data_type` and represents the time period corresponding to the entry's value under the `partition_transform`
-// Examples:
-// partition_transform = Iceberg::PartitionTransform::Year
-// index = 3
-// partition_column = [1970, 1971, 1972, 1973, 1974]
-// column_data_type = Date
-// Range: {1096, true, 1461, false} // 1096 is the start of the year 1973 (in days), 1461 is the start of the year 1974 (in days) (exclusive end of the year 1973)
-// partition_transform = Iceberg::PartitionTransform::Month
-// index = 3
-// partition_column = [1970-01, 1970-02, 1970-03, 1970-04, 1970-05]
-// column_data_type = DateTime64
-// Range: {7776000, true, 10368000, false} // 7776000 is the start of the year 1970-04 (in seconds), 10368000 is the start of the month 1970-05 (in seconds) (exclusive end of the month 1970-04)
-DB::Range getPartitionRange(
-    Iceberg::PartitionTransform partition_transform, size_t index, DB::ColumnPtr partition_column, DB::DataTypePtr column_data_type);
 }
+
+#endif
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index c755f05dfdcf..64a81febaa0c 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -2078,11 +2078,12 @@ def execute_spark_query(query: str):
                 date2 DATE,
                 ts TIMESTAMP,
                 ts2 TIMESTAMP,
-                time_struct struct<a : DATE, b : TIMESTAMP>
-
+                time_struct struct<a : DATE, b : TIMESTAMP>,
+                name VARCHAR(50),
+                number BIGINT
             )
             USING iceberg
-            PARTITIONED BY (identity(tag), days(date), years(date2), hours(ts), months(ts2))
+            PARTITIONED BY (identity(tag), days(date), years(date2), hours(ts), months(ts2), TRUNCATE(3, name), TRUNCATE(3, number))
             OPTIONS('format-version'='2')
         """
     )
@@ -2091,13 +2092,13 @@ def execute_spark_query(query: str):
         f"""
         INSERT INTO {TABLE_NAME} VALUES
         (1, DATE '2024-01-20', DATE '2024-01-20',
-        TIMESTAMP '2024-02-20 10:00:00', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-01-20', 'b', TIMESTAMP '2024-02-20 10:00:00')),
+        TIMESTAMP '2024-02-20 10:00:00', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-01-20', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'vasya', 5),
         (2, DATE '2024-01-30', DATE '2024-01-30',
-        TIMESTAMP '2024-03-20 15:00:00', TIMESTAMP '2024-03-20 15:00:00', named_struct('a', DATE '2024-03-20', 'b', TIMESTAMP '2024-03-20 14:00:00')),
+        TIMESTAMP '2024-03-20 15:00:00', TIMESTAMP '2024-03-20 15:00:00', named_struct('a', DATE '2024-03-20', 'b', TIMESTAMP '2024-03-20 14:00:00'), 'vasilisa', 6),
         (1, DATE '2024-02-20', DATE '2024-02-20',
-        TIMESTAMP '2024-03-20 20:00:00', TIMESTAMP '2024-03-20 20:00:00', named_struct('a', DATE '2024-02-20', 'b', TIMESTAMP '2024-02-20 10:00:00')),
+        TIMESTAMP '2024-03-20 20:00:00', TIMESTAMP '2024-03-20 20:00:00', named_struct('a', DATE '2024-02-20', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'iceberg', 7),
         (2, DATE '2025-01-20', DATE '2025-01-20',
-        TIMESTAMP '2024-04-30 14:00:00', TIMESTAMP '2024-04-30 14:00:00', named_struct('a', DATE '2024-04-30', 'b', TIMESTAMP '2024-04-30 14:00:00'));
+        TIMESTAMP '2024-04-30 14:00:00', TIMESTAMP '2024-04-30 14:00:00', named_struct('a', DATE '2024-04-30', 'b', TIMESTAMP '2024-04-30 14:00:00'), 'icebreaker', 8);
     """
     )
 
@@ -2206,6 +2207,34 @@ def check_validity_and_get_prunned_files(select_expression):
         == 2
     )
 
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE name == 'vasilisa' ORDER BY ALL"
+        )
+        == 2
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE name < 'kek' ORDER BY ALL"
+        )
+        == 2
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE number == 8 ORDER BY ALL"
+        )
+        == 1
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE number <= 5 ORDER BY ALL"
+        )
+        == 3
+    )
+
     execute_spark_query(f"ALTER TABLE {TABLE_NAME} RENAME COLUMN date TO date3")
 
     assert (
@@ -2234,7 +2263,7 @@ def check_validity_and_get_prunned_files(select_expression):
     )
 
     execute_spark_query(
-        f"INSERT INTO {TABLE_NAME} VALUES (1, DATE '2024-01-20', DATE '2024-01-20', TIMESTAMP '2024-02-20 10:00:00', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-03-15', 'b', TIMESTAMP '2024-02-20 10:00:00'))"
+        f"INSERT INTO {TABLE_NAME} VALUES (1, DATE '2024-01-20', DATE '2024-01-20', TIMESTAMP '2024-02-20 10:00:00', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-03-15', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'kek', 10)"
     )
 
     assert (
diff --git a/tests/queries/0_stateless/03376_iceberg_truncate.reference b/tests/queries/0_stateless/03376_iceberg_truncate.reference
new file mode 100644
index 000000000000..4d71057f4325
--- /dev/null
+++ b/tests/queries/0_stateless/03376_iceberg_truncate.reference
@@ -0,0 +1,42 @@
+0
+0
+0
+0
+10
+10
+-10
+-10
+-10
+-20
+0
+0
+0
+0
+10
+10
+-10
+-10
+-10
+-20
+12.3
+12.3
+12.2
+0
+-0.1
+12.3
+12.3
+12.2
+0
+-0.1
+abcde
+abc
+abcde
+abcde
+abc
+abcde
+0
+-10
+0
+-10
+10.5
+ice
diff --git a/tests/queries/0_stateless/03376_iceberg_truncate.sql b/tests/queries/0_stateless/03376_iceberg_truncate.sql
new file mode 100644
index 000000000000..894ddb5ff988
--- /dev/null
+++ b/tests/queries/0_stateless/03376_iceberg_truncate.sql
@@ -0,0 +1,56 @@
+-- Test taken from Iceberg repo https://github.com/apache/iceberg/blob/6e8718113c08aebf76d8e79a9e2534c89c73407a/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java
+
+SELECT icebergTruncate(10, 0);
+SELECT icebergTruncate(10, 1);
+SELECT icebergTruncate(10, 5);
+SELECT icebergTruncate(10, 9);
+SELECT icebergTruncate(10, 10);
+SELECT icebergTruncate(10, 11);
+SELECT icebergTruncate(10, -1);
+SELECT icebergTruncate(10, -5);
+SELECT icebergTruncate(10, -10);
+SELECT icebergTruncate(10, -11);
+
+SELECT icebergTruncate(10, 0::Int64);
+SELECT icebergTruncate(10, 1::Int64);
+SELECT icebergTruncate(10, 5::Int64);
+SELECT icebergTruncate(10, 9::Int64);
+SELECT icebergTruncate(10, 10::Int64);
+SELECT icebergTruncate(10, 11::Int64);
+SELECT icebergTruncate(10, -1::Int64);
+SELECT icebergTruncate(10, -5::Int64);
+SELECT icebergTruncate(10, -10::Int64);
+SELECT icebergTruncate(10, -11::Int64);
+
+SELECT icebergTruncate(10, toDecimal64('12.34', 2));
+SELECT icebergTruncate(10, toDecimal64('12.30', 2));
+SELECT icebergTruncate(10, toDecimal64('12.29', 2));
+SELECT icebergTruncate(10, toDecimal64('0.05', 2));
+SELECT icebergTruncate(10, toDecimal64('-0.05', 2));
+
+SELECT icebergTruncate(10, toDecimal32('12.34', 2));
+SELECT icebergTruncate(10, toDecimal32('12.30', 2));
+SELECT icebergTruncate(10, toDecimal32('12.29', 2));
+SELECT icebergTruncate(10, toDecimal32('0.05', 2));
+SELECT icebergTruncate(10, toDecimal32('-0.05', 2));
+
+SELECT icebergTruncate(5, 'abcdefg');
+SELECT icebergTruncate(5, 'abc');
+SELECT icebergTruncate(5, 'abcde');
+
+SELECT icebergTruncate(5, toFixedString('abcdefg', 30));
+SELECT icebergTruncate(5, toFixedString('abc', 3));
+SELECT icebergTruncate(5, toFixedString('abcde', 5));
+
+SELECT icebergTruncate(0, 55); --{serverError BAD_ARGUMENTS}
+SELECT icebergTruncate(-1, 55); --{serverError ILLEGAL_TYPE_OF_ARGUMENT}
+SELECT icebergTruncate(3, 0.0); --{serverError ILLEGAL_TYPE_OF_ARGUMENT}
+
+-- Test taken from examples: https://iceberg.apache.org/spec/#truncate-transform-details
+
+SELECT icebergTruncate(10, 1);
+SELECT icebergTruncate(10, -1);
+SELECT icebergTruncate(10, 1::Int64);
+SELECT icebergTruncate(10, -1::Int64);
+SELECT icebergTruncate(50, toDecimal64('10.65', 2));
+SELECT icebergTruncate(3, 'iceberg');

From 8c558080e88c9d370a5d76e099d50df893dad78e Mon Sep 17 00:00:00 2001
From: Daniil Ivanik <61067749+divanik@users.noreply.github.com>
Date: Wed, 19 Mar 2025 22:29:48 +0000
Subject: [PATCH 04/14] Merge pull request #77439 from
 ClickHouse/divanik/iceberg_time_travel_by_snapshots

Iceberg time travel by snapshots
---
 .../table-engines/integrations/iceberg.md     | 176 ++++++++-
 .../sql-reference/table-functions/iceberg.md  | 173 ++++++++-
 src/Core/Settings.cpp                         |   6 +
 src/Core/SettingsChangesHistory.cpp           |   9 +
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 233 +++++++++---
 .../DataLakes/Iceberg/IcebergMetadata.h       |  36 +-
 .../DataLakes/Iceberg/SchemaProcessor.cpp     |   4 +
 .../DataLakes/Iceberg/SchemaProcessor.h       |   2 +
 .../DataLakes/Iceberg/Snapshot.h              |   1 +
 .../integration/test_storage_iceberg/test.py  | 353 +++++++++++++++++-
 10 files changed, 898 insertions(+), 95 deletions(-)

diff --git a/docs/en/engines/table-engines/integrations/iceberg.md b/docs/en/engines/table-engines/integrations/iceberg.md
index 532cd9ed84bd..00c2e4626d1c 100644
--- a/docs/en/engines/table-engines/integrations/iceberg.md
+++ b/docs/en/engines/table-engines/integrations/iceberg.md
@@ -4,7 +4,7 @@ sidebar_position: 90
 sidebar_label: Iceberg
 ---
 
-# Iceberg Table Engine
+# Iceberg Table Engine {#iceberg-table-engine}
 
 :::warning 
 We recommend using the [Iceberg Table Function](/docs/sql-reference/table-functions/iceberg.md) for working with Iceberg data in ClickHouse. The Iceberg Table Function currently provides sufficient functionality, offering a partial read-only interface for Iceberg tables.
@@ -34,14 +34,14 @@ CREATE TABLE iceberg_table_local
     ENGINE = IcebergLocal(path_to_table, [,format] [,compression_method])
 ```
 
-**Engine arguments**
+## Engine arguments {#engine-arguments}
 
 Description of the arguments coincides with description of arguments in engines `S3`, `AzureBlobStorage`, `HDFS` and `File` correspondingly.
 `format` stands for the format of data files in the Iceberg table.
 
 Engine parameters can be specified using [Named Collections](../../../operations/named-collections.md)
 
-**Example**
+### Example {#example}
 
 ```sql
 CREATE TABLE iceberg_table ENGINE=IcebergS3('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test')
@@ -66,12 +66,11 @@ CREATE TABLE iceberg_table ENGINE=IcebergS3(iceberg_conf, filename = 'test_table
 
 ```
 
-**Aliases**
-
+## Aliases {#aliases}
 
 Table engine `Iceberg` is an alias to `IcebergS3` now.
 
-**Schema Evolution**
+## Schema Evolution {#schema-evolution}
 At the moment, with the help of CH, you can read iceberg tables, the schema of which has changed over time. We currently support reading tables where columns have been added and removed, and their order has changed. You can also change a column where a value is required to one where NULL is allowed. Additionally, we support permitted type casting for simple types, namely:  
 * int -> long
 * float -> double
@@ -81,11 +80,172 @@ Currently, it is not possible to change nested structures or the types of elemen
 
 To read a table where the schema has changed after its creation with dynamic schema inference, set allow_dynamic_metadata_for_data_lakes = true when creating the table.
 
-**Partition Pruning**
+## Partition Pruning {#partition-pruning}
 
 ClickHouse supports partition pruning during SELECT queries for Iceberg tables, which helps optimize query performance by skipping irrelevant data files. Now it works with only identity transforms and time-based transforms (hour, day, month, year). To enable partition pruning, set `use_iceberg_partition_pruning = 1`.
 
-### Data cache {#data-cache}
+
+## Time Travel {#time-travel}
+
+ClickHouse supports time travel for Iceberg tables, allowing you to query historical data with a specific timestamp or snapshot ID.
+
+### Basic usage {#basic-usage}
+ ```sql
+ SELECT * FROM example_table ORDER BY 1 
+ SETTINGS iceberg_timestamp_ms = 1714636800000
+ ```
+
+ ```sql
+ SELECT * FROM example_table ORDER BY 1 
+ SETTINGS iceberg_snapshot_id = 3547395809148285433
+ ```
+
+Note: You cannot specify both `iceberg_timestamp_ms` and `iceberg_snapshot_id` parameters in the same query.
+
+### Important considerations {#important-considerations}
+
+- **Snapshots** are typically created when:
+    - New data is written to the table
+    - Some kind of data compaction is performed
+
+- **Schema changes typically don't create snapshots** - This leads to important behaviors when using time travel with tables that have undergone schema evolution.
+
+### Example scenarios {#example-scenarios}
+
+All scenarios are written in Spark because CH doesn't support writing to Iceberg tables yet.
+
+#### Scenario 1: Schema Changes Without New Snapshots {#scenario-1}
+
+Consider this sequence of operations:
+
+ ```sql
+ -- Create a table with two columns
+  CREATE TABLE IF NOT EXISTS spark_catalog.db.time_travel_example (
+  order_number int, 
+  product_code string
+  ) 
+  USING iceberg 
+  OPTIONS ('format-version'='2')
+
+-- Insert data into the table
+  INSERT INTO spark_catalog.db.time_travel_example VALUES 
+    (1, 'Mars')
+
+  ts1 = now() // A piece of pseudo code
+
+-- Alter table to add a new column
+  ALTER TABLE spark_catalog.db.time_travel_example ADD COLUMN (price double)
+ 
+  ts2 = now()
+
+-- Insert data into the table
+  INSERT INTO spark_catalog.db.time_travel_example VALUES (2, 'Venus', 100)
+
+   ts3 = now()
+
+-- Query the table at each timestamp
+  SELECT * FROM spark_catalog.db.time_travel_example TIMESTAMP AS OF ts1;
+
++------------+------------+
+|order_number|product_code|
++------------+------------+
+|           1|        Mars|
++------------+------------+
+
+
+  SELECT * FROM spark_catalog.db.time_travel_example TIMESTAMP AS OF ts2;
+
++------------+------------+
+|order_number|product_code|
++------------+------------+
+|           1|        Mars|
++------------+------------+
+
+  SELECT * FROM spark_catalog.db.time_travel_example TIMESTAMP AS OF ts3;
+
++------------+------------+-----+
+|order_number|product_code|price|
++------------+------------+-----+
+|           1|        Mars| NULL|
+|           2|       Venus|100.0|
++------------+------------+-----+
+```
+
+Query results at different timestamps:
+
+- At ts1 & ts2: Only the original two columns appear
+- At ts3: All three columns appear, with NULL for the price of the first row
+
+#### Scenario 2:  Historical vs. Current Schema Differences {#scenario-2}
+
+
+A time travel query at a current moment might show a different schema than the current table:
+
+
+```sql
+-- Create a table
+  CREATE TABLE IF NOT EXISTS spark_catalog.db.time_travel_example_2 (
+  order_number int, 
+  product_code string
+  ) 
+  USING iceberg 
+  OPTIONS ('format-version'='2')
+
+-- Insert initial data into the table
+  INSERT INTO spark_catalog.db.time_travel_example_2 VALUES (2, 'Venus');
+
+-- Alter table to add a new column
+  ALTER TABLE spark_catalog.db.time_travel_example_2 ADD COLUMN (price double);
+
+  ts = now();
+
+-- Query the table at a current moment but using timestamp syntax
+
+  SELECT * FROM spark_catalog.db.time_travel_example_2 TIMESTAMP AS OF ts;
+
+    +------------+------------+
+    |order_number|product_code|
+    +------------+------------+
+    |           2|       Venus|
+    +------------+------------+
+
+-- Query the table at a current moment
+  SELECT * FROM spark_catalog.db.time_travel_example_2;
+
+
+    +------------+------------+-----+
+    |order_number|product_code|price|
+    +------------+------------+-----+
+    |           2|       Venus| NULL|
+    +------------+------------+-----+
+```
+
+This happens because `ALTER TABLE` doesn't create a new snapshot but for the current table Spark takes value of `schema_id` from the latest metadata file, not a snapshot.
+
+#### Scenario 3:  Historical vs. Current Schema Differences {#scenario-3}
+
+The second one is that while doing time travel you can't get state of table before any data was written to it:
+
+```sql
+-- Create a table
+  CREATE TABLE IF NOT EXISTS spark_catalog.db.time_travel_example_3 (
+  order_number int, 
+  product_code string
+  ) 
+  USING iceberg 
+  OPTIONS ('format-version'='2');
+
+  ts = now();
+
+-- Query the table at a specific timestamp
+  SELECT * FROM spark_catalog.db.time_travel_example_3 TIMESTAMP AS OF ts; -- Finises with error: Cannot find a snapshot older than ts.
+```
+
+
+In Clickhouse the behavior is consistent with Spark. You can mentally replace Spark Select queries with Clickhouse Select queries and it will work the same way.
+
+
+## Data cache {#data-cache}
 
 `Iceberg` table engine and table function support data caching same as `S3`, `AzureBlobStorage`, `HDFS` storages. See [here](../../../engines/table-engines/integrations/s3.md#data-cache).
 
diff --git a/docs/en/sql-reference/table-functions/iceberg.md b/docs/en/sql-reference/table-functions/iceberg.md
index 07ff0bcc810c..4e561feafbc3 100644
--- a/docs/en/sql-reference/table-functions/iceberg.md
+++ b/docs/en/sql-reference/table-functions/iceberg.md
@@ -4,7 +4,7 @@ sidebar_position: 90
 sidebar_label: iceberg
 ---
 
-# iceberg Table Function
+# iceberg Table Function {#iceberg-table-function}
 
 Provides a read-only table-like interface to Apache [Iceberg](https://iceberg.apache.org/) tables in Amazon S3, Azure, HDFS or locally stored.
 
@@ -29,10 +29,10 @@ icebergLocal(named_collection[, option=value [,..]])
 Description of the arguments coincides with description of arguments in table functions `s3`, `azureBlobStorage`, `HDFS` and `file` correspondingly.
 `format` stands for the format of data files in the Iceberg table.
 
-**Returned value**
+### Returned value {#returned-value}
 A table with the specified structure for reading data in the specified Iceberg table.
 
-**Example**
+### Example {#example}
 
 ```sql
 SELECT * FROM icebergS3('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test')
@@ -65,7 +65,7 @@ SELECT * FROM icebergS3(iceberg_conf, filename = 'test_table')
 DESCRIBE icebergS3(iceberg_conf, filename = 'test_table')
 ```
 
-**Schema Evolution**
+## Schema Evolution {#schema-evolution}
 At the moment, with the help of CH, you can read iceberg tables, the schema of which has changed over time. We currently support reading tables where columns have been added and removed, and their order has changed. You can also change a column where a value is required to one where NULL is allowed. Additionally, we support permitted type casting for simple types, namely:  
 * int -> long
 * float -> double
@@ -73,15 +73,174 @@ At the moment, with the help of CH, you can read iceberg tables, the schema of w
 
 Currently, it is not possible to change nested structures or the types of elements within arrays and maps.
 
-**Partition Pruning**
+## Partition Pruning {#partition-pruning}
 
 ClickHouse supports partition pruning during SELECT queries for Iceberg tables, which helps optimize query performance by skipping irrelevant data files. Now it works with only identity transforms and time-based transforms (hour, day, month, year). To enable partition pruning, set `use_iceberg_partition_pruning = 1`.
 
-**Aliases**
+
+## Time Travel {#time-travel}
+
+ClickHouse supports time travel for Iceberg tables, allowing you to query historical data with a specific timestamp or snapshot ID.
+
+### Basic usage {#basic-usage}
+ ```sql
+ SELECT * FROM example_table ORDER BY 1 
+ SETTINGS iceberg_timestamp_ms = 1714636800000
+ ```
+
+ ```sql
+ SELECT * FROM example_table ORDER BY 1 
+ SETTINGS iceberg_snapshot_id = 3547395809148285433
+ ```
+
+Note: You cannot specify both `iceberg_timestamp_ms` and `iceberg_snapshot_id` parameters in the same query.
+
+### Important considerations {#important-considerations}
+
+- **Snapshots** are typically created when:
+    - New data is written to the table
+    - Some kind of data compaction is performed
+
+- **Schema changes typically don't create snapshots** - This leads to important behaviors when using time travel with tables that have undergone schema evolution.
+
+### Example scenarios {#example-scenarios}
+
+All scenarios are written in Spark because CH doesn't support writing to Iceberg tables yet.
+
+#### Scenario 1: Schema Changes Without New Snapshots {#scenario-1}
+
+Consider this sequence of operations:
+
+ ```sql
+ -- Create a table with two columns
+  CREATE TABLE IF NOT EXISTS spark_catalog.db.time_travel_example (
+  order_number bigint, 
+  product_code string
+  ) 
+  USING iceberg 
+  OPTIONS ('format-version'='2')
+
+-- Insert data into the table
+  INSERT INTO spark_catalog.db.time_travel_example VALUES 
+    (1, 'Mars')
+
+  ts1 = now() // A piece of pseudo code
+
+-- Alter table to add a new column
+  ALTER TABLE spark_catalog.db.time_travel_example ADD COLUMN (price double)
+ 
+  ts2 = now()
+
+-- Insert data into the table
+  INSERT INTO spark_catalog.db.time_travel_example VALUES (2, 'Venus', 100)
+
+   ts3 = now()
+
+-- Query the table at each timestamp
+  SELECT * FROM spark_catalog.db.time_travel_example TIMESTAMP AS OF ts1;
+
++------------+------------+
+|order_number|product_code|
++------------+------------+
+|           1|        Mars|
++------------+------------+
+
+
+  SELECT * FROM spark_catalog.db.time_travel_example TIMESTAMP AS OF ts2;
+
++------------+------------+
+|order_number|product_code|
++------------+------------+
+|           1|        Mars|
++------------+------------+
+
+  SELECT * FROM spark_catalog.db.time_travel_example TIMESTAMP AS OF ts3;
+
++------------+------------+-----+
+|order_number|product_code|price|
++------------+------------+-----+
+|           1|        Mars| NULL|
+|           2|       Venus|100.0|
++------------+------------+-----+
+```
+
+Query results at different timestamps:
+
+- At ts1 & ts2: Only the original two columns appear
+- At ts3: All three columns appear, with NULL for the price of the first row
+
+#### Scenario 2:  Historical vs. Current Schema Differences {#scenario-2}
+
+
+A time travel query at a current moment might show a different schema than the current table:
+
+
+```sql
+-- Create a table
+  CREATE TABLE IF NOT EXISTS spark_catalog.db.time_travel_example_2 (
+  order_number bigint, 
+  product_code string
+  ) 
+  USING iceberg 
+  OPTIONS ('format-version'='2')
+
+-- Insert initial data into the table
+  INSERT INTO spark_catalog.db.time_travel_example_2 VALUES (2, 'Venus');
+
+-- Alter table to add a new column
+  ALTER TABLE spark_catalog.db.time_travel_example_2 ADD COLUMN (price double);
+
+  ts = now();
+
+-- Query the table at a current moment but using timestamp syntax
+
+  SELECT * FROM spark_catalog.db.time_travel_example_2 TIMESTAMP AS OF ts;
+
+    +------------+------------+
+    |order_number|product_code|
+    +------------+------------+
+    |           2|       Venus|
+    +------------+------------+
+
+-- Query the table at a current moment
+  SELECT * FROM spark_catalog.db.time_travel_example_2;
+
+
+    +------------+------------+-----+
+    |order_number|product_code|price|
+    +------------+------------+-----+
+    |           2|       Venus| NULL|
+    +------------+------------+-----+
+```
+
+This happens because `ALTER TABLE` doesn't create a new snapshot but for the current table Spark takes value of `schema_id` from the latest metadata file, not a snapshot.
+
+#### Scenario 3:  Historical vs. Current Schema Differences {#scenario-3}
+
+The second one is that while doing time travel you can't get state of table before any data was written to it:
+
+```sql
+-- Create a table
+  CREATE TABLE IF NOT EXISTS spark_catalog.db.time_travel_example_3 (
+  order_number bigint, 
+  product_code string
+  ) 
+  USING iceberg 
+  OPTIONS ('format-version'='2');
+
+  ts = now();
+
+-- Query the table at a specific timestamp
+  SELECT * FROM spark_catalog.db.time_travel_example_3 TIMESTAMP AS OF ts; -- Finises with error: Cannot find a snapshot older than ts.
+```
+
+In Clickhouse the behavior is consistent with Spark. You can mentally replace Spark Select queries with Clickhouse Select queries and it will work the same way.
+
+## Aliases {#aliases}
 
 Table function `iceberg` is an alias to `icebergS3` now.
 
-**See Also**
+## See Also {#see-also}
 
 - [Iceberg engine](/docs/engines/table-engines/integrations/iceberg.md)
 - [Iceberg cluster table function](/docs/sql-reference/table-functions/icebergCluster.md)
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 017638d2be37..7982534482be 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -5685,6 +5685,12 @@ When the ratio of rows containing NULL values to the total number of rows exceed
 )", 0) \
     DECLARE(Int64, prefer_warmed_unmerged_parts_seconds, 0, R"(
 Only available in ClickHouse Cloud. If a merged part is less than this many seconds old and is not pre-warmed (see [cache_populated_by_fetch](merge-tree-settings.md/#cache_populated_by_fetch)), but all its source parts are available and pre-warmed, SELECT queries will read from those parts instead. Only for Replicated-/SharedMergeTree. Note that this only checks whether CacheWarmer processed the part; if the part was fetched into cache by something else, it'll still be considered cold until CacheWarmer gets to it; if it was warmed, then evicted from cache, it'll still be considered warm.
+)", 0) \
+    DECLARE(Int64, iceberg_timestamp_ms, 0, R"(
+Query Iceberg table using the snapshot that was current at a specific timestamp.
+)", 0) \
+    DECLARE(Int64, iceberg_snapshot_id, 0, R"(
+Query Iceberg table using the specific snapshot id.
 )", 0) \
     DECLARE(Bool, allow_deprecated_error_prone_window_functions, false, R"(
 Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index c074e100fe42..e3daf1c01107 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -74,6 +74,15 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
         {
             // Altinity Antalya modifications atop of 24.12
             {"input_format_parquet_use_metadata_cache", true, true, "New setting, turned ON by default"}, // https://github.com/Altinity/ClickHouse/pull/586
+            /// Release closed. Please use 25.4
+            {"allow_experimental_database_unity_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'unity'"},
+            {"allow_experimental_database_glue_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'glue'"},
+            {"use_page_cache_with_distributed_cache", false, false, "New setting"},
+            {"use_query_condition_cache", false, false, "New setting."},
+            {"iceberg_timestamp_ms", 0, 0, "New setting."},
+            {"iceberg_snapshot_id", 0, 0, "New setting."},
+            {"parallel_replicas_for_cluster_engines", false, true, "New setting."},
+            /// Release closed. Please use 25.4
         });
         addSettingsChanges(settings_changes_history, "25.2",
         {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index d469da0d3c66..68e055b0cb69 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -1,3 +1,5 @@
+#include <stdexcept>
+#include "base/types.h"
 #include "Core/NamesAndTypes.h"
 #include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
 #include "config.h"
@@ -44,12 +46,27 @@ extern const int LOGICAL_ERROR;
 extern const int ICEBERG_SPECIFICATION_VIOLATION;
 }
 
+namespace Setting
+{
+extern const SettingsInt64 iceberg_timestamp_ms;
+extern const SettingsInt64 iceberg_snapshot_id;
+}
+
+
 using namespace Iceberg;
 
 
-constexpr const char * COLUMN_SEQ_NUMBER_NAME = "sequence_number";
-constexpr const char * COLUMN_MANIFEST_FILE_PATH_NAME = "manifest_path";
-constexpr const char * FIELD_FORMAT_VERSION_NAME = "format-version";
+constexpr const char * SEQUENCE_NUMBER_COLUMN = "sequence_number";
+constexpr const char * MANIFEST_FILE_PATH_COLUMN = "manifest_path";
+constexpr const char * FORMAT_VERSION_FIELD = "format-version";
+constexpr const char * CURRENT_SNAPSHOT_ID_FIELD_IN_METADATA_FILE = "current-snapshot-id";
+constexpr const char * SNAPSHOT_ID_FIELD_IN_SNAPSHOT = "snapshot-id";
+constexpr const char * MANIFEST_LIST_PATH_FIELD = "manifest-list";
+constexpr const char * SNAPSHOT_LOG_FIELD = "snapshot-log";
+constexpr const char * TIMESTAMP_FIELD_INSIDE_SNAPSHOT = "timestamp-ms";
+constexpr const char * TABLE_LOCATION_FIELD = "location";
+constexpr const char * SNAPSHOTS_FIELD = "snapshots";
+
 
 std::pair<Int32, Poco::JSON::Object::Ptr>
 parseTableSchemaFromManifestFile(const avro::DataFileReaderBase & manifest_file_reader, const String & manifest_file_name)
@@ -77,22 +94,19 @@ IcebergMetadata::IcebergMetadata(
     const DB::ContextPtr & context_,
     Int32 metadata_version_,
     Int32 format_version_,
-    const Poco::JSON::Object::Ptr & object)
+    const Poco::JSON::Object::Ptr & metadata_object_)
     : WithContext(context_)
     , object_storage(std::move(object_storage_))
     , configuration(std::move(configuration_))
     , schema_processor(IcebergSchemaProcessor())
     , log(getLogger("IcebergMetadata"))
-    , current_metadata_version(metadata_version_)
+    , last_metadata_version(metadata_version_)
+    , last_metadata_object(metadata_object_)
     , format_version(format_version_)
-    , table_location(object->getValue<String>("location"))
+    , relevant_snapshot_schema_id(-1)
+    , table_location(last_metadata_object->getValue<String>(TABLE_LOCATION_FIELD))
 {
-    auto manifest_list_file = getRelevantManifestList(object);
-    if (manifest_list_file)
-    {
-        current_snapshot = getSnapshot(manifest_list_file.value());
-    }
-    current_schema_id = parseTableSchema(object, schema_processor, log);
+    updateState(context_);
 }
 
 std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV2Method(const Poco::JSON::Object::Ptr & metadata_object)
@@ -138,10 +152,36 @@ std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV1Method(const Poco::J
     return {schema, current_schema_id};
 }
 
+
+void IcebergMetadata::addTableSchemaById(Int32 schema_id)
+{
+    if (schema_processor.hasClickhouseTableSchemaById(schema_id))
+        return;
+    if (!last_metadata_object->has("schemas"))
+    {
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema with id `{}`: 'schemas' field is missing in metadata", schema_id);
+    }
+    auto schemas = last_metadata_object->get("schemas").extract<Poco::JSON::Array::Ptr>();
+    for (uint32_t i = 0; i != schemas->size(); ++i)
+    {
+        auto current_schema = schemas->getObject(i);
+        if (current_schema->has("schema-id") && current_schema->getValue<int>("schema-id") == schema_id)
+        {
+            schema_processor.addIcebergTableSchema(current_schema);
+            return;
+        }
+    }
+    throw Exception(
+        ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+        "Cannot parse Iceberg table schema with id `{}`: schema with such id is not found in metadata",
+        schema_id);
+}
+
 Int32 IcebergMetadata::parseTableSchema(
     const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, LoggerPtr metadata_logger)
 {
-    Int32 format_version = metadata_object->getValue<Int32>(FIELD_FORMAT_VERSION_NAME);
+    const auto format_version = metadata_object->getValue<Int32>(FORMAT_VERSION_FIELD);
     if (format_version == 2)
     {
         auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object);
@@ -244,46 +284,121 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
 
     const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
 
-    if (metadata_version == current_metadata_version)
-        return false;
+    last_metadata_version = metadata_version;
 
-    current_metadata_version = metadata_version;
+    last_metadata_object = readJSON(metadata_file_path, local_context);
 
-    auto metadata_object = readJSON(metadata_file_path, local_context);
+    chassert(format_version == last_metadata_object->getValue<int>(FORMAT_VERSION_FIELD));
 
-    chassert(format_version == metadata_object->getValue<int>(FIELD_FORMAT_VERSION_NAME));
+    auto previous_snapshot_id = relevant_snapshot_id;
+    auto previous_snapshot_schema_id = relevant_snapshot_schema_id;
 
+    updateState(local_context);
 
-    auto manifest_list_file = getRelevantManifestList(metadata_object);
-    if (manifest_list_file
-        && (!current_snapshot.has_value() || (manifest_list_file.value() != current_snapshot->manifest_list_iterator.getName())))
+    if (previous_snapshot_id != relevant_snapshot_id)
     {
-        current_snapshot = getSnapshot(manifest_list_file.value());
-        cached_unprunned_files_for_current_snapshot = std::nullopt;
+        cached_unprunned_files_for_last_processed_snapshot = std::nullopt;
+        return true;
     }
-    current_schema_id = parseTableSchema(metadata_object, schema_processor, log);
-    return true;
+    return previous_snapshot_schema_id != relevant_snapshot_schema_id;
 }
 
-std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata)
+void IcebergMetadata::updateSnapshot()
 {
     auto configuration_ptr = configuration.lock();
-
-    auto snapshots = metadata->get("snapshots").extract<Poco::JSON::Array::Ptr>();
-
-    auto current_snapshot_id = metadata->getValue<Int64>("current-snapshot-id");
-
+    if (!last_metadata_object->has(SNAPSHOTS_FIELD))
+        throw Exception(
+            ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+            "No snapshot set found in metadata for iceberg table `{}`, it is impossible to get manifest list by snapshot id `{}`",
+            configuration_ptr->getPath(),
+            relevant_snapshot_id);
+    auto snapshots = last_metadata_object->get(SNAPSHOTS_FIELD).extract<Poco::JSON::Array::Ptr>();
     for (size_t i = 0; i < snapshots->size(); ++i)
     {
         const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
+        if (snapshot->getValue<Int64>(SNAPSHOT_ID_FIELD_IN_SNAPSHOT) == relevant_snapshot_id)
+        {
+            if (!snapshot->has("manifest-list"))
+                throw Exception(
+                    ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+                    "No manifest list found for snapshot id `{}` for iceberg table `{}`",
+                    relevant_snapshot_id,
+                    configuration_ptr->getPath());
+            relevant_snapshot = IcebergSnapshot{
+                getManifestList(getProperFilePathFromMetadataInfo(
+                    snapshot->getValue<String>(MANIFEST_LIST_PATH_FIELD), configuration_ptr->getPath(), table_location)),
+                relevant_snapshot_id};
+            if (!snapshot->has("schema-id"))
+                throw Exception(
+                    ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+                    "No schema id found for snapshot id `{}` for iceberg table `{}`",
+                    relevant_snapshot_id,
+                    configuration_ptr->getPath());
+            relevant_snapshot_schema_id = snapshot->getValue<Int32>("schema-id");
+            addTableSchemaById(relevant_snapshot_schema_id);
+            return;
+        }
+    }
+    throw Exception(
+        ErrorCodes::BAD_ARGUMENTS,
+        "No manifest list is found for snapshot id `{}` in metadata for iceberg table `{}`",
+        relevant_snapshot_id,
+        configuration_ptr->getPath());
+}
 
-        if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
+void IcebergMetadata::updateState(const ContextPtr & local_context)
+{
+    auto configuration_ptr = configuration.lock();
+    std::optional<String> manifest_list_file;
+
+    bool timestamp_changed = local_context->getSettingsRef()[Setting::iceberg_timestamp_ms].changed;
+    bool snapshot_id_changed = local_context->getSettingsRef()[Setting::iceberg_snapshot_id].changed;
+    if (timestamp_changed && snapshot_id_changed)
+    {
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS,
+            "Time travel with timestamp and snapshot id for iceberg table by path {} cannot be changed simultaneously",
+            configuration_ptr->getPath());
+    }
+    if (timestamp_changed)
+    {
+        Int64 closest_timestamp = 0;
+        Int64 query_timestamp = local_context->getSettingsRef()[Setting::iceberg_timestamp_ms];
+        if (!last_metadata_object->has(SNAPSHOT_LOG_FIELD))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No snapshot log found in metadata for iceberg table {} so it is impossible to get relevant snapshot id using timestamp", configuration_ptr->getPath());
+        auto snapshots = last_metadata_object->get(SNAPSHOT_LOG_FIELD).extract<Poco::JSON::Array::Ptr>();
+        relevant_snapshot_id = -1;
+        for (size_t i = 0; i < snapshots->size(); ++i)
         {
-            const auto path = snapshot->getValue<String>("manifest-list");
-            return getProperFilePathFromMetadataInfo(std::string_view(path), configuration_ptr->getPath(), table_location);
+            const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
+            Int64 snapshot_timestamp = snapshot->getValue<Int64>(TIMESTAMP_FIELD_INSIDE_SNAPSHOT);
+            if (snapshot_timestamp <= query_timestamp && snapshot_timestamp > closest_timestamp)
+            {
+                closest_timestamp = snapshot_timestamp;
+                relevant_snapshot_id = snapshot->getValue<Int64>(SNAPSHOT_ID_FIELD_IN_SNAPSHOT);
+            }
         }
+        if (relevant_snapshot_id < 0)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No snapshot found in snapshot log before requested timestamp for iceberg table {}", configuration_ptr->getPath());
+        updateSnapshot();
+    }
+    else if (snapshot_id_changed)
+    {
+        relevant_snapshot_id = local_context->getSettingsRef()[Setting::iceberg_snapshot_id];
+        updateSnapshot();
+    }
+    else
+    {
+        if (!last_metadata_object->has(CURRENT_SNAPSHOT_ID_FIELD_IN_METADATA_FILE))
+            relevant_snapshot_id = -1;
+        else
+            relevant_snapshot_id = last_metadata_object->getValue<Int64>(CURRENT_SNAPSHOT_ID_FIELD_IN_METADATA_FILE);
+        if (relevant_snapshot_id != -1)
+        {
+            updateSnapshot();
+        }
+        relevant_snapshot_schema_id = parseTableSchema(last_metadata_object, schema_processor, log);
     }
-    return std::nullopt;
 }
 
 std::optional<Int32> IcebergMetadata::getSchemaVersionByFileIfOutdated(String data_path) const
@@ -295,7 +410,7 @@ std::optional<Int32> IcebergMetadata::getSchemaVersionByFileIfOutdated(String da
     }
     const ManifestFileContent & manifest_file = *manifest_file_it->second;
     auto schema_id = manifest_file.getSchemaId();
-    if (schema_id == current_schema_id)
+    if (schema_id == relevant_snapshot_schema_id)
         return std::nullopt;
     return std::optional{schema_id};
 }
@@ -325,7 +440,7 @@ DataLakeMetadataPtr IcebergMetadata::create(
 
     IcebergSchemaProcessor schema_processor;
 
-    auto format_version = object->getValue<int>(FIELD_FORMAT_VERSION_NAME);
+    auto format_version = object->getValue<int>(FORMAT_VERSION_FIELD);
 
     auto ptr
         = std::make_unique<IcebergMetadata>(object_storage, configuration_ptr, local_context, metadata_version, format_version, object);
@@ -348,34 +463,36 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
 
     auto [name_to_index, name_to_data_type, header] = getColumnsAndTypesFromAvroByNames(
         manifest_list_file_reader->dataSchema().root(),
-        {COLUMN_MANIFEST_FILE_PATH_NAME, COLUMN_SEQ_NUMBER_NAME},
+        {MANIFEST_FILE_PATH_COLUMN, SEQUENCE_NUMBER_COLUMN},
         {avro::Type::AVRO_STRING, avro::Type::AVRO_LONG});
 
-    if (name_to_index.find(COLUMN_MANIFEST_FILE_PATH_NAME) == name_to_index.end())
+    if (name_to_index.find(MANIFEST_FILE_PATH_COLUMN) == name_to_index.end())
         throw Exception(
             DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
             "Required columns are not found in manifest file: {}",
-            COLUMN_MANIFEST_FILE_PATH_NAME);
-    if (format_version > 1 && name_to_index.find(COLUMN_SEQ_NUMBER_NAME) == name_to_index.end())
+            MANIFEST_FILE_PATH_COLUMN);
+    if (format_version > 1 && name_to_index.find(SEQUENCE_NUMBER_COLUMN) == name_to_index.end())
         throw Exception(
-            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: `{}`", COLUMN_SEQ_NUMBER_NAME);
+            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+            "Required columns are not found in manifest file: `{}`",
+            SEQUENCE_NUMBER_COLUMN);
 
 
     auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
-    const auto & manifest_path_col = columns.at(name_to_index.at(COLUMN_MANIFEST_FILE_PATH_NAME));
+    const auto & manifest_path_col = columns.at(name_to_index.at(MANIFEST_FILE_PATH_COLUMN));
 
     std::optional<const ColumnInt64 *> sequence_number_column = std::nullopt;
     if (format_version > 1)
     {
-        if (columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getDataType() != TypeIndex::Int64)
+        if (columns.at(name_to_index.at(SEQUENCE_NUMBER_COLUMN))->getDataType() != TypeIndex::Int64)
         {
             throw Exception(
                 DB::ErrorCodes::ILLEGAL_COLUMN,
                 "The parsed column from Avro file of `{}` field should be Int64 type, got `{}`",
-                COLUMN_SEQ_NUMBER_NAME,
-                columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getFamilyName());
+                SEQUENCE_NUMBER_COLUMN,
+                columns.at(name_to_index.at(SEQUENCE_NUMBER_COLUMN))->getFamilyName());
         }
-        sequence_number_column = assert_cast<const ColumnInt64 *>(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME)).get());
+        sequence_number_column = assert_cast<const ColumnInt64 *>(columns.at(name_to_index.at(SEQUENCE_NUMBER_COLUMN)).get());
     }
 
     if (manifest_path_col->getDataType() != TypeIndex::String)
@@ -383,7 +500,7 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
         throw Exception(
             ErrorCodes::ILLEGAL_COLUMN,
             "The parsed column from Avro file of `{}` field should be String type, got `{}`",
-            COLUMN_MANIFEST_FILE_PATH_NAME,
+            MANIFEST_FILE_PATH_COLUMN,
             manifest_path_col->getFamilyName());
     }
 
@@ -464,24 +581,18 @@ ManifestListIterator IcebergMetadata::getManifestList(const String & filename) c
     return ManifestListIterator{manifest_file_iterator};
 }
 
-
-IcebergSnapshot IcebergMetadata::getSnapshot(const String & filename) const
-{
-    return IcebergSnapshot{getManifestList(filename)};
-}
-
 Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
 {
-    if (!current_snapshot)
+    if (!relevant_snapshot)
         return {};
 
-    if (!filter_dag && cached_unprunned_files_for_current_snapshot.has_value())
-        return cached_unprunned_files_for_current_snapshot.value();
+    if (!filter_dag && cached_unprunned_files_for_last_processed_snapshot.has_value())
+        return cached_unprunned_files_for_last_processed_snapshot.value();
 
     Strings data_files;
-    for (const auto & manifest_list_entry : *(current_snapshot->manifest_list_iterator))
+    for (const auto & manifest_list_entry : *(relevant_snapshot->manifest_list_iterator))
     {
-        PartitionPruner pruner(schema_processor, current_schema_id, filter_dag, *manifest_list_entry.manifest_file, getContext());
+        PartitionPruner pruner(schema_processor, relevant_snapshot_schema_id, filter_dag, *manifest_list_entry.manifest_file, getContext());
         const auto & data_files_in_manifest = manifest_list_entry.manifest_file->getFiles();
         for (const auto & manifest_file_entry : data_files_in_manifest)
         {
@@ -502,8 +613,8 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
 
     if (!filter_dag)
     {
-        cached_unprunned_files_for_current_snapshot = data_files;
-        return cached_unprunned_files_for_current_snapshot.value();
+        cached_unprunned_files_for_last_processed_snapshot = data_files;
+        return cached_unprunned_files_for_last_processed_snapshot.value();
     }
 
     return data_files;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index d82d0817d900..76dc41db76a7 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -17,6 +17,8 @@
 #include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
 #include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
 
+#include <tuple>
+
 
 namespace DB
 {
@@ -36,7 +38,7 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
         const DB::ContextPtr & context_,
         Int32 metadata_version_,
         Int32 format_version_,
-        const Poco::JSON::Object::Ptr & object);
+        const Poco::JSON::Object::Ptr & metadata_object);
 
 
     /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
@@ -45,7 +47,10 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
     Strings getDataFiles() const override { return getDataFilesImpl(nullptr); }
 
     /// Get table schema parsed from metadata.
-    NamesAndTypesList getTableSchema() const override { return *schema_processor.getClickhouseTableSchemaById(current_schema_id); }
+    NamesAndTypesList getTableSchema() const override
+    {
+        return *schema_processor.getClickhouseTableSchemaById(relevant_snapshot_schema_id);
+    }
 
     bool operator==(const IDataLakeMetadata & other) const override
     {
@@ -59,7 +64,6 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
         const ContextPtr & local_context,
         bool allow_experimental_delta_kernel_rs);
 
-    size_t getVersion() const { return current_metadata_version; }
 
     std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
     {
@@ -71,7 +75,7 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
     {
         auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
         return version_if_outdated.has_value()
-            ? schema_processor.getSchemaTransformationDagByIds(version_if_outdated.value(), current_schema_id)
+            ? schema_processor.getSchemaTransformationDagByIds(version_if_outdated.value(), relevant_snapshot_schema_id)
             : nullptr;
     }
 
@@ -100,21 +104,30 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
     mutable Iceberg::ManifestListsStorage manifest_lists_by_name;
     mutable ManifestEntryByDataFile manifest_file_by_data_file;
 
-    Int32 current_metadata_version;
+    std::tuple<Int64, Int32> getVersion() const { return std::make_tuple(relevant_snapshot_id, relevant_snapshot_schema_id); }
+
+    Int32 last_metadata_version;
+    Poco::JSON::Object::Ptr last_metadata_object;
     Int32 format_version;
-    Int32 current_schema_id;
-    std::optional<Iceberg::IcebergSnapshot> current_snapshot;
+
+
+    Int32 relevant_snapshot_schema_id;
+    std::optional<Iceberg::IcebergSnapshot> relevant_snapshot;
+    Int64 relevant_snapshot_id{-1};
     String table_location;
 
-    mutable std::optional<Strings> cached_unprunned_files_for_current_snapshot;
+    mutable std::optional<Strings> cached_unprunned_files_for_last_processed_snapshot;
 
-    mutable std::vector<Iceberg::ManifestFileEntry> positional_delete_files_for_current_query;
+    void updateState(const ContextPtr & local_context);
+
+    void updateSnapshot();
 
     Iceberg::ManifestList initializeManifestList(const String & filename) const;
+    mutable std::vector<Iceberg::ManifestFileEntry> positional_delete_files_for_current_query;
 
-    Iceberg::ManifestListIterator getManifestList(const String & filename) const;
+    void addTableSchemaById(Int32 schema_id);
 
-    Iceberg::IcebergSnapshot getSnapshot(const String & filename) const;
+    Iceberg::ManifestListIterator getManifestList(const String & filename) const;
 
     std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const;
 
@@ -130,7 +143,6 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     std::optional<Iceberg::ManifestFileIterator> tryGetManifestFile(const String & filename) const;
 };
-
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
index c2383fda12f3..625feaa4cb42 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
@@ -423,4 +423,8 @@ std::shared_ptr<NamesAndTypesList> IcebergSchemaProcessor::getClickhouseTableSch
     return it->second;
 }
 
+bool IcebergSchemaProcessor::hasClickhouseTableSchemaById(Int32 id) const
+{
+    return clickhouse_table_schemas_by_ids.find(id) != clickhouse_table_schemas_by_ids.end();
+}
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
index 60706988e74e..469d7438e2af 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
@@ -83,6 +83,8 @@ class IcebergSchemaProcessor
     std::optional<NameAndTypePair> tryGetFieldCharacteristics(Int32 schema_version, Int32 source_id) const;
     NamesAndTypesList tryGetFieldsCharacteristics(Int32 schema_id, const std::vector<Int32> & source_ids) const;
 
+    bool hasClickhouseTableSchemaById(Int32 id) const;
+
 private:
     std::unordered_map<Int32, Poco::JSON::Object::Ptr> iceberg_table_schemas_by_ids;
     std::unordered_map<Int32, std::shared_ptr<NamesAndTypesList>> clickhouse_table_schemas_by_ids;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
index ba9fac631385..e6efb1eb7c00 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -22,6 +22,7 @@ using ManifestListIterator = IteratorWrapper<ManifestList>;
 struct IcebergSnapshot
 {
     ManifestListIterator manifest_list_iterator;
+    Int64 snapshot_id;
 };
 }
 
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index 64a81febaa0c..953ec42ce214 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 
 import pyspark
 import pytest
@@ -283,9 +283,13 @@ def get_creation_expression(
         raise Exception(f"Unknown iceberg storage type: {storage_type}")
 
 
-def check_schema_and_data(instance, table_expression, expected_schema, expected_data):
-    schema = instance.query(f"DESC {table_expression}")
-    data = instance.query(f"SELECT * FROM {table_expression} ORDER BY ALL")
+def check_schema_and_data(instance, table_expression, expected_schema, expected_data, timestamp_ms=None):
+    if timestamp_ms:
+        schema = instance.query(f"DESC {table_expression} SETTINGS iceberg_timestamp_ms = {timestamp_ms}")
+        data = instance.query(f"SELECT * FROM {table_expression} ORDER BY ALL SETTINGS iceberg_timestamp_ms = {timestamp_ms}")
+    else:
+        schema = instance.query(f"DESC {table_expression}")
+        data = instance.query(f"SELECT * FROM {table_expression} ORDER BY ALL")
     schema = list(
         map(
             lambda x: x.split("\t")[:2],
@@ -301,7 +305,6 @@ def check_schema_and_data(instance, table_expression, expected_schema, expected_
     assert expected_schema == schema
     assert expected_data == data
 
-
 def get_uuid_str():
     return str(uuid.uuid4()).replace("-", "_")
 
@@ -2054,8 +2057,6 @@ def test_filesystem_cache(started_cluster, storage_type):
 
 @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
 def test_partition_pruning(started_cluster, storage_type):
-    if is_arm() and storage_type == "hdfs":
-        pytest.skip("Disabled test IcebergHDFS for aarch64")
     instance = started_cluster.instances["node1"]
     spark = started_cluster.spark_session
     TABLE_NAME = "test_partition_pruning_" + storage_type + "_" + get_uuid_str()
@@ -2272,3 +2273,341 @@ def check_validity_and_get_prunned_files(select_expression):
         )
         == 1
     )
+
+
+@pytest.mark.parametrize("format_version", ["1", "2"])
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_schema_evolution_with_time_travel(
+    started_cluster, format_version, storage_type
+):
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+    TABLE_NAME = (
+        "test_schema_evolution_with_time_travel_"
+        + format_version
+        + "_"
+        + storage_type
+        + "_"
+        + get_uuid_str()
+    )
+
+    def execute_spark_query(query: str):
+        spark.sql(query)
+        default_upload_directory(
+            started_cluster,
+            storage_type,
+            f"/iceberg_data/default/{TABLE_NAME}/",
+            f"/iceberg_data/default/{TABLE_NAME}/",
+        )
+        return
+
+    execute_spark_query(
+        f"""
+            DROP TABLE IF EXISTS {TABLE_NAME};
+        """
+    )
+
+    execute_spark_query(
+        f"""
+            CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
+                a int NOT NULL
+            )
+            USING iceberg 
+            OPTIONS ('format-version'='{format_version}')
+        """
+    )
+
+    table_creation_expression = get_creation_expression(
+        storage_type,
+        TABLE_NAME,
+        started_cluster,
+        table_function=True,
+        allow_dynamic_metadata_for_data_lakes=True,
+    )
+
+    table_select_expression =  table_creation_expression
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"]
+        ],
+        [],
+    )
+
+    first_timestamp_ms = int(datetime.now().timestamp() * 1000)
+
+    time.sleep(0.5)
+
+    execute_spark_query(
+        f"""
+            INSERT INTO {TABLE_NAME} VALUES (4);
+        """
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+        ],
+        [["4"]],
+    )
+
+    error_message = instance.query_and_get_error(f"SELECT * FROM {table_select_expression} ORDER BY ALL SETTINGS iceberg_timestamp_ms = {first_timestamp_ms}")
+    assert "No snapshot found in snapshot log before requested timestamp" in error_message
+
+
+    second_timestamp_ms = int(datetime.now().timestamp() * 1000)
+
+    time.sleep(0.5)
+
+    execute_spark_query(
+        f"""
+            ALTER TABLE {TABLE_NAME} ADD COLUMNS (
+                b double
+            );
+        """
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+            ["b", "Nullable(Float64)"]
+        ],
+        [["4", "\\N"]],
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+        ],
+        [["4"]],
+        timestamp_ms=second_timestamp_ms,
+    )
+
+    third_timestamp_ms = int(datetime.now().timestamp() * 1000)
+
+    time.sleep(0.5)
+
+
+    execute_spark_query(
+        f"""
+            INSERT INTO {TABLE_NAME} VALUES (7, 5.0);
+        """
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+            ["b", "Nullable(Float64)"]
+        ],
+        [["4", "\\N"], ["7", "5"]],
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+        ],
+        [["4"]],
+        timestamp_ms=second_timestamp_ms,
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],        ],
+        [["4"]],
+        timestamp_ms=third_timestamp_ms,
+    )
+
+    execute_spark_query(
+        f"""
+            ALTER TABLE {TABLE_NAME} ADD COLUMNS (
+                c double
+            );
+        """
+    )
+
+    time.sleep(0.5)
+    fourth_timestamp_ms = int(datetime.now().timestamp() * 1000)
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+            ["b", "Nullable(Float64)"]
+        ],
+        [["4", "\\N"], ["7", "5"]],
+        timestamp_ms=fourth_timestamp_ms,
+    )
+
+    check_schema_and_data(
+        instance,
+        table_select_expression,
+        [
+            ["a", "Int32"],
+            ["b", "Nullable(Float64)"],
+            ["c", "Nullable(Float64)"]
+        ],
+        [["4", "\\N", "\\N"], ["7", "5", "\\N"]],
+    )
+
+def get_last_snapshot(path_to_table):
+    import json
+    import os
+
+    metadata_dir = f"{path_to_table}/metadata/"
+    last_timestamp = 0
+    last_snapshot_id = -1
+    for filename in os.listdir(metadata_dir):
+        if filename.endswith('.json'):
+            filepath = os.path.join(metadata_dir, filename)
+            with open(filepath, 'r') as f:
+                data = json.load(f)
+                print(data)
+                timestamp = data.get('last-updated-ms')
+                if (timestamp > last_timestamp):
+                    last_timestamp = timestamp
+                    last_snapshot_id = data.get('current-snapshot-id')
+    return last_snapshot_id
+    
+
+@pytest.mark.parametrize("format_version", ["1", "2"])
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_iceberg_snapshot_reads(started_cluster, format_version, storage_type):
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+    TABLE_NAME = (
+        "test_iceberg_snapshot_reads"
+        + format_version
+        + "_"
+        + storage_type
+        + "_"
+        + get_uuid_str()
+    )
+
+    write_iceberg_from_df(
+        spark,
+        generate_data(spark, 0, 100),
+        TABLE_NAME,
+        mode="overwrite",
+        format_version=format_version,
+    )
+    default_upload_directory(
+        started_cluster,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        "",
+    )
+
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
+    snapshot1_timestamp = datetime.now(timezone.utc)
+    snapshot1_id = get_last_snapshot(f"/iceberg_data/default/{TABLE_NAME}/")
+    time.sleep(0.1)
+
+    write_iceberg_from_df(
+        spark,
+        generate_data(spark, 100, 200),
+        TABLE_NAME,
+        mode="append",
+        format_version=format_version,
+    )
+    default_upload_directory(
+        started_cluster,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        "",
+    )
+    snapshot2_timestamp = datetime.now(timezone.utc)
+    snapshot2_id = get_last_snapshot(f"/iceberg_data/default/{TABLE_NAME}/")
+    time.sleep(0.1)
+
+    write_iceberg_from_df(
+        spark,
+        generate_data(spark, 200, 300),
+        TABLE_NAME,
+        mode="append",
+        format_version=format_version,
+    )
+    default_upload_directory(
+        started_cluster,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        "",
+    )
+    snapshot3_timestamp = datetime.now(timezone.utc)
+    snapshot3_id = get_last_snapshot(f"/iceberg_data/default/{TABLE_NAME}/")
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 300
+    assert instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY 1") == instance.query(
+        "SELECT number, toString(number + 1) FROM numbers(300)"
+    )
+
+    # Validate that each snapshot timestamp only sees the data inserted by that time.
+    assert (
+        instance.query(
+            f"""
+                          SELECT * FROM {TABLE_NAME} ORDER BY 1
+                          SETTINGS iceberg_timestamp_ms = {int(snapshot1_timestamp.timestamp() * 1000)}"""
+        )
+        == instance.query("SELECT number, toString(number + 1) FROM numbers(100)")
+    )
+
+    assert (
+        instance.query(
+            f"""
+                          SELECT * FROM {TABLE_NAME} ORDER BY 1
+                          SETTINGS iceberg_snapshot_id = {snapshot1_id}"""
+        )
+        == instance.query("SELECT number, toString(number + 1) FROM numbers(100)")
+    )
+
+
+    assert (
+        instance.query(
+            f"""
+                          SELECT * FROM {TABLE_NAME} ORDER BY 1
+                          SETTINGS iceberg_timestamp_ms = {int(snapshot2_timestamp.timestamp() * 1000)}"""
+        )
+        == instance.query("SELECT number, toString(number + 1) FROM numbers(200)")
+    )
+
+    assert (
+        instance.query(
+            f"""
+                          SELECT * FROM {TABLE_NAME} ORDER BY 1
+                          SETTINGS iceberg_snapshot_id = {snapshot2_id}"""
+        )
+        == instance.query("SELECT number, toString(number + 1) FROM numbers(200)")
+    )
+
+
+    assert (
+        instance.query(
+            f"""SELECT * FROM {TABLE_NAME} ORDER BY 1
+                          SETTINGS iceberg_timestamp_ms = {int(snapshot3_timestamp.timestamp() * 1000)}"""
+        )
+        == instance.query("SELECT number, toString(number + 1) FROM numbers(300)")
+    )
+
+    assert (
+        instance.query(
+            f"""
+                          SELECT * FROM {TABLE_NAME} ORDER BY 1
+                          SETTINGS iceberg_snapshot_id = {snapshot3_id}"""
+        )
+        == instance.query("SELECT number, toString(number + 1) FROM numbers(300)")
+    )
+

From d2ca873be19c24a6862f81fa754bc7ca913f9ed5 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 21 Mar 2025 10:15:09 +0000
Subject: [PATCH 05/14] Merge pull request #77916 from
 ClickHouse/add_trivial_count

Refactor operations with Avro files in Iceberg-related code.
---
 .../Formats/Impl/AvroRowInputFormat.cpp       |  25 +++
 .../Formats/Impl/AvroRowInputFormat.h         |   1 +
 .../Iceberg/AvroForIcebergDeserializer.cpp    |  88 ++++++++
 .../Iceberg/AvroForIcebergDeserializer.h      |  57 +++++
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 113 +++-------
 .../DataLakes/Iceberg/IcebergMetadata.h       |   6 +-
 .../DataLakes/Iceberg/ManifestFile.cpp        | 196 +++++-------------
 .../DataLakes/Iceberg/ManifestFile.h          |  48 ++++-
 .../DataLakes/Iceberg/ManifestFileImpl.h      |  66 ------
 .../ObjectStorage/DataLakes/Iceberg/Utils.cpp |  88 +-------
 .../ObjectStorage/DataLakes/Iceberg/Utils.h   |  15 +-
 11 files changed, 304 insertions(+), 399 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h
 delete mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h

diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp
index 0119bf95b34a..45dcde422a24 100644
--- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp
@@ -940,6 +940,31 @@ AvroDeserializer::Action AvroDeserializer::createAction(const Block & header, co
     }
 }
 
+
+AvroDeserializer::AvroDeserializer(DataTypePtr data_type, const std::string & column_name, avro::ValidSchema schema, bool allow_missing_fields, bool null_as_default_, const FormatSettings & settings_)
+    : null_as_default(null_as_default_), settings(settings_)
+{
+    const auto & schema_root = schema.root();
+    if (schema_root->type() != avro::AVRO_RECORD)
+        throw Exception(ErrorCodes::TYPE_MISMATCH, "Root schema must be a record");
+
+    Block header;
+    header.insert({data_type->createColumn(), data_type, column_name});
+
+    column_found.resize(header.columns());
+    row_action = createAction(header, schema_root, column_name);
+    // fail on missing fields when allow_missing_fields = false
+    if (!allow_missing_fields)
+    {
+        for (size_t i = 0; i < header.columns(); ++i)
+        {
+            if (!column_found[i])
+                throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Field {} not found in Avro schema", header.getByPosition(i).name);
+        }
+    }
+
+}
+
 AvroDeserializer::AvroDeserializer(const Block & header, avro::ValidSchema schema, bool allow_missing_fields, bool null_as_default_, const FormatSettings & settings_)
     : null_as_default(null_as_default_), settings(settings_)
 {
diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.h b/src/Processors/Formats/Impl/AvroRowInputFormat.h
index 88cec2b6d060..6a2a95f1cef3 100644
--- a/src/Processors/Formats/Impl/AvroRowInputFormat.h
+++ b/src/Processors/Formats/Impl/AvroRowInputFormat.h
@@ -50,6 +50,7 @@ class AvroDeserializer
 {
 public:
     AvroDeserializer(const Block & header, avro::ValidSchema schema, bool allow_missing_fields, bool null_as_default_, const FormatSettings & settings_);
+    AvroDeserializer(DataTypePtr data_type, const std::string & column_name, avro::ValidSchema schema, bool allow_missing_fields, bool null_as_default_, const FormatSettings & settings_);
     void deserializeRow(MutableColumns & columns, avro::Decoder & decoder, RowReadExtension & ext) const;
 
     using DeserializeFn = std::function<bool(IColumn & column, avro::Decoder & decoder)>;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp
new file mode 100644
index 000000000000..ee990f76e44b
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp
@@ -0,0 +1,88 @@
+#include <Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h>
+
+#if USE_AVRO
+
+#include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Common/assert_cast.h>
+#include <base/find_symbols.h>
+
+namespace DB::ErrorCodes
+{
+    extern const int ICEBERG_SPECIFICATION_VIOLATION;
+}
+
+namespace Iceberg
+{
+
+using namespace DB;
+
+AvroForIcebergDeserializer::AvroForIcebergDeserializer(
+    std::unique_ptr<ReadBufferFromFileBase> buffer_,
+    const std::string & manifest_file_path_,
+    const DB::FormatSettings & format_settings)
+    : buffer(std::move(buffer_))
+    , manifest_file_path(manifest_file_path_)
+{
+    auto manifest_file_reader
+        = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*buffer));
+
+    avro::NodePtr root_node = manifest_file_reader->dataSchema().root();
+    auto data_type = AvroSchemaReader::avroNodeToDataType(root_node);
+
+    MutableColumns columns;
+    columns.push_back(data_type->createColumn());
+    AvroDeserializer deserializer(data_type, root_node->name(), manifest_file_reader->dataSchema(), true, true, format_settings);
+    manifest_file_reader->init();
+    RowReadExtension ext;
+    while (manifest_file_reader->hasMore())
+    {
+        manifest_file_reader->decr();
+        deserializer.deserializeRow(columns, manifest_file_reader->decoder(), ext);
+    }
+
+    metadata = manifest_file_reader->metadata();
+    parsed_column = std::move(columns[0]);
+    parsed_column_data_type = std::dynamic_pointer_cast<const DataTypeTuple>(data_type);
+}
+
+size_t AvroForIcebergDeserializer::rows() const
+{
+    return parsed_column->size();
+}
+
+bool AvroForIcebergDeserializer::hasPath(const std::string & path) const
+{
+    return parsed_column_data_type->hasSubcolumn(path);
+}
+
+TypeIndex AvroForIcebergDeserializer::getTypeForPath(const std::string & path) const
+{
+    return WhichDataType(parsed_column_data_type->getSubcolumnType(path)).idx;
+}
+
+Field AvroForIcebergDeserializer::getValueFromRowByName(size_t row_num, const std::string & path, std::optional<TypeIndex> expected_type) const
+{
+    auto current_column = parsed_column_data_type->getSubcolumn(path, parsed_column);
+    auto current_data_type = parsed_column_data_type->getSubcolumnType(path);
+
+    if (expected_type && WhichDataType(current_data_type).idx != *expected_type)
+        throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
+                        "Got wrong data type for key {} in manifest file {}, expected {}, got {}",
+                        path, manifest_file_path, *expected_type, WhichDataType(current_data_type).idx);
+    Field result;
+    current_column->get(row_num, result);
+    return result;
+}
+
+std::optional<std::string> AvroForIcebergDeserializer::tryGetAvroMetadataValue(std::string metadata_key) const
+{
+    auto it = metadata.find(metadata_key);
+    if (it == metadata.end())
+        return std::nullopt;
+
+    return std::string{it->second.begin(), it->second.end()};
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h
new file mode 100644
index 000000000000..cbef423c14ee
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "config.h"
+
+#if USE_AVRO
+
+#include <IO/ReadBufferFromFileBase.h>
+#include <Columns/IColumn.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <Core/Field.h>
+
+#include <memory>
+
+namespace Iceberg
+{
+
+/// In Iceberg manifest files and manifest lists are store in Avro format: https://avro.apache.org/
+/// This format is some kind of mix between JSON and binary schemaful format like protobuf.
+/// It has rich types system, with it's own binary representation and it's really tricky
+/// to parse some of them.
+///
+/// In ClickHouse we already support avro as input format, so we can parse it. The main complexity
+/// comes from the fact that we parse Avro files into nested Tuple column which are really hard
+/// to operate in key-value fashion. That is why this class is written on top of our avro parser.
+/// It allows to access files in avro files using syntax like "data_file.partition.XXXX" and return
+/// Field values back. Also manages avro file metadata which is basically just mapping string -> string.
+class AvroForIcebergDeserializer
+{
+private:
+    std::unique_ptr<DB::ReadBufferFromFileBase> buffer;
+    std::string manifest_file_path;
+    DB::ColumnPtr parsed_column;
+    std::shared_ptr<const DB::DataTypeTuple> parsed_column_data_type;
+
+    std::map<std::string, std::vector<uint8_t>> metadata;
+public:
+
+    AvroForIcebergDeserializer(
+        std::unique_ptr<DB::ReadBufferFromFileBase> buffer_,
+        const std::string & manifest_file_path_,
+        const DB::FormatSettings & format_settings);
+
+    size_t rows() const;
+
+    /// Allow to access avro paths like "a.b.c"
+    bool hasPath(const std::string & path) const;
+    DB::TypeIndex getTypeForPath(const std::string & path) const;
+    /// Allow to access avro paths like "a.b.c".
+    /// If expected type is provided will throw an exception if types don't match
+    DB::Field getValueFromRowByName(size_t row_num, const std::string & path, std::optional<DB::TypeIndex> expected_type = std::nullopt) const;
+
+    std::optional<std::string> tryGetAvroMetadataValue(std::string metadata_key) const;
+};
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 68e055b0cb69..895583dcc24f 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -1,46 +1,46 @@
-#include <stdexcept>
-#include "base/types.h"
-#include "Core/NamesAndTypes.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
 #include "config.h"
 
 #if USE_AVRO
 
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnTuple.h>
-#include <Columns/IColumn.h>
 #include <Core/Settings.h>
+#include <Core/NamesAndTypes.h>
 #include <Formats/FormatFactory.h>
 #include <IO/ReadBufferFromFileBase.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/ReadHelpers.h>
-#include <Processors/Formats/Impl/AvroRowInputFormat.h>
+
 #include <Storages/ObjectStorage/DataLakes/Common.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
-#include <Common/logger_useful.h>
 #include <Interpreters/ExpressionActions.h>
 
-#include "Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
 
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+#include <Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
 
+#include <Common/logger_useful.h>
 #include <Common/ProfileEvents.h>
 
 
 namespace ProfileEvents
 {
-extern const Event IcebergPartitionPrunnedFiles;
+    extern const Event IcebergPartitionPrunnedFiles;
 }
+
 namespace DB
 {
 
+namespace StorageObjectStorageSetting
+{
+    extern const StorageObjectStorageSettingsString iceberg_metadata_file_path;
+}
+
 namespace ErrorCodes
 {
 extern const int FILE_DOESNT_EXIST;
-extern const int ILLEGAL_COLUMN;
 extern const int BAD_ARGUMENTS;
 extern const int LOGICAL_ERROR;
 extern const int ICEBERG_SPECIFICATION_VIOLATION;
@@ -69,19 +69,16 @@ constexpr const char * SNAPSHOTS_FIELD = "snapshots";
 
 
 std::pair<Int32, Poco::JSON::Object::Ptr>
-parseTableSchemaFromManifestFile(const avro::DataFileReaderBase & manifest_file_reader, const String & manifest_file_name)
+parseTableSchemaFromManifestFile(const AvroForIcebergDeserializer & deserializer, const String & manifest_file_name)
 {
-    auto avro_metadata = manifest_file_reader.metadata();
-    auto avro_schema_it = avro_metadata.find("schema");
-    if (avro_schema_it == avro_metadata.end())
+    auto schema_json_string = deserializer.tryGetAvroMetadataValue("schema");
+    if (!schema_json_string.has_value())
         throw Exception(
             ErrorCodes::BAD_ARGUMENTS,
             "Cannot read Iceberg table: manifest file '{}' doesn't have table schema in its metadata",
             manifest_file_name);
-    std::vector<uint8_t> schema_json = avro_schema_it->second;
-    String schema_json_string = String(reinterpret_cast<char *>(schema_json.data()), schema_json.size());
     Poco::JSON::Parser parser;
-    Poco::Dynamic::Var json = parser.parse(schema_json_string);
+    Poco::Dynamic::Var json = parser.parse(*schema_json_string);
     const Poco::JSON::Object::Ptr & schema_object = json.extract<Poco::JSON::Object::Ptr>();
     Int32 schema_object_id = schema_object->getValue<int>("schema-id");
     return {schema_object_id, schema_object};
@@ -454,69 +451,20 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
     if (configuration_ptr == nullptr)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
 
-    auto context = getContext();
     StorageObjectStorage::ObjectInfo object_info(filename);
-    auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
-
-    auto manifest_list_file_reader
-        = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*manifest_list_buf));
+    auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, getContext(), log);
+    AvroForIcebergDeserializer manifest_list_deserializer(std::move(manifest_list_buf), filename, getFormatSettings(getContext()));
 
-    auto [name_to_index, name_to_data_type, header] = getColumnsAndTypesFromAvroByNames(
-        manifest_list_file_reader->dataSchema().root(),
-        {MANIFEST_FILE_PATH_COLUMN, SEQUENCE_NUMBER_COLUMN},
-        {avro::Type::AVRO_STRING, avro::Type::AVRO_LONG});
-
-    if (name_to_index.find(MANIFEST_FILE_PATH_COLUMN) == name_to_index.end())
-        throw Exception(
-            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
-            "Required columns are not found in manifest file: {}",
-            MANIFEST_FILE_PATH_COLUMN);
-    if (format_version > 1 && name_to_index.find(SEQUENCE_NUMBER_COLUMN) == name_to_index.end())
-        throw Exception(
-            DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
-            "Required columns are not found in manifest file: `{}`",
-            SEQUENCE_NUMBER_COLUMN);
-
-
-    auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
-    const auto & manifest_path_col = columns.at(name_to_index.at(MANIFEST_FILE_PATH_COLUMN));
-
-    std::optional<const ColumnInt64 *> sequence_number_column = std::nullopt;
-    if (format_version > 1)
-    {
-        if (columns.at(name_to_index.at(SEQUENCE_NUMBER_COLUMN))->getDataType() != TypeIndex::Int64)
-        {
-            throw Exception(
-                DB::ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `{}` field should be Int64 type, got `{}`",
-                SEQUENCE_NUMBER_COLUMN,
-                columns.at(name_to_index.at(SEQUENCE_NUMBER_COLUMN))->getFamilyName());
-        }
-        sequence_number_column = assert_cast<const ColumnInt64 *>(columns.at(name_to_index.at(SEQUENCE_NUMBER_COLUMN)).get());
-    }
-
-    if (manifest_path_col->getDataType() != TypeIndex::String)
-    {
-        throw Exception(
-            ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `{}` field should be String type, got `{}`",
-            MANIFEST_FILE_PATH_COLUMN,
-            manifest_path_col->getFamilyName());
-    }
-
-    const auto * manifest_path_col_str = typeid_cast<ColumnString *>(manifest_path_col.get());
     ManifestList manifest_list;
 
-
-    for (size_t i = 0; i < manifest_path_col_str->size(); ++i)
+    for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i)
     {
-        const std::string_view file_path = manifest_path_col_str->getDataAt(i).toView();
+        const std::string file_path = manifest_list_deserializer.getValueFromRowByName(i, MANIFEST_FILE_PATH_COLUMN, TypeIndex::String).safeGet<std::string>();
         const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location);
         Int64 added_sequence_number = 0;
         if (format_version > 1)
-        {
-            added_sequence_number = sequence_number_column.value()->getInt(i);
-        }
+            added_sequence_number = manifest_list_deserializer.getValueFromRowByName(i, SEQUENCE_NUMBER_COLUMN, TypeIndex::Int64).safeGet<Int64>();
+
         /// We can't encapsulate this logic in getManifestFile because we need not only the name of the file, but also an inherited sequence number which is known only during the parsing of ManifestList
         auto manifest_file_content = initializeManifestFile(manifest_file_name, added_sequence_number);
         auto [iterator, _inserted] = manifest_files_by_name.emplace(manifest_file_name, std::move(manifest_file_content));
@@ -538,21 +486,18 @@ ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filen
 
     ObjectInfo manifest_object_info(filename);
     auto buffer = StorageObjectStorageSource::createReadBuffer(manifest_object_info, object_storage, getContext(), log);
-    auto manifest_file_reader = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*buffer));
-    auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(*manifest_file_reader, filename);
+    AvroForIcebergDeserializer manifest_file_deserializer(std::move(buffer), filename, getFormatSettings(getContext()));
+    auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(manifest_file_deserializer, filename);
     schema_processor.addIcebergTableSchema(schema_object);
-    auto manifest_file_impl = std::make_unique<ManifestFileContentImpl>(
-        std::move(manifest_file_reader),
+    return ManifestFileContent(
+        manifest_file_deserializer,
         format_version,
         configuration_ptr->getPath(),
-        getFormatSettings(getContext()),
         schema_id,
         schema_processor,
         inherited_sequence_number,
         table_location,
         getContext());
-
-    return ManifestFileContent(std::move(manifest_file_impl));
 }
 
 ManifestFileIterator IcebergMetadata::getManifestFile(const String & filename) const
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 76dc41db76a7..5790f1d132d9 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -13,9 +13,9 @@
 #include <Poco/JSON/Object.h>
 #include <Poco/JSON/Parser.h>
 
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h>
 
 #include <tuple>
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index 46953627c92c..19e6cbb49f0d 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -2,25 +2,21 @@
 
 #if USE_AVRO
 
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
+#include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
 
-#include <Columns/ColumnNullable.h>
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnTuple.h>
 #include <Poco/JSON/Parser.h>
-#include "DataTypes/DataTypeTuple.h"
 #include <Storages/ColumnsDescription.h>
 #include <Parsers/ASTFunction.h>
-#include <DataTypes/DataTypeNullable.h>
 #include <Common/quoteString.h>
+#include <DataTypes/DataTypeNullable.h>
 
 namespace DB::ErrorCodes
 {
-extern const int ILLEGAL_COLUMN;
-extern const int UNSUPPORTED_METHOD;
-extern const int ICEBERG_SPECIFICATION_VIOLATION;
-extern const int LOGICAL_ERROR;
+    extern const int UNSUPPORTED_METHOD;
+    extern const int ICEBERG_SPECIFICATION_VIOLATION;
+    extern const int LOGICAL_ERROR;
 }
 
 namespace Iceberg
@@ -30,34 +26,27 @@ constexpr const char * COLUMN_STATUS_NAME = "status";
 constexpr const char * COLUMN_TUPLE_DATA_FILE_NAME = "data_file";
 constexpr const char * COLUMN_SEQ_NUMBER_NAME = "sequence_number";
 
-constexpr const char * SUBCOLUMN_FILE_PATH_NAME = "file_path";
-constexpr const char * SUBCOLUMN_CONTENT_NAME = "content";
-constexpr const char * SUBCOLUMN_PARTITION_NAME = "partition";
+constexpr const char * SUBCOLUMN_FILE_PATH_NAME = "data_file.file_path";
+constexpr const char * SUBCOLUMN_CONTENT_NAME = "data_file.content";
+constexpr const char * SUBCOLUMN_PARTITION_NAME = "data_file.partition";
 
 
 const std::vector<ManifestFileEntry> & ManifestFileContent::getFiles() const
 {
-    return impl->files;
+    return files;
 }
 
 Int32 ManifestFileContent::getSchemaId() const
 {
-    return impl->schema_id;
-}
-
-
-ManifestFileContent::ManifestFileContent(std::unique_ptr<ManifestFileContentImpl> impl_) : impl(std::move(impl_))
-{
+    return schema_id;
 }
 
 using namespace DB;
 
-
-ManifestFileContentImpl::ManifestFileContentImpl(
-    std::unique_ptr<avro::DataFileReaderBase> manifest_file_reader_,
+ManifestFileContent::ManifestFileContent(
+    const AvroForIcebergDeserializer & manifest_file_deserializer,
     Int32 format_version_,
     const String & common_path,
-    const DB::FormatSettings & format_settings,
     Int32 schema_id_,
     const IcebergSchemaProcessor & schema_processor,
     Int64 inherited_sequence_number,
@@ -66,101 +55,26 @@ ManifestFileContentImpl::ManifestFileContentImpl(
 {
     this->schema_id = schema_id_;
 
-    avro::NodePtr root_node = manifest_file_reader_->dataSchema().root();
-
-    auto [name_to_index, name_to_data_type, manifest_file_header] = getColumnsAndTypesFromAvroByNames(
-        root_node,
-        {COLUMN_STATUS_NAME, COLUMN_TUPLE_DATA_FILE_NAME, COLUMN_SEQ_NUMBER_NAME},
-        {avro::Type::AVRO_INT, avro::Type::AVRO_RECORD, avro::Type::AVRO_UNION});
-
     for (const auto & column_name : {COLUMN_STATUS_NAME, COLUMN_TUPLE_DATA_FILE_NAME})
     {
-        if (name_to_index.find(column_name) == name_to_index.end())
+        if (!manifest_file_deserializer.hasPath(column_name))
             throw Exception(
                 DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: {}", column_name);
     }
 
-    if (format_version_ > 1 && name_to_index.find(COLUMN_SEQ_NUMBER_NAME) == name_to_index.end())
+    if (format_version_ > 1 && !manifest_file_deserializer.hasPath(COLUMN_SEQ_NUMBER_NAME))
         throw Exception(
             ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Required columns are not found in manifest file: {}", COLUMN_SEQ_NUMBER_NAME);
 
-    auto columns = parseAvro(*manifest_file_reader_, manifest_file_header, format_settings);
-    if (columns.at(name_to_index.at(COLUMN_STATUS_NAME))->getDataType() != TypeIndex::Int32)
-    {
-        throw Exception(
-            DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `{}` field should be Int32 type, got {}",
-            COLUMN_STATUS_NAME,
-            columns.at(name_to_index.at(COLUMN_STATUS_NAME))->getFamilyName());
-    }
-    if (columns.at(name_to_index.at(COLUMN_TUPLE_DATA_FILE_NAME))->getDataType() != TypeIndex::Tuple)
-    {
-        throw Exception(
-            DB::ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `{}` field should be Tuple type, got {}",
-            COLUMN_TUPLE_DATA_FILE_NAME,
-            magic_enum::enum_name(columns.at(name_to_index.at(COLUMN_TUPLE_DATA_FILE_NAME))->getDataType()));
-    }
-
-    const auto * status_int_column = assert_cast<DB::ColumnInt32 *>(columns.at(name_to_index.at(COLUMN_STATUS_NAME)).get());
-
-    const auto & data_file_tuple_type = assert_cast<const DataTypeTuple &>(*name_to_data_type.at(COLUMN_TUPLE_DATA_FILE_NAME).get());
-    const auto * data_file_tuple_column = assert_cast<DB::ColumnTuple *>(columns.at(name_to_index.at(COLUMN_TUPLE_DATA_FILE_NAME)).get());
-
-    ColumnPtr file_path_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName(SUBCOLUMN_FILE_PATH_NAME));
-
-    if (file_path_column->getDataType() != TypeIndex::String)
-    {
-        throw Exception(
-            ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `{}` field should be String type, got {}",
-            SUBCOLUMN_FILE_PATH_NAME,
-            magic_enum::enum_name(file_path_column->getDataType()));
-    }
-
-    const auto * file_path_string_column = assert_cast<const ColumnString *>(file_path_column.get());
-
-    ColumnPtr content_column;
-    const ColumnInt32 * content_int_column = nullptr;
-    if (format_version_ > 1)
-    {
-        content_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName(SUBCOLUMN_CONTENT_NAME));
-        if (content_column->getDataType() != TypeIndex::Int32)
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `{}` field should be Int type, got {}",
-                SUBCOLUMN_CONTENT_NAME,
-                magic_enum::enum_name(content_column->getDataType()));
-        }
-
-        content_int_column = assert_cast<const ColumnInt32 *>(content_column.get());
-    }
-
-
     Poco::JSON::Parser parser;
 
-    ColumnPtr big_partition_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName(SUBCOLUMN_PARTITION_NAME));
-    if (big_partition_column->getDataType() != TypeIndex::Tuple)
-    {
-        throw Exception(
-            ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `{}` field should be Tuple type, got {}",
-            SUBCOLUMN_PARTITION_NAME,
-            magic_enum::enum_name(big_partition_column->getDataType()));
-    }
-    const auto * big_partition_tuple = assert_cast<const ColumnTuple *>(big_partition_column.get());
+    auto partition_spec_json_string = manifest_file_deserializer.tryGetAvroMetadataValue("partition-spec");
+    if (!partition_spec_json_string.has_value())
+        throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "No partition-spec in iceberg manifest file");
 
-    auto avro_metadata = manifest_file_reader_->metadata();
-
-    std::vector<uint8_t> partition_spec_json_bytes = avro_metadata["partition-spec"];
-    String partition_spec_json_string
-        = String(reinterpret_cast<char *>(partition_spec_json_bytes.data()), partition_spec_json_bytes.size());
-
-    Poco::Dynamic::Var partition_spec_json = parser.parse(partition_spec_json_string);
+    Poco::Dynamic::Var partition_spec_json = parser.parse(*partition_spec_json_string);
     const Poco::JSON::Array::Ptr & partition_specification = partition_spec_json.extract<Poco::JSON::Array::Ptr>();
 
-    std::vector<ColumnPtr> partition_columns;
     DB::NamesAndTypesList partition_columns_description;
     std::shared_ptr<DB::ASTFunction> partition_key_ast = std::make_shared<DB::ASTFunction>();
     partition_key_ast->name = "tuple";
@@ -183,56 +97,45 @@ ManifestFileContentImpl::ManifestFileContentImpl(
 
         partition_key_ast->arguments->children.emplace_back(std::move(partition_ast));
         partition_columns_description.emplace_back(numeric_column_name, removeNullable(manifest_file_column_characteristics.type));
-        partition_columns.push_back(removeNullable(big_partition_tuple->getColumnPtr(i)));
         this->partition_column_ids.push_back(source_id);
     }
 
     if (!partition_column_ids.empty())
         this->partition_key_description.emplace(DB::KeyDescription::getKeyFromAST(std::move(partition_key_ast), ColumnsDescription(partition_columns_description), context));
 
-    std::optional<const ColumnNullable *> sequence_number_column = std::nullopt;
-    if (format_version_ > 1)
-    {
-        if (columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getDataType() != TypeIndex::Nullable)
-        {
-            throw Exception(
-                DB::ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `{}` field should be Nullable type, got {}",
-                COLUMN_SEQ_NUMBER_NAME,
-                magic_enum::enum_name(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME))->getDataType()));
-        }
-        sequence_number_column = assert_cast<const ColumnNullable *>(columns.at(name_to_index.at(COLUMN_SEQ_NUMBER_NAME)).get());
-        if (sequence_number_column.value()->getNestedColumnPtr()->getDataType() != TypeIndex::Int64)
-        {
-            throw Exception(
-                DB::ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `{}` field should be Int64 type, got {}",
-                COLUMN_SEQ_NUMBER_NAME,
-                magic_enum::enum_name(sequence_number_column.value()->getNestedColumnPtr()->getDataType()));
-        }
-    }
-
-    for (size_t i = 0; i < data_file_tuple_column->size(); ++i)
+    for (size_t i = 0; i < manifest_file_deserializer.rows(); ++i)
     {
         FileContentType content_type = FileContentType::DATA;
         if (format_version_ > 1)
         {
-            content_type = FileContentType(content_int_column->getElement(i));
+            content_type = FileContentType(manifest_file_deserializer.getValueFromRowByName(i, SUBCOLUMN_CONTENT_NAME, TypeIndex::Int32).safeGet<UInt64>());
             if (content_type != FileContentType::DATA)
                 throw Exception(
                     ErrorCodes::UNSUPPORTED_METHOD, "Cannot read Iceberg table: positional and equality deletes are not supported");
         }
-        const auto status = ManifestEntryStatus(status_int_column->getInt(i));
-
-        const auto file_path = getProperFilePathFromMetadataInfo(file_path_string_column->getDataAt(i).toView(), common_path, table_location);
+        const auto status = ManifestEntryStatus(manifest_file_deserializer.getValueFromRowByName(i, COLUMN_STATUS_NAME, TypeIndex::Int32).safeGet<UInt64>());
+
+        const auto file_path = getProperFilePathFromMetadataInfo(manifest_file_deserializer.getValueFromRowByName(i, SUBCOLUMN_FILE_PATH_NAME, TypeIndex::String).safeGet<String>(), common_path, table_location);
+
+        /// NOTE: This is weird, because in manifest file partition looks like this:
+        /// {
+        /// ...
+        ///  "data_file": {
+        ///    "partition": {
+        ///      "total_amount_trunc": {
+        ///        "decimal_10_2": "\u0000\u0000\u0000\u0013<U+0086>"
+        ///      }
+        ///    },
+        ///    ....
+        /// However, somehow parser ignores all these nested keys like "total_amount_trunc" or "decimal_10_2" and
+        /// directly returns tuple of partition values. However it's exactly what we need.
+        Field partition_value = manifest_file_deserializer.getValueFromRowByName(i, SUBCOLUMN_PARTITION_NAME);
+        auto tuple = partition_value.safeGet<Tuple>();
 
         DB::Row partition_key_value;
-        for (const auto & partition_column : partition_columns)
-        {
-            Field partition_value;
-            partition_column->get(i, partition_value);
-            partition_key_value.emplace_back(partition_value);
-        }
+        for (const auto & value : tuple)
+            partition_key_value.emplace_back(value);
+
         FileEntry file = FileEntry{DataFileEntry{file_path}};
 
         Int64 added_sequence_number = 0;
@@ -244,13 +147,16 @@ ManifestFileContentImpl::ManifestFileContentImpl(
                     added_sequence_number = inherited_sequence_number;
                     break;
                 case ManifestEntryStatus::EXISTING:
-                    if (sequence_number_column.value()->isNullAt(i))
+                {
+                    auto value = manifest_file_deserializer.getValueFromRowByName(i, COLUMN_SEQ_NUMBER_NAME);
+                    if (value.isNull())
                         throw Exception(
                             DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
                             "Data sequence number is null for the file added in another snapshot");
                     else
-                        added_sequence_number = sequence_number_column.value()->getInt(i);
+                        added_sequence_number = value.safeGet<UInt64>();
                     break;
+                }
                 case ManifestEntryStatus::DELETED:
                     added_sequence_number = inherited_sequence_number;
                     break;
@@ -262,19 +168,19 @@ ManifestFileContentImpl::ManifestFileContentImpl(
 
 bool ManifestFileContent::hasPartitionKey() const
 {
-    return !impl->partition_column_ids.empty();
+    return !partition_column_ids.empty();
 }
 
 const DB::KeyDescription & ManifestFileContent::getPartitionKeyDescription() const
 {
     if (!hasPartitionKey())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Table has no partition key, but it was requested");
-    return *(impl->partition_key_description);
+    return *(partition_key_description);
 }
 
 const std::vector<Int32> & ManifestFileContent::getPartitionKeyColumnIDs() const
 {
-    return impl->partition_column_ids;
+    return partition_column_ids;
 }
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 14b3dd82294a..8d0c1482c4d9 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -6,6 +6,8 @@
 #if USE_AVRO
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h>
 #include <Storages/KeyDescription.h>
 #include <Core/Field.h>
 
@@ -45,10 +47,44 @@ struct ManifestFileEntry
     DB::Row partition_key_value;
 };
 
+/**
+ * Manifest file has the following format: '/iceberg_data/db/table_name/metadata/c87bfec7-d36c-4075-ad04-600b6b0f2020-m0.avro'
+ *
+ * `manifest file` is different in format version V1 and V2 and has the following contents:
+ *                        v1     v2
+ * status                 req    req
+ * snapshot_id            req    opt
+ * sequence_number               opt
+ * file_sequence_number          opt
+ * data_file              req    req
+ * Example format version V1:
+ * ┌─status─┬─────────snapshot_id─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 2819310504515118887 │ ('/iceberg_data/db/table_name/data/00000-1-3edca534-15a0-4f74-8a28-4733e0bf1270-00001.parquet','PARQUET',(),100,1070,67108864,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],0) │
+ * └────────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ * Example format version V2:
+ * ┌─status─┬─────────snapshot_id─┬─sequence_number─┬─file_sequence_number─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 5887006101709926452 │            ᴺᵁᴸᴸ │                 ᴺᵁᴸᴸ │ (0,'/iceberg_data/db/table_name/data/00000-1-c8045c90-8799-4eac-b957-79a0484e223c-00001.parquet','PARQUET',(),100,1070,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],[],0) │
+ * └────────┴─────────────────────┴─────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ * In case of partitioned data we'll have extra directory partition=value:
+ * ─status─┬─────────snapshot_id─┬─data_file──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=0/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00001.parquet','PARQUET',(0),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],NULL,[4],0) │
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=1/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00002.parquet','PARQUET',(1),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'2')],[(1,'\0\0\0\0\0\0\0'),(2,'2')],NULL,[4],0) │
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
+ * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ */
+
 class ManifestFileContent
 {
 public:
-    explicit ManifestFileContent(std::unique_ptr<ManifestFileContentImpl> impl_);
+    explicit ManifestFileContent(
+        const AvroForIcebergDeserializer & manifest_file_deserializer,
+        Int32 format_version_,
+        const String & common_path,
+        Int32 schema_id_,
+        const DB::IcebergSchemaProcessor & schema_processor,
+        Int64 inherited_sequence_number,
+        const std::string & table_location,
+        DB::ContextPtr context);
 
     const std::vector<ManifestFileEntry> & getFiles() const;
     Int32 getSchemaId() const;
@@ -57,9 +93,15 @@ class ManifestFileContent
     const DB::KeyDescription & getPartitionKeyDescription() const;
     const std::vector<Int32> & getPartitionKeyColumnIDs() const;
 private:
-    std::unique_ptr<ManifestFileContentImpl> impl;
-};
 
+    Int32 schema_id;
+
+    std::optional<DB::KeyDescription> partition_key_description;
+    std::vector<Int32> partition_column_ids;
+    // Size - number of files
+    std::vector<ManifestFileEntry> files;
+
+};
 
 using ManifestFilesStorage = std::map<String, ManifestFileContent>;
 using ManifestFileIterator = IteratorWrapper<ManifestFileContent>;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
deleted file mode 100644
index f96ea285483e..000000000000
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#pragma once
-
-#include "config.h"
-
-#if USE_AVRO
-
-#include <Processors/Formats/Impl/AvroRowInputFormat.h>
-#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
-#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
-#include <Storages/KeyDescription.h>
-
-#include "SchemaProcessor.h"
-
-namespace Iceberg
-{
-
-/**
- * Manifest file has the following format: '/iceberg_data/db/table_name/metadata/c87bfec7-d36c-4075-ad04-600b6b0f2020-m0.avro'
- *
- * `manifest file` is different in format version V1 and V2 and has the following contents:
- *                        v1     v2
- * status                 req    req
- * snapshot_id            req    opt
- * sequence_number               opt
- * file_sequence_number          opt
- * data_file              req    req
- * Example format version V1:
- * ┌─status─┬─────────snapshot_id─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 2819310504515118887 │ ('/iceberg_data/db/table_name/data/00000-1-3edca534-15a0-4f74-8a28-4733e0bf1270-00001.parquet','PARQUET',(),100,1070,67108864,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],0) │
- * └────────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- * Example format version V2:
- * ┌─status─┬─────────snapshot_id─┬─sequence_number─┬─file_sequence_number─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 5887006101709926452 │            ᴺᵁᴸᴸ │                 ᴺᵁᴸᴸ │ (0,'/iceberg_data/db/table_name/data/00000-1-c8045c90-8799-4eac-b957-79a0484e223c-00001.parquet','PARQUET',(),100,1070,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],[],0) │
- * └────────┴─────────────────────┴─────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- * In case of partitioned data we'll have extra directory partition=value:
- * ─status─┬─────────snapshot_id─┬─data_file──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=0/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00001.parquet','PARQUET',(0),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],NULL,[4],0) │
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=1/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00002.parquet','PARQUET',(1),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'2')],[(1,'\0\0\0\0\0\0\0'),(2,'2')],NULL,[4],0) │
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
- * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- */
-struct ManifestFileContentImpl
-{
-public:
-    explicit ManifestFileContentImpl(
-        std::unique_ptr<avro::DataFileReaderBase> manifest_file_reader_,
-        Int32 format_version_,
-        const String & common_path,
-        const DB::FormatSettings & format_settings,
-        Int32 schema_id_,
-        const DB::IcebergSchemaProcessor & schema_processor,
-        Int64 inherited_sequence_number,
-        const std::string & table_location,
-        DB::ContextPtr context);
-
-    Int32 schema_id;
-
-    std::optional<DB::KeyDescription> partition_key_description;
-    std::vector<Int32> partition_column_ids;
-    // Size - number of files
-    std::vector<ManifestFileEntry> files;
-};
-
-}
-
-#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
index c4fc63e7f9ba..38d6829a13c3 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
@@ -13,9 +13,9 @@ using namespace DB;
 
 namespace DB::ErrorCodes
 {
-extern const int ICEBERG_SPECIFICATION_VIOLATION;
-extern const int BAD_TYPE_OF_FIELD;
+
 extern const int BAD_ARGUMENTS;
+
 }
 
 namespace Iceberg
@@ -23,31 +23,6 @@ namespace Iceberg
 
 using namespace DB;
 
-
-MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & header, const FormatSettings & settings)
-{
-    auto deserializer = std::make_unique<AvroDeserializer>(header, file_reader.dataSchema(), true, true, settings);
-    MutableColumns columns = header.cloneEmptyColumns();
-
-    file_reader.init();
-    RowReadExtension ext;
-    while (file_reader.hasMore())
-    {
-        file_reader.decr();
-        deserializer->deserializeRow(columns, file_reader.decoder(), ext);
-    }
-
-    for (size_t i = 0; i < columns.size(); ++i)
-    {
-        if (columns[0]->size() != columns[i]->size())
-        {
-            throw Exception(DB::ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "All columns should have the same size");
-        }
-    }
-    return columns;
-}
-
-
 // This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files.
 // For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro
 // Common path should end with "<table_name>" or "<table_name>/".
@@ -111,65 +86,6 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s
     }
 }
 
-std::tuple<NameToIndex, NameToDataType, DB::Block> getColumnsAndTypesFromAvroByNames(
-    avro::NodePtr root_node, const std::vector<String> & names, const std::vector<avro::Type> & expected_types)
-{
-    NameToIndex name_to_index;
-    NameToDataType name_to_data_type;
-
-    std::unordered_map<String, std::optional<size_t>> initial_index_by_name;
-    for (const auto & name : names)
-    {
-        initial_index_by_name.insert({name, std::nullopt});
-    }
-
-    size_t leaves_num = root_node->leaves();
-    for (size_t i = 0; i < leaves_num; ++i)
-    {
-        const auto & name = root_node->nameAt(static_cast<int>(i));
-
-        if (initial_index_by_name.find(name) != initial_index_by_name.end())
-            initial_index_by_name[name] = i;
-    }
-
-
-    size_t current_new_index = 0;
-    ColumnsWithTypeAndName columns_to_add = {};
-    for (size_t i = 0; i < names.size(); ++i)
-    {
-        const auto & name = names[i];
-        if (initial_index_by_name.at(name).has_value())
-        {
-            name_to_index.insert({name, current_new_index++});
-            const auto node = root_node->leafAt(static_cast<int>(initial_index_by_name.at(name).value()));
-            const size_t initial_index = initial_index_by_name.at(name).value();
-            if (node->type() != expected_types.at(i))
-            {
-                throw Exception(
-                    ErrorCodes::BAD_TYPE_OF_FIELD,
-                    "The parsed column from Avro file of `{}` field should be {} type, got {}",
-                    name,
-                    magic_enum::enum_name(expected_types[initial_index]),
-                    magic_enum::enum_name(node->type()));
-            }
-            name_to_data_type.insert({name, AvroSchemaReader::avroNodeToDataType(node)});
-            columns_to_add.push_back(ColumnWithTypeAndName{name_to_data_type.at(name)->createColumn(), name_to_data_type.at(name), name});
-        }
-    }
-
-    return std::make_tuple(name_to_index, name_to_data_type, Block{columns_to_add});
-}
-
-void checkColumnType(const DB::ColumnPtr & column, DB::TypeIndex expected_type_index)
-{
-    if (column->getDataType() != expected_type_index)
-        throw Exception(
-            ErrorCodes::BAD_TYPE_OF_FIELD,
-            "The parsed column from Avro file should be {} type, got {}",
-            magic_enum::enum_name(expected_type_index),
-            column->getFamilyName());
-}
-
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
index 3166a7fb6ab5..432751be8832 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
@@ -2,25 +2,16 @@
 
 #include "config.h"
 
-#if USE_AVRO
+#include <string>
+#include <string_view>
 
-#include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#if USE_AVRO
 
 namespace Iceberg
 {
 
-using NameToIndex = std::unordered_map<String, size_t>;
-using NameToDataType = std::unordered_map<String, DB::DataTypePtr>;
-
-DB::MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const DB::Block & header, const DB::FormatSettings & settings);
-
-
 std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location);
 
-std::tuple<NameToIndex, NameToDataType, DB::Block> getColumnsAndTypesFromAvroByNames(
-    avro::NodePtr root_node, const std::vector<String> & names, const std::vector<avro::Type> & expected_types);
 }
 
-void checkColumnType(const DB::ColumnPtr & column, DB::TypeIndex expected_type_index);
-
 #endif

From bc10f91a616b5e90d5a7573d99104cf3a5b6808f Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 24 Mar 2025 15:51:34 +0000
Subject: [PATCH 06/14] Merge pull request #78090 from
 ClickHouse/trivial_count_for_iceberg

Trivial count optimization for iceberg
---
 src/Common/ProfileEvents.cpp                  |  1 +
 src/Interpreters/InterpreterDropQuery.cpp     |  2 +-
 src/Interpreters/InterpreterSelectQuery.cpp   |  2 +-
 .../ServerAsynchronousMetrics.cpp             |  6 +-
 src/Planner/PlannerJoinTree.cpp               |  2 +-
 .../QueryPlan/Optimizations/optimizeJoin.cpp  |  2 +-
 src/Storages/IStorage.h                       |  4 +-
 .../DataLakes/DataLakeConfiguration.h         | 10 +-
 .../DataLakes/IDataLakeMetadata.h             |  2 +
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 81 +++++++++++++++-
 .../DataLakes/Iceberg/IcebergMetadata.h       |  4 +-
 .../DataLakes/Iceberg/ManifestFile.cpp        | 97 ++++++++++++++++++-
 .../DataLakes/Iceberg/ManifestFile.h          | 15 +++
 .../DataLakes/Iceberg/Snapshot.h              |  2 +
 .../ObjectStorage/StorageObjectStorage.cpp    | 13 +++
 .../ObjectStorage/StorageObjectStorage.h      |  5 +
 .../RocksDB/StorageEmbeddedRocksDB.cpp        |  6 +-
 src/Storages/RocksDB/StorageEmbeddedRocksDB.h |  4 +-
 src/Storages/StorageBuffer.cpp                |  6 +-
 src/Storages/StorageBuffer.h                  |  4 +-
 src/Storages/StorageDistributed.cpp           |  8 +-
 src/Storages/StorageDistributed.h             |  2 +-
 src/Storages/StorageJoin.cpp                  |  6 +-
 src/Storages/StorageJoin.h                    |  4 +-
 src/Storages/StorageLog.cpp                   |  4 +-
 src/Storages/StorageLog.h                     |  4 +-
 src/Storages/StorageMaterializedView.cpp      |  8 +-
 src/Storages/StorageMaterializedView.h        |  4 +-
 src/Storages/StorageMemory.cpp                |  4 +-
 src/Storages/StorageMemory.h                  |  4 +-
 src/Storages/StorageMerge.cpp                 |  8 +-
 src/Storages/StorageMerge.h                   |  4 +-
 src/Storages/StorageMergeTree.cpp             |  4 +-
 src/Storages/StorageMergeTree.h               |  4 +-
 src/Storages/StorageNull.h                    |  4 +-
 src/Storages/StorageProxy.h                   |  4 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  6 +-
 src/Storages/StorageReplicatedMergeTree.h     |  4 +-
 src/Storages/StorageSet.cpp                   |  4 +-
 src/Storages/StorageSet.h                     |  4 +-
 src/Storages/StorageStripeLog.cpp             |  4 +-
 src/Storages/StorageStripeLog.h               |  4 +-
 src/Storages/StorageTimeSeries.cpp            |  8 +-
 src/Storages/StorageTimeSeries.h              |  4 +-
 src/Storages/System/StorageSystemTables.cpp   | 18 ++--
 .../integration/test_storage_iceberg/test.py  | 24 ++++-
 46 files changed, 332 insertions(+), 92 deletions(-)

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 1bdcec97d1cf..2214fb8dc92a 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -219,6 +219,7 @@
     M(ExternalJoinUncompressedBytes, "Amount of data (uncompressed, before compression) written for JOIN in external memory.", ValueType::Bytes) \
     \
     M(IcebergPartitionPrunnedFiles, "Number of skipped files during Iceberg partition pruning", ValueType::Number) \
+    M(IcebergTrivialCountOptimizationApplied, "Trivial count optimization applied while reading from Iceberg", ValueType::Number) \
     M(JoinBuildTableRowCount, "Total number of rows in the build table for a JOIN operation.", ValueType::Number) \
     M(JoinProbeTableRowCount, "Total number of rows in the probe table for a JOIN operation.", ValueType::Number) \
     M(JoinResultRowCount, "Total number of rows in the result of a JOIN operation.", ValueType::Number) \
diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp
index f8129b66ac38..2cf2e06fed78 100644
--- a/src/Interpreters/InterpreterDropQuery.cpp
+++ b/src/Interpreters/InterpreterDropQuery.cpp
@@ -151,7 +151,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ContextPtr & context_, AS
         const auto & settings = getContext()->getSettingsRef();
         if (query.if_empty)
         {
-            if (auto rows = table->totalRows(settings); rows > 0)
+            if (auto rows = table->totalRows(getContext()); rows > 0)
                 throw Exception(ErrorCodes::TABLE_NOT_EMPTY, "Table {} is not empty", backQuoteIfNeed(table_id.table_name));
         }
         checkStorageSupportsTransactionsIfNeeded(table, context_);
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index f17f63a7e6b9..619944e40caa 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -2481,7 +2481,7 @@ std::optional<UInt64> InterpreterSelectQuery::getTrivialCount(UInt64 allow_exper
         /// require reading some data (but much faster than reading columns).
         /// Set a special flag in query info so the storage will see it and optimize count in read() method.
         query_info.optimize_trivial_count = optimize_trivial_count;
-        return storage->totalRows(settings);
+        return storage->totalRows(context);
     }
 
     // It's possible to optimize count() given only partition predicates
diff --git a/src/Interpreters/ServerAsynchronousMetrics.cpp b/src/Interpreters/ServerAsynchronousMetrics.cpp
index c29c227694e1..c1c6d46cad65 100644
--- a/src/Interpreters/ServerAsynchronousMetrics.cpp
+++ b/src/Interpreters/ServerAsynchronousMetrics.cpp
@@ -328,12 +328,10 @@ void ServerAsynchronousMetrics::updateImpl(TimePoint update_time, TimePoint curr
 
                 if (MergeTreeData * table_merge_tree = dynamic_cast<MergeTreeData *>(table.get()))
                 {
-                    const auto & settings = getContext()->getSettingsRef();
-
                     calculateMax(max_part_count_for_partition, table_merge_tree->getMaxPartsCountAndSizeForPartition().first);
 
-                    size_t bytes = table_merge_tree->totalBytes(settings).value();
-                    size_t rows = table_merge_tree->totalRows(settings).value();
+                    size_t bytes = table_merge_tree->totalBytes(getContext()).value();
+                    size_t rows = table_merge_tree->totalRows(getContext()).value();
                     size_t parts = table_merge_tree->getActivePartsCount();
 
                     total_number_of_bytes += bytes;
diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp
index 3aaa1489158f..6a70ca4ffb5c 100644
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@@ -331,7 +331,7 @@ bool applyTrivialCountIfPossible(
     select_query_info.optimize_trivial_count = true;
 
     /// Get number of rows
-    std::optional<UInt64> num_rows = storage->totalRows(settings);
+    std::optional<UInt64> num_rows = storage->totalRows(query_context);
     if (!num_rows)
         return false;
 
diff --git a/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp b/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp
index bdd28e6e9738..5015c433005f 100644
--- a/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp
+++ b/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp
@@ -87,7 +87,7 @@ static std::optional<UInt64> estimateReadRowsCount(QueryPlan::Node & node, bool
     }
 
     if (const auto * reading = typeid_cast<const ReadFromMemoryStorageStep *>(step))
-        return reading->getStorage()->totalRows(Settings{});
+        return reading->getStorage()->totalRows({});
 
     if (node.children.size() != 1)
         return {};
diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h
index 89d59f6cd4e4..bce8748d299b 100644
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@@ -673,7 +673,7 @@ class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromo
     /// - For total_rows column in system.tables
     ///
     /// Does takes underlying Storage (if any) into account.
-    virtual std::optional<UInt64> totalRows(const Settings &) const { return {}; }
+    virtual std::optional<UInt64> totalRows(ContextPtr) const { return {}; }
 
     /// Same as above but also take partition predicate into account.
     virtual std::optional<UInt64> totalRowsByPartitionPredicate(const ActionsDAG &, ContextPtr) const { return {}; }
@@ -691,7 +691,7 @@ class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromo
     /// Memory part should be estimated as a resident memory size.
     /// In particular, alloctedBytes() is preferable over bytes()
     /// when considering in-memory blocks.
-    virtual std::optional<UInt64> totalBytes(const Settings &) const { return {}; }
+    virtual std::optional<UInt64> totalBytes(ContextPtr) const { return {}; }
 
     /// If it is possible to quickly determine exact number of uncompressed bytes for the table on storage:
     /// - disk (uncompressed)
diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index 9e3a1b4566f2..24fbd90e5fc7 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -21,7 +21,6 @@
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 
 #include <Common/ErrorCodes.h>
 
@@ -86,6 +85,15 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
         BaseStorageConfiguration::setPaths(current_metadata->makePartitionPruning(filter_dag));
     }
 
+
+    std::optional<size_t> totalRows() override
+    {
+        if (!current_metadata)
+            return {};
+
+        return current_metadata->totalRows();
+    }
+
     std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
     {
         if (!current_metadata)
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index 4160220a4679..f2110c7a7b2e 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -50,6 +50,8 @@ class IDataLakeMetadata : boost::noncopyable
     /// Whether schema evolution is supported.
     virtual bool supportsExternalMetadataChange() const { return false; }
 
+    virtual std::optional<size_t> totalRows() const { return {}; }
+    virtual std::optional<size_t> totalBytes() const { return {}; }
 protected:
     [[noreturn]] void throwNotImplemented(std::string_view method) const
     {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 895583dcc24f..284491f624c0 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -24,10 +24,10 @@
 #include <Common/logger_useful.h>
 #include <Common/ProfileEvents.h>
 
-
 namespace ProfileEvents
 {
     extern const Event IcebergPartitionPrunnedFiles;
+    extern const Event IcebergTrivialCountOptimizationApplied;
 }
 
 namespace DB
@@ -321,10 +321,24 @@ void IcebergMetadata::updateSnapshot()
                     "No manifest list found for snapshot id `{}` for iceberg table `{}`",
                     relevant_snapshot_id,
                     configuration_ptr->getPath());
+            std::optional<size_t> total_rows;
+            std::optional<size_t> total_bytes;
+
+            if (snapshot->has("summary"))
+            {
+                auto summary_object = snapshot->get("summary").extract<Poco::JSON::Object::Ptr>();
+                if (summary_object->has("total-records"))
+                    total_rows = summary_object->getValue<Int64>("total-records");
+
+                if (summary_object->has("total-files-size"))
+                    total_bytes = summary_object->getValue<Int64>("total-files-size");
+            }
+
             relevant_snapshot = IcebergSnapshot{
                 getManifestList(getProperFilePathFromMetadataInfo(
                     snapshot->getValue<String>(MANIFEST_LIST_PATH_FIELD), configuration_ptr->getPath(), table_location)),
-                relevant_snapshot_id};
+                relevant_snapshot_id, total_rows, total_bytes};
+
             if (!snapshot->has("schema-id"))
                 throw Exception(
                     ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION,
@@ -574,6 +588,69 @@ Strings IcebergMetadata::makePartitionPruning(const ActionsDAG & filter_dag)
     }
     return getDataFilesImpl(&filter_dag);
 }
+
+std::optional<size_t> IcebergMetadata::totalRows() const
+{
+    auto configuration_ptr = configuration.lock();
+    if (!configuration_ptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
+
+    if (!relevant_snapshot)
+    {
+        ProfileEvents::increment(ProfileEvents::IcebergTrivialCountOptimizationApplied);
+        return 0;
+    }
+
+    /// All these "hints" with total rows or bytes are optional both in
+    /// metadata files and in manifest files, so we try all of them one by one
+    if (relevant_snapshot->total_rows.has_value())
+    {
+        ProfileEvents::increment(ProfileEvents::IcebergTrivialCountOptimizationApplied);
+        return relevant_snapshot->total_rows;
+    }
+
+    Int64 result = 0;
+    for (const auto & manifest_list_entry : *(relevant_snapshot->manifest_list))
+    {
+        auto count = manifest_list_entry->getRowsCountInAllDataFilesExcludingDeleted();
+        if (!count.has_value())
+            return {};
+
+        result += count.value();
+    }
+
+    ProfileEvents::increment(ProfileEvents::IcebergTrivialCountOptimizationApplied);
+    return result;
+}
+
+
+std::optional<size_t> IcebergMetadata::totalBytes() const
+{
+    auto configuration_ptr = configuration.lock();
+    if (!configuration_ptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
+
+    if (!relevant_snapshot)
+        return 0;
+
+    /// All these "hints" with total rows or bytes are optional both in
+    /// metadata files and in manifest files, so we try all of them one by one
+    if (relevant_snapshot->total_bytes.has_value())
+        return relevant_snapshot->total_bytes;
+
+    Int64 result = 0;
+    for (const auto & manifest_list_entry : *(relevant_snapshot->manifest_list))
+    {
+        auto count = manifest_list_entry->getBytesCountInAllDataFiles();
+        if (!count.has_value())
+            return {};
+
+        result += count.value();
+    }
+
+    return result;
+}
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 5790f1d132d9..f4f355bb6ba7 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -64,7 +64,6 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
         const ContextPtr & local_context,
         bool allow_experimental_delta_kernel_rs);
 
-
     std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
     {
         auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
@@ -92,6 +91,9 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     bool supportsPartitionPruning() override { return true; }
 
+    std::optional<size_t> totalRows() const override;
+    std::optional<size_t> totalBytes() const override;
+
 private:
     using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFileIterator>;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index 19e6cbb49f0d..b9a968f06db6 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -11,6 +11,7 @@
 #include <Parsers/ASTFunction.h>
 #include <Common/quoteString.h>
 #include <DataTypes/DataTypeNullable.h>
+#include <Common/logger_useful.h>
 
 namespace DB::ErrorCodes
 {
@@ -30,6 +31,12 @@ constexpr const char * SUBCOLUMN_FILE_PATH_NAME = "data_file.file_path";
 constexpr const char * SUBCOLUMN_CONTENT_NAME = "data_file.content";
 constexpr const char * SUBCOLUMN_PARTITION_NAME = "data_file.partition";
 
+constexpr const char * SUBCOLUMN_VALUES_COUNT_NAME = "data_file.value_counts";
+constexpr const char * SUBCOLUMN_COLUMN_SIZES_NAME = "data_file.column_sizes";
+constexpr const char * SUBCOLUMN_NULL_VALUE_COUNTS_NAME = "data_file.null_value_counts";
+constexpr const char * SUBCOLUMN_LOWER_BOUNDS_NAME = "data_file.lower_bounds";
+constexpr const char * SUBCOLUMN_UPPER_BOUNDS_NAME = "data_file.upper_bounds";
+
 
 const std::vector<ManifestFileEntry> & ManifestFileContent::getFiles() const
 {
@@ -136,6 +143,46 @@ ManifestFileContent::ManifestFileContent(
         for (const auto & value : tuple)
             partition_key_value.emplace_back(value);
 
+        std::unordered_map<Int32, ColumnInfo> columns_infos;
+
+        for (const auto & path : {SUBCOLUMN_VALUES_COUNT_NAME, SUBCOLUMN_COLUMN_SIZES_NAME, SUBCOLUMN_NULL_VALUE_COUNTS_NAME})
+        {
+            if (manifest_file_deserializer.hasPath(path))
+            {
+                Field values_count = manifest_file_deserializer.getValueFromRowByName(i, path);
+                for (const auto & column_stats : values_count.safeGet<Array>())
+                {
+                    const auto & column_number_and_count = column_stats.safeGet<Tuple>();
+                    Int32 number = column_number_and_count[0].safeGet<Int32>();
+                    Int64 count = column_number_and_count[1].safeGet<Int64>();
+                    if (path == SUBCOLUMN_VALUES_COUNT_NAME)
+                        columns_infos[number].rows_count = count;
+                    else if (path == SUBCOLUMN_COLUMN_SIZES_NAME)
+                        columns_infos[number].bytes_size = count;
+                    else
+                        columns_infos[number].nulls_count = count;
+                }
+            }
+        }
+
+        for (const auto & path : {SUBCOLUMN_LOWER_BOUNDS_NAME, SUBCOLUMN_UPPER_BOUNDS_NAME})
+        {
+            if (manifest_file_deserializer.hasPath(path))
+            {
+                Field bounds = manifest_file_deserializer.getValueFromRowByName(i, path);
+                for (const auto & column_stats : bounds.safeGet<Array>())
+                {
+                    const auto & column_number_and_bound = column_stats.safeGet<Tuple>();
+                    Int32 number = column_number_and_bound[0].safeGet<Int32>();
+                    const Field & bound_value = column_number_and_bound[1];
+                    if (path == SUBCOLUMN_LOWER_BOUNDS_NAME)
+                        columns_infos[number].lower_bound = bound_value;
+                    else
+                        columns_infos[number].upper_bound = bound_value;
+                }
+            }
+        }
+
         FileEntry file = FileEntry{DataFileEntry{file_path}};
 
         Int64 added_sequence_number = 0;
@@ -162,7 +209,7 @@ ManifestFileContent::ManifestFileContent(
                     break;
             }
         }
-        this->files.emplace_back(status, added_sequence_number, file, partition_key_value);
+        this->files.emplace_back(status, added_sequence_number, file, partition_key_value, columns_infos);
     }
 }
 
@@ -183,6 +230,54 @@ const std::vector<Int32> & ManifestFileContent::getPartitionKeyColumnIDs() const
     return partition_column_ids;
 }
 
+std::optional<Int64> ManifestFileContent::getRowsCountInAllDataFilesExcludingDeleted() const
+{
+    Int64 result = 0;
+    for (const auto & file : files)
+    {
+        /// Have at least one column with rows count
+        bool found = false;
+        for (const auto & [column, column_info] : file.columns_infos)
+        {
+            if (column_info.rows_count.has_value())
+            {
+                if (file.status != ManifestEntryStatus::DELETED)
+                    result += *column_info.rows_count;
+                found = true;
+                break;
+            }
+        }
+
+        if (!found)
+            return std::nullopt;
+    }
+    return result;
+}
+
+std::optional<Int64> ManifestFileContent::getBytesCountInAllDataFiles() const
+{
+    Int64 result = 0;
+    for (const auto & file : files)
+    {
+        /// Have at least one column with bytes count
+        bool found = false;
+        for (const auto & [column, column_info] : file.columns_infos)
+        {
+            if (column_info.bytes_size.has_value())
+            {
+                result += *column_info.bytes_size;
+                found = true;
+                break;
+            }
+        }
+
+        if (!found)
+            return std::nullopt;
+    }
+    return result;
+
+}
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 8d0c1482c4d9..8cae2f2deecd 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -35,6 +35,15 @@ struct DataFileEntry
     String file_name;
 };
 
+struct ColumnInfo
+{
+    std::optional<Int64> rows_count;
+    std::optional<Int64> bytes_size;
+    std::optional<Int64> nulls_count;
+    std::optional<DB::Field> lower_bound;
+    std::optional<DB::Field> upper_bound;
+};
+
 using FileEntry = std::variant<DataFileEntry>; // In the future we will add PositionalDeleteFileEntry and EqualityDeleteFileEntry here
 
 /// Description of Data file in manifest file
@@ -45,6 +54,7 @@ struct ManifestFileEntry
 
     FileEntry file;
     DB::Row partition_key_value;
+    std::unordered_map<Int32, ColumnInfo> columns_infos;
 };
 
 /**
@@ -92,6 +102,11 @@ class ManifestFileContent
     bool hasPartitionKey() const;
     const DB::KeyDescription & getPartitionKeyDescription() const;
     const std::vector<Int32> & getPartitionKeyColumnIDs() const;
+
+    /// Fields with rows count in manifest files are optional
+    /// they can be absent.
+    std::optional<Int64> getRowsCountInAllDataFilesExcludingDeleted() const;
+    std::optional<Int64> getBytesCountInAllDataFiles() const;
 private:
 
     Int32 schema_id;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
index e6efb1eb7c00..ed5ba39b2e7a 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -23,6 +23,8 @@ struct IcebergSnapshot
 {
     ManifestListIterator manifest_list_iterator;
     Int64 snapshot_id;
+    std::optional<size_t> total_rows;
+    std::optional<size_t> total_bytes;
 };
 }
 
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index 97de56e5e2cf..37be3b3c2760 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -214,6 +214,18 @@ void StorageObjectStorage::updateExternalDynamicMetadata(ContextPtr context_ptr)
     setInMemoryMetadata(metadata);
 }
 
+std::optional<UInt64> StorageObjectStorage::totalRows(ContextPtr query_context) const
+{
+    configuration->update(object_storage, query_context);
+    return configuration->totalRows();
+}
+
+std::optional<UInt64> StorageObjectStorage::totalBytes(ContextPtr query_context) const
+{
+    configuration->update(object_storage, query_context);
+    return configuration->totalBytes();
+}
+
 namespace
 {
 class ReadFromObjectStorageStep : public SourceStepWithFilter
@@ -686,4 +698,5 @@ void StorageObjectStorage::Configuration::assertInitialized() const
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration was not initialized before usage");
     }
 }
+
 }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h
index ad98b8bfc97d..f04a63021afb 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.h
@@ -138,6 +138,8 @@ class StorageObjectStorage : public IStorage
 
     void updateExternalDynamicMetadata(ContextPtr) override;
 
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 protected:
     String getPathSample(ContextPtr context);
 
@@ -220,6 +222,9 @@ class StorageObjectStorage::Configuration
 
     virtual void implementPartitionPruning(const ActionsDAG &) { }
 
+    virtual std::optional<size_t> totalRows() { return {}; }
+    virtual std::optional<size_t> totalBytes() { return {}; }
+
     virtual bool hasExternalDynamicMetadata() { return false; }
 
     virtual std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String&) const { return {}; }
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
index 0651efcccfb2..d2fc9139970f 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@@ -819,9 +819,9 @@ Chunk StorageEmbeddedRocksDB::getBySerializedKeys(
     return Chunk(std::move(columns), num_rows);
 }
 
-std::optional<UInt64> StorageEmbeddedRocksDB::totalRows(const Settings & query_settings) const
+std::optional<UInt64> StorageEmbeddedRocksDB::totalRows(ContextPtr query_context) const
 {
-    if (!query_settings[Setting::optimize_trivial_approximate_count_query])
+    if (!query_context->getSettingsRef()[Setting::optimize_trivial_approximate_count_query])
         return {};
     std::shared_lock lock(rocksdb_ptr_mx);
     if (!rocksdb_ptr)
@@ -832,7 +832,7 @@ std::optional<UInt64> StorageEmbeddedRocksDB::totalRows(const Settings & query_s
     return estimated_rows;
 }
 
-std::optional<UInt64> StorageEmbeddedRocksDB::totalBytes(const Settings & /*settings*/) const
+std::optional<UInt64> StorageEmbeddedRocksDB::totalBytes(ContextPtr) const
 {
     std::shared_lock lock(rocksdb_ptr_mx);
     if (!rocksdb_ptr)
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
index 5b2cef648e40..51a6e907b7d6 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
@@ -102,9 +102,9 @@ class StorageEmbeddedRocksDB final : public IStorage, public IKeyValueEntity, Wi
     /// To turn on the optimization optimize_trivial_approximate_count_query=1 should be set for a query.
     bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; }
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
 
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 
     void checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const override;
 
diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp
index 4345ad8aaf69..3c77fe41b21b 100644
--- a/src/Storages/StorageBuffer.cpp
+++ b/src/Storages/StorageBuffer.cpp
@@ -1167,16 +1167,16 @@ void StorageBuffer::checkAlterIsPossible(const AlterCommands & commands, Context
     }
 }
 
-std::optional<UInt64> StorageBuffer::totalRows(const Settings & settings) const
+std::optional<UInt64> StorageBuffer::totalRows(ContextPtr query_context) const
 {
     std::optional<UInt64> underlying_rows;
     if (auto destination = getDestinationTable())
-        underlying_rows = destination->totalRows(settings);
+        underlying_rows = destination->totalRows(query_context);
 
     return total_writes.rows + underlying_rows.value_or(0);
 }
 
-std::optional<UInt64> StorageBuffer::totalBytes(const Settings & /*settings*/) const
+std::optional<UInt64> StorageBuffer::totalBytes(ContextPtr) const
 {
     return total_writes.bytes;
 }
diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h
index 3341271db4a5..d68fb1ee1a95 100644
--- a/src/Storages/StorageBuffer.h
+++ b/src/Storages/StorageBuffer.h
@@ -116,8 +116,8 @@ friend class BufferSink;
     /// The structure of the subordinate table is not checked and does not change.
     void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & table_lock_holder) override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 
     std::optional<UInt64> lifetimeRows() const override { return lifetime_writes.rows; }
     std::optional<UInt64> lifetimeBytes() const override { return lifetime_writes.bytes; }
diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp
index 0627053bc177..e3b7d6e9d3b9 100644
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@@ -1513,7 +1513,7 @@ Cluster::Addresses StorageDistributed::parseAddresses(const std::string & name)
     return addresses;
 }
 
-std::optional<UInt64> StorageDistributed::totalBytes(const Settings &) const
+std::optional<UInt64> StorageDistributed::totalBytes(ContextPtr) const
 {
     UInt64 total_bytes = 0;
     for (const auto & status : getDirectoryQueueStatuses())
@@ -1837,7 +1837,7 @@ void StorageDistributed::delayInsertOrThrowIfNeeded() const
         !(*distributed_settings)[DistributedSetting::bytes_to_delay_insert])
         return;
 
-    UInt64 total_bytes = *totalBytes(getContext()->getSettingsRef());
+    UInt64 total_bytes = *totalBytes(getContext());
 
     if ((*distributed_settings)[DistributedSetting::bytes_to_throw_insert] && total_bytes > (*distributed_settings)[DistributedSetting::bytes_to_throw_insert])
     {
@@ -1858,12 +1858,12 @@ void StorageDistributed::delayInsertOrThrowIfNeeded() const
         do {
             delayed_ms += step_ms;
             std::this_thread::sleep_for(std::chrono::milliseconds(step_ms));
-        } while (*totalBytes(getContext()->getSettingsRef()) > (*distributed_settings)[DistributedSetting::bytes_to_delay_insert] && delayed_ms < (*distributed_settings)[DistributedSetting::max_delay_to_insert]*1000);
+        } while (*totalBytes(getContext()) > (*distributed_settings)[DistributedSetting::bytes_to_delay_insert] && delayed_ms < (*distributed_settings)[DistributedSetting::max_delay_to_insert]*1000);
 
         ProfileEvents::increment(ProfileEvents::DistributedDelayedInserts);
         ProfileEvents::increment(ProfileEvents::DistributedDelayedInsertsMilliseconds, delayed_ms);
 
-        UInt64 new_total_bytes = *totalBytes(getContext()->getSettingsRef());
+        UInt64 new_total_bytes = *totalBytes(getContext());
         LOG_INFO(log, "Too many bytes pending for async INSERT: was {}, now {}, INSERT was delayed to {} ms",
             formatReadableSizeWithBinarySuffix(total_bytes),
             formatReadableSizeWithBinarySuffix(new_total_bytes),
diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h
index 0fd6d8fdcc3f..0ca830e2d7ea 100644
--- a/src/Storages/StorageDistributed.h
+++ b/src/Storages/StorageDistributed.h
@@ -108,7 +108,7 @@ class StorageDistributed final : public IStorage, WithContext
         size_t /*num_streams*/) override;
 
     bool supportsParallelInsert() const override { return true; }
-    std::optional<UInt64> totalBytes(const Settings &) const override;
+    std::optional<UInt64> totalBytes(ContextPtr) const override;
 
     SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override;
 
diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp
index d5fbf4cdb016..c88611faf499 100644
--- a/src/Storages/StorageJoin.cpp
+++ b/src/Storages/StorageJoin.cpp
@@ -328,14 +328,16 @@ size_t StorageJoin::getSize(ContextPtr context) const
     return join->getTotalRowCount();
 }
 
-std::optional<UInt64> StorageJoin::totalRows(const Settings &settings) const
+std::optional<UInt64> StorageJoin::totalRows(ContextPtr query_context) const
 {
+    const auto & settings = query_context->getSettingsRef();
     TableLockHolder holder = tryLockTimed(rwlock, RWLockImpl::Read, RWLockImpl::NO_QUERY, settings[Setting::lock_acquire_timeout]);
     return join->getTotalRowCount();
 }
 
-std::optional<UInt64> StorageJoin::totalBytes(const Settings &settings) const
+std::optional<UInt64> StorageJoin::totalBytes(ContextPtr query_context) const
 {
+    const auto & settings = query_context->getSettingsRef();
     TableLockHolder holder = tryLockTimed(rwlock, RWLockImpl::Read, RWLockImpl::NO_QUERY, settings[Setting::lock_acquire_timeout]);
     return join->getTotalByteCount();
 }
diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h
index 10a551b40636..90a982d6484a 100644
--- a/src/Storages/StorageJoin.h
+++ b/src/Storages/StorageJoin.h
@@ -82,8 +82,8 @@ class StorageJoin final : public StorageSetOrJoinBase
         size_t max_block_size,
         size_t num_streams) override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 
     Block getRightSampleBlock() const
     {
diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp
index 468856938423..5a3247a63c1e 100644
--- a/src/Storages/StorageLog.cpp
+++ b/src/Storages/StorageLog.cpp
@@ -987,7 +987,7 @@ void StorageLog::updateTotalRows(const WriteLock &)
         total_rows = 0;
 }
 
-std::optional<UInt64> StorageLog::totalRows(const Settings &) const
+std::optional<UInt64> StorageLog::totalRows(ContextPtr) const
 {
     if (use_marks_file && marks_loaded)
         return total_rows;
@@ -998,7 +998,7 @@ std::optional<UInt64> StorageLog::totalRows(const Settings &) const
     return {};
 }
 
-std::optional<UInt64> StorageLog::totalBytes(const Settings &) const
+std::optional<UInt64> StorageLog::totalBytes(ContextPtr) const
 {
     return total_bytes;
 }
diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h
index 882e9cfaa75b..71dd78543d20 100644
--- a/src/Storages/StorageLog.h
+++ b/src/Storages/StorageLog.h
@@ -69,8 +69,8 @@ class StorageLog final : public IStorage, public WithMutableContext
     bool supportsSubcolumns() const override { return true; }
     ColumnSizeByName getColumnSizes() const override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr) const override;
+    std::optional<UInt64> totalBytes(ContextPtr) const override;
 
     void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
     void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp
index b955e0320c20..55d06c9d23ab 100644
--- a/src/Storages/StorageMaterializedView.cpp
+++ b/src/Storages/StorageMaterializedView.cpp
@@ -821,22 +821,22 @@ bool StorageMaterializedView::supportsBackupPartition() const
     return false;
 }
 
-std::optional<UInt64> StorageMaterializedView::totalRows(const Settings & settings) const
+std::optional<UInt64> StorageMaterializedView::totalRows(ContextPtr query_context) const
 {
     if (hasInnerTable())
     {
         if (auto table = tryGetTargetTable())
-            return table->totalRows(settings);
+            return table->totalRows(query_context);
     }
     return {};
 }
 
-std::optional<UInt64> StorageMaterializedView::totalBytes(const Settings & settings) const
+std::optional<UInt64> StorageMaterializedView::totalBytes(ContextPtr query_context) const
 {
     if (hasInnerTable())
     {
         if (auto table = tryGetTargetTable())
-            return table->totalBytes(settings);
+            return table->totalBytes(query_context);
     }
     return {};
 }
diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h
index b995731b5dae..2eff219fc155 100644
--- a/src/Storages/StorageMaterializedView.h
+++ b/src/Storages/StorageMaterializedView.h
@@ -98,8 +98,8 @@ class StorageMaterializedView final : public IStorage, WithMutableContext
     void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
     bool supportsBackupPartition() const override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
     std::optional<UInt64> totalBytesUncompressed(const Settings & settings) const override;
 
 private:
diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index da7fb483ef24..2980c7a82156 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -612,14 +612,14 @@ void StorageMemory::checkAlterIsPossible(const AlterCommands & commands, Context
     }
 }
 
-std::optional<UInt64> StorageMemory::totalRows(const Settings &) const
+std::optional<UInt64> StorageMemory::totalRows(ContextPtr) const
 {
     /// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency
     /// When run concurrently we are fine with any value: "before" or "after"
     return total_size_rows.load(std::memory_order_relaxed);
 }
 
-std::optional<UInt64> StorageMemory::totalBytes(const Settings &) const
+std::optional<UInt64> StorageMemory::totalBytes(ContextPtr) const
 {
     return total_size_bytes.load(std::memory_order_relaxed);
 }
diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h
index 07cf86a29737..6b2ae22c8728 100644
--- a/src/Storages/StorageMemory.h
+++ b/src/Storages/StorageMemory.h
@@ -86,8 +86,8 @@ friend class MemorySink;
     void checkAlterIsPossible(const AlterCommands & commands, ContextPtr local_context) const override;
     void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & alter_lock_holder) override;
 
-    std::optional<UInt64> totalRows(const Settings &) const override;
-    std::optional<UInt64> totalBytes(const Settings &) const override;
+    std::optional<UInt64> totalRows(ContextPtr) const override;
+    std::optional<UInt64> totalBytes(ContextPtr) const override;
 
     /** Delays initialization of StorageMemory::read() until the first read is actually happen.
       * Usually, fore code like this:
diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp
index 160ea530a935..d485e3283637 100644
--- a/src/Storages/StorageMerge.cpp
+++ b/src/Storages/StorageMerge.cpp
@@ -1656,14 +1656,14 @@ bool StorageMerge::supportsTrivialCountOptimization(const StorageSnapshotPtr &,
     return getFirstTable([&](const auto & table) { return !table->supportsTrivialCountOptimization(nullptr, ctx); }) == nullptr;
 }
 
-std::optional<UInt64> StorageMerge::totalRows(const Settings & settings) const
+std::optional<UInt64> StorageMerge::totalRows(ContextPtr query_context) const
 {
-    return totalRowsOrBytes([&](const auto & table) { return table->totalRows(settings); });
+    return totalRowsOrBytes([&](const auto & table) { return table->totalRows(query_context); });
 }
 
-std::optional<UInt64> StorageMerge::totalBytes(const Settings & settings) const
+std::optional<UInt64> StorageMerge::totalBytes(ContextPtr query_context) const
 {
-    return totalRowsOrBytes([&](const auto & table) { return table->totalBytes(settings); });
+    return totalRowsOrBytes([&](const auto & table) { return table->totalBytes(query_context); });
 }
 
 template <typename F>
diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h
index 7a8f11204e22..562b3e981e2b 100644
--- a/src/Storages/StorageMerge.h
+++ b/src/Storages/StorageMerge.h
@@ -81,8 +81,8 @@ class StorageMerge final : public IStorage, WithContext
 
     bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 
     using DatabaseTablesIterators = std::vector<DatabaseTablesIteratorPtr>;
     DatabaseTablesIterators getDatabaseIterators(ContextPtr context) const;
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 39ce4f0ebafd..6a2e320fbf08 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -308,7 +308,7 @@ void StorageMergeTree::read(
         query_plan = std::move(*plan);
 }
 
-std::optional<UInt64> StorageMergeTree::totalRows(const Settings &) const
+std::optional<UInt64> StorageMergeTree::totalRows(ContextPtr) const
 {
     return getTotalActiveSizeInRows();
 }
@@ -319,7 +319,7 @@ std::optional<UInt64> StorageMergeTree::totalRowsByPartitionPredicate(const Acti
     return totalRowsByPartitionPredicateImpl(filter_actions_dag, local_context, parts);
 }
 
-std::optional<UInt64> StorageMergeTree::totalBytes(const Settings &) const
+std::optional<UInt64> StorageMergeTree::totalBytes(ContextPtr) const
 {
     return getTotalActiveSizeInBytes();
 }
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index eed6d1945acd..e0f512b54f9d 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -64,9 +64,9 @@ class StorageMergeTree final : public MergeTreeData
         size_t max_block_size,
         size_t num_streams) override;
 
-    std::optional<UInt64> totalRows(const Settings &) const override;
+    std::optional<UInt64> totalRows(ContextPtr) const override;
     std::optional<UInt64> totalRowsByPartitionPredicate(const ActionsDAG & filter_actions_dag, ContextPtr) const override;
-    std::optional<UInt64> totalBytes(const Settings &) const override;
+    std::optional<UInt64> totalBytes(ContextPtr) const override;
     std::optional<UInt64> totalBytesUncompressed(const Settings &) const override;
 
     UInt64 getNumberOnFlyDataMutations() const override;
diff --git a/src/Storages/StorageNull.h b/src/Storages/StorageNull.h
index 74abf931f8f1..5507b183a834 100644
--- a/src/Storages/StorageNull.h
+++ b/src/Storages/StorageNull.h
@@ -59,11 +59,11 @@ class StorageNull final : public IStorage
 
     void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & table_lock_holder) override;
 
-    std::optional<UInt64> totalRows(const Settings &) const override
+    std::optional<UInt64> totalRows(ContextPtr) const override
     {
         return {0};
     }
-    std::optional<UInt64> totalBytes(const Settings &) const override
+    std::optional<UInt64> totalBytes(ContextPtr) const override
     {
         return {0};
     }
diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h
index 5cd86f7ad2c1..ee68b4d4be05 100644
--- a/src/Storages/StorageProxy.h
+++ b/src/Storages/StorageProxy.h
@@ -155,8 +155,8 @@ class StorageProxy : public IStorage
     bool storesDataOnDisk() const override { return getNested()->storesDataOnDisk(); }
     Strings getDataPaths() const override { return getNested()->getDataPaths(); }
     StoragePolicyPtr getStoragePolicy() const override { return getNested()->getStoragePolicy(); }
-    std::optional<UInt64> totalRows(const Settings & settings) const override { return getNested()->totalRows(settings); }
-    std::optional<UInt64> totalBytes(const Settings & settings) const override { return getNested()->totalBytes(settings); }
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override { return getNested()->totalRows(query_context); }
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override { return getNested()->totalBytes(query_context); }
     std::optional<UInt64> lifetimeRows() const override { return getNested()->lifetimeRows(); }
     std::optional<UInt64> lifetimeBytes() const override { return getNested()->lifetimeBytes(); }
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index f3717d907843..d4897f2532fc 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -5895,8 +5895,9 @@ void StorageReplicatedMergeTree::foreachActiveParts(Func && func, bool select_se
     }
 }
 
-std::optional<UInt64> StorageReplicatedMergeTree::totalRows(const Settings & settings) const
+std::optional<UInt64> StorageReplicatedMergeTree::totalRows(ContextPtr query_context) const
 {
+    const auto & settings = query_context->getSettingsRef();
     UInt64 res = 0;
     foreachActiveParts([&res](auto & part) { res += part->rows_count; }, settings[Setting::select_sequential_consistency]);
     return res;
@@ -5909,8 +5910,9 @@ std::optional<UInt64> StorageReplicatedMergeTree::totalRowsByPartitionPredicate(
     return totalRowsByPartitionPredicateImpl(filter_actions_dag, local_context, parts);
 }
 
-std::optional<UInt64> StorageReplicatedMergeTree::totalBytes(const Settings & settings) const
+std::optional<UInt64> StorageReplicatedMergeTree::totalBytes(ContextPtr query_context) const
 {
+    const auto & settings = query_context->getSettingsRef();
     UInt64 res = 0;
     foreachActiveParts([&res](auto & part) { res += part->getBytesOnDisk(); }, settings[Setting::select_sequential_consistency]);
     return res;
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index e09f0cd373f2..14f3f4127d07 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -163,9 +163,9 @@ class StorageReplicatedMergeTree final : public MergeTreeData
         size_t max_block_size,
         size_t num_streams) override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
     std::optional<UInt64> totalRowsByPartitionPredicate(const ActionsDAG & filter_actions_dag, ContextPtr context) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
     std::optional<UInt64> totalBytesUncompressed(const Settings & settings) const override;
 
     UInt64 getNumberOnFlyDataMutations() const override;
diff --git a/src/Storages/StorageSet.cpp b/src/Storages/StorageSet.cpp
index 498b09fd490c..e10bd2b5717c 100644
--- a/src/Storages/StorageSet.cpp
+++ b/src/Storages/StorageSet.cpp
@@ -213,7 +213,7 @@ size_t StorageSet::getSize(ContextPtr) const
     return current_set->getTotalRowCount();
 }
 
-std::optional<UInt64> StorageSet::totalRows(const Settings &) const
+std::optional<UInt64> StorageSet::totalRows(ContextPtr) const
 {
     SetPtr current_set;
     {
@@ -223,7 +223,7 @@ std::optional<UInt64> StorageSet::totalRows(const Settings &) const
     return current_set->getTotalRowCount();
 }
 
-std::optional<UInt64> StorageSet::totalBytes(const Settings &) const
+std::optional<UInt64> StorageSet::totalBytes(ContextPtr) const
 {
     SetPtr current_set;
     {
diff --git a/src/Storages/StorageSet.h b/src/Storages/StorageSet.h
index 5b6d899b48db..fb4cc81ffb5e 100644
--- a/src/Storages/StorageSet.h
+++ b/src/Storages/StorageSet.h
@@ -82,8 +82,8 @@ class StorageSet final : public StorageSetOrJoinBase
 
     void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &) override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 
 private:
     /// Allows to concurrently truncate the set and work (read/fill) the existing set.
diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp
index 0db0f67084fc..522833f1f848 100644
--- a/src/Storages/StorageStripeLog.cpp
+++ b/src/Storages/StorageStripeLog.cpp
@@ -538,7 +538,7 @@ void StorageStripeLog::updateTotalRows(const WriteLock &)
     total_rows = new_total_rows;
 }
 
-std::optional<UInt64> StorageStripeLog::totalRows(const Settings &) const
+std::optional<UInt64> StorageStripeLog::totalRows(ContextPtr) const
 {
     if (indices_loaded)
         return total_rows;
@@ -549,7 +549,7 @@ std::optional<UInt64> StorageStripeLog::totalRows(const Settings &) const
     return {};
 }
 
-std::optional<UInt64> StorageStripeLog::totalBytes(const Settings &) const
+std::optional<UInt64> StorageStripeLog::totalBytes(ContextPtr) const
 {
     return total_bytes;
 }
diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h
index dc2f8d8be4a1..66a686670187 100644
--- a/src/Storages/StorageStripeLog.h
+++ b/src/Storages/StorageStripeLog.h
@@ -61,8 +61,8 @@ friend class StripeLogSink;
 
     void truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder&) override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
 
     void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
     void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
diff --git a/src/Storages/StorageTimeSeries.cpp b/src/Storages/StorageTimeSeries.cpp
index fb4b833a1f42..f3175c78ef46 100644
--- a/src/Storages/StorageTimeSeries.cpp
+++ b/src/Storages/StorageTimeSeries.cpp
@@ -254,7 +254,7 @@ StoragePtr StorageTimeSeries::tryGetTargetTable(ViewTarget::Kind target_kind, co
 }
 
 
-std::optional<UInt64> StorageTimeSeries::totalRows(const Settings & settings) const
+std::optional<UInt64> StorageTimeSeries::totalRows(ContextPtr query_context) const
 {
     UInt64 total_rows = 0;
     if (has_inner_tables)
@@ -267,7 +267,7 @@ std::optional<UInt64> StorageTimeSeries::totalRows(const Settings & settings) co
                 if (!inner_table)
                     return std::nullopt;
 
-                auto total_rows_in_inner_table = inner_table->totalRows(settings);
+                auto total_rows_in_inner_table = inner_table->totalRows(query_context);
                 if (!total_rows_in_inner_table)
                     return std::nullopt;
 
@@ -278,7 +278,7 @@ std::optional<UInt64> StorageTimeSeries::totalRows(const Settings & settings) co
     return total_rows;
 }
 
-std::optional<UInt64> StorageTimeSeries::totalBytes(const Settings & settings) const
+std::optional<UInt64> StorageTimeSeries::totalBytes(ContextPtr query_context) const
 {
     UInt64 total_bytes = 0;
     if (has_inner_tables)
@@ -291,7 +291,7 @@ std::optional<UInt64> StorageTimeSeries::totalBytes(const Settings & settings) c
                 if (!inner_table)
                     return std::nullopt;
 
-                auto total_bytes_in_inner_table = inner_table->totalBytes(settings);
+                auto total_bytes_in_inner_table = inner_table->totalBytes(query_context);
                 if (!total_bytes_in_inner_table)
                     return std::nullopt;
 
diff --git a/src/Storages/StorageTimeSeries.h b/src/Storages/StorageTimeSeries.h
index 6be2e9184b70..997c5296cc39 100644
--- a/src/Storages/StorageTimeSeries.h
+++ b/src/Storages/StorageTimeSeries.h
@@ -85,8 +85,8 @@ class StorageTimeSeries final : public IStorage, WithContext
     void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
     void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional<ASTs> & partitions) override;
 
-    std::optional<UInt64> totalRows(const Settings & settings) const override;
-    std::optional<UInt64> totalBytes(const Settings & settings) const override;
+    std::optional<UInt64> totalRows(ContextPtr query_context) const override;
+    std::optional<UInt64> totalBytes(ContextPtr query_context) const override;
     std::optional<UInt64> totalBytesUncompressed(const Settings & settings) const override;
     Strings getDataPaths() const override;
 
diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp
index e4f5dc490706..18c90c3e21a5 100644
--- a/src/Storages/System/StorageSystemTables.cpp
+++ b/src/Storages/System/StorageSystemTables.cpp
@@ -341,7 +341,6 @@ class TablesBlockSource : public ISource
                         if (columns_mask[src_index++])
                             res_columns[res_index++]->insert(table.second->getName());
 
-                        const auto & settings = context->getSettingsRef();
                         while (src_index < columns_mask.size())
                         {
                             // total_rows
@@ -349,7 +348,7 @@ class TablesBlockSource : public ISource
                             {
                                 try
                                 {
-                                    if (auto total_rows = table.second->totalRows(settings))
+                                    if (auto total_rows = table.second->totalRows(context))
                                         res_columns[res_index++]->insert(*total_rows);
                                     else
                                         res_columns[res_index++]->insertDefault();
@@ -366,7 +365,7 @@ class TablesBlockSource : public ISource
                             {
                                 try
                                 {
-                                    if (auto total_bytes = table.second->totalBytes(settings))
+                                    if (auto total_bytes = table.second->totalBytes(context))
                                         res_columns[res_index++]->insert(*total_bytes);
                                     else
                                         res_columns[res_index++]->insertDefault();
@@ -588,13 +587,16 @@ class TablesBlockSource : public ISource
                         res_columns[res_index++]->insertDefault();
                 }
 
-                auto settings = context->getSettingsRef();
-                settings[Setting::select_sequential_consistency] = 0;
+                ContextMutablePtr context_copy = Context::createCopy(context);
+                Settings settings_copy = context_copy->getSettingsCopy();
+                settings_copy[Setting::select_sequential_consistency] = 0;
+                context_copy->setSettings(settings_copy);
+
                 if (columns_mask[src_index++])
                 {
                     try
                     {
-                        auto total_rows = table ? table->totalRows(settings) : std::nullopt;
+                        auto total_rows = table ? table->totalRows(context) : std::nullopt;
                         if (total_rows)
                             res_columns[res_index++]->insert(*total_rows);
                         else
@@ -612,7 +614,7 @@ class TablesBlockSource : public ISource
                 {
                     try
                     {
-                        auto total_bytes = table->totalBytes(settings);
+                        auto total_bytes = table->totalBytes(context_copy);
                         if (total_bytes)
                             res_columns[res_index++]->insert(*total_bytes);
                         else
@@ -630,7 +632,7 @@ class TablesBlockSource : public ISource
                 {
                     try
                     {
-                        auto total_bytes_uncompressed = table->totalBytesUncompressed(settings);
+                        auto total_bytes_uncompressed = table->totalBytesUncompressed(context_copy->getSettingsRef());
                         if (total_bytes_uncompressed)
                             res_columns[res_index++]->insert(*total_bytes_uncompressed);
                         else
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index 953ec42ce214..1a1a6219dd08 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -851,7 +851,11 @@ def test_delete_files(started_cluster, format_version, storage_type):
     )
     create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
 
-    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
+    # Test trivial count with deleted files
+    query_id = "test_trivial_count_" + get_uuid_str()
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}", query_id=query_id)) == 100
+    instance.query("SYSTEM FLUSH LOGS")
+    assert instance.query(f"SELECT ProfileEvents['IcebergTrivialCountOptimizationApplied'] FROM system.query_log where query_id = '{query_id}' and type = 'QueryFinish'") == "1\n"
 
     spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 0")
     default_upload_directory(
@@ -861,7 +865,11 @@ def test_delete_files(started_cluster, format_version, storage_type):
         "",
     )
 
-    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 0
+    query_id = "test_trivial_count_" + get_uuid_str()
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}", query_id=query_id)) == 0
+
+    instance.query("SYSTEM FLUSH LOGS")
+    assert instance.query(f"SELECT ProfileEvents['IcebergTrivialCountOptimizationApplied'] FROM system.query_log where query_id = '{query_id}' and type = 'QueryFinish'") == "1\n"
 
     write_iceberg_from_df(
         spark,
@@ -878,7 +886,11 @@ def test_delete_files(started_cluster, format_version, storage_type):
         "",
     )
 
-    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
+    query_id = "test_trivial_count_" + get_uuid_str()
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}", query_id=query_id)) == 100
+
+    instance.query("SYSTEM FLUSH LOGS")
+    assert instance.query(f"SELECT ProfileEvents['IcebergTrivialCountOptimizationApplied'] FROM system.query_log where query_id = '{query_id}' and type = 'QueryFinish'") == "1\n"
 
     spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 150")
     default_upload_directory(
@@ -888,7 +900,11 @@ def test_delete_files(started_cluster, format_version, storage_type):
         "",
     )
 
-    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 50
+    query_id = "test_trivial_count_" + get_uuid_str()
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}", query_id=query_id)) == 50
+
+    instance.query("SYSTEM FLUSH LOGS")
+    assert instance.query(f"SELECT ProfileEvents['IcebergTrivialCountOptimizationApplied'] FROM system.query_log where query_id = '{query_id}' and type = 'QueryFinish'") == "1\n"
 
 
 @pytest.mark.parametrize("format_version", ["1", "2"])

From f84753ff87749e1ada1f03f452810db2d1cf1f4c Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Fri, 21 Mar 2025 18:39:12 +0000
Subject: [PATCH 07/14] Merge pull request #78021 from
 hanfei1991/hanfei/refactor-iceberg

Refactor some code in Iceberg Storage
---
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 41 ++++++++-----------
 .../DataLakes/Iceberg/IcebergMetadata.h       | 16 ++++----
 .../DataLakes/Iceberg/IteratorWrapper.h       | 29 -------------
 .../DataLakes/Iceberg/ManifestFile.h          |  6 +--
 .../DataLakes/Iceberg/Snapshot.h              | 18 +++-----
 5 files changed, 33 insertions(+), 77 deletions(-)
 delete mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 284491f624c0..380a3a35030a 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -481,20 +481,19 @@ ManifestList IcebergMetadata::initializeManifestList(const String & filename) co
 
         /// We can't encapsulate this logic in getManifestFile because we need not only the name of the file, but also an inherited sequence number which is known only during the parsing of ManifestList
         auto manifest_file_content = initializeManifestFile(manifest_file_name, added_sequence_number);
-        auto [iterator, _inserted] = manifest_files_by_name.emplace(manifest_file_name, std::move(manifest_file_content));
-        auto manifest_file_iterator = ManifestFileIterator{iterator};
-        for (const auto & data_file_path : manifest_file_iterator->getFiles())
+        manifest_files_by_name.emplace(manifest_file_name, manifest_file_content);
+        for (const auto & data_file_path : manifest_file_content->getFiles())
         {
             if (std::holds_alternative<DataFileEntry>(data_file_path.file))
-                manifest_file_by_data_file.emplace(std::get<DataFileEntry>(data_file_path.file).file_name, manifest_file_iterator);
+                manifest_file_by_data_file.emplace(std::get<DataFileEntry>(data_file_path.file).file_name, manifest_file_content);
         }
-        manifest_list.push_back(ManifestListFileEntry{manifest_file_iterator, added_sequence_number});
+        manifest_list.push_back(manifest_file_content);
     }
 
     return manifest_list;
 }
 
-ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const
+ManifestFilePtr IcebergMetadata::initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const
 {
     auto configuration_ptr = configuration.lock();
 
@@ -503,7 +502,7 @@ ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filen
     AvroForIcebergDeserializer manifest_file_deserializer(std::move(buffer), filename, getFormatSettings(getContext()));
     auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(manifest_file_deserializer, filename);
     schema_processor.addIcebergTableSchema(schema_object);
-    return ManifestFileContent(
+    return std::make_shared<ManifestFileContent>(
         manifest_file_deserializer,
         format_version,
         configuration_ptr->getPath(),
@@ -512,32 +511,26 @@ ManifestFileContent IcebergMetadata::initializeManifestFile(const String & filen
         inherited_sequence_number,
         table_location,
         getContext());
-}
 
-ManifestFileIterator IcebergMetadata::getManifestFile(const String & filename) const
-{
-    auto manifest_file_it = manifest_files_by_name.find(filename);
-    if (manifest_file_it != manifest_files_by_name.end())
-        return ManifestFileIterator{manifest_file_it};
-    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find manifest file: {}", filename);
 }
 
-std::optional<ManifestFileIterator> IcebergMetadata::tryGetManifestFile(const String & filename) const
+ManifestFilePtr IcebergMetadata::tryGetManifestFile(const String & filename) const
 {
     auto manifest_file_it = manifest_files_by_name.find(filename);
     if (manifest_file_it != manifest_files_by_name.end())
-        return ManifestFileIterator{manifest_file_it};
-    return std::nullopt;
+        return manifest_file_it->second;
+    return nullptr;
 }
 
-ManifestListIterator IcebergMetadata::getManifestList(const String & filename) const
+ManifestListPtr IcebergMetadata::getManifestList(const String & filename) const
 {
     auto manifest_file_it = manifest_lists_by_name.find(filename);
     if (manifest_file_it != manifest_lists_by_name.end())
-        return ManifestListIterator{manifest_file_it};
+        return manifest_file_it->second;
     auto configuration_ptr = configuration.lock();
-    auto [manifest_file_iterator, _inserted] = manifest_lists_by_name.emplace(filename, initializeManifestList(filename));
-    return ManifestListIterator{manifest_file_iterator};
+    auto manifest_list_ptr = std::make_shared<ManifestList>(initializeManifestList(filename));
+    manifest_lists_by_name.emplace(filename, manifest_list_ptr);
+    return manifest_list_ptr;
 }
 
 Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
@@ -549,10 +542,10 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
         return cached_unprunned_files_for_last_processed_snapshot.value();
 
     Strings data_files;
-    for (const auto & manifest_list_entry : *(relevant_snapshot->manifest_list_iterator))
+    for (const auto & manifest_file_ptr : *(relevant_snapshot->manifest_list))
     {
-        PartitionPruner pruner(schema_processor, relevant_snapshot_schema_id, filter_dag, *manifest_list_entry.manifest_file, getContext());
-        const auto & data_files_in_manifest = manifest_list_entry.manifest_file->getFiles();
+        PartitionPruner pruner(schema_processor, relevant_snapshot_schema_id, filter_dag, *manifest_file_ptr, getContext());
+        const auto & data_files_in_manifest = manifest_file_ptr->getFiles();
         for (const auto & manifest_file_entry : data_files_in_manifest)
         {
             if (manifest_file_entry.status != ManifestEntryStatus::DELETED)
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index f4f355bb6ba7..f5ac4a33cb6a 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -95,15 +95,17 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
     std::optional<size_t> totalBytes() const override;
 
 private:
-    using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFileIterator>;
+    using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFilePtr>;
+    using ManifestFilesStorage = std::unordered_map<String, Iceberg::ManifestFilePtr>;
+    using ManifestListsStorage = std::unordered_map<String, Iceberg::ManifestListPtr>;
 
     const ObjectStoragePtr object_storage;
     const ConfigurationObserverPtr configuration;
     mutable IcebergSchemaProcessor schema_processor;
     LoggerPtr log;
 
-    mutable Iceberg::ManifestFilesStorage manifest_files_by_name;
-    mutable Iceberg::ManifestListsStorage manifest_lists_by_name;
+    mutable ManifestFilesStorage manifest_files_by_name;
+    mutable ManifestListsStorage manifest_lists_by_name;
     mutable ManifestEntryByDataFile manifest_file_by_data_file;
 
     std::tuple<Int64, Int32> getVersion() const { return std::make_tuple(relevant_snapshot_id, relevant_snapshot_schema_id); }
@@ -129,13 +131,11 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     void addTableSchemaById(Int32 schema_id);
 
-    Iceberg::ManifestListIterator getManifestList(const String & filename) const;
+    Iceberg::ManifestListPtr getManifestList(const String & filename) const;
 
     std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const;
 
-    Iceberg::ManifestFileContent initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const;
-
-    Iceberg::ManifestFileIterator getManifestFile(const String & filename) const;
+    Iceberg::ManifestFilePtr initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const;
 
     std::optional<String> getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata);
 
@@ -143,7 +143,7 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     Strings getDataFilesImpl(const ActionsDAG * filter_dag) const;
 
-    std::optional<Iceberg::ManifestFileIterator> tryGetManifestFile(const String & filename) const;
+    Iceberg::ManifestFilePtr tryGetManifestFile(const String & filename) const;
 };
 }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h
deleted file mode 100644
index e64a6e95fb81..000000000000
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include <map>
-#include <base/types.h>
-namespace Iceberg
-{
-
-template <typename T>
-class IteratorWrapper
-{
-private:
-    using StorageType = std::map<String, T>;
-    using StorageConstIterator = StorageType::const_iterator;
-    using StorageIterator = StorageType::iterator;
-
-public:
-    explicit IteratorWrapper(StorageConstIterator iterator_) : iterator(iterator_) { }
-    explicit IteratorWrapper(StorageIterator iterator_) : iterator(iterator_) { }
-
-    String getName() const { return iterator->first; }
-
-    const T * operator->() const { return &iterator->second; }
-    const T & operator*() const { return iterator->second; }
-
-private:
-    StorageIterator iterator;
-};
-
-}
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 8cae2f2deecd..c623c49c4f63 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -5,7 +5,6 @@
 #include <cstdint>
 #if USE_AVRO
 
-#include <Storages/ObjectStorage/DataLakes/Iceberg/IteratorWrapper.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h>
 #include <Storages/KeyDescription.h>
@@ -118,8 +117,9 @@ class ManifestFileContent
 
 };
 
-using ManifestFilesStorage = std::map<String, ManifestFileContent>;
-using ManifestFileIterator = IteratorWrapper<ManifestFileContent>;
+/// Once manifest file is constructed. It's unchangeable.
+using ManifestFilePtr = std::shared_ptr<const ManifestFileContent>;
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
index ed5ba39b2e7a..21fc25063bc0 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -3,25 +3,17 @@
 
 #if USE_AVRO
 
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
-namespace Iceberg
-{
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
 
-struct ManifestListFileEntry
+namespace Iceberg
 {
-    ManifestFileIterator manifest_file;
-    Int64 added_sequence_number;
-};
-
-using ManifestList = std::vector<ManifestListFileEntry>;
-
 
-using ManifestListsStorage = std::map<String, ManifestList>;
-using ManifestListIterator = IteratorWrapper<ManifestList>;
+using ManifestList = std::vector<ManifestFilePtr>;
+using ManifestListPtr = std::shared_ptr<const ManifestList>;
 
 struct IcebergSnapshot
 {
-    ManifestListIterator manifest_list_iterator;
+    ManifestListPtr manifest_list;
     Int64 snapshot_id;
     std::optional<size_t> total_rows;
     std::optional<size_t> total_bytes;

From 4f30300daf957f2e519ef2be3e62025b11c1a9ae Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 2 Apr 2025 16:51:41 +0000
Subject: [PATCH 08/14] Merge pull request #78242 from
 ClickHouse/minmax_iceberg

Minmax iceberg
---
 src/Common/ProfileEvents.cpp                  |   1 +
 .../DataLakes/Iceberg/IcebergMetadata.cpp     |  11 +-
 .../DataLakes/Iceberg/ManifestFile.cpp        | 119 ++++++++-
 .../DataLakes/Iceberg/ManifestFile.h          |  11 +-
 .../Iceberg/ManifestFilesPruning.cpp          | 217 +++++++++++++++
 ...titionPruning.h => ManifestFilesPruning.h} |  15 +-
 .../DataLakes/Iceberg/PartitionPruning.cpp    | 144 ----------
 .../DataLakes/Iceberg/SchemaProcessor.cpp     |  11 +
 .../DataLakes/Iceberg/SchemaProcessor.h       |   2 +
 .../integration/test_storage_iceberg/test.py  | 248 ++++++++++++++++++
 10 files changed, 608 insertions(+), 171 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.cpp
 rename src/Storages/ObjectStorage/DataLakes/Iceberg/{PartitionPruning.h => ManifestFilesPruning.h} (72%)
 delete mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 2214fb8dc92a..580e26fbdc40 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -220,6 +220,7 @@
     \
     M(IcebergPartitionPrunnedFiles, "Number of skipped files during Iceberg partition pruning", ValueType::Number) \
     M(IcebergTrivialCountOptimizationApplied, "Trivial count optimization applied while reading from Iceberg", ValueType::Number) \
+    M(IcebergMinMaxIndexPrunnedFiles, "Number of skipped files by using MinMax index in Iceberg", ValueType::Number) \
     M(JoinBuildTableRowCount, "Total number of rows in the build table for a JOIN operation.", ValueType::Number) \
     M(JoinProbeTableRowCount, "Total number of rows in the probe table for a JOIN operation.", ValueType::Number) \
     M(JoinResultRowCount, "Total number of rows in the result of a JOIN operation.", ValueType::Number) \
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 380a3a35030a..c360d5c2f9ca 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -18,7 +18,7 @@
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h>
-#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
 
 #include <Common/logger_useful.h>
@@ -26,7 +26,6 @@
 
 namespace ProfileEvents
 {
-    extern const Event IcebergPartitionPrunnedFiles;
     extern const Event IcebergTrivialCountOptimizationApplied;
 }
 
@@ -544,17 +543,13 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
     Strings data_files;
     for (const auto & manifest_file_ptr : *(relevant_snapshot->manifest_list))
     {
-        PartitionPruner pruner(schema_processor, relevant_snapshot_schema_id, filter_dag, *manifest_file_ptr, getContext());
+        ManifestFilesPruner pruner(schema_processor, relevant_snapshot_schema_id, filter_dag, *manifest_file_ptr, getContext());
         const auto & data_files_in_manifest = manifest_file_ptr->getFiles();
         for (const auto & manifest_file_entry : data_files_in_manifest)
         {
             if (manifest_file_entry.status != ManifestEntryStatus::DELETED)
             {
-                if (pruner.canBePruned(manifest_file_entry))
-                {
-                    ProfileEvents::increment(ProfileEvents::IcebergPartitionPrunnedFiles);
-                }
-                else
+                if (!pruner.canBePruned(manifest_file_entry))
                 {
                     if (std::holds_alternative<DataFileEntry>(manifest_file_entry.file))
                         data_files.push_back(std::get<DataFileEntry>(manifest_file_entry.file).file_name);
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index b9a968f06db6..650a5c4af033 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -4,14 +4,16 @@
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
-#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h>
 
+#include <DataTypes/DataTypesDecimal.h>
 #include <Poco/JSON/Parser.h>
 #include <Storages/ColumnsDescription.h>
 #include <Parsers/ASTFunction.h>
 #include <Common/quoteString.h>
 #include <DataTypes/DataTypeNullable.h>
-#include <Common/logger_useful.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadHelpers.h>
 
 namespace DB::ErrorCodes
 {
@@ -23,6 +25,86 @@ namespace DB::ErrorCodes
 namespace Iceberg
 {
 
+namespace
+{
+    /// Iceberg stores lower_bounds and upper_bounds serialized with some custom deserialization as bytes array
+    /// https://iceberg.apache.org/spec/#appendix-d-single-value-serialization
+    std::optional<DB::Field> deserializeFieldFromBinaryRepr(std::string str, DB::DataTypePtr expected_type, bool lower_bound)
+    {
+        auto non_nullable_type = DB::removeNullable(expected_type);
+        auto column = non_nullable_type->createColumn();
+        if (DB::WhichDataType(non_nullable_type).isDecimal())
+        {
+            /// Iceberg store decimal values as unscaled value with two’s-complement big-endian binary
+            /// using the minimum number of bytes for the value
+            /// Our decimal binary representation is little endian
+            /// so we cannot reuse our default code for parsing it.
+            int64_t unscaled_value = 0;
+
+            // Convert from big-endian to signed int
+            for (const auto byte : str)
+                unscaled_value = (unscaled_value << 8) | static_cast<uint8_t>(byte);
+
+            /// Add sign
+            if (str[0] & 0x80)
+            {
+                int64_t sign_extension = -1;
+                sign_extension <<= (str.size() * 8);
+                unscaled_value |= sign_extension;
+            }
+
+            /// NOTE: It's very weird, but Decimal values for lower bound and upper bound
+            /// are stored rounded, without fractional part. What is more strange
+            /// the integer part is rounded mathematically correctly according to fractional part.
+            /// Example: 17.22 -> 17, 8888.999 -> 8889, 1423.77 -> 1424.
+            /// I've checked two implementations: Spark and Amazon Athena and both of them
+            /// do this.
+            ///
+            /// The problem is -- we cannot use rounded values for lower bounds and upper bounds.
+            /// Example: upper_bound(x) = 17.22, but it's rounded 17.00, now condition WHERE x >= 17.21 will
+            /// check rounded value and say: "Oh largest value is 17, so values bigger than 17.21 cannot be in this file,
+            /// let's skip it". But it will produce incorrect result since actual value (17.22 >= 17.21) is stored in this file.
+            ///
+            /// To handle this issue we subtract 1 from the integral part for lower_bound and add 1 to integral
+            /// part of upper_bound. This produces: 17.22 -> [16.0, 18.0]. So this is more rough boundary,
+            /// but at least it doesn't lead to incorrect results.
+            {
+                int64_t scaler = lower_bound ? -10 : 10;
+                int32_t scale = DB::getDecimalScale(*non_nullable_type);
+                while (--scale)
+                    scaler *= 10;
+
+                unscaled_value += scaler;
+            }
+
+            if (const auto * decimal_type = DB::checkDecimal<DB::Decimal32>(*non_nullable_type))
+            {
+                DB::DecimalField<DB::Decimal32> result(unscaled_value, decimal_type->getScale());
+                return result;
+            }
+            if (const auto * decimal_type = DB::checkDecimal<DB::Decimal64>(*non_nullable_type))
+            {
+                DB::DecimalField<DB::Decimal64> result(unscaled_value, decimal_type->getScale());
+                return result;
+            }
+            else
+            {
+                return std::nullopt;
+            }
+        }
+        else
+        {
+            /// For all other types except decimal binary representation
+            /// matches our internal representation
+            column->insertData(str.data(), str.length());
+            DB::Field result;
+            column->get(0, result);
+            return result;
+        }
+    }
+
+}
+
 constexpr const char * COLUMN_STATUS_NAME = "status";
 constexpr const char * COLUMN_TUPLE_DATA_FILE_NAME = "data_file";
 constexpr const char * COLUMN_SEQ_NUMBER_NAME = "sequence_number";
@@ -104,10 +186,9 @@ ManifestFileContent::ManifestFileContent(
 
         partition_key_ast->arguments->children.emplace_back(std::move(partition_ast));
         partition_columns_description.emplace_back(numeric_column_name, removeNullable(manifest_file_column_characteristics.type));
-        this->partition_column_ids.push_back(source_id);
     }
 
-    if (!partition_column_ids.empty())
+    if (!partition_columns_description.empty())
         this->partition_key_description.emplace(DB::KeyDescription::getKeyFromAST(std::move(partition_key_ast), ColumnsDescription(partition_columns_description), context));
 
     for (size_t i = 0; i < manifest_file_deserializer.rows(); ++i)
@@ -165,6 +246,7 @@ ManifestFileContent::ManifestFileContent(
             }
         }
 
+        std::unordered_map<Int32, std::pair<Field, Field>> value_for_bounds;
         for (const auto & path : {SUBCOLUMN_LOWER_BOUNDS_NAME, SUBCOLUMN_UPPER_BOUNDS_NAME})
         {
             if (manifest_file_deserializer.hasPath(path))
@@ -175,14 +257,28 @@ ManifestFileContent::ManifestFileContent(
                     const auto & column_number_and_bound = column_stats.safeGet<Tuple>();
                     Int32 number = column_number_and_bound[0].safeGet<Int32>();
                     const Field & bound_value = column_number_and_bound[1];
+
                     if (path == SUBCOLUMN_LOWER_BOUNDS_NAME)
-                        columns_infos[number].lower_bound = bound_value;
+                        value_for_bounds[number].first = bound_value;
                     else
-                        columns_infos[number].upper_bound = bound_value;
+                        value_for_bounds[number].second = bound_value;
+
+                    column_ids_which_have_bounds.insert(number);
                 }
             }
         }
 
+        for (const auto & [column_id, bounds] : value_for_bounds)
+        {
+            DB::NameAndTypePair name_and_type = schema_processor.getFieldCharacteristics(schema_id, column_id);
+            auto left = deserializeFieldFromBinaryRepr(bounds.first.safeGet<std::string>(), name_and_type.type, true);
+            auto right = deserializeFieldFromBinaryRepr(bounds.second.safeGet<std::string>(), name_and_type.type, false);
+            if (!left || !right)
+                continue;
+
+            columns_infos[column_id].hyperrectangle.emplace(*left, true, *right, true);
+        }
+
         FileEntry file = FileEntry{DataFileEntry{file_path}};
 
         Int64 added_sequence_number = 0;
@@ -215,7 +311,7 @@ ManifestFileContent::ManifestFileContent(
 
 bool ManifestFileContent::hasPartitionKey() const
 {
-    return !partition_column_ids.empty();
+    return partition_key_description.has_value();
 }
 
 const DB::KeyDescription & ManifestFileContent::getPartitionKeyDescription() const
@@ -225,9 +321,14 @@ const DB::KeyDescription & ManifestFileContent::getPartitionKeyDescription() con
     return *(partition_key_description);
 }
 
-const std::vector<Int32> & ManifestFileContent::getPartitionKeyColumnIDs() const
+bool ManifestFileContent::hasBoundsInfoInManifests() const
+{
+    return !column_ids_which_have_bounds.empty();
+}
+
+const std::set<Int32> & ManifestFileContent::getColumnsIDsWithBounds() const
 {
-    return partition_column_ids;
+    return column_ids_which_have_bounds;
 }
 
 std::optional<Int64> ManifestFileContent::getRowsCountInAllDataFilesExcludingDeleted() const
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index c623c49c4f63..18c9ab2527c8 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -8,6 +8,7 @@
 #include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.h>
 #include <Storages/KeyDescription.h>
+#include <Storages/MergeTree/KeyCondition.h>
 #include <Core/Field.h>
 
 namespace Iceberg
@@ -39,8 +40,7 @@ struct ColumnInfo
     std::optional<Int64> rows_count;
     std::optional<Int64> bytes_size;
     std::optional<Int64> nulls_count;
-    std::optional<DB::Field> lower_bound;
-    std::optional<DB::Field> upper_bound;
+    std::optional<DB::Range> hyperrectangle;
 };
 
 using FileEntry = std::variant<DataFileEntry>; // In the future we will add PositionalDeleteFileEntry and EqualityDeleteFileEntry here
@@ -100,21 +100,24 @@ class ManifestFileContent
 
     bool hasPartitionKey() const;
     const DB::KeyDescription & getPartitionKeyDescription() const;
-    const std::vector<Int32> & getPartitionKeyColumnIDs() const;
 
     /// Fields with rows count in manifest files are optional
     /// they can be absent.
     std::optional<Int64> getRowsCountInAllDataFilesExcludingDeleted() const;
     std::optional<Int64> getBytesCountInAllDataFiles() const;
+
+    bool hasBoundsInfoInManifests() const;
+    const std::set<Int32> & getColumnsIDsWithBounds() const;
 private:
 
     Int32 schema_id;
 
     std::optional<DB::KeyDescription> partition_key_description;
-    std::vector<Int32> partition_column_ids;
     // Size - number of files
     std::vector<ManifestFileEntry> files;
 
+    std::set<Int32> column_ids_which_have_bounds;
+
 };
 
 /// Once manifest file is constructed. It's unchangeable.
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.cpp
new file mode 100644
index 000000000000..246af3f2d457
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.cpp
@@ -0,0 +1,217 @@
+#include "config.h"
+
+#if USE_AVRO
+
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnsDateTime.h>
+#include <Common/DateLUTImpl.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <Common/logger_useful.h>
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTLiteral.h>
+#include <IO/ReadHelpers.h>
+#include <Common/quoteString.h>
+#include <fmt/ranges.h>
+
+#include <Interpreters/ExpressionActions.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
+
+using namespace DB;
+
+namespace ProfileEvents
+{
+    extern const Event IcebergPartitionPrunnedFiles;
+    extern const Event IcebergMinMaxIndexPrunnedFiles;
+}
+
+
+namespace Iceberg
+{
+
+DB::ASTPtr getASTFromTransform(const String & transform_name_src, const String & column_name)
+{
+    std::string transform_name = Poco::toLower(transform_name_src);
+
+    if (transform_name == "year" || transform_name == "years")
+        return makeASTFunction("toYearNumSinceEpoch", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "month" || transform_name == "months")
+        return makeASTFunction("toMonthNumSinceEpoch", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "day" || transform_name == "date" || transform_name == "days" || transform_name == "dates")
+        return makeASTFunction("toRelativeDayNum", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "hour" || transform_name == "hours")
+        return makeASTFunction("toRelativeHourNum", std::make_shared<DB::ASTIdentifier>(column_name));
+
+    if (transform_name == "identity")
+        return std::make_shared<ASTIdentifier>(column_name);
+
+    if (transform_name == "void")
+        return makeASTFunction("tuple");
+
+    if (transform_name.starts_with("truncate"))
+    {
+        /// should look like transform[N]
+
+        if (transform_name.back() != ']')
+            return nullptr;
+
+        auto argument_start = transform_name.find('[');
+
+        if (argument_start == std::string::npos)
+            return nullptr;
+
+        auto argument_width = transform_name.length() - 2 - argument_start;
+        std::string width = transform_name.substr(argument_start + 1, argument_width);
+        size_t truncate_width;
+        bool parsed = DB::tryParse<size_t>(truncate_width, width);
+
+        if (!parsed)
+            return nullptr;
+
+        return makeASTFunction("icebergTruncate", std::make_shared<DB::ASTLiteral>(truncate_width), std::make_shared<DB::ASTIdentifier>(column_name));
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+std::unique_ptr<DB::ActionsDAG> ManifestFilesPruner::transformFilterDagForManifest(const DB::ActionsDAG * source_dag, std::vector<Int32> & used_columns_in_filter) const
+{
+    if (source_dag == nullptr)
+        return nullptr;
+
+    const auto & inputs = source_dag->getInputs();
+
+    for (const auto & input : inputs)
+    {
+        if (input->type == ActionsDAG::ActionType::INPUT)
+        {
+            std::string input_name = input->result_name;
+            std::optional<Int32> input_id = schema_processor.tryGetColumnIDByName(current_schema_id, input_name);
+            if (input_id)
+                used_columns_in_filter.push_back(*input_id);
+        }
+    }
+
+    ActionsDAG dag_with_renames;
+    for (const auto column_id : used_columns_in_filter)
+    {
+        auto column = schema_processor.tryGetFieldCharacteristics(current_schema_id, column_id);
+
+        /// Columns which we dropped and don't exist in current schema
+        /// cannot be queried in WHERE expression.
+        if (!column.has_value())
+            continue;
+
+        /// We take data type from manifest schema, not latest type
+        auto column_from_manifest = schema_processor.tryGetFieldCharacteristics(manifest_schema_id, column_id);
+        if (!column_from_manifest.has_value())
+            continue;
+
+        auto numeric_column_name = DB::backQuote(DB::toString(column_id));
+        const auto * node = &dag_with_renames.addInput(numeric_column_name, column_from_manifest->type);
+        node = &dag_with_renames.addAlias(*node, column->name);
+        dag_with_renames.getOutputs().push_back(node);
+    }
+    auto result = std::make_unique<DB::ActionsDAG>(DB::ActionsDAG::merge(std::move(dag_with_renames), source_dag->clone()));
+    result->removeUnusedActions();
+    return result;
+
+}
+
+
+ManifestFilesPruner::ManifestFilesPruner(
+    const DB::IcebergSchemaProcessor & schema_processor_,
+    Int32 current_schema_id_,
+    const DB::ActionsDAG * filter_dag,
+    const ManifestFileContent & manifest_file,
+    DB::ContextPtr context)
+    : schema_processor(schema_processor_)
+    , current_schema_id(current_schema_id_)
+    , manifest_schema_id(manifest_file.getSchemaId())
+{
+    std::unique_ptr<ActionsDAG> transformed_dag;
+    std::vector<Int32> used_columns_in_filter;
+    if (manifest_file.hasPartitionKey() || manifest_file.hasBoundsInfoInManifests())
+        transformed_dag = transformFilterDagForManifest(filter_dag, used_columns_in_filter);
+
+    if (manifest_file.hasPartitionKey())
+    {
+        partition_key = &manifest_file.getPartitionKeyDescription();
+        if (transformed_dag != nullptr)
+            partition_key_condition.emplace(transformed_dag.get(), context, partition_key->column_names, partition_key->expression, true /* single_point */);
+    }
+
+    if (manifest_file.hasBoundsInfoInManifests() && transformed_dag != nullptr)
+    {
+        {
+            const auto & bounded_colums = manifest_file.getColumnsIDsWithBounds();
+            for (Int32 used_column_id : used_columns_in_filter)
+            {
+                if (!bounded_colums.contains(used_column_id))
+                    continue;
+
+                NameAndTypePair name_and_type = schema_processor.getFieldCharacteristics(manifest_schema_id, used_column_id);
+                name_and_type.name = DB::backQuote(DB::toString(used_column_id));
+
+                ExpressionActionsPtr expression = std::make_shared<ExpressionActions>(
+                    ActionsDAG({name_and_type}), ExpressionActionsSettings(context));
+
+                min_max_key_conditions.emplace(used_column_id, KeyCondition(transformed_dag.get(), context, {name_and_type.name}, expression));
+            }
+        }
+    }
+}
+
+bool ManifestFilesPruner::canBePruned(const ManifestFileEntry & entry) const
+{
+    if (partition_key_condition.has_value())
+    {
+        const auto & partition_value = entry.partition_key_value;
+        std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
+        for (auto & field : index_value)
+        {
+            // NULL_LAST
+            if (field.isNull())
+                field = POSITIVE_INFINITY;
+        }
+
+        bool can_be_true = partition_key_condition->mayBeTrueInRange(
+            partition_value.size(), index_value.data(), index_value.data(), partition_key->data_types);
+
+        if (!can_be_true)
+        {
+            ProfileEvents::increment(ProfileEvents::IcebergPartitionPrunnedFiles);
+            return true;
+        }
+    }
+
+    for (const auto & [column_id, key_condition] : min_max_key_conditions)
+    {
+        std::optional<NameAndTypePair> name_and_type = schema_processor.tryGetFieldCharacteristics(manifest_schema_id, column_id);
+
+        /// There is no such column in this manifest file
+        if (!name_and_type.has_value())
+            continue;
+
+        auto hyperrectangle = entry.columns_infos.at(column_id).hyperrectangle;
+        if (hyperrectangle.has_value() && !key_condition.mayBeTrueInRange(1, &hyperrectangle->left, &hyperrectangle->right, {name_and_type->type}))
+        {
+            ProfileEvents::increment(ProfileEvents::IcebergMinMaxIndexPrunnedFiles);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h
similarity index 72%
rename from src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h
rename to src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h
index 8b9fd80c612f..89a761265985 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h
@@ -20,20 +20,23 @@ class ManifestFileContent;
 DB::ASTPtr getASTFromTransform(const String & transform_name_src, const String & column_name);
 
 /// Prune specific data files based on manifest content
-class PartitionPruner
+class ManifestFilesPruner
 {
 private:
     const DB::IcebergSchemaProcessor & schema_processor;
     Int32 current_schema_id;
+    Int32 manifest_schema_id;
     const DB::KeyDescription * partition_key;
+    std::optional<DB::KeyCondition> partition_key_condition;
 
-    std::optional<DB::KeyCondition> key_condition;
-    /// NOTE: tricky part to support RENAME column in partition key.
+    std::unordered_map<Int32, DB::KeyCondition> min_max_key_conditions;
+    /// NOTE: tricky part to support RENAME column.
     /// Takes ActionDAG representation of user's WHERE expression and
-    /// rename columns to the their origina numeric id's in iceberg
-    std::unique_ptr<DB::ActionsDAG> transformFilterDagForManifest(const DB::ActionsDAG * source_dag, Int32 manifest_schema_id, const std::vector<Int32> & partition_column_ids) const;
+    /// rename columns to the their origina numeric ID's in iceberg
+    std::unique_ptr<DB::ActionsDAG> transformFilterDagForManifest(const DB::ActionsDAG * source_dag, std::vector<Int32> & used_columns_in_filter) const;
 public:
-    PartitionPruner(
+
+    ManifestFilesPruner(
         const DB::IcebergSchemaProcessor & schema_processor_,
         Int32 current_schema_id_,
         const DB::ActionsDAG * filter_dag,
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp
deleted file mode 100644
index 9e24494e071e..000000000000
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-#include "config.h"
-
-#if USE_AVRO
-
-#include <Columns/ColumnNullable.h>
-#include <Columns/ColumnsDateTime.h>
-#include <Common/DateLUTImpl.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <Common/logger_useful.h>
-#include <Parsers/ASTFunction.h>
-#include <Parsers/ASTIdentifier.h>
-#include <Parsers/ASTExpressionList.h>
-#include <Parsers/ASTLiteral.h>
-#include <IO/ReadHelpers.h>
-#include <Common/quoteString.h>
-
-#include <Storages/ObjectStorage/DataLakes/Iceberg/PartitionPruning.h>
-#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
-
-using namespace DB;
-
-namespace Iceberg
-{
-
-DB::ASTPtr getASTFromTransform(const String & transform_name_src, const String & column_name)
-{
-    std::string transform_name = Poco::toLower(transform_name_src);
-
-    if (transform_name == "year" || transform_name == "years")
-        return makeASTFunction("toYearNumSinceEpoch", std::make_shared<DB::ASTIdentifier>(column_name));
-
-    if (transform_name == "month" || transform_name == "months")
-        return makeASTFunction("toMonthNumSinceEpoch", std::make_shared<DB::ASTIdentifier>(column_name));
-
-    if (transform_name == "day" || transform_name == "date" || transform_name == "days" || transform_name == "dates")
-        return makeASTFunction("toRelativeDayNum", std::make_shared<DB::ASTIdentifier>(column_name));
-
-    if (transform_name == "hour" || transform_name == "hours")
-        return makeASTFunction("toRelativeHourNum", std::make_shared<DB::ASTIdentifier>(column_name));
-
-    if (transform_name == "identity")
-        return std::make_shared<ASTIdentifier>(column_name);
-
-    if (transform_name == "void")
-        return makeASTFunction("tuple");
-
-    if (transform_name.starts_with("truncate"))
-    {
-        /// should look like transform[N]
-
-        if (transform_name.back() != ']')
-            return nullptr;
-
-        auto argument_start = transform_name.find('[');
-
-        if (argument_start == std::string::npos)
-            return nullptr;
-
-        auto argument_width = transform_name.length() - 2 - argument_start;
-        std::string width = transform_name.substr(argument_start + 1, argument_width);
-        size_t truncate_width;
-        bool parsed = DB::tryParse<size_t>(truncate_width, width);
-
-        if (!parsed)
-            return nullptr;
-
-        return makeASTFunction("icebergTruncate", std::make_shared<DB::ASTLiteral>(truncate_width), std::make_shared<DB::ASTIdentifier>(column_name));
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-std::unique_ptr<DB::ActionsDAG> PartitionPruner::transformFilterDagForManifest(const DB::ActionsDAG * source_dag, Int32 manifest_schema_id, const std::vector<Int32> & partition_column_ids) const
-{
-    if (source_dag == nullptr)
-        return nullptr;
-
-    ActionsDAG dag_with_renames;
-    for (const auto column_id : partition_column_ids)
-    {
-        auto column = schema_processor.tryGetFieldCharacteristics(current_schema_id, column_id);
-
-        /// Columns which we dropped and doesn't exist in current schema
-        /// cannot be queried in WHERE expression.
-        if (!column.has_value())
-            continue;
-
-        /// We take data type from manifest schema, not latest type
-        auto column_type = schema_processor.getFieldCharacteristics(manifest_schema_id, column_id).type;
-        auto numeric_column_name = DB::backQuote(DB::toString(column_id));
-        const auto * node = &dag_with_renames.addInput(numeric_column_name, column_type);
-        node = &dag_with_renames.addAlias(*node, column->name);
-        dag_with_renames.getOutputs().push_back(node);
-    }
-    auto result = std::make_unique<DB::ActionsDAG>(DB::ActionsDAG::merge(std::move(dag_with_renames), source_dag->clone()));
-    result->removeUnusedActions();
-    return result;
-
-}
-
-PartitionPruner::PartitionPruner(
-    const DB::IcebergSchemaProcessor & schema_processor_,
-    Int32 current_schema_id_,
-    const DB::ActionsDAG * filter_dag,
-    const ManifestFileContent & manifest_file,
-    DB::ContextPtr context)
-    : schema_processor(schema_processor_)
-    , current_schema_id(current_schema_id_)
-{
-    if (manifest_file.hasPartitionKey())
-    {
-        partition_key = &manifest_file.getPartitionKeyDescription();
-        auto transformed_dag = transformFilterDagForManifest(filter_dag, manifest_file.getSchemaId(), manifest_file.getPartitionKeyColumnIDs());
-        if (transformed_dag != nullptr)
-            key_condition.emplace(transformed_dag.get(), context, partition_key->column_names, partition_key->expression, true /* single_point */);
-    }
-}
-
-bool PartitionPruner::canBePruned(const ManifestFileEntry & entry) const
-{
-    if (!key_condition.has_value())
-        return false;
-
-    const auto & partition_value = entry.partition_key_value;
-    std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
-    for (auto & field : index_value)
-    {
-        // NULL_LAST
-        if (field.isNull())
-            field = POSITIVE_INFINITY;
-    }
-
-    bool can_be_true = key_condition->mayBeTrueInRange(
-        partition_value.size(), index_value.data(), index_value.data(), partition_key->data_types);
-
-    return !can_be_true;
-}
-
-
-}
-
-#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
index 625feaa4cb42..44df87b8fbe6 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
@@ -117,6 +117,7 @@ void IcebergSchemaProcessor::addIcebergTableSchema(Poco::JSON::Object::Ptr schem
             auto type = getFieldType(field, "type", required, current_full_name, true);
             clickhouse_schema->push_back(NameAndTypePair{name, type});
             clickhouse_types_by_source_ids[{schema_id, field->getValue<Int32>("id")}] = NameAndTypePair{current_full_name, type};
+            clickhouse_ids_by_source_names[{schema_id, current_full_name}] = field->getValue<Int32>("id");
         }
         clickhouse_table_schemas_by_ids[schema_id] = clickhouse_schema;
     }
@@ -139,6 +140,14 @@ std::optional<NameAndTypePair> IcebergSchemaProcessor::tryGetFieldCharacteristic
     return it->second;
 }
 
+std::optional<Int32> IcebergSchemaProcessor::tryGetColumnIDByName(Int32 schema_id, const std::string & name) const
+{
+    auto it = clickhouse_ids_by_source_names.find({schema_id, name});
+    if (it == clickhouse_ids_by_source_names.end())
+        return {};
+    return it->second;
+}
+
 NamesAndTypesList IcebergSchemaProcessor::tryGetFieldsCharacteristics(Int32 schema_id, const std::vector<Int32> & source_ids) const
 {
     NamesAndTypesList fields;
@@ -232,6 +241,8 @@ IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr &
                 element_types.push_back(getFieldType(field, "type", required, current_full_name, true));
                 clickhouse_types_by_source_ids[{current_schema_id.value(), field->getValue<Int32>("id")}]
                     = NameAndTypePair{current_full_name, element_types.back()};
+
+                clickhouse_ids_by_source_names[{current_schema_id.value(), current_full_name}] = field->getValue<Int32>("id");
             }
             else
             {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
index 469d7438e2af..311d723c83b2 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
@@ -82,6 +82,7 @@ class IcebergSchemaProcessor
     NameAndTypePair getFieldCharacteristics(Int32 schema_version, Int32 source_id) const;
     std::optional<NameAndTypePair> tryGetFieldCharacteristics(Int32 schema_version, Int32 source_id) const;
     NamesAndTypesList tryGetFieldsCharacteristics(Int32 schema_id, const std::vector<Int32> & source_ids) const;
+    std::optional<Int32> tryGetColumnIDByName(Int32 schema_id, const std::string & name) const;
 
     bool hasClickhouseTableSchemaById(Int32 id) const;
 
@@ -90,6 +91,7 @@ class IcebergSchemaProcessor
     std::unordered_map<Int32, std::shared_ptr<NamesAndTypesList>> clickhouse_table_schemas_by_ids;
     std::map<std::pair<Int32, Int32>, std::shared_ptr<ActionsDAG>> transform_dags_by_ids;
     mutable std::map<std::pair<Int32, Int32>, NameAndTypePair> clickhouse_types_by_source_ids;
+    mutable std::map<std::pair<Int32, std::string>, Int32> clickhouse_ids_by_source_names;
 
     NamesAndTypesList getSchemaType(const Poco::JSON::Object::Ptr & schema);
     DataTypePtr getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type, String & current_full_name, bool is_subfield_of_root);
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index 1a1a6219dd08..457f41906f22 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -2627,3 +2627,251 @@ def test_iceberg_snapshot_reads(started_cluster, format_version, storage_type):
         == instance.query("SELECT number, toString(number + 1) FROM numbers(300)")
     )
 
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_minmax_pruning(started_cluster, storage_type):
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+    TABLE_NAME = "test_minmax_pruning_" + storage_type + "_" + get_uuid_str()
+
+    def execute_spark_query(query: str):
+        spark.sql(query)
+        default_upload_directory(
+            started_cluster,
+            storage_type,
+            f"/iceberg_data/default/{TABLE_NAME}/",
+            f"/iceberg_data/default/{TABLE_NAME}/",
+        )
+        return
+
+    execute_spark_query(
+        f"""
+            CREATE TABLE {TABLE_NAME} (
+                tag INT,
+                date DATE,
+                ts TIMESTAMP,
+                time_struct struct<a : DATE, b : TIMESTAMP>,
+                name VARCHAR(50),
+                number BIGINT
+            )
+            USING iceberg
+            OPTIONS('format-version'='2')
+        """
+    )
+
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (1, DATE '2024-01-20',
+        TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-01-20', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'vasya', 5)
+    """
+    )
+
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (2, DATE '2024-02-20',
+        TIMESTAMP '2024-03-20 15:00:00', named_struct('a', DATE '2024-02-20', 'b', TIMESTAMP '2024-03-20 14:00:00'), 'vasilisa', 6)
+    """
+    )
+
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (3, DATE '2025-03-20',
+        TIMESTAMP '2024-04-30 14:00:00', named_struct('a', DATE '2024-03-20', 'b', TIMESTAMP '2024-04-30 14:00:00'), 'icebreaker', 7)
+    """
+    )
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (4, DATE '2025-04-20',
+        TIMESTAMP '2024-05-30 14:00:00', named_struct('a', DATE '2024-04-20', 'b', TIMESTAMP '2024-05-30 14:00:00'), 'iceberg', 8)
+    """
+    )
+
+    creation_expression = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster, table_function=True
+    )
+
+    def check_validity_and_get_prunned_files(select_expression):
+        query_id1 = f"{TABLE_NAME}-{uuid.uuid4()}"
+        query_id2 = f"{TABLE_NAME}-{uuid.uuid4()}"
+
+        data1 = instance.query(
+            select_expression,
+            query_id=query_id1,
+            settings={"use_iceberg_partition_pruning": 0, "input_format_parquet_bloom_filter_push_down": 0, "input_format_parquet_filter_push_down": 0},
+        )
+        data1 = list(
+            map(
+                lambda x: x.split("\t"),
+                filter(lambda x: len(x) > 0, data1.strip().split("\n")),
+            )
+        )
+
+        data2 = instance.query(
+            select_expression,
+            query_id=query_id2,
+            settings={"use_iceberg_partition_pruning": 1, "input_format_parquet_bloom_filter_push_down": 0, "input_format_parquet_filter_push_down": 0},
+        )
+        data2 = list(
+            map(
+                lambda x: x.split("\t"),
+                filter(lambda x: len(x) > 0, data2.strip().split("\n")),
+            )
+        )
+
+        assert data1 == data2
+
+        instance.query("SYSTEM FLUSH LOGS")
+
+        print(
+            "Unprunned: ",
+            instance.query(
+                f"SELECT ProfileEvents['IcebergMinMaxIndexPrunnedFiles'] FROM system.query_log WHERE query_id = '{query_id1}' AND type = 'QueryFinish'"
+            ),
+        )
+        print(
+            "Prunned: ",
+            instance.query(
+                f"SELECT ProfileEvents['IcebergMinMaxIndexPrunnedFiles'] FROM system.query_log WHERE query_id = '{query_id2}' AND type = 'QueryFinish'"
+            ),
+        )
+
+        assert 0 == int(
+            instance.query(
+                f"SELECT ProfileEvents['IcebergMinMaxIndexPrunnedFiles'] FROM system.query_log WHERE query_id = '{query_id1}' AND type = 'QueryFinish'"
+            )
+        )
+        return int(
+            instance.query(
+                f"SELECT ProfileEvents['IcebergMinMaxIndexPrunnedFiles'] FROM system.query_log WHERE query_id = '{query_id2}' AND type = 'QueryFinish'"
+            )
+        )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} ORDER BY ALL"
+        )
+        == 0
+    )
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE date <= '2024-01-25' ORDER BY ALL"
+        )
+        == 3
+    )
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE ts <= timestamp('2024-03-20 14:00:00.000000') ORDER BY ALL"
+        )
+        == 3
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE tag == 1 ORDER BY ALL"
+        )
+        == 3
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE tag <= 1 ORDER BY ALL"
+        )
+        == 3
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE name == 'vasilisa' ORDER BY ALL"
+        )
+        == 3
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE name < 'kek' ORDER BY ALL"
+        )
+        == 2
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE number == 8 ORDER BY ALL"
+        )
+        == 3
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE number <= 5 ORDER BY ALL"
+        )
+        == 3
+    )
+
+    execute_spark_query(f"ALTER TABLE {TABLE_NAME} RENAME COLUMN date TO date3")
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE date3 <= '2024-01-25' ORDER BY ALL"
+        )
+        == 3
+    )
+
+    execute_spark_query(f"ALTER TABLE {TABLE_NAME} ALTER COLUMN tag TYPE BIGINT")
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE tag <= 1 ORDER BY ALL"
+        )
+        == 3
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE time_struct.a <= '2024-02-01' ORDER BY ALL"
+        )
+        == 3
+    )
+
+    execute_spark_query(
+        f"INSERT INTO {TABLE_NAME} VALUES (1, DATE '2024-01-20', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-03-15', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'kek', 10)"
+    )
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE time_struct.a <= '2024-02-01' ORDER BY ALL"
+        )
+        == 4
+    )
+
+    execute_spark_query(f"ALTER TABLE {TABLE_NAME} ADD COLUMNS (ddd decimal(10, 3))")
+
+    execute_spark_query(
+        f"INSERT INTO {TABLE_NAME} VALUES (1, DATE '2024-01-20', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-03-15', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'kek', 30, decimal(17.22))"
+    )
+
+    execute_spark_query(
+        f"INSERT INTO {TABLE_NAME} VALUES (1, DATE '2024-01-20', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-03-15', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'kek', 10, decimal(14311.772))"
+    )
+
+    execute_spark_query(
+        f"INSERT INTO {TABLE_NAME} VALUES (1, DATE '2024-01-20', TIMESTAMP '2024-02-20 10:00:00', named_struct('a', DATE '2024-03-15', 'b', TIMESTAMP '2024-02-20 10:00:00'), 'kek', 10, decimal(-8888.999))"
+    )
+
+
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE ddd >= 100 ORDER BY ALL"
+        )
+        == 2
+    )
+    # Spark store rounded values of decimals, this query checks that we work it around.
+    # Please check the code where we parse lower bounds and upper bounds
+    assert (
+        check_validity_and_get_prunned_files(
+            f"SELECT * FROM {creation_expression} WHERE ddd >= toDecimal64('17.21', 3) ORDER BY ALL"
+        )
+        == 1
+    )

From 8deec9f360164ae6bda5437972ab4f386fea4f94 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 10 Mar 2025 13:19:00 +0000
Subject: [PATCH 09/14] Merge pull request #77318 from
 ClickHouse/add_iceberg_setting_for_metadata

Allow to specify metadata file for Iceberg
---
 src/Databases/Iceberg/DatabaseIceberg.cpp     |  4 +-
 .../DataLakes/DataLakeConfiguration.h         | 20 ++++--
 .../DataLakes/DeltaLakeMetadata.h             |  6 +-
 .../ObjectStorage/DataLakes/HudiMetadata.h    |  3 +-
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 67 +++++++++++++------
 .../DataLakes/Iceberg/IcebergMetadata.h       |  3 +-
 .../ObjectStorage/StorageObjectStorage.cpp    | 21 ++----
 .../ObjectStorage/StorageObjectStorage.h      |  9 ++-
 .../StorageObjectStorageSettings.cpp          |  3 +
 .../StorageObjectStorageSettings.h            |  2 +
 .../registerStorageObjectStorage.cpp          |  8 +--
 .../TableFunctionObjectStorage.cpp            |  4 +-
 .../TableFunctionObjectStorage.h              |  4 +-
 .../integration/test_storage_iceberg/test.py  | 45 ++++++++++++-
 14 files changed, 137 insertions(+), 62 deletions(-)

diff --git a/src/Databases/Iceberg/DatabaseIceberg.cpp b/src/Databases/Iceberg/DatabaseIceberg.cpp
index c42e10d9bd2d..ba77fae25acb 100644
--- a/src/Databases/Iceberg/DatabaseIceberg.cpp
+++ b/src/Databases/Iceberg/DatabaseIceberg.cpp
@@ -226,11 +226,11 @@ StoragePtr DatabaseIceberg::tryGetTable(const String & name, ContextPtr context_
         storage_type = table_metadata.getStorageType();
 
     const auto configuration = getConfiguration(storage_type);
-    auto storage_settings = std::make_unique<StorageObjectStorageSettings>();
+    auto storage_settings = std::make_shared<StorageObjectStorageSettings>();
 
     /// with_table_structure = false: because there will be
     /// no table structure in table definition AST.
-    configuration->initialize(args, context_, /* with_table_structure */false, storage_settings.get());
+    configuration->initialize(args, context_, /* with_table_structure */false, storage_settings);
 
     auto cluster_name = settings[DatabaseIcebergSetting::object_storage_cluster].value;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index 24fbd90e5fc7..c268dcce93e7 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -11,6 +11,7 @@
 #include <Storages/ObjectStorage/Local/Configuration.h>
 #include <Storages/ObjectStorage/S3/Configuration.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
+#include <Storages/ObjectStorage/StorageObjectStorageSettings.h>
 #include <Storages/StorageFactory.h>
 #include <Common/logger_useful.h>
 #include <Storages/ColumnsDescription.h>
@@ -34,6 +35,12 @@ extern const int FORMAT_VERSION_TOO_OLD;
 extern const int LOGICAL_ERROR;
 }
 
+namespace StorageObjectStorageSetting
+{
+extern const StorageObjectStorageSettingsBool allow_dynamic_metadata_for_data_lakes;
+}
+
+
 template <typename T>
 concept StorageConfiguration = std::derived_from<T, StorageObjectStorage::Configuration>;
 
@@ -110,7 +117,8 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
 
     bool hasExternalDynamicMetadata() override
     {
-        return StorageObjectStorage::Configuration::allow_dynamic_metadata_for_data_lakes && current_metadata
+        return BaseStorageConfiguration::getSettingsRef()[StorageObjectStorageSetting::allow_dynamic_metadata_for_data_lakes]
+            && current_metadata
             && current_metadata->supportsExternalMetadataChange();
     }
 
@@ -172,7 +180,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
             current_metadata = DataLakeMetadata::create(
                 object_storage,
                 weak_from_this(),
-                local_context, BaseStorageConfiguration::allow_experimental_delta_kernel_rs);
+                local_context);
         }
         auto read_schema = current_metadata->getReadSchema();
         if (!read_schema.empty())
@@ -231,8 +239,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
             current_metadata = DataLakeMetadata::create(
                 object_storage,
                 weak_from_this(),
-                context,
-                BaseStorageConfiguration::allow_experimental_delta_kernel_rs);
+                context);
             return true;
         }
 
@@ -244,8 +251,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
         auto new_metadata = DataLakeMetadata::create(
             object_storage,
             weak_from_this(),
-            context,
-            BaseStorageConfiguration::allow_experimental_delta_kernel_rs);
+            context);
 
         if (*current_metadata != *new_metadata)
         {
@@ -355,7 +361,7 @@ class StorageIcebergConfiguration : public StorageObjectStorage::Configuration,
         ASTs & engine_args,
         ContextPtr local_context,
         bool with_table_structure,
-        StorageObjectStorageSettings * settings) override
+        StorageObjectStorageSettingsPtr settings) override
     {
         createDynamicConfiguration(engine_args, local_context);
         getImpl().initialize(engine_args, local_context, with_table_structure, settings);
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
index e9e7f06effc5..ce0d35d08541 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
@@ -55,11 +55,11 @@ class DeltaLakeMetadata final : public IDataLakeMetadata
     static DataLakeMetadataPtr create(
         ObjectStoragePtr object_storage,
         ConfigurationObserverPtr configuration,
-        ContextPtr local_context,
-        [[maybe_unused]] bool allow_experimental_delta_kernel_rs)
+        ContextPtr local_context)
     {
 #if USE_DELTA_KERNEL_RS
-        if (allow_experimental_delta_kernel_rs)
+        auto configuration_ptr = configuration.lock();
+        if (configuration_ptr->getSettingsRef()[StorageObjectStorageSetting::allow_experimental_delta_kernel_rs])
             return std::make_unique<DeltaLakeMetadataDeltaKernel>(object_storage, configuration, local_context);
         else
             return std::make_unique<DeltaLakeMetadata>(object_storage, configuration, local_context);
diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h
index c5c81b583026..46291d9e6d96 100644
--- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h
@@ -34,8 +34,7 @@ class HudiMetadata final : public IDataLakeMetadata, private WithContext
     static DataLakeMetadataPtr create(
         ObjectStoragePtr object_storage,
         ConfigurationObserverPtr configuration,
-        ContextPtr local_context,
-        bool)
+        ContextPtr local_context)
     {
         return std::make_unique<HudiMetadata>(object_storage, configuration, local_context);
     }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index c360d5c2f9ca..9f0e3237cf33 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -11,6 +11,7 @@
 
 #include <Storages/ObjectStorage/DataLakes/Common.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#include <Storages/ObjectStorage/StorageObjectStorageSettings.h>
 #include <Interpreters/ExpressionActions.h>
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h>
@@ -34,7 +35,7 @@ namespace DB
 
 namespace StorageObjectStorageSetting
 {
-    extern const StorageObjectStorageSettingsString iceberg_metadata_file_path;
+extern const StorageObjectStorageSettingsString iceberg_metadata_file_path;
 }
 
 namespace ErrorCodes
@@ -222,14 +223,32 @@ Int32 IcebergMetadata::parseTableSchema(
     }
 }
 
+static std::pair<Int32, String> getMetadataFileAndVersion(const std::string & path)
+{
+    String file_name(path.begin() + path.find_last_of('/') + 1, path.end());
+    String version_str;
+    /// v<V>.metadata.json
+    if (file_name.starts_with('v'))
+        version_str = String(file_name.begin() + 1, file_name.begin() + file_name.find_first_of('.'));
+    /// <V>-<random-uuid>.metadata.json
+    else
+        version_str = String(file_name.begin(), file_name.begin() + file_name.find_first_of('-'));
+
+    if (!std::all_of(version_str.begin(), version_str.end(), isdigit))
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: {}. Expected vN.metadata.json where N is a number", file_name);
+
+    return std::make_pair(std::stoi(version_str), path);
+}
+
 /**
  * Each version of table metadata is stored in a `metadata` directory and
  * has one of 2 formats:
  *   1) v<V>.metadata.json, where V - metadata version.
  *   2) <V>-<random-uuid>.metadata.json, where V - metadata version
  */
-std::pair<Int32, String>
-getMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const StorageObjectStorage::Configuration & configuration)
+static std::pair<Int32, String>
+getLatestMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const StorageObjectStorage::Configuration & configuration)
 {
     const auto metadata_files = listFiles(*object_storage, configuration, "metadata", ".metadata.json");
     if (metadata_files.empty())
@@ -237,30 +256,37 @@ getMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const Storage
         throw Exception(
             ErrorCodes::FILE_DOESNT_EXIST, "The metadata file for Iceberg table with path {} doesn't exist", configuration.getPath());
     }
-
     std::vector<std::pair<UInt32, String>> metadata_files_with_versions;
     metadata_files_with_versions.reserve(metadata_files.size());
     for (const auto & path : metadata_files)
     {
-        String file_name(path.begin() + path.find_last_of('/') + 1, path.end());
-        String version_str;
-        /// v<V>.metadata.json
-        if (file_name.starts_with('v'))
-            version_str = String(file_name.begin() + 1, file_name.begin() + file_name.find_first_of('.'));
-        /// <V>-<random-uuid>.metadata.json
-        else
-            version_str = String(file_name.begin(), file_name.begin() + file_name.find_first_of('-'));
-
-        if (!std::all_of(version_str.begin(), version_str.end(), isdigit))
-            throw Exception(
-                ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: {}. Expected vN.metadata.json where N is a number", file_name);
-        metadata_files_with_versions.emplace_back(std::stoi(version_str), path);
+        metadata_files_with_versions.emplace_back(getMetadataFileAndVersion(path));
     }
 
     /// Get the latest version of metadata file: v<V>.metadata.json
     return *std::max_element(metadata_files_with_versions.begin(), metadata_files_with_versions.end());
 }
 
+static std::pair<Int32, String> getLatestOrExplicitMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const StorageObjectStorage::Configuration & configuration)
+{
+    auto explicit_metadata_path = configuration.getSettingsRef()[StorageObjectStorageSetting::iceberg_metadata_file_path].value;
+    std::pair<Int32, String> result;
+    if (!explicit_metadata_path.empty())
+    {
+        auto prefix_storage_path = configuration.getPath();
+        if (!explicit_metadata_path.starts_with(prefix_storage_path))
+            explicit_metadata_path = std::filesystem::path(prefix_storage_path) / explicit_metadata_path;
+        result = getMetadataFileAndVersion(explicit_metadata_path);
+    }
+    else
+    {
+        result = getLatestMetadataFileAndVersion(object_storage, configuration);
+    }
+
+    return result;
+}
+
+
 Poco::JSON::Object::Ptr IcebergMetadata::readJSON(const String & metadata_file_path, const ContextPtr & local_context) const
 {
     ObjectInfo object_info(metadata_file_path);
@@ -278,7 +304,7 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
 {
     auto configuration_ptr = configuration.lock();
 
-    const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
+    const auto [metadata_version, metadata_file_path] = getLatestOrExplicitMetadataFileAndVersion(object_storage, *configuration_ptr);
 
     last_metadata_version = metadata_version;
 
@@ -429,12 +455,11 @@ std::optional<Int32> IcebergMetadata::getSchemaVersionByFileIfOutdated(String da
 DataLakeMetadataPtr IcebergMetadata::create(
     const ObjectStoragePtr & object_storage,
     const ConfigurationObserverPtr & configuration,
-    const ContextPtr & local_context,
-    bool)
+    const ContextPtr & local_context)
 {
     auto configuration_ptr = configuration.lock();
 
-    const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
+    const auto [metadata_version, metadata_file_path] = getLatestOrExplicitMetadataFileAndVersion(object_storage, *configuration_ptr);
 
     auto log = getLogger("IcebergMetadata");
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index f5ac4a33cb6a..0826b86ca035 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -61,8 +61,7 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
     static DataLakeMetadataPtr create(
         const ObjectStoragePtr & object_storage,
         const ConfigurationObserverPtr & configuration,
-        const ContextPtr & local_context,
-        bool allow_experimental_delta_kernel_rs);
+        const ContextPtr & local_context);
 
     std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
     {
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index 37be3b3c2760..c7b0fb20c827 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -46,12 +46,6 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
-namespace StorageObjectStorageSetting
-{
-extern const StorageObjectStorageSettingsBool allow_dynamic_metadata_for_data_lakes;
-extern const StorageObjectStorageSettingsBool allow_experimental_delta_kernel_rs;
-}
-
 String StorageObjectStorage::getPathSample(ContextPtr context)
 {
     auto query_settings = configuration->getQuerySettings(context);
@@ -606,7 +600,7 @@ void StorageObjectStorage::Configuration::initialize(
     ASTs & engine_args,
     ContextPtr local_context,
     bool with_table_structure,
-    StorageObjectStorageSettings * settings)
+    StorageObjectStorageSettingsPtr settings)
 {
     if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context))
         fromNamedCollection(*named_collection, local_context);
@@ -630,16 +624,15 @@ void StorageObjectStorage::Configuration::initialize(
     else
         FormatFactory::instance().checkFormatName(format);
 
-    if (settings)
-    {
-        allow_dynamic_metadata_for_data_lakes
-            = (*settings)[StorageObjectStorageSetting::allow_dynamic_metadata_for_data_lakes];
-        allow_experimental_delta_kernel_rs
-            = (*settings)[StorageObjectStorageSetting::allow_experimental_delta_kernel_rs];
-    }
+    storage_settings = settings;
     initialized = true;
 }
 
+const StorageObjectStorageSettings & StorageObjectStorage::Configuration::getSettingsRef() const
+{
+    return *storage_settings;
+}
+ 
 void StorageObjectStorage::Configuration::check(ContextPtr) const
 {
     FormatFactory::instance().checkFormatName(format);
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h
index f04a63021afb..c4ef7e7cdcc7 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.h
@@ -18,6 +18,8 @@ namespace DB
 class ReadBufferIterator;
 class SchemaCache;
 class NamedCollection;
+struct StorageObjectStorageSettings;
+using StorageObjectStorageSettingsPtr = std::shared_ptr<StorageObjectStorageSettings>;
 
 namespace ErrorCodes
 {
@@ -173,7 +175,7 @@ class StorageObjectStorage::Configuration
         ASTs & engine_args,
         ContextPtr local_context,
         bool with_table_structure,
-        StorageObjectStorageSettings * settings);
+        StorageObjectStorageSettingsPtr settings);
 
     /// Storage type: s3, hdfs, azure, local.
     virtual ObjectStorageType getType() const = 0;
@@ -264,6 +266,8 @@ class StorageObjectStorage::Configuration
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method createArgsWithAccessData is not supported by storage {}", getEngineName());
     }
 
+    const StorageObjectStorageSettings & getSettingsRef() const;
+
     virtual void fromNamedCollection(const NamedCollection & collection, ContextPtr context) = 0;
     virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0;
 
@@ -275,8 +279,7 @@ class StorageObjectStorage::Configuration
     bool initialized = false;
     std::atomic<bool> updated = false;
 
-    bool allow_dynamic_metadata_for_data_lakes = false;
-    bool allow_experimental_delta_kernel_rs = false;
+    StorageObjectStorageSettingsPtr storage_settings;
 };
 
 }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp
index 0d1aa6b04779..b220303c43e2 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp
@@ -22,6 +22,9 @@ If enabled, the engine would use delta-kernel-rs for DeltaLake metadata parsing
 )", 0) \
     DECLARE(String, object_storage_cluster, "", R"(
 Cluster for distributed requests
+)", 0) \
+    DECLARE(String, iceberg_metadata_file_path, "", R"(
+Explicit path to desired Iceberg metadata file, should be relative to path in object storage. Make sense for table function use case only.
 )", 0) \
 
 // clang-format on
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSettings.h b/src/Storages/ObjectStorage/StorageObjectStorageSettings.h
index 1f58402d3e98..68b3d62e8ff2 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSettings.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSettings.h
@@ -64,4 +64,6 @@ struct StorageObjectStorageSettings
     std::unique_ptr<StorageObjectStorageSettingsImpl> impl;
 };
 
+using StorageObjectStorageSettingsPtr = std::shared_ptr<StorageObjectStorageSettings>;
+
 }
diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp
index 7e3bca85ef2f..7c2c58940221 100644
--- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp
@@ -39,14 +39,14 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments");
 
     const auto context = args.getLocalContext();
-    auto queue_settings = std::make_unique<StorageObjectStorageSettings>();
+    auto storage_settings = std::make_shared<StorageObjectStorageSettings>();
 
     if (args.storage_def->settings)
-        queue_settings->loadFromQuery(*args.storage_def->settings);
+        storage_settings->loadFromQuery(*args.storage_def->settings);
 
-    auto cluster_name = (*queue_settings)[StorageObjectStorageSetting::object_storage_cluster].value;
+    auto cluster_name = (*storage_settings)[StorageObjectStorageSetting::object_storage_cluster].value;
 
-    configuration->initialize(args.engine_args, context, false, queue_settings.get());
+    configuration->initialize(args.engine_args, context, false, storage_settings);
 
     // Use format settings from global server context + settings from
     // the SETTINGS clause of the create query. Settings from current
diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp
index 564f4fa34248..aa73b55baea4 100644
--- a/src/TableFunctions/TableFunctionObjectStorage.cpp
+++ b/src/TableFunctions/TableFunctionObjectStorage.cpp
@@ -72,13 +72,15 @@ void TableFunctionObjectStorage<Definition, Configuration>::parseArguments(const
     if (args_func.size() != 1)
         throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments.", getName());
 
+    settings = std::make_shared<StorageObjectStorageSettings>();
+
     auto & args = args_func.at(0)->children;
     for (auto * it = args.begin(); it != args.end(); ++it)
     {
         ASTSetQuery * settings_ast = (*it)->as<ASTSetQuery>();
         if (settings_ast)
         {
-            settings.loadFromQuery(*settings_ast);
+            settings->loadFromQuery(*settings_ast);
             args.erase(it);
             break;
         }
diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h
index 2a0e6ce35748..9bed13109c95 100644
--- a/src/TableFunctions/TableFunctionObjectStorage.h
+++ b/src/TableFunctions/TableFunctionObjectStorage.h
@@ -130,7 +130,7 @@ class TableFunctionObjectStorage : public ITableFunction
 
     virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context)
     {
-        getConfiguration()->initialize(args, context, true, &settings);
+        getConfiguration()->initialize(args, context, true, settings);
     }
 
     static void updateStructureAndFormatArgumentsIfNeeded(
@@ -163,7 +163,7 @@ class TableFunctionObjectStorage : public ITableFunction
     mutable ConfigurationPtr configuration;
     mutable ObjectStoragePtr object_storage;
     ColumnsDescription structure_hint;
-    StorageObjectStorageSettings settings;
+    std::shared_ptr<StorageObjectStorageSettings> settings;
 
     std::vector<size_t> skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override;
 };
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index 457f41906f22..01f6e650f36f 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -192,16 +192,19 @@ def get_creation_expression(
     table_function=False,
     allow_dynamic_metadata_for_data_lakes=False,
     run_on_cluster=False,
+    explicit_metadata_path="",
     object_storage_cluster=False,
     storage_type_as_arg=False,
     storage_type_in_named_collection=False,
     **kwargs,
 ):
     settings_suffix = ""
-    if allow_dynamic_metadata_for_data_lakes or object_storage_cluster:
+    if allow_dynamic_metadata_for_data_lakes or object_storage_cluster or explicit_metadata_path:
         settings = []
         if allow_dynamic_metadata_for_data_lakes:
             settings.append("allow_dynamic_metadata_for_data_lakes = 1")
+        if explicit_metadata_path:
+            settings.append(f"iceberg_metadata_file_path = '{explicit_metadata_path}'")
         if object_storage_cluster:
             settings.append(f"object_storage_cluster = 'cluster_simple'")
         settings_suffix = " SETTINGS " + ", ".join(settings)
@@ -2875,3 +2878,43 @@ def check_validity_and_get_prunned_files(select_expression):
         )
         == 1
     )
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_explicit_metadata_file(started_cluster, storage_type):
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+    TABLE_NAME = (
+        "test_explicit_metadata_file_"
+        + storage_type
+        + "_"
+        + get_uuid_str()
+    )
+
+    spark.sql(
+        f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
+    )
+
+    for i in range(50):
+        spark.sql(
+            f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)"
+        )
+
+    default_upload_directory(
+        started_cluster,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, explicit_metadata_path="")
+
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500
+
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, explicit_metadata_path="metadata/v31.metadata.json")
+
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 300
+
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, explicit_metadata_path="metadata/v11.metadata.json")
+
+    assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100

From ab7294f93d693d18cc8046a745b77124787c7580 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Tue, 1 Apr 2025 20:54:54 +0000
Subject: [PATCH 10/14] Merge pull request #78368 from
 ClickHouse/delta-kernel-perf-issue

delta-kernel: fix progress bar, fix performance
---
 .../DataLakes/DataLakeConfiguration.h         |   4 +-
 .../DataLakes/DeltaLake/TableSnapshot.cpp     | 167 +++++++++++++++---
 .../DataLakes/DeltaLake/TableSnapshot.h       |   5 +-
 .../DeltaLake/getSchemaFromSnapshot.cpp       |   7 +
 .../DeltaLake/getSchemaFromSnapshot.h         |   2 +
 .../DataLakes/DeltaLakeMetadata.h             |  10 +-
 .../DeltaLakeMetadataDeltaKernel.cpp          |  13 +-
 .../DataLakes/DeltaLakeMetadataDeltaKernel.h  |  22 ++-
 .../DataLakes/IDataLakeMetadata.h             |   3 +-
 .../ObjectStorage/StorageObjectStorage.cpp    |   8 +-
 .../ObjectStorage/StorageObjectStorage.h      |   4 +-
 .../StorageObjectStorageSettings.cpp          |   3 +
 .../StorageObjectStorageSource.cpp            |   2 +-
 .../TableFunctionObjectStorage.cpp            |   3 +-
 14 files changed, 209 insertions(+), 44 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index c268dcce93e7..9f439a170d91 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -142,10 +142,10 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
         return current_metadata->supportsFileIterator();
     }
 
-    ObjectIterator iterate() override
+    ObjectIterator iterate(IDataLakeMetadata::FileProgressCallback callback, size_t list_batch_size) override
     {
         chassert(current_metadata);
-        return current_metadata->iterate();
+        return current_metadata->iterate(callback, list_batch_size);
     }
 
     /// This is an awful temporary crutch,
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp
index 92743edbcd67..b3234ad3d9eb 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp
@@ -9,6 +9,8 @@
 #include <Core/Field.h>
 #include <Common/Exception.h>
 #include <Common/logger_useful.h>
+#include <Common/ThreadPool.h>
+#include <Common/ThreadStatus.h>
 #include <IO/ReadBufferFromString.h>
 #include "getSchemaFromSnapshot.h"
 #include "KernelUtils.h"
@@ -58,19 +60,72 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
         const DB::NamesAndTypesList & schema_,
         const DB::Names & partition_columns_,
         DB::ObjectStoragePtr object_storage_,
+        DB::IDataLakeMetadata::FileProgressCallback callback_,
+        size_t list_batch_size_,
         LoggerPtr log_)
-        : scan(KernelUtils::unwrapResult(ffi::scan(snapshot_.get(), engine_.get(), /* predicate */{}), "scan"))
-        , scan_data_iterator(KernelUtils::unwrapResult(
-            ffi::kernel_scan_data_init(engine_.get(), scan.get()),
-            "kernel_scan_data_init"))
+        : engine(engine_)
+        , snapshot(snapshot_)
         , data_prefix(data_prefix_)
         , schema(schema_)
         , partition_columns(partition_columns_)
         , object_storage(object_storage_)
+        , callback(callback_)
+        , list_batch_size(list_batch_size_)
         , log(log_)
+        , thread([&, thread_group = DB::CurrentThread::getGroup()] {
+            /// Attach to current query thread group, to be able to
+            /// have query id in logs and metrics from scanDataFunc.
+            DB::ThreadGroupSwitcher switcher(thread_group, "TableSnapshot");
+            scanDataFunc();
+        })
     {
     }
 
+    ~Iterator() override
+    {
+        shutdown.store(true);
+        schedule_next_batch_cv.notify_one();
+        if (thread.joinable())
+            thread.join();
+    }
+
+    void initScanState()
+    {
+        scan = KernelUtils::unwrapResult(ffi::scan(snapshot.get(), engine.get(), /* predicate */{}), "scan");
+        scan_data_iterator = KernelUtils::unwrapResult(
+            ffi::kernel_scan_data_init(engine.get(), scan.get()),
+            "kernel_scan_data_init");
+    }
+
+    void scanDataFunc()
+    {
+        initScanState();
+        while (!shutdown.load())
+        {
+            bool have_scan_data_res = KernelUtils::unwrapResult(
+                ffi::kernel_scan_data_next(scan_data_iterator.get(), this, visitData),
+                "kernel_scan_data_next");
+
+            if (have_scan_data_res)
+            {
+                std::unique_lock lock(next_mutex);
+                if (!shutdown.load() && data_files.size() >= list_batch_size)
+                {
+                    schedule_next_batch_cv.wait(lock, [&]() { return (data_files.size() < list_batch_size) || shutdown.load(); });
+                }
+            }
+            else
+            {
+                {
+                    std::lock_guard lock(next_mutex);
+                    iterator_finished = true;
+                }
+                data_files_cv.notify_all();
+                return;
+            }
+        }
+    }
+
     size_t estimatedKeysCount() override
     {
         /// For now do the same as StorageObjectStorageSource::GlobIterator.
@@ -80,23 +135,35 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
 
     DB::ObjectInfoPtr next(size_t) override
     {
-        std::lock_guard lock(next_mutex);
-        while (data_files.empty())
+        DB::ObjectInfoPtr object;
         {
-            bool have_scan_data_res = KernelUtils::unwrapResult(
-                ffi::kernel_scan_data_next(scan_data_iterator.get(), this, visitData),
-                "kernel_scan_data_next");
+            std::unique_lock lock(next_mutex);
+            if (!iterator_finished && data_files.empty())
+            {
+                LOG_TEST(log, "Waiting for next data file");
+                schedule_next_batch_cv.notify_one();
+                data_files_cv.wait(lock, [&]() { return !data_files.empty() || iterator_finished; });
+            }
 
-            if (!have_scan_data_res)
+            if (data_files.empty())
                 return nullptr;
-        }
 
-        chassert(!data_files.empty());
+            LOG_TEST(log, "Current data files: {}", data_files.size());
 
-        auto object = data_files.front();
-        data_files.pop_front();
+            object = data_files.front();
+            data_files.pop_front();
+            if (data_files.empty())
+                schedule_next_batch_cv.notify_one();
+        }
 
         chassert(object);
+        object->metadata = object_storage->getObjectMetadata(object->getPath());
+
+        if (callback)
+        {
+            chassert(object->metadata);
+            callback(DB::FileProgress(0, object->metadata->size_bytes));
+        }
         return object;
     }
 
@@ -158,39 +225,66 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
             "Scanned file: {}, size: {}, num records: {}, partition columns: {}",
             full_path, size, stats->num_records, partitions_info.size());
 
-        auto metadata = context->object_storage->getObjectMetadata(full_path);
         DB::ObjectInfoPtr object;
         if (partitions_info.empty())
-            object = std::make_shared<DB::ObjectInfo>(std::move(full_path), metadata);
+            object = std::make_shared<DB::ObjectInfo>(std::move(full_path));
         else
-            object = std::make_shared<DB::ObjectInfoWithPartitionColumns>(std::move(partitions_info), std::move(full_path), metadata);
+            object = std::make_shared<DB::ObjectInfoWithPartitionColumns>(std::move(partitions_info), std::move(full_path));
 
-        context->data_files.push_back(std::move(object));
+        {
+            std::lock_guard lock(context->next_mutex);
+            context->data_files.push_back(std::move(object));
+        }
+        context->data_files_cv.notify_one();
     }
 
 private:
     using KernelScan = KernelPointerWrapper<ffi::SharedScan, ffi::free_scan>;
     using KernelScanDataIterator = KernelPointerWrapper<ffi::SharedScanDataIterator, ffi::free_kernel_scan_data>;
 
-    const KernelScan scan;
-    const KernelScanDataIterator scan_data_iterator;
+
+    const KernelExternEngine & engine;
+    const KernelSnapshot & snapshot;
+    KernelScan scan;
+    KernelScanDataIterator scan_data_iterator;
+
     const std::string data_prefix;
     const DB::NamesAndTypesList & schema;
     const DB::Names & partition_columns;
     const DB::ObjectStoragePtr object_storage;
+    const DB::IDataLakeMetadata::FileProgressCallback callback;
+    const size_t list_batch_size;
     const LoggerPtr log;
 
+    /// Whether scanDataFunc should stop scanning.
+    /// Set in destructor.
+    std::atomic<bool> shutdown = false;
+    /// A CV to notify that new data_files are available.
+    std::condition_variable data_files_cv;
+    /// A flag meaning that all data files were scanned
+    /// and data scanning thread is finished.
+    bool iterator_finished = false;
+
+    /// A CV to notify data scanning thread to continue,
+    /// as current data batch is fully read.
+    std::condition_variable schedule_next_batch_cv;
+
     std::deque<DB::ObjectInfoPtr> data_files;
     std::mutex next_mutex;
+
+    /// A thread for async data scanning.
+    ThreadFromGlobalPool thread;
 };
 
 
 TableSnapshot::TableSnapshot(
     KernelHelperPtr helper_,
     DB::ObjectStoragePtr object_storage_,
+    bool read_schema_same_as_table_schema_,
     LoggerPtr log_)
     : helper(helper_)
     , object_storage(object_storage_)
+    , read_schema_same_as_table_schema(read_schema_same_as_table_schema_)
     , log(log_)
 {
 }
@@ -222,13 +316,15 @@ void TableSnapshot::initSnapshot() const
 
 void TableSnapshot::initSnapshotImpl() const
 {
+    LOG_TEST(log, "Initializing snapshot");
+
     auto * engine_builder = helper->createBuilder();
     engine = KernelUtils::unwrapResult(ffi::builder_build(engine_builder), "builder_build");
     snapshot = KernelUtils::unwrapResult(
         ffi::snapshot(KernelUtils::toDeltaString(helper->getTableLocation()), engine.get()), "snapshot");
     snapshot_version = ffi::version(snapshot.get());
 
-    LOG_TEST(log, "Snapshot version: {}", snapshot_version);
+    LOG_TRACE(log, "Snapshot version: {}", snapshot_version);
 }
 
 ffi::SharedSnapshot * TableSnapshot::getSnapshot()
@@ -238,7 +334,7 @@ ffi::SharedSnapshot * TableSnapshot::getSnapshot()
     return snapshot.get();
 }
 
-DB::ObjectIterator TableSnapshot::iterate()
+DB::ObjectIterator TableSnapshot::iterate(DB::IDataLakeMetadata::FileProgressCallback callback, size_t list_batch_size)
 {
     initSnapshot();
     return std::make_shared<TableSnapshot::Iterator>(
@@ -248,6 +344,8 @@ DB::ObjectIterator TableSnapshot::iterate()
         getTableSchema(),
         getPartitionColumns(),
         object_storage,
+        callback,
+        list_batch_size,
         log);
 }
 
@@ -256,13 +354,16 @@ const DB::NamesAndTypesList & TableSnapshot::getTableSchema()
     if (!table_schema.has_value())
     {
         table_schema = getTableSchemaFromSnapshot(getSnapshot());
-        LOG_TEST(log, "Fetched table schema: {}", table_schema->toString());
+        LOG_TRACE(log, "Fetched table schema");
+        LOG_TEST(log, "Table schema: {}", table_schema->toString());
     }
     return table_schema.value();
 }
 
 const DB::NamesAndTypesList & TableSnapshot::getReadSchema()
 {
+    if (read_schema_same_as_table_schema)
+        return getTableSchema();
     if (!read_schema.has_value())
         loadReadSchemaAndPartitionColumns();
     return read_schema.value();
@@ -279,11 +380,23 @@ void TableSnapshot::loadReadSchemaAndPartitionColumns()
 {
     auto * current_snapshot = getSnapshot();
     chassert(engine.get());
-    std::tie(read_schema, partition_columns) = getReadSchemaAndPartitionColumnsFromSnapshot(current_snapshot, engine.get());
+    if (read_schema_same_as_table_schema)
+    {
+        partition_columns = getPartitionColumnsFromSnapshot(current_snapshot, engine.get());
+        LOG_TRACE(
+            log, "Fetched partition columns: {}",
+            fmt::join(partition_columns.value(), ", "));
+    }
+    else
+    {
+        std::tie(read_schema, partition_columns) = getReadSchemaAndPartitionColumnsFromSnapshot(current_snapshot, engine.get());
+        LOG_TRACE(
+            log, "Fetched read schema and partition columns: {}",
+            fmt::join(partition_columns.value(), ", "));
+
+        LOG_TEST(log, "Read schema: {}", read_schema->toString());
+    }
 
-    LOG_TEST(
-        log, "Fetched read schema: {}, partition columns: {}",
-        read_schema->toString(), fmt::join(partition_columns.value(), ", "));
 }
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h
index dde7159a9e48..b7f4d7831d55 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h
@@ -9,6 +9,7 @@
 #include <Common/Logger.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Storages/ObjectStorage/IObjectIterator.h>
+#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
 #include "KernelPointerWrapper.h"
 #include "KernelHelper.h"
 #include <boost/noncopyable.hpp>
@@ -29,6 +30,7 @@ class TableSnapshot
     explicit TableSnapshot(
         KernelHelperPtr helper_,
         DB::ObjectStoragePtr object_storage_,
+        bool read_schema_same_as_table_schema_,
         LoggerPtr log_);
 
     /// Get snapshot version.
@@ -38,7 +40,7 @@ class TableSnapshot
     bool update();
 
     /// Iterate over DeltaLake data files.
-    DB::ObjectIterator iterate();
+    DB::ObjectIterator iterate(DB::IDataLakeMetadata::FileProgressCallback callback, size_t list_batch_size);
 
     /// Get schema from DeltaLake table metadata.
     const DB::NamesAndTypesList & getTableSchema();
@@ -59,6 +61,7 @@ class TableSnapshot
 
     const KernelHelperPtr helper;
     const DB::ObjectStoragePtr object_storage;
+    const bool read_schema_same_as_table_schema;
     const LoggerPtr log;
 
     mutable KernelExternEngine engine;
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.cpp
index d85c48df15e4..48cf027793c6 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.cpp
@@ -463,6 +463,13 @@ getReadSchemaAndPartitionColumnsFromSnapshot(ffi::SharedSnapshot * snapshot, ffi
     return {data.getSchemaResult(), data.getPartitionColumns()};
 }
 
+DB::Names getPartitionColumnsFromSnapshot(ffi::SharedSnapshot * snapshot, ffi::SharedExternEngine * engine)
+{
+    SchemaVisitorData data;
+    SchemaVisitor::visitPartitionColumns(snapshot, engine, data);
+    return data.getPartitionColumns();
+}
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.h b/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.h
index d9c7f081657b..c7d511baa951 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/getSchemaFromSnapshot.h
@@ -27,6 +27,8 @@ DB::NamesAndTypesList getTableSchemaFromSnapshot(ffi::SharedSnapshot * snapshot)
 std::pair<DB::NamesAndTypesList, DB::Names>
 getReadSchemaAndPartitionColumnsFromSnapshot(ffi::SharedSnapshot * snapshot, ffi::SharedExternEngine * engine);
 
+DB::Names getPartitionColumnsFromSnapshot(ffi::SharedSnapshot * snapshot, ffi::SharedExternEngine * engine);
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
index ce0d35d08541..12792f42f85c 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
@@ -16,6 +16,7 @@ namespace DB
 namespace StorageObjectStorageSetting
 {
 extern const StorageObjectStorageSettingsBool allow_experimental_delta_kernel_rs;
+extern const StorageObjectStorageSettingsBool delta_lake_read_schema_same_as_table_schema;
 }
 
 struct DeltaLakePartitionColumn
@@ -59,8 +60,13 @@ class DeltaLakeMetadata final : public IDataLakeMetadata
     {
 #if USE_DELTA_KERNEL_RS
         auto configuration_ptr = configuration.lock();
-        if (configuration_ptr->getSettingsRef()[StorageObjectStorageSetting::allow_experimental_delta_kernel_rs])
-            return std::make_unique<DeltaLakeMetadataDeltaKernel>(object_storage, configuration, local_context);
+        const auto & settings_ref = configuration_ptr->getSettingsRef();
+        if (settings_ref[StorageObjectStorageSetting::allow_experimental_delta_kernel_rs])
+            return std::make_unique<DeltaLakeMetadataDeltaKernel>(
+                object_storage,
+                configuration,
+                local_context,
+                settings_ref[StorageObjectStorageSetting::delta_lake_read_schema_same_as_table_schema]);
         else
             return std::make_unique<DeltaLakeMetadata>(object_storage, configuration, local_context);
 #else
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
index 6771e02b9c37..1586eb606803 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
@@ -10,10 +10,15 @@ namespace DB
 DeltaLakeMetadataDeltaKernel::DeltaLakeMetadataDeltaKernel(
     ObjectStoragePtr object_storage,
     ConfigurationObserverPtr configuration_,
-    ContextPtr)
+    ContextPtr,
+    bool read_schema_same_as_table_schema_)
     : log(getLogger("DeltaLakeMetadata"))
     , table_snapshot(
-        std::make_shared<DeltaLake::TableSnapshot>(getKernelHelper(configuration_.lock()), object_storage, log))
+        std::make_shared<DeltaLake::TableSnapshot>(
+            getKernelHelper(configuration_.lock(), object_storage),
+            object_storage,
+            read_schema_same_as_table_schema_,
+            log))
 {
 }
 
@@ -33,9 +38,9 @@ Strings DeltaLakeMetadataDeltaKernel::getDataFiles() const
     throwNotImplemented("getDataFiles()");
 }
 
-ObjectIterator DeltaLakeMetadataDeltaKernel::iterate() const
+ObjectIterator DeltaLakeMetadataDeltaKernel::iterate(FileProgressCallback callback, size_t list_batch_size) const
 {
-    return table_snapshot->iterate();
+    return table_snapshot->iterate(callback, list_batch_size);
 }
 
 NamesAndTypesList DeltaLakeMetadataDeltaKernel::getTableSchema() const
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
index aa4b6d14323f..351b5ca7c8ea 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
@@ -7,6 +7,7 @@
 #include <Interpreters/Context_fwd.h>
 #include <Core/Types.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
+#include <Storages/ObjectStorage/StorageObjectStorageSettings.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
 #include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
 #include <Disks/ObjectStorages/IObjectStorage.h>
@@ -18,6 +19,11 @@ class TableSnapshot;
 
 namespace DB
 {
+namespace StorageObjectStorageSetting
+{
+extern const StorageObjectStorageSettingsBool allow_experimental_delta_kernel_rs;
+extern const StorageObjectStorageSettingsBool delta_lake_read_schema_same_as_table_schema;
+}
 
 class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata
 {
@@ -25,7 +31,11 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata
     using ConfigurationObserverPtr = StorageObjectStorage::ConfigurationObserverPtr;
     static constexpr auto name = "DeltaLake";
 
-    DeltaLakeMetadataDeltaKernel(ObjectStoragePtr object_storage_, ConfigurationObserverPtr configuration_, ContextPtr context_);
+    DeltaLakeMetadataDeltaKernel(
+        ObjectStoragePtr object_storage_,
+        ConfigurationObserverPtr configuration_,
+        ContextPtr context_,
+        bool read_schema_same_as_table_schema_);
 
     bool supportsUpdate() const override { return true; }
 
@@ -45,12 +55,18 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata
         ContextPtr local_context,
         bool)
     {
-        return std::make_unique<DeltaLakeMetadataDeltaKernel>(object_storage, configuration, local_context);
+        auto configuration_ptr = configuration.lock();
+        const auto & settings_ref = configuration_ptr->getSettingsRef();
+        return std::make_unique<DeltaLakeMetadataDeltaKernel>(
+            object_storage,
+            configuration,
+            local_context,
+            settings_ref[StorageObjectStorageSetting::delta_lake_read_schema_same_as_table_schema]);
     }
 
     bool supportsFileIterator() const override { return true; }
 
-    ObjectIterator iterate() const override;
+    ObjectIterator iterate(FileProgressCallback callback, size_t list_batch_size) const override;
 
 private:
     const LoggerPtr log;
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index f2110c7a7b2e..c9cdd7ea5967 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -26,7 +26,8 @@ class IDataLakeMetadata : boost::noncopyable
     /// Whether `iterate()` method is supported for the data lake.
     virtual bool supportsFileIterator() const { return false; }
     /// Return iterator to `data files`.
-    virtual ObjectIterator iterate() const { throwNotImplemented("iterate()"); }
+    using FileProgressCallback = std::function<void(FileProgress)>;
+    virtual ObjectIterator iterate(FileProgressCallback /* callback */, size_t /* list_batch_size */) const { throwNotImplemented("iterate()"); }
 
     /// Table schema from data lake metadata.
     virtual NamesAndTypesList getTableSchema() const = 0;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index c7b0fb20c827..191f7a686d2a 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -89,6 +89,7 @@ StorageObjectStorage::StorageObjectStorage(
     LoadingStrictnessLevel mode,
     bool distributed_processing_,
     ASTPtr partition_by_,
+    bool is_table_function_,
     bool lazy_init)
     : IStorage(table_id_)
     , configuration(configuration_)
@@ -99,6 +100,7 @@ StorageObjectStorage::StorageObjectStorage(
     , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName())))
 {
     bool do_lazy_init = lazy_init && !columns_.empty() && !configuration->format.empty();
+    update_configuration_on_read = !is_table_function_ || do_lazy_init;
     bool failed_init = false;
     auto do_init = [&]()
     {
@@ -364,7 +366,11 @@ void StorageObjectStorage::read(
     size_t max_block_size,
     size_t num_streams)
 {
-    configuration->update(object_storage, local_context);
+    /// We did configuration->update() in constructor,
+    /// so in case of table function there is no need to do the same here again.
+    if (update_configuration_on_read)
+        configuration->update(object_storage, local_context);
+
     if (partition_by && configuration->withPartitionWildcard())
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED,
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h
index c4ef7e7cdcc7..997eeb958952 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.h
@@ -71,6 +71,7 @@ class StorageObjectStorage : public IStorage
         LoadingStrictnessLevel mode,
         bool distributed_processing_ = false,
         ASTPtr partition_by_ = nullptr,
+        bool is_table_function_ = false,
         bool lazy_init = false);
 
     String getName() const override;
@@ -157,6 +158,7 @@ class StorageObjectStorage : public IStorage
     const std::optional<FormatSettings> format_settings;
     const ASTPtr partition_by;
     const bool distributed_processing;
+    bool update_configuration_on_read;
 
     LoggerPtr log;
 };
@@ -248,7 +250,7 @@ class StorageObjectStorage::Configuration
     virtual std::optional<ColumnsDescription> tryGetTableStructureFromMetadata() const;
 
     virtual bool supportsFileIterator() const { return false; }
-    virtual ObjectIterator iterate()
+    virtual ObjectIterator iterate(std::function<void(FileProgress)> /* callback */, size_t /* list_batch_size */)
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method iterate() is not implemented for configuration type {}", getTypeName());
     }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp
index b220303c43e2..7cc22fbdb335 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSettings.cpp
@@ -22,6 +22,9 @@ If enabled, the engine would use delta-kernel-rs for DeltaLake metadata parsing
 )", 0) \
     DECLARE(String, object_storage_cluster, "", R"(
 Cluster for distributed requests
+)", 0) \
+    DECLARE(Bool, delta_lake_read_schema_same_as_table_schema, false, R"(
+Whether delta-lake read schema is the same as table schema.
 )", 0) \
     DECLARE(String, iceberg_metadata_file_path, "", R"(
 Explicit path to desired Iceberg metadata file, should be relative to path in object storage. Make sense for table function use case only.
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index 55a7c8e214e1..225d5e17452b 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -163,7 +163,7 @@ std::shared_ptr<IObjectIterator> StorageObjectStorageSource::createFileIterator(
     }
     else if (configuration->supportsFileIterator())
     {
-        return configuration->iterate();
+        return configuration->iterate(file_progress_callback, query_settings.list_object_keys_size);
     }
     else
     {
diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp
index aa73b55baea4..2456473ef7f7 100644
--- a/src/TableFunctions/TableFunctionObjectStorage.cpp
+++ b/src/TableFunctions/TableFunctionObjectStorage.cpp
@@ -132,7 +132,8 @@ StoragePtr TableFunctionObjectStorage<Definition, Configuration>::executeImpl(
         /* format_settings */ std::nullopt,
         /* mode */ LoadingStrictnessLevel::CREATE,
         /* distributed_processing */ false,
-        nullptr);
+        /* partition_by */ nullptr,
+        /* is_table_function */ true);
 
     storage->startup();
     return storage;

From 4751016b5c9cc2fb921a7fe0982671e94c7bacb0 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Thu, 3 Apr 2025 16:20:43 +0000
Subject: [PATCH 11/14] Merge pull request #78486 from
 ClickHouse/support-partition-pruning-in-delta-kernel

Support partition pruning in DeltaLake engine
---
 src/Common/ProfileEvents.cpp                  |   2 +
 .../DataLakes/DataLakeConfiguration.h         |   7 +-
 .../DataLakes/DeltaLake/PartitionPruner.cpp   | 109 ++++++++++++++++
 .../DataLakes/DeltaLake/PartitionPruner.h     |  36 ++++++
 .../DataLakes/DeltaLake/TableSnapshot.cpp     |  85 ++++++++----
 .../DataLakes/DeltaLake/TableSnapshot.h       |  21 +--
 .../DeltaLakeMetadataDeltaKernel.cpp          |   7 +-
 .../DataLakes/DeltaLakeMetadataDeltaKernel.h  |   5 +-
 .../DataLakes/IDataLakeMetadata.h             |   5 +-
 .../ObjectStorage/StorageObjectStorage.cpp    |  19 +--
 .../ObjectStorage/StorageObjectStorage.h      |   5 +-
 .../StorageObjectStorageCluster.cpp           |   3 +-
 .../StorageObjectStorageSource.cpp            |   6 +-
 .../StorageObjectStorageSource.h              |   1 +
 tests/integration/test_storage_delta/test.py  | 121 +++++++++++++++++-
 15 files changed, 377 insertions(+), 55 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.cpp
 create mode 100644 src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.h

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 580e26fbdc40..81d01cc14ec7 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -225,6 +225,8 @@
     M(JoinProbeTableRowCount, "Total number of rows in the probe table for a JOIN operation.", ValueType::Number) \
     M(JoinResultRowCount, "Total number of rows in the result of a JOIN operation.", ValueType::Number) \
     \
+    M(DeltaLakePartitionPrunedFiles, "Number of skipped files during DeltaLake partition pruning", ValueType::Number) \
+    \
     M(SlowRead, "Number of reads from a file that were slow. This indicate system overload. Thresholds are controlled by read_backoff_* settings.", ValueType::Number) \
     M(ReadBackoff, "Number of times the number of query processing threads was lowered due to slow reads.", ValueType::Number) \
     \
diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index 9f439a170d91..918ac44eb1d9 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -142,10 +142,13 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
         return current_metadata->supportsFileIterator();
     }
 
-    ObjectIterator iterate(IDataLakeMetadata::FileProgressCallback callback, size_t list_batch_size) override
+    ObjectIterator iterate(
+        const ActionsDAG * filter_dag,
+        IDataLakeMetadata::FileProgressCallback callback,
+        size_t list_batch_size) override
     {
         chassert(current_metadata);
-        return current_metadata->iterate(callback, list_batch_size);
+        return current_metadata->iterate(filter_dag, callback, list_batch_size);
     }
 
     /// This is an awful temporary crutch,
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.cpp
new file mode 100644
index 000000000000..9a487b6cd617
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.cpp
@@ -0,0 +1,109 @@
+#include "PartitionPruner.h"
+
+#if USE_DELTA_KERNEL_RS
+#include <DataTypes/DataTypeNullable.h>
+
+#include <Interpreters/ActionsDAG.h>
+#include <Interpreters/Context_fwd.h>
+
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTIdentifier.h>
+
+#include <Storages/MergeTree/KeyCondition.h>
+#include <Storages/KeyDescription.h>
+
+
+namespace DB::ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+namespace DeltaLake
+{
+
+namespace
+{
+    DB::ASTPtr createPartitionKeyAST(const DB::Names & partition_columns)
+    {
+        /// DeltaLake supports only plain partition keys,
+        /// e.g. by column names without any functions.
+
+        std::shared_ptr<DB::ASTFunction> partition_key_ast = std::make_shared<DB::ASTFunction>();
+        partition_key_ast->name = "tuple";
+        partition_key_ast->arguments = std::make_shared<DB::ASTExpressionList>();
+        partition_key_ast->children.push_back(partition_key_ast->arguments);
+
+        for (const auto & column_name : partition_columns)
+        {
+            auto partition_ast = std::make_shared<DB::ASTIdentifier>(column_name);
+            partition_key_ast->arguments->children.emplace_back(std::move(partition_ast));
+        }
+        return partition_key_ast;
+    }
+
+    DB::ColumnsDescription getPartitionColumnsDescription(
+        const DB::Names & partition_columns,
+        const DB::NamesAndTypesList & table_schema)
+    {
+        DB::NamesAndTypesList names_and_types;
+        for (const auto & column_name : partition_columns)
+        {
+            auto column = table_schema.tryGetByName(column_name);
+            if (!column.has_value())
+                throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Not found partition column in schema: {}", column_name);
+            names_and_types.emplace_back(column_name, removeNullable(column->type));
+        }
+        return DB::ColumnsDescription(names_and_types);
+    }
+}
+
+PartitionPruner::PartitionPruner(
+    const DB::ActionsDAG & filter_dag,
+    const DB::NamesAndTypesList & table_schema_,
+    const DB::Names & partition_columns_,
+    DB::ContextPtr context)
+{
+    if (!partition_columns_.empty())
+    {
+        const auto partition_columns_description = getPartitionColumnsDescription(partition_columns_, table_schema_);
+        const auto partition_key_ast = createPartitionKeyAST(partition_columns_);
+
+        partition_key = DB::KeyDescription::getKeyFromAST(
+            partition_key_ast,
+            partition_columns_description,
+            context);
+
+        key_condition.emplace(
+            &filter_dag, context, partition_key.column_names, partition_key.expression, true /* single_point */);
+    }
+}
+
+bool PartitionPruner::canBePruned(const DB::ObjectInfoWithPartitionColumns & object_info) const
+{
+    if (!key_condition.has_value())
+        return false;
+
+    DB::Row partition_key_values;
+    partition_key_values.reserve(object_info.partitions_info.size());
+
+    for (const auto & [name_and_type, value] : object_info.partitions_info)
+    {
+        if (value.isNull())
+            partition_key_values.push_back(DB::POSITIVE_INFINITY); /// NULL_LAST
+        else
+            partition_key_values.push_back(value);
+    }
+
+    std::vector<DB::FieldRef> partition_key_values_ref(partition_key_values.begin(), partition_key_values.end());
+    bool can_be_true = key_condition->mayBeTrueInRange(
+        partition_key_values_ref.size(),
+        partition_key_values_ref.data(),
+        partition_key_values_ref.data(),
+        partition_key.data_types);
+
+    return !can_be_true;
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.h b/src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.h
new file mode 100644
index 000000000000..bb64aa64aebe
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/PartitionPruner.h
@@ -0,0 +1,36 @@
+#pragma once
+#include "config.h"
+#include <Interpreters/Context_fwd.h>
+#include <Storages/ObjectStorage/DataLakes/DeltaLake/ObjectInfoWithPartitionColumns.h>
+
+#if USE_DELTA_KERNEL_RS
+
+namespace DB
+{
+class ActionsDAG;
+class NamesAndTypesList;
+using Names = std::vector<std::string>;
+}
+
+namespace DeltaLake
+{
+
+class PartitionPruner
+{
+public:
+    PartitionPruner(
+        const DB::ActionsDAG & filter_dag,
+        const DB::NamesAndTypesList & table_schema_,
+        const DB::Names & partition_columns_,
+        DB::ContextPtr context);
+
+    bool canBePruned(const DB::ObjectInfoWithPartitionColumns & object_info) const;
+
+private:
+    std::optional<DB::KeyCondition> key_condition;
+    DB::KeyDescription partition_key;
+};
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp
index b3234ad3d9eb..43c1844a1f46 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.cpp
@@ -13,6 +13,7 @@
 #include <Common/ThreadStatus.h>
 #include <IO/ReadBufferFromString.h>
 #include "getSchemaFromSnapshot.h"
+#include "PartitionPruner.h"
 #include "KernelUtils.h"
 
 namespace fs = std::filesystem;
@@ -23,6 +24,11 @@ namespace DB::ErrorCodes
     extern const int NOT_IMPLEMENTED;
 }
 
+namespace ProfileEvents
+{
+    extern const Event DeltaLakePartitionPrunedFiles;
+}
+
 namespace DB
 {
 
@@ -60,6 +66,7 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
         const DB::NamesAndTypesList & schema_,
         const DB::Names & partition_columns_,
         DB::ObjectStoragePtr object_storage_,
+        const DB::ActionsDAG * filter_dag_,
         DB::IDataLakeMetadata::FileProgressCallback callback_,
         size_t list_batch_size_,
         LoggerPtr log_)
@@ -79,6 +86,8 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
             scanDataFunc();
         })
     {
+        if (filter_dag_)
+            pruner.emplace(*filter_dag_, schema_, partition_columns_, DB::Context::getGlobalContextInstance());
     }
 
     ~Iterator() override
@@ -135,36 +144,51 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
 
     DB::ObjectInfoPtr next(size_t) override
     {
-        DB::ObjectInfoPtr object;
+        while (true)
         {
-            std::unique_lock lock(next_mutex);
-            if (!iterator_finished && data_files.empty())
+            DB::ObjectInfoPtr object;
             {
-                LOG_TEST(log, "Waiting for next data file");
-                schedule_next_batch_cv.notify_one();
-                data_files_cv.wait(lock, [&]() { return !data_files.empty() || iterator_finished; });
-            }
+                std::unique_lock lock(next_mutex);
+                if (!iterator_finished && data_files.empty())
+                {
+                    LOG_TEST(log, "Waiting for next data file");
+                    schedule_next_batch_cv.notify_one();
+                    data_files_cv.wait(lock, [&]() { return !data_files.empty() || iterator_finished; });
+                }
 
-            if (data_files.empty())
-                return nullptr;
+                if (data_files.empty())
+                    return nullptr;
 
-            LOG_TEST(log, "Current data files: {}", data_files.size());
+                LOG_TEST(log, "Current data files: {}", data_files.size());
 
-            object = data_files.front();
-            data_files.pop_front();
-            if (data_files.empty())
-                schedule_next_batch_cv.notify_one();
-        }
+                object = data_files.front();
+                data_files.pop_front();
+                if (data_files.empty())
+                    schedule_next_batch_cv.notify_one();
+            }
 
-        chassert(object);
-        object->metadata = object_storage->getObjectMetadata(object->getPath());
+            chassert(object);
+            if (pruner.has_value())
+            {
+                const auto * object_with_partition_info = dynamic_cast<const DB::ObjectInfoWithPartitionColumns *>(object.get());
+                if (object_with_partition_info && pruner->canBePruned(*object_with_partition_info))
+                {
+                    ProfileEvents::increment(ProfileEvents::DeltaLakePartitionPrunedFiles);
 
-        if (callback)
-        {
-            chassert(object->metadata);
-            callback(DB::FileProgress(0, object->metadata->size_bytes));
+                    LOG_TEST(log, "Skipping file {} according to partition pruning", object->getPath());
+                    continue;
+                }
+            }
+
+            object->metadata = object_storage->getObjectMetadata(object->getPath());
+
+            if (callback)
+            {
+                chassert(object->metadata);
+                callback(DB::FileProgress(0, object->metadata->size_bytes));
+            }
+            return object;
         }
-        return object;
     }
 
     static void visitData(
@@ -247,6 +271,7 @@ class TableSnapshot::Iterator final : public DB::IObjectIterator
     const KernelSnapshot & snapshot;
     KernelScan scan;
     KernelScanDataIterator scan_data_iterator;
+    std::optional<PartitionPruner> pruner;
 
     const std::string data_prefix;
     const DB::NamesAndTypesList & schema;
@@ -327,14 +352,17 @@ void TableSnapshot::initSnapshotImpl() const
     LOG_TRACE(log, "Snapshot version: {}", snapshot_version);
 }
 
-ffi::SharedSnapshot * TableSnapshot::getSnapshot()
+ffi::SharedSnapshot * TableSnapshot::getSnapshot() const
 {
     if (!snapshot.get())
         initSnapshot();
     return snapshot.get();
 }
 
-DB::ObjectIterator TableSnapshot::iterate(DB::IDataLakeMetadata::FileProgressCallback callback, size_t list_batch_size)
+DB::ObjectIterator TableSnapshot::iterate(
+    const DB::ActionsDAG * filter_dag,
+    DB::IDataLakeMetadata::FileProgressCallback callback,
+    size_t list_batch_size)
 {
     initSnapshot();
     return std::make_shared<TableSnapshot::Iterator>(
@@ -344,12 +372,13 @@ DB::ObjectIterator TableSnapshot::iterate(DB::IDataLakeMetadata::FileProgressCal
         getTableSchema(),
         getPartitionColumns(),
         object_storage,
+        filter_dag,
         callback,
         list_batch_size,
         log);
 }
 
-const DB::NamesAndTypesList & TableSnapshot::getTableSchema()
+const DB::NamesAndTypesList & TableSnapshot::getTableSchema() const
 {
     if (!table_schema.has_value())
     {
@@ -360,7 +389,7 @@ const DB::NamesAndTypesList & TableSnapshot::getTableSchema()
     return table_schema.value();
 }
 
-const DB::NamesAndTypesList & TableSnapshot::getReadSchema()
+const DB::NamesAndTypesList & TableSnapshot::getReadSchema() const
 {
     if (read_schema_same_as_table_schema)
         return getTableSchema();
@@ -369,14 +398,14 @@ const DB::NamesAndTypesList & TableSnapshot::getReadSchema()
     return read_schema.value();
 }
 
-const DB::Names & TableSnapshot::getPartitionColumns()
+const DB::Names & TableSnapshot::getPartitionColumns() const
 {
     if (!partition_columns.has_value())
         loadReadSchemaAndPartitionColumns();
     return partition_columns.value();
 }
 
-void TableSnapshot::loadReadSchemaAndPartitionColumns()
+void TableSnapshot::loadReadSchemaAndPartitionColumns() const
 {
     auto * current_snapshot = getSnapshot();
     chassert(engine.get());
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h
index b7f4d7831d55..7b83c993be5d 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/TableSnapshot.h
@@ -40,18 +40,21 @@ class TableSnapshot
     bool update();
 
     /// Iterate over DeltaLake data files.
-    DB::ObjectIterator iterate(DB::IDataLakeMetadata::FileProgressCallback callback, size_t list_batch_size);
+    DB::ObjectIterator iterate(
+        const DB::ActionsDAG * filter_dag,
+        DB::IDataLakeMetadata::FileProgressCallback callback,
+        size_t list_batch_size);
 
     /// Get schema from DeltaLake table metadata.
-    const DB::NamesAndTypesList & getTableSchema();
+    const DB::NamesAndTypesList & getTableSchema() const;
     /// Get read schema derived from data files.
     /// (In most cases it would be the same as table schema).
-    const DB::NamesAndTypesList & getReadSchema();
+    const DB::NamesAndTypesList & getReadSchema() const;
     /// DeltaLake stores partition columns values not in the data files,
     /// but in data file path directory names.
     /// Therefore "table schema" would contain partition columns,
     /// but "read schema" would not.
-    const DB::Names & getPartitionColumns();
+    const DB::Names & getPartitionColumns() const;
 
 private:
     class Iterator;
@@ -69,16 +72,16 @@ class TableSnapshot
     mutable KernelScan scan;
     mutable size_t snapshot_version;
 
-    std::optional<DB::NamesAndTypesList> table_schema;
-    std::optional<DB::NamesAndTypesList> read_schema;
-    std::optional<DB::Names> partition_columns;
+    mutable std::optional<DB::NamesAndTypesList> table_schema;
+    mutable std::optional<DB::NamesAndTypesList> read_schema;
+    mutable std::optional<DB::Names> partition_columns;
 
     void initSnapshot() const;
     void initSnapshotImpl() const;
     /// Both read schema and partition columns are loaded with the same data scan object,
     /// therefore we load them together.
-    void loadReadSchemaAndPartitionColumns();
-    ffi::SharedSnapshot * getSnapshot();
+    void loadReadSchemaAndPartitionColumns() const;
+    ffi::SharedSnapshot * getSnapshot() const;
 };
 
 /// TODO; Enable event tracing in DeltaKernel.
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
index 1586eb606803..830264f3d88f 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
@@ -38,9 +38,12 @@ Strings DeltaLakeMetadataDeltaKernel::getDataFiles() const
     throwNotImplemented("getDataFiles()");
 }
 
-ObjectIterator DeltaLakeMetadataDeltaKernel::iterate(FileProgressCallback callback, size_t list_batch_size) const
+ObjectIterator DeltaLakeMetadataDeltaKernel::iterate(
+    const ActionsDAG * filter_dag,
+    FileProgressCallback callback,
+    size_t list_batch_size) const
 {
-    return table_snapshot->iterate(callback, list_batch_size);
+    return table_snapshot->iterate(filter_dag, callback, list_batch_size);
 }
 
 NamesAndTypesList DeltaLakeMetadataDeltaKernel::getTableSchema() const
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
index 351b5ca7c8ea..1b0ddc7a23c4 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
@@ -66,7 +66,10 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata
 
     bool supportsFileIterator() const override { return true; }
 
-    ObjectIterator iterate(FileProgressCallback callback, size_t list_batch_size) const override;
+    ObjectIterator iterate(
+        const ActionsDAG * filter_dag,
+        FileProgressCallback callback,
+        size_t list_batch_size) const override;
 
 private:
     const LoggerPtr log;
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index c9cdd7ea5967..f8e6661d7e56 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -27,7 +27,10 @@ class IDataLakeMetadata : boost::noncopyable
     virtual bool supportsFileIterator() const { return false; }
     /// Return iterator to `data files`.
     using FileProgressCallback = std::function<void(FileProgress)>;
-    virtual ObjectIterator iterate(FileProgressCallback /* callback */, size_t /* list_batch_size */) const { throwNotImplemented("iterate()"); }
+    virtual ObjectIterator iterate(
+        const ActionsDAG * /* filter_dag */,
+        FileProgressCallback /* callback */,
+        size_t /* list_batch_size */) const { throwNotImplemented("iterate()"); }
 
     /// Table schema from data lake metadata.
     virtual NamesAndTypesList getTableSchema() const = 0;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index 191f7a686d2a..9c235b6fbbb8 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -67,6 +67,7 @@ String StorageObjectStorage::getPathSample(ContextPtr context)
         local_distributed_processing,
         context,
         {}, // predicate
+        {},
         {}, // virtual_columns
         nullptr, // read_keys
         {} // file_progress_callback
@@ -263,21 +264,17 @@ class ReadFromObjectStorageStep : public SourceStepWithFilter
     void applyFilters(ActionDAGNodes added_filter_nodes) override
     {
         SourceStepWithFilter::applyFilters(std::move(added_filter_nodes));
-        const ActionsDAG::Node * predicate = nullptr;
         if (filter_actions_dag.has_value())
         {
-            predicate = filter_actions_dag->getOutputs().at(0);
             if (getContext()->getSettingsRef()[Setting::use_iceberg_partition_pruning])
-            {
                 configuration->implementPartitionPruning(*filter_actions_dag);
-            }
         }
-        createIterator(predicate);
+        createIterator();
     }
 
     void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override
     {
-        createIterator(nullptr);
+        createIterator();
 
         Pipes pipes;
         auto context = getContext();
@@ -329,14 +326,19 @@ class ReadFromObjectStorageStep : public SourceStepWithFilter
     size_t num_streams;
     const bool distributed_processing;
 
-    void createIterator(const ActionsDAG::Node * predicate)
+    void createIterator()
     {
         if (iterator_wrapper)
             return;
+
+        const ActionsDAG::Node * predicate = nullptr;
+        if (filter_actions_dag.has_value())
+            predicate = filter_actions_dag->getOutputs().at(0);
+
         auto context = getContext();
         iterator_wrapper = StorageObjectStorageSource::createFileIterator(
             configuration, configuration->getQuerySettings(context), object_storage, distributed_processing,
-            context, predicate, virtual_columns, nullptr, context->getFileProgressCallback());
+            context, predicate, filter_actions_dag, virtual_columns, nullptr, context->getFileProgressCallback());
     }
 };
 }
@@ -501,6 +503,7 @@ std::unique_ptr<ReadBufferIterator> StorageObjectStorage::createReadBufferIterat
         false/* distributed_processing */,
         context,
         {}/* predicate */,
+        {},
         {}/* virtual_columns */,
         &read_keys);
 
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h
index 997eeb958952..2156e6b32eb3 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.h
@@ -250,7 +250,10 @@ class StorageObjectStorage::Configuration
     virtual std::optional<ColumnsDescription> tryGetTableStructureFromMetadata() const;
 
     virtual bool supportsFileIterator() const { return false; }
-    virtual ObjectIterator iterate(std::function<void(FileProgress)> /* callback */, size_t /* list_batch_size */)
+    virtual ObjectIterator iterate(
+        const ActionsDAG * /* filter_dag */,
+        std::function<void(FileProgress)> /* callback */,
+        size_t /* list_batch_size */)
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method iterate() is not implemented for configuration type {}", getTypeName());
     }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
index d152e86ed5ec..0e67d3c2f80e 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
@@ -52,6 +52,7 @@ String StorageObjectStorageCluster::getPathSample(StorageInMemoryMetadata metada
         false, // distributed_processing
         context,
         {}, // predicate
+        {},
         metadata.getColumns().getAll(), // virtual_columns
         nullptr, // read_keys
         {} // file_progress_callback
@@ -343,7 +344,7 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten
 {
     auto iterator = StorageObjectStorageSource::createFileIterator(
         configuration, configuration->getQuerySettings(local_context), object_storage, /* distributed_processing */false,
-        local_context, predicate, getVirtualsList(), nullptr, local_context->getFileProgressCallback());
+        local_context, predicate, {}, getVirtualsList(), nullptr, local_context->getFileProgressCallback());
 
     auto task_distributor = std::make_shared<StorageObjectStorageStableTaskDistributor>(iterator, ids_of_replicas);
 
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index 225d5e17452b..5622a2605535 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -125,6 +125,7 @@ std::shared_ptr<IObjectIterator> StorageObjectStorageSource::createFileIterator(
     bool distributed_processing,
     const ContextPtr & local_context,
     const ActionsDAG::Node * predicate,
+    const std::optional<ActionsDAG> & filter_actions_dag,
     const NamesAndTypesList & virtual_columns,
     ObjectInfos * read_keys,
     std::function<void(FileProgress)> file_progress_callback)
@@ -163,7 +164,10 @@ std::shared_ptr<IObjectIterator> StorageObjectStorageSource::createFileIterator(
     }
     else if (configuration->supportsFileIterator())
     {
-        return configuration->iterate(file_progress_callback, query_settings.list_object_keys_size);
+        return configuration->iterate(
+            filter_actions_dag.has_value() ? &filter_actions_dag.value() : nullptr,
+            file_progress_callback,
+            query_settings.list_object_keys_size);
     }
     else
     {
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h
index a197240a0734..1301c089e1e0 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h
@@ -54,6 +54,7 @@ class StorageObjectStorageSource : public SourceWithKeyCondition
         bool distributed_processing,
         const ContextPtr & local_context,
         const ActionsDAG::Node * predicate,
+        const std::optional<ActionsDAG> & filter_actions_dag,
         const NamesAndTypesList & virtual_columns,
         ObjectInfos * read_keys,
         std::function<void(FileProgress)> file_progress_callback = {});
diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py
index 093a9c546214..eb05a34bc321 100644
--- a/tests/integration/test_storage_delta/test.py
+++ b/tests/integration/test_storage_delta/test.py
@@ -642,16 +642,30 @@ def test_partition_columns(started_cluster, use_delta_kernel):
         )
     )
     assert result == num_rows
+
+    query_id = f"query_with_filter_{TABLE_NAME}"
     result = int(
         instance.query(
             f"""SELECT count()
             FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', 'minio123', SETTINGS allow_experimental_delta_kernel_rs={use_delta_kernel})
             WHERE c == toDateTime('2000/01/05')
-            """
+            """,
+            query_id=query_id,
         )
     )
     assert result == 1
 
+    if use_delta_kernel == 1:
+        instance.query("SYSTEM FLUSH LOGS")
+        assert num_rows - 1 == int(
+            instance.query(
+                f"""
+            SELECT ProfileEvents['DeltaLakePartitionPrunedFiles']
+            FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'
+        """
+            )
+        )
+
     instance.query(
         f"""
        DROP TABLE IF EXISTS {TABLE_NAME};
@@ -1023,3 +1037,108 @@ def test_replicated_database_and_unavailable_s3(started_cluster, use_delta_kerne
         assert "123456" not in node2.query(
             f"SELECT * FROM system.zookeeper WHERE path = '{replica_path}'"
         )
+
+
+@pytest.mark.parametrize("use_delta_kernel", ["1"])
+def test_partition_columns_2(started_cluster, use_delta_kernel):
+    node = started_cluster.instances["node1"]
+    table_name = randomize_table_name("test_partition_columns_2")
+
+    schema = pa.schema(
+        [
+            ("a", pa.int32()),
+            ("b", pa.int32()),
+            ("c", pa.int32()),
+            ("d", pa.string()),
+            ("e", pa.string()),
+        ]
+    )
+    data = [
+        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
+        pa.array([4, 5, 6, 7, 8], type=pa.int32()),
+        pa.array([7, 7, 8, 9, 10], type=pa.int32()),
+        pa.array(["aa", "bb", "cc", "aa", "bb"], type=pa.string()),
+        pa.array(["aa", "bb", "cc", "aa", "cc"], type=pa.string()),
+    ]
+
+    storage_options = {
+        "AWS_ENDPOINT_URL": f"http://{started_cluster.minio_ip}:{started_cluster.minio_port}",
+        "AWS_ACCESS_KEY_ID": minio_access_key,
+        "AWS_SECRET_ACCESS_KEY": minio_secret_key,
+        "AWS_ALLOW_HTTP": "true",
+        "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
+    }
+    path = f"s3://root/{table_name}"
+    table = pa.Table.from_arrays(data, schema=schema)
+
+    write_deltalake(
+        path, table, storage_options=storage_options, partition_by=["c", "d"]
+    )
+
+    delta_function = f"""
+deltaLake(
+        'http://{started_cluster.minio_ip}:{started_cluster.minio_port}/root/{table_name}' ,
+        '{minio_access_key}',
+        '{minio_secret_key}',
+        SETTINGS allow_experimental_delta_kernel_rs={use_delta_kernel})
+    """
+
+    num_files = int(node.query(f"SELECT uniqExact(_path) FROM {delta_function}"))
+    assert num_files == 5
+
+    new_data = [
+        pa.array([2], type=pa.int32()),
+        pa.array([3], type=pa.int32()),
+        pa.array([7], type=pa.int32()),
+        pa.array(["aa"], type=pa.string()),
+        pa.array(["cc"], type=pa.string()),
+    ]
+    new_table_data = pa.Table.from_arrays(new_data, schema=schema)
+
+    write_deltalake(
+        path, new_table_data, storage_options=storage_options, mode="append"
+    )
+
+    assert (
+        "a\tNullable(Int32)\t\t\t\t\t\n"
+        "b\tNullable(Int32)\t\t\t\t\t\n"
+        "c\tNullable(Int32)\t\t\t\t\t\n"
+        "d\tNullable(String)\t\t\t\t\t\n"
+        "e\tNullable(String)" == node.query(f"DESCRIBE TABLE {delta_function}").strip()
+    )
+
+    num_files = int(node.query(f"SELECT uniqExact(_path) FROM {delta_function}"))
+    assert num_files == 6
+
+    query_id = f"{table_name}-{uuid.uuid4()}"
+    assert (
+        "1"
+        in node.query(
+            f" SELECT a FROM {delta_function} WHERE c = 7 and d = 'aa'",
+            query_id=query_id,
+        ).strip()
+    )
+
+    def check_pruned(count, query_id):
+        node.query("SYSTEM FLUSH LOGS")
+        assert count == int(
+            node.query(
+                f"""
+            SELECT ProfileEvents['DeltaLakePartitionPrunedFiles']
+            FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'
+        """
+            )
+        )
+
+    check_pruned(num_files - 2, query_id)
+
+    query_id = f"{table_name}-{uuid.uuid4()}"
+    assert (
+        "2"
+        in node.query(
+            f"SELECT a FROM {delta_function} WHERE c = 7 and d = 'bb'",
+            query_id=query_id,
+        ).strip()
+    )
+
+    check_pruned(num_files - 1, query_id)

From 38e5605fcc29fd6c31c3640497014344001bc7e2 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Mon, 7 Apr 2025 15:22:38 +0000
Subject: [PATCH 12/14] Merge pull request #78775 from
 ClickHouse/refactor-code-around-data-lakes

Small refactoring around data lakes
---
 .../DataLakes/DataLakeConfiguration.h         | 23 +------
 .../DataLakes/DeltaLakeMetadata.cpp           |  9 +++
 .../DataLakes/DeltaLakeMetadata.h             | 11 +++-
 .../DeltaLakeMetadataDeltaKernel.cpp          |  5 --
 .../DataLakes/DeltaLakeMetadataDeltaKernel.h  |  4 --
 .../ObjectStorage/DataLakes/HudiMetadata.cpp  | 10 ++-
 .../ObjectStorage/DataLakes/HudiMetadata.h    |  9 ++-
 .../DataLakes/IDataLakeMetadata.cpp           | 63 +++++++++++++++++++
 .../DataLakes/IDataLakeMetadata.h             | 16 +++--
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 32 +++++-----
 .../DataLakes/Iceberg/IcebergMetadata.h       | 20 +++---
 .../ObjectStorage/StorageObjectStorage.cpp    |  6 --
 .../ObjectStorage/StorageObjectStorage.h      |  2 -
 13 files changed, 132 insertions(+), 78 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp

diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index 918ac44eb1d9..6a3406668bfb 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -68,8 +68,6 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
                     ErrorCodes::FORMAT_VERSION_TOO_OLD,
                     "Metadata is not consinsent with the one which was used to infer table schema. Please, retry the query.");
             }
-            if (!supportsFileIterator())
-                BaseStorageConfiguration::setPaths(current_metadata->getDataFiles());
         }
     }
 
@@ -85,14 +83,6 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
         return std::nullopt;
     }
 
-    void implementPartitionPruning(const ActionsDAG & filter_dag) override
-    {
-        if (!current_metadata || !current_metadata->supportsPartitionPruning())
-            return;
-        BaseStorageConfiguration::setPaths(current_metadata->makePartitionPruning(filter_dag));
-    }
-
-
     std::optional<size_t> totalRows() override
     {
         if (!current_metadata)
@@ -127,20 +117,11 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl
         ContextPtr context) override
     {
         BaseStorageConfiguration::update(object_storage, context);
-        if (updateMetadataObjectIfNeeded(object_storage, context))
-        {
-            if (!supportsFileIterator())
-                BaseStorageConfiguration::setPaths(current_metadata->getDataFiles());
-        }
-
+        updateMetadataObjectIfNeeded(object_storage, context);
         return ColumnsDescription{current_metadata->getTableSchema()};
     }
 
-    bool supportsFileIterator() const override
-    {
-        chassert(current_metadata);
-        return current_metadata->supportsFileIterator();
-    }
+    bool supportsFileIterator() const override { return true; }
 
     ObjectIterator iterate(
         const ActionsDAG * filter_dag,
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp
index 314f28b030a9..48398febe0bb 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp
@@ -695,11 +695,20 @@ DeltaLakeMetadata::DeltaLakeMetadata(ObjectStoragePtr object_storage_, Configura
     data_files = result.data_files;
     schema = result.schema;
     partition_columns = result.partition_columns;
+    object_storage = object_storage_;
 
     LOG_TRACE(impl.log, "Found {} data files, {} partition files, schema: {}",
              data_files.size(), partition_columns.size(), schema.toString());
 }
 
+ObjectIterator DeltaLakeMetadata::iterate(
+    const ActionsDAG * filter_dag,
+    FileProgressCallback callback,
+    size_t /* list_batch_size */) const
+{
+    return createKeysIterator(getDataFiles(filter_dag), object_storage, callback);
+}
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
index 12792f42f85c..c9d4e865b059 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h
@@ -39,8 +39,6 @@ class DeltaLakeMetadata final : public IDataLakeMetadata
 
     DeltaLakeMetadata(ObjectStoragePtr object_storage_, ConfigurationObserverPtr configuration_, ContextPtr context_);
 
-    Strings getDataFiles() const override { return data_files; }
-
     NamesAndTypesList getTableSchema() const override { return schema; }
 
     DeltaLakePartitionColumns getPartitionColumns() const { return partition_columns; }
@@ -74,10 +72,19 @@ class DeltaLakeMetadata final : public IDataLakeMetadata
 #endif
     }
 
+protected:
+    ObjectIterator iterate(
+        const ActionsDAG * filter_dag,
+        FileProgressCallback callback,
+        size_t list_batch_size) const override;
+
 private:
     mutable Strings data_files;
     NamesAndTypesList schema;
     DeltaLakePartitionColumns partition_columns;
+    ObjectStoragePtr object_storage;
+
+    Strings getDataFiles(const ActionsDAG *) const { return data_files; }
 };
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
index 830264f3d88f..bc8a3e52e84f 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
@@ -33,11 +33,6 @@ bool DeltaLakeMetadataDeltaKernel::update(const ContextPtr &)
     return table_snapshot->update();
 }
 
-Strings DeltaLakeMetadataDeltaKernel::getDataFiles() const
-{
-    throwNotImplemented("getDataFiles()");
-}
-
 ObjectIterator DeltaLakeMetadataDeltaKernel::iterate(
     const ActionsDAG * filter_dag,
     FileProgressCallback callback,
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
index 1b0ddc7a23c4..f7f1f333cb6a 100644
--- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
+++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h
@@ -41,8 +41,6 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata
 
     bool update(const ContextPtr & context) override;
 
-    Strings getDataFiles() const override;
-
     NamesAndTypesList getTableSchema() const override;
 
     NamesAndTypesList getReadSchema() const override;
@@ -64,8 +62,6 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata
             settings_ref[StorageObjectStorageSetting::delta_lake_read_schema_same_as_table_schema]);
     }
 
-    bool supportsFileIterator() const override { return true; }
-
     ObjectIterator iterate(
         const ActionsDAG * filter_dag,
         FileProgressCallback callback,
diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp
index 77ef769ed0e9..d8deea67cb12 100644
--- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp
@@ -91,11 +91,19 @@ HudiMetadata::HudiMetadata(ObjectStoragePtr object_storage_, ConfigurationObserv
 {
 }
 
-Strings HudiMetadata::getDataFiles() const
+Strings HudiMetadata::getDataFiles(const ActionsDAG *) const
 {
     if (data_files.empty())
         data_files = getDataFilesImpl();
     return data_files;
 }
 
+ObjectIterator HudiMetadata::iterate(
+    const ActionsDAG * filter_dag,
+    FileProgressCallback callback,
+    size_t /* list_batch_size */) const
+{
+    return createKeysIterator(getDataFiles(filter_dag), object_storage, callback);
+}
+
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h
index 46291d9e6d96..a64ecfeb55dd 100644
--- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h
@@ -19,8 +19,6 @@ class HudiMetadata final : public IDataLakeMetadata, private WithContext
 
     HudiMetadata(ObjectStoragePtr object_storage_, ConfigurationObserverPtr configuration_, ContextPtr context_);
 
-    Strings getDataFiles() const override;
-
     NamesAndTypesList getTableSchema() const override { return {}; }
 
     bool operator ==(const IDataLakeMetadata & other) const override
@@ -39,12 +37,19 @@ class HudiMetadata final : public IDataLakeMetadata, private WithContext
         return std::make_unique<HudiMetadata>(object_storage, configuration, local_context);
     }
 
+protected:
+    ObjectIterator iterate(
+        const ActionsDAG * filter_dag,
+        FileProgressCallback callback,
+        size_t list_batch_size) const override;
+
 private:
     const ObjectStoragePtr object_storage;
     const ConfigurationObserverPtr configuration;
     mutable Strings data_files;
 
     Strings getDataFilesImpl() const;
+    Strings getDataFiles(const ActionsDAG * filter_dag) const;
 };
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
new file mode 100644
index 000000000000..61f31766a455
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
@@ -0,0 +1,63 @@
+#include "IDataLakeMetadata.h"
+#include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+
+namespace DB
+{
+
+namespace
+{
+
+class KeysIterator : public IObjectIterator
+{
+public:
+    KeysIterator(
+        Strings && data_files_,
+        ObjectStoragePtr object_storage_,
+        IDataLakeMetadata::FileProgressCallback callback_)
+        : data_files(data_files_)
+        , object_storage(object_storage_)
+        , callback(callback_)
+    {
+    }
+
+    size_t estimatedKeysCount() override
+    {
+        return data_files.size();
+    }
+
+    ObjectInfoPtr next(size_t) override
+    {
+        while (true)
+        {
+            size_t current_index = index.fetch_add(1, std::memory_order_relaxed);
+            if (current_index >= data_files.size())
+                return nullptr;
+
+            auto key = data_files[current_index];
+            auto object_metadata = object_storage->getObjectMetadata(key);
+
+            if (callback)
+                callback(FileProgress(0, object_metadata.size_bytes));
+
+            return std::make_shared<ObjectInfo>(key, std::move(object_metadata));
+        }
+    }
+
+private:
+    Strings data_files;
+    ObjectStoragePtr object_storage;
+    std::atomic<size_t> index = 0;
+    IDataLakeMetadata::FileProgressCallback callback;
+};
+
+}
+
+ObjectIterator IDataLakeMetadata::createKeysIterator(
+    Strings && data_files_,
+    ObjectStoragePtr object_storage_,
+    IDataLakeMetadata::FileProgressCallback callback_) const
+{
+    return std::make_shared<KeysIterator>(std::move(data_files_), object_storage_, callback_);
+}
+
+}
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index f8e6661d7e56..fe1fa151b9a0 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -20,17 +20,12 @@ class IDataLakeMetadata : boost::noncopyable
 
     virtual bool operator==(const IDataLakeMetadata & other) const = 0;
 
-    /// List all data files.
-    /// For better parallelization, iterate() method should be used.
-    virtual Strings getDataFiles() const = 0;
-    /// Whether `iterate()` method is supported for the data lake.
-    virtual bool supportsFileIterator() const { return false; }
     /// Return iterator to `data files`.
     using FileProgressCallback = std::function<void(FileProgress)>;
     virtual ObjectIterator iterate(
         const ActionsDAG * /* filter_dag */,
         FileProgressCallback /* callback */,
-        size_t /* list_batch_size */) const { throwNotImplemented("iterate()"); }
+        size_t /* list_batch_size */) const = 0;
 
     /// Table schema from data lake metadata.
     virtual NamesAndTypesList getTableSchema() const = 0;
@@ -39,9 +34,6 @@ class IDataLakeMetadata : boost::noncopyable
     /// Return nothing if read schema is the same as table schema.
     virtual NamesAndTypesList getReadSchema() const { return {}; }
 
-    virtual bool supportsPartitionPruning() { return false; }
-    virtual Strings makePartitionPruning(const ActionsDAG &) { throwNotImplemented("makePartitionPrunning()"); }
-
     virtual std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String &) const { return {}; }
     virtual std::shared_ptr<const ActionsDAG> getSchemaTransformer(const String &) const { return {}; }
 
@@ -56,7 +48,13 @@ class IDataLakeMetadata : boost::noncopyable
 
     virtual std::optional<size_t> totalRows() const { return {}; }
     virtual std::optional<size_t> totalBytes() const { return {}; }
+
 protected:
+    ObjectIterator createKeysIterator(
+        Strings && data_files_,
+        ObjectStoragePtr object_storage_,
+        IDataLakeMetadata::FileProgressCallback callback_) const;
+
     [[noreturn]] void throwNotImplemented(std::string_view method) const
     {
         throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Method `{}` is not implemented", method);
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 9f0e3237cf33..0fa0c24e778b 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -50,6 +50,7 @@ namespace Setting
 {
 extern const SettingsInt64 iceberg_timestamp_ms;
 extern const SettingsInt64 iceberg_snapshot_id;
+extern const SettingsBool use_iceberg_partition_pruning;
 }
 
 
@@ -557,18 +558,23 @@ ManifestListPtr IcebergMetadata::getManifestList(const String & filename) const
     return manifest_list_ptr;
 }
 
-Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
+Strings IcebergMetadata::getDataFiles(const ActionsDAG * filter_dag) const
 {
     if (!relevant_snapshot)
         return {};
 
-    if (!filter_dag && cached_unprunned_files_for_last_processed_snapshot.has_value())
+    bool use_partition_pruning = filter_dag && getContext()->getSettingsRef()[Setting::use_iceberg_partition_pruning];
+
+    if (!use_partition_pruning && cached_unprunned_files_for_last_processed_snapshot.has_value())
         return cached_unprunned_files_for_last_processed_snapshot.value();
 
     Strings data_files;
     for (const auto & manifest_file_ptr : *(relevant_snapshot->manifest_list))
     {
-        ManifestFilesPruner pruner(schema_processor, relevant_snapshot_schema_id, filter_dag, *manifest_file_ptr, getContext());
+        ManifestFilesPruner pruner(
+            schema_processor, relevant_snapshot_schema_id,
+            use_partition_pruning ? filter_dag : nullptr,
+            *manifest_file_ptr, getContext());
         const auto & data_files_in_manifest = manifest_file_ptr->getFiles();
         for (const auto & manifest_file_entry : data_files_in_manifest)
         {
@@ -583,7 +589,7 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
         }
     }
 
-    if (!filter_dag)
+    if (!use_partition_pruning)
     {
         cached_unprunned_files_for_last_processed_snapshot = data_files;
         return cached_unprunned_files_for_last_processed_snapshot.value();
@@ -592,16 +598,6 @@ Strings IcebergMetadata::getDataFilesImpl(const ActionsDAG * filter_dag) const
     return data_files;
 }
 
-Strings IcebergMetadata::makePartitionPruning(const ActionsDAG & filter_dag)
-{
-    auto configuration_ptr = configuration.lock();
-    if (!configuration_ptr)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
-    }
-    return getDataFilesImpl(&filter_dag);
-}
-
 std::optional<size_t> IcebergMetadata::totalRows() const
 {
     auto configuration_ptr = configuration.lock();
@@ -664,6 +660,14 @@ std::optional<size_t> IcebergMetadata::totalBytes() const
     return result;
 }
 
+ObjectIterator IcebergMetadata::iterate(
+    const ActionsDAG * filter_dag,
+    FileProgressCallback callback,
+    size_t /* list_batch_size */) const
+{
+    return createKeysIterator(getDataFiles(filter_dag), object_storage, callback);
+}
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 0826b86ca035..744a215bdbdc 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -40,12 +40,6 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
         Int32 format_version_,
         const Poco::JSON::Object::Ptr & metadata_object);
 
-
-    /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
-    /// All subsequent calls when the same data snapshot is relevant will return saved list of files (because it cannot be changed
-    /// without changing metadata file). Drops on every snapshot update.
-    Strings getDataFiles() const override { return getDataFilesImpl(nullptr); }
-
     /// Get table schema parsed from metadata.
     NamesAndTypesList getTableSchema() const override
     {
@@ -86,13 +80,15 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     bool update(const ContextPtr & local_context) override;
 
-    Strings makePartitionPruning(const ActionsDAG & filter_dag) override;
-
-    bool supportsPartitionPruning() override { return true; }
-
     std::optional<size_t> totalRows() const override;
     std::optional<size_t> totalBytes() const override;
 
+protected:
+    ObjectIterator iterate(
+        const ActionsDAG * filter_dag,
+        FileProgressCallback callback,
+        size_t list_batch_size) const override;
+
 private:
     using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFilePtr>;
     using ManifestFilesStorage = std::unordered_map<String, Iceberg::ManifestFilePtr>;
@@ -121,6 +117,8 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     mutable std::optional<Strings> cached_unprunned_files_for_last_processed_snapshot;
 
+    Strings getDataFiles(const ActionsDAG * filter_dag) const;
+
     void updateState(const ContextPtr & local_context);
 
     void updateSnapshot();
@@ -140,8 +138,6 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     Poco::JSON::Object::Ptr readJSON(const String & metadata_file_path, const ContextPtr & local_context) const;
 
-    Strings getDataFilesImpl(const ActionsDAG * filter_dag) const;
-
     Iceberg::ManifestFilePtr tryGetManifestFile(const String & filename) const;
 };
 }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index 9c235b6fbbb8..e1b1fc3d7c11 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -36,7 +36,6 @@ namespace Setting
     extern const SettingsMaxThreads max_threads;
     extern const SettingsBool optimize_count_from_files;
     extern const SettingsBool use_hive_partitioning;
-    extern const SettingsBool use_iceberg_partition_pruning;
 }
 
 namespace ErrorCodes
@@ -264,11 +263,6 @@ class ReadFromObjectStorageStep : public SourceStepWithFilter
     void applyFilters(ActionDAGNodes added_filter_nodes) override
     {
         SourceStepWithFilter::applyFilters(std::move(added_filter_nodes));
-        if (filter_actions_dag.has_value())
-        {
-            if (getContext()->getSettingsRef()[Setting::use_iceberg_partition_pruning])
-                configuration->implementPartitionPruning(*filter_actions_dag);
-        }
         createIterator();
     }
 
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h
index 2156e6b32eb3..a2f232972c8d 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.h
@@ -224,8 +224,6 @@ class StorageObjectStorage::Configuration
 
     virtual bool isDataLakeConfiguration() const { return false; }
 
-    virtual void implementPartitionPruning(const ActionsDAG &) { }
-
     virtual std::optional<size_t> totalRows() { return {}; }
     virtual std::optional<size_t> totalBytes() { return {}; }
 

From d62992318fabee60774d18ea51b3cabd59e35b85 Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Wed, 9 Apr 2025 14:12:28 +0000
Subject: [PATCH 13/14] Merge pull request #77156 from
 ClickHouse/hanfei/datalake_meatadata_cache

Support Iceberg Metadata Files Cache
---
 .../table-engines/integrations/iceberg.md     |   6 +-
 docs/en/sql-reference/statements/system.md    |   6 +-
 .../sql-reference/table-functions/iceberg.md  |   4 +
 programs/local/LocalServer.cpp                |  17 ++
 programs/server/Server.cpp                    |  17 ++
 src/Access/Common/AccessType.h                |   1 +
 src/Common/CurrentMetrics.cpp                 |   1 +
 src/Common/ProfileEvents.cpp                  |   3 +
 src/Core/Defines.h                            |   4 +
 src/Core/ServerSettings.cpp                   |   4 +
 src/Core/Settings.cpp                         |   4 +
 src/Core/SettingsChangesHistory.cpp           |   1 +
 src/Interpreters/Context.cpp                  |  44 ++++
 src/Interpreters/Context.h                    |   9 +
 src/Interpreters/InterpreterSystemQuery.cpp   |   9 +
 src/Parsers/ASTSystemQuery.cpp                |   1 +
 src/Parsers/ASTSystemQuery.h                  |   1 +
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 203 +++++++++++-------
 .../DataLakes/Iceberg/IcebergMetadata.h       |  23 +-
 .../Iceberg/IcebergMetadataFilesCache.h       | 164 ++++++++++++++
 .../DataLakes/Iceberg/ManifestFile.cpp        |  13 +-
 .../DataLakes/Iceberg/ManifestFile.h          |   7 +-
 .../integration/test_storage_iceberg/test.py  |  98 +++++++++
 .../01271_show_privileges.reference           |   1 +
 24 files changed, 548 insertions(+), 93 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h

diff --git a/docs/en/engines/table-engines/integrations/iceberg.md b/docs/en/engines/table-engines/integrations/iceberg.md
index 00c2e4626d1c..e0d65d90b2cf 100644
--- a/docs/en/engines/table-engines/integrations/iceberg.md
+++ b/docs/en/engines/table-engines/integrations/iceberg.md
@@ -249,6 +249,10 @@ In Clickhouse the behavior is consistent with Spark. You can mentally replace Sp
 
 `Iceberg` table engine and table function support data caching same as `S3`, `AzureBlobStorage`, `HDFS` storages. See [here](../../../engines/table-engines/integrations/s3.md#data-cache).
 
-## See also
+## Metadata cache {#metadata-cache}
+
+`Iceberg` table engine and table function support metadata cache storing the information of manifest files, manifest list and metadata json. The cache is stored in memory. This feature is controlled by setting `use_iceberg_metadata_files_cache`, which is enabled by default.
+
+## See also {#see-also}
 
 - [iceberg table function](/docs/sql-reference/table-functions/iceberg.md)
diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md
index 73b2922662b9..e73ffd486d67 100644
--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@@ -95,7 +95,11 @@ For more convenient (automatic) cache management, see disable_internal_dns_cache
 
 Clears the mark cache.
 
-## DROP REPLICA
+## DROP ICEBERG METADATA CACHE {#drop-iceberg-metadata-cache}
+
+Clears the iceberg metadata cache.
+
+## DROP REPLICA {#drop-replica}
 
 Dead replicas of `ReplicatedMergeTree` tables can be dropped using following syntax:
 
diff --git a/docs/en/sql-reference/table-functions/iceberg.md b/docs/en/sql-reference/table-functions/iceberg.md
index 4e561feafbc3..cab8cd1d364f 100644
--- a/docs/en/sql-reference/table-functions/iceberg.md
+++ b/docs/en/sql-reference/table-functions/iceberg.md
@@ -236,6 +236,10 @@ The second one is that while doing time travel you can't get state of table befo
 
 In Clickhouse the behavior is consistent with Spark. You can mentally replace Spark Select queries with Clickhouse Select queries and it will work the same way.
 
+## Metadata cache {#metadata-cache}
+
+`Iceberg` table engine and table function support metadata cache storing the information of manifest files, manifest list and metadata json. The cache is stored in memory. This feature is controlled by setting `use_iceberg_metadata_files_cache`, which is enabled by default.
+
 ## Aliases {#aliases}
 
 Table function `iceberg` is an alias to `icebergS3` now.
diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 1d77c8a26088..c360ac267cdc 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -99,6 +99,10 @@ namespace ServerSetting
     extern const ServerSettingsString mark_cache_policy;
     extern const ServerSettingsUInt64 mark_cache_size;
     extern const ServerSettingsDouble mark_cache_size_ratio;
+    extern const ServerSettingsString iceberg_metadata_files_cache_policy;
+    extern const ServerSettingsUInt64 iceberg_metadata_files_cache_size;
+    extern const ServerSettingsUInt64 iceberg_metadata_files_cache_max_entries;
+    extern const ServerSettingsDouble iceberg_metadata_files_cache_size_ratio;
     extern const ServerSettingsUInt64 max_active_parts_loading_thread_pool_size;
     extern const ServerSettingsUInt64 max_io_thread_pool_free_size;
     extern const ServerSettingsUInt64 max_io_thread_pool_size;
@@ -827,6 +831,19 @@ void LocalServer::processConfig()
     }
     global_context->setMMappedFileCache(mmap_cache_size);
 
+#if USE_AVRO
+    String iceberg_metadata_files_cache_policy = server_settings[ServerSetting::iceberg_metadata_files_cache_policy];
+    size_t iceberg_metadata_files_cache_size = server_settings[ServerSetting::iceberg_metadata_files_cache_size];
+    size_t iceberg_metadata_files_cache_max_entries = server_settings[ServerSetting::iceberg_metadata_files_cache_max_entries];
+    double iceberg_metadata_files_cache_size_ratio = server_settings[ServerSetting::iceberg_metadata_files_cache_size_ratio];
+    if (iceberg_metadata_files_cache_size > max_cache_size)
+    {
+        iceberg_metadata_files_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered Iceberg metadata cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(iceberg_metadata_files_cache_size));
+    }
+    global_context->setIcebergMetadataFilesCache(iceberg_metadata_files_cache_policy, iceberg_metadata_files_cache_size, iceberg_metadata_files_cache_max_entries, iceberg_metadata_files_cache_size_ratio);
+#endif
+
     /// Initialize a dummy query cache.
     global_context->setQueryCache(0, 0, 0, 0);
 
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index c41a8d9a3ff6..59a8598b2137 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -236,6 +236,10 @@ namespace ServerSetting
     extern const ServerSettingsString index_uncompressed_cache_policy;
     extern const ServerSettingsUInt64 index_uncompressed_cache_size;
     extern const ServerSettingsDouble index_uncompressed_cache_size_ratio;
+    extern const ServerSettingsString iceberg_metadata_files_cache_policy;
+    extern const ServerSettingsUInt64 iceberg_metadata_files_cache_size;
+    extern const ServerSettingsUInt64 iceberg_metadata_files_cache_max_entries;
+    extern const ServerSettingsDouble iceberg_metadata_files_cache_size_ratio;
     extern const ServerSettingsUInt64 io_thread_pool_queue_size;
     extern const ServerSettingsSeconds keep_alive_timeout;
     extern const ServerSettingsString mark_cache_policy;
@@ -1707,6 +1711,19 @@ try
     }
     global_context->setMMappedFileCache(mmap_cache_size);
 
+#if USE_AVRO
+    String iceberg_metadata_files_cache_policy = server_settings[ServerSetting::iceberg_metadata_files_cache_policy];
+    size_t iceberg_metadata_files_cache_size = server_settings[ServerSetting::iceberg_metadata_files_cache_size];
+    size_t iceberg_metadata_files_cache_max_entries = server_settings[ServerSetting::iceberg_metadata_files_cache_max_entries];
+    double iceberg_metadata_files_cache_size_ratio = server_settings[ServerSetting::iceberg_metadata_files_cache_size_ratio];
+    if (iceberg_metadata_files_cache_size > max_cache_size)
+    {
+        iceberg_metadata_files_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered Iceberg metadata cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(iceberg_metadata_files_cache_size));
+    }
+    global_context->setIcebergMetadataFilesCache(iceberg_metadata_files_cache_policy, iceberg_metadata_files_cache_size, iceberg_metadata_files_cache_max_entries, iceberg_metadata_files_cache_size_ratio);
+#endif
+
     size_t query_cache_max_size_in_bytes = config().getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
     size_t query_cache_max_entries = config().getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
     size_t query_cache_query_cache_max_entry_size_in_bytes = config().getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h
index 2e377689ccfc..d170d56500d3 100644
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@@ -167,6 +167,7 @@ enum class AccessType : uint8_t
     M(SYSTEM_DROP_CONNECTIONS_CACHE, "SYSTEM DROP CONNECTIONS CACHE, DROP CONNECTIONS CACHE", GLOBAL, SYSTEM_DROP_CACHE)  \
     M(SYSTEM_PREWARM_MARK_CACHE, "SYSTEM PREWARM MARK, PREWARM MARK CACHE, PREWARM MARKS", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_MARK_CACHE, "SYSTEM DROP MARK, DROP MARK CACHE, DROP MARKS", GLOBAL, SYSTEM_DROP_CACHE) \
+    M(SYSTEM_DROP_ICEBERG_METADATA_CACHE, "SYSTEM DROP ICEBERG_METADATA_CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_PREWARM_PRIMARY_INDEX_CACHE, "SYSTEM PREWARM PRIMARY INDEX, PREWARM PRIMARY INDEX CACHE, PREWARM PRIMARY INDEX", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_PRIMARY_INDEX_CACHE, "SYSTEM DROP PRIMARY INDEX, DROP PRIMARY INDEX CACHE, DROP PRIMARY INDEX", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_UNCOMPRESSED_CACHE, "SYSTEM DROP UNCOMPRESSED, DROP UNCOMPRESSED CACHE, DROP UNCOMPRESSED", GLOBAL, SYSTEM_DROP_CACHE) \
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 87e6af3a8b0d..07339453c4b9 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -307,6 +307,7 @@
     M(FilesystemCacheDelayedCleanupElements, "Filesystem cache elements in background cleanup queue") \
     M(FilesystemCacheHoldFileSegments, "Filesystem cache file segment which are currently hold as unreleasable") \
     M(AsyncInsertCacheSize, "Number of async insert hash id in cache") \
+    M(IcebergMetadataFilesCacheSize, "Size of the iceberg metadata cache in bytes") \
     M(SkippingIndexCacheSize, "Size of the skipping index cache in bytes") \
     M(S3Requests, "S3 requests count") \
     M(KeeperAliveConnections, "Number of alive connections") \
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 81d01cc14ec7..ec7fa7fef0a7 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -70,6 +70,9 @@
     M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.", ValueType::Number) \
     M(PrimaryIndexCacheHits, "Number of times an entry has been found in the primary index cache, so we didn't have to load a index file.", ValueType::Number) \
     M(PrimaryIndexCacheMisses, "Number of times an entry has not been found in the primary index cache, so we had to load a index file in memory, which is a costly operation, adding to query latency.", ValueType::Number) \
+    M(IcebergMetadataFilesCacheHits, "Number of times iceberg metadata files have been found in the cache.", ValueType::Number) \
+    M(IcebergMetadataFilesCacheMisses, "Number of times iceberg metadata files have not been found in the iceberg metadata cache and had to be read from (remote) disk.", ValueType::Number) \
+    M(IcebergMetadataFilesCacheWeightLost, "Approximate number of bytes evicted from the iceberg metadata cache.", ValueType::Number) \
     M(SkippingIndexCacheHits, "Number of times an index granule has been found in the skipping index cache.", ValueType::Number) \
     M(SkippingIndexCacheMisses, "Number of times an index granule has not been found in the skipping index cache and had to be read from disk.", ValueType::Number) \
     M(SkippingIndexCacheWeightLost, "Approximate number of bytes evicted from the secondary index cache.", ValueType::Number) \
diff --git a/src/Core/Defines.h b/src/Core/Defines.h
index 140463c12126..3755bcdf3160 100644
--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@@ -111,6 +111,10 @@ static constexpr auto DEFAULT_SKIPPING_INDEX_CACHE_MAX_ENTRIES = 10'000'000;
 static constexpr auto DEFAULT_MMAP_CACHE_MAX_SIZE = 1_KiB; /// chosen by rolling dice
 static constexpr auto DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE = 128_MiB;
 static constexpr auto DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_ENTRIES = 10'000;
+static constexpr auto DEFAULT_ICEBERG_METADATA_CACHE_POLICY = "SLRU";
+static constexpr auto DEFAULT_ICEBERG_METADATA_CACHE_MAX_SIZE = 1_GiB;
+static constexpr auto DEFAULT_ICEBERG_METADATA_CACHE_SIZE_RATIO = 0.5;
+static constexpr auto DEFAULT_ICEBERG_METADATA_CACHE_MAX_ENTRIES = 1000;
 static constexpr auto DEFAULT_QUERY_CACHE_MAX_SIZE = 1_GiB;
 static constexpr auto DEFAULT_QUERY_CACHE_MAX_ENTRIES = 1024uz;
 static constexpr auto DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES = 1_MiB;
diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp
index 2b09caed76cb..32de4c70d8e9 100644
--- a/src/Core/ServerSettings.cpp
+++ b/src/Core/ServerSettings.cpp
@@ -459,6 +459,10 @@ namespace DB
     DECLARE(UInt64, primary_index_cache_size, DEFAULT_PRIMARY_INDEX_CACHE_MAX_SIZE, R"(Maximum size of cache for primary index (index of MergeTree family of tables).)", 0) \
     DECLARE(Double, primary_index_cache_size_ratio, DEFAULT_PRIMARY_INDEX_CACHE_SIZE_RATIO, R"(The size of the protected queue (in case of SLRU policy) in the primary index cache relative to the cache's total size.)", 0) \
     DECLARE(Double, primary_index_cache_prewarm_ratio, 0.95, R"(The ratio of total size of mark cache to fill during prewarm.)", 0) \
+    DECLARE(String, iceberg_metadata_files_cache_policy, DEFAULT_ICEBERG_METADATA_CACHE_POLICY, "Iceberg metadata cache policy name.", 0) \
+    DECLARE(UInt64, iceberg_metadata_files_cache_size, DEFAULT_ICEBERG_METADATA_CACHE_MAX_SIZE, "Maximum size of iceberg metadata cache in bytes. Zero means disabled.", 0) \
+    DECLARE(UInt64, iceberg_metadata_files_cache_max_entries, DEFAULT_ICEBERG_METADATA_CACHE_MAX_ENTRIES, "Maximum size of iceberg metadata files cache in entries. Zero means disabled.", 0) \
+    DECLARE(Double, iceberg_metadata_files_cache_size_ratio, DEFAULT_ICEBERG_METADATA_CACHE_SIZE_RATIO, "The size of the protected queue (in case of SLRU policy) in the iceberg metadata cache relative to the cache's total size.", 0) \
     DECLARE(String, skipping_index_cache_policy, DEFAULT_SKIPPING_INDEX_CACHE_POLICY, "Skipping index cache policy name.", 0) \
     DECLARE(UInt64, skipping_index_cache_size, DEFAULT_SKIPPING_INDEX_CACHE_MAX_SIZE, "Size of cache for secondary index in bytes. Zero means disabled.", 0) \
     DECLARE(UInt64, skipping_index_cache_max_entries, DEFAULT_SKIPPING_INDEX_CACHE_MAX_ENTRIES, "Size of cache for secondary index in entries. Zero means disabled.", 0) \
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 7982534482be..404442216317 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -4256,6 +4256,10 @@ The maximum size of serialized literal in bytes to replace in `UPDATE` and `DELE
     \
     DECLARE(Float, create_replicated_merge_tree_fault_injection_probability, 0.0f, R"(
 The probability of a fault injection during table creation after creating metadata in ZooKeeper
+)", 0) \
+    \
+    DECLARE(Bool, use_iceberg_metadata_files_cache, true, R"(
+If turned on, iceberg table function and iceberg storage may utilize the iceberg metadata files cache.
 )", 0) \
     \
     DECLARE(Bool, use_query_cache, false, R"(
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index e3daf1c01107..40d50b264a1d 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -78,6 +78,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             {"allow_experimental_database_unity_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'unity'"},
             {"allow_experimental_database_glue_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'glue'"},
             {"use_page_cache_with_distributed_cache", false, false, "New setting"},
+            {"use_iceberg_metadata_files_cache", true, true, "New setting"},
             {"use_query_condition_cache", false, false, "New setting."},
             {"iceberg_timestamp_ms", 0, 0, "New setting."},
             {"iceberg_snapshot_id", 0, 0, "New setting."},
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index db5731edf574..cfddc940d182 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -38,6 +38,7 @@
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeSettings.h>
 #include <Storages/MergeTree/PrimaryIndexCache.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h>
 #include <Storages/MergeTree/SkippingIndexCache.h>
 #include <Storages/Distributed/DistributedSettings.h>
 #include <Storages/CompressionCodecSelector.h>
@@ -443,6 +444,9 @@ struct ContextSharedPart : boost::noncopyable
     mutable QueryCachePtr query_cache TSA_GUARDED_BY(mutex);                          /// Cache of query results.
     mutable MarkCachePtr index_mark_cache TSA_GUARDED_BY(mutex);                      /// Cache of marks in compressed files of MergeTree indices.
     mutable MMappedFileCachePtr mmap_cache TSA_GUARDED_BY(mutex);                     /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
+#if USE_AVRO
+    mutable IcebergMetadataFilesCachePtr iceberg_metadata_files_cache TSA_GUARDED_BY(mutex);   /// Cache of deserialized iceberg metadata files.
+#endif
     AsynchronousMetrics * asynchronous_metrics TSA_GUARDED_BY(mutex) = nullptr;       /// Points to asynchronous metrics
     mutable PageCachePtr page_cache TSA_GUARDED_BY(mutex);                            /// Userspace page cache.
     ProcessList process_list;                                   /// Executing queries at the moment.
@@ -3598,6 +3602,46 @@ void Context::clearMMappedFileCache() const
         cache->clear();
 }
 
+#if USE_AVRO
+void Context::setIcebergMetadataFilesCache(const String & cache_policy, size_t max_size_in_bytes, size_t max_entries, double size_ratio)
+{
+    std::lock_guard lock(shared->mutex);
+
+    if (shared->iceberg_metadata_files_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Iceberg metadata cache has been already created.");
+
+    shared->iceberg_metadata_files_cache = std::make_shared<IcebergMetadataFilesCache>(cache_policy, max_size_in_bytes, max_entries, size_ratio);
+}
+
+void Context::updateIcebergMetadataFilesCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+    std::lock_guard lock(shared->mutex);
+
+    if (!shared->iceberg_metadata_files_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Iceberg metadata cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("iceberg_metadata_files_cache_size", DEFAULT_ICEBERG_METADATA_CACHE_MAX_SIZE);
+    size_t max_entries = config.getUInt64("iceberg_metadata_files_cache_max_entries", DEFAULT_ICEBERG_METADATA_CACHE_MAX_ENTRIES);
+    shared->iceberg_metadata_files_cache->setMaxSizeInBytes(max_size_in_bytes);
+    shared->iceberg_metadata_files_cache->setMaxCount(max_entries);
+}
+
+std::shared_ptr<IcebergMetadataFilesCache> Context::getIcebergMetadataFilesCache() const
+{
+    std::lock_guard lock(shared->mutex);
+    return shared->iceberg_metadata_files_cache;
+}
+
+void Context::clearIcebergMetadataFilesCache() const
+{
+    auto cache = getIcebergMetadataFilesCache();
+
+    /// Clear the cache without holding context mutex to avoid blocking context for a long time
+    if (cache)
+        cache->clear();
+}
+#endif
+
 void Context::setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_rows)
 {
     std::lock_guard lock(shared->mutex);
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 603f8a85d34c..0e2f6851d9c1 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -97,6 +97,7 @@ class PrimaryIndexCache;
 class PageCache;
 class MMappedFileCache;
 class UncompressedCache;
+class IcebergMetadataFilesCache;
 class SkippingIndexCache;
 class ProcessList;
 class QueryStatus;
@@ -1127,6 +1128,14 @@ class Context: public ContextData, public std::enable_shared_from_this<Context>
     std::shared_ptr<QueryCache> getQueryCache() const;
     void clearQueryCache(const std::optional<String> & tag) const;
 
+#if USE_AVRO
+    void setIcebergMetadataFilesCache(const String & cache_policy, size_t max_size_in_bytes, size_t max_entries, double size_ratio);
+    void updateIcebergMetadataFilesCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
+    std::shared_ptr<IcebergMetadataFilesCache> getIcebergMetadataFilesCache() const;
+    void clearIcebergMetadataFilesCache() const;
+#endif
+
+
     /** Clear the caches of the uncompressed blocks and marks.
       * This is usually done when renaming tables, changing the type of columns, deleting a table.
       *  - since caches are linked to file names, and become incorrect.
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index 955cf59d6e8b..9a3b1e7d6ff8 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -388,6 +388,14 @@ BlockIO InterpreterSystemQuery::execute()
             getContext()->checkAccess(AccessType::SYSTEM_DROP_MARK_CACHE);
             system_context->clearMarkCache();
             break;
+        case Type::DROP_ICEBERG_METADATA_CACHE:
+#if USE_AVRO
+            getContext()->checkAccess(AccessType::SYSTEM_DROP_ICEBERG_METADATA_CACHE);
+            system_context->clearIcebergMetadataFilesCache();
+            break;
+#else
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "The server was compiled without the support for AVRO");
+#endif
         case Type::DROP_PRIMARY_INDEX_CACHE:
             getContext()->checkAccess(AccessType::SYSTEM_DROP_PRIMARY_INDEX_CACHE);
             system_context->clearPrimaryIndexCache();
@@ -1446,6 +1454,7 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
         case Type::DROP_DNS_CACHE:
         case Type::DROP_CONNECTIONS_CACHE:
         case Type::DROP_MARK_CACHE:
+        case Type::DROP_ICEBERG_METADATA_CACHE:
         case Type::DROP_PRIMARY_INDEX_CACHE:
         case Type::DROP_MMAP_CACHE:
         case Type::DROP_QUERY_CACHE:
diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp
index 9335766e13b6..811321975aa4 100644
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@@ -432,6 +432,7 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti
         case Type::DROP_COMPILED_EXPRESSION_CACHE:
         case Type::DROP_S3_CLIENT_CACHE:
         case Type::DROP_PARQUET_METADATA_CACHE:
+        case Type::DROP_ICEBERG_METADATA_CACHE:
         case Type::RESET_COVERAGE:
         case Type::RESTART_REPLICAS:
         case Type::JEMALLOC_PURGE:
diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h
index 521bb703f467..4a906ea457af 100644
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@@ -34,6 +34,7 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster
         DROP_MMAP_CACHE,
         DROP_QUERY_CACHE,
         DROP_COMPILED_EXPRESSION_CACHE,
+        DROP_ICEBERG_METADATA_CACHE,
         DROP_FILESYSTEM_CACHE,
         DROP_DISK_METADATA_CACHE,
         DROP_PAGE_CACHE,
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 0fa0c24e778b..be7093338fb1 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -50,6 +50,7 @@ namespace Setting
 {
 extern const SettingsInt64 iceberg_timestamp_ms;
 extern const SettingsInt64 iceberg_snapshot_id;
+extern const SettingsBool use_iceberg_metadata_files_cache;
 extern const SettingsBool use_iceberg_partition_pruning;
 }
 
@@ -92,19 +93,21 @@ IcebergMetadata::IcebergMetadata(
     const DB::ContextPtr & context_,
     Int32 metadata_version_,
     Int32 format_version_,
-    const Poco::JSON::Object::Ptr & metadata_object_)
+    const Poco::JSON::Object::Ptr & metadata_object_,
+    IcebergMetadataFilesCachePtr cache_ptr)
     : WithContext(context_)
     , object_storage(std::move(object_storage_))
     , configuration(std::move(configuration_))
     , schema_processor(IcebergSchemaProcessor())
     , log(getLogger("IcebergMetadata"))
+    , manifest_cache(cache_ptr)
     , last_metadata_version(metadata_version_)
     , last_metadata_object(metadata_object_)
     , format_version(format_version_)
     , relevant_snapshot_schema_id(-1)
     , table_location(last_metadata_object->getValue<String>(TABLE_LOCATION_FIELD))
 {
-    updateState(context_);
+    updateState(context_, true);
 }
 
 std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV2Method(const Poco::JSON::Object::Ptr & metadata_object)
@@ -290,15 +293,24 @@ static std::pair<Int32, String> getLatestOrExplicitMetadataFileAndVersion(const
 
 Poco::JSON::Object::Ptr IcebergMetadata::readJSON(const String & metadata_file_path, const ContextPtr & local_context) const
 {
-    ObjectInfo object_info(metadata_file_path);
-    auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
+    auto configuration_ptr = configuration.lock();
+    auto create_fn = [&]()
+    {
+        ObjectInfo object_info(metadata_file_path);
+        auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
 
-    String json_str;
-    readJSONObjectPossiblyInvalid(json_str, *buf);
+        String json_str;
+        readJSONObjectPossiblyInvalid(json_str, *buf);
 
-    Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
-    Poco::Dynamic::Var json = parser.parse(json_str);
-    return json.extract<Poco::JSON::Object::Ptr>();
+        Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
+        Poco::Dynamic::Var json = parser.parse(json_str);
+        return std::make_pair(json.extract<Poco::JSON::Object::Ptr>(), json.size());
+    };
+    if (manifest_cache)
+    {
+        return manifest_cache->getOrSetTableMetadata(IcebergMetadataFilesCache::getKey(configuration_ptr, metadata_file_path), create_fn);
+    }
+    return create_fn().first;
 }
 
 bool IcebergMetadata::update(const ContextPtr & local_context)
@@ -307,16 +319,20 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
 
     const auto [metadata_version, metadata_file_path] = getLatestOrExplicitMetadataFileAndVersion(object_storage, *configuration_ptr);
 
-    last_metadata_version = metadata_version;
-
-    last_metadata_object = readJSON(metadata_file_path, local_context);
+    bool metadata_file_changed = false;
+    if (last_metadata_version != metadata_version)
+    {
+        last_metadata_version = metadata_version;
+        last_metadata_object = readJSON(metadata_file_path, local_context);
+        metadata_file_changed = true;
+    }
 
     chassert(format_version == last_metadata_object->getValue<int>(FORMAT_VERSION_FIELD));
 
     auto previous_snapshot_id = relevant_snapshot_id;
     auto previous_snapshot_schema_id = relevant_snapshot_schema_id;
 
-    updateState(local_context);
+    updateState(local_context, metadata_file_changed);
 
     if (previous_snapshot_id != relevant_snapshot_id)
     {
@@ -383,7 +399,7 @@ void IcebergMetadata::updateSnapshot()
         configuration_ptr->getPath());
 }
 
-void IcebergMetadata::updateState(const ContextPtr & local_context)
+void IcebergMetadata::updateState(const ContextPtr & local_context, bool metadata_file_changed)
 {
     auto configuration_ptr = configuration.lock();
     std::optional<String> manifest_list_file;
@@ -424,7 +440,7 @@ void IcebergMetadata::updateState(const ContextPtr & local_context)
         relevant_snapshot_id = local_context->getSettingsRef()[Setting::iceberg_snapshot_id];
         updateSnapshot();
     }
-    else
+    else if (metadata_file_changed)
     {
         if (!last_metadata_object->has(CURRENT_SNAPSHOT_ID_FIELD_IN_METADATA_FILE))
             relevant_snapshot_id = -1;
@@ -464,98 +480,133 @@ DataLakeMetadataPtr IcebergMetadata::create(
 
     auto log = getLogger("IcebergMetadata");
 
-    ObjectInfo object_info(metadata_file_path);
-    auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
+    Poco::JSON::Object::Ptr object = nullptr;
+    IcebergMetadataFilesCachePtr cache_ptr = nullptr;
+    if (local_context->getSettingsRef()[Setting::use_iceberg_metadata_files_cache])
+        cache_ptr = local_context->getIcebergMetadataFilesCache();
+    else
+        LOG_TRACE(log, "Not using in-memory cache for iceberg metadata files, because the setting use_iceberg_metadata_files_cache is false.");
+
+    auto create_fn = [&]()
+    {
+        ObjectInfo object_info(metadata_file_path); // NOLINT
+        auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
+
+        String json_str;
+        readJSONObjectPossiblyInvalid(json_str, *buf);
 
-    String json_str;
-    readJSONObjectPossiblyInvalid(json_str, *buf);
+        Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
+        Poco::Dynamic::Var json = parser.parse(json_str);
+        return std::make_pair(json.extract<Poco::JSON::Object::Ptr>(), json_str.size());
+    };
 
-    Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
-    Poco::Dynamic::Var json = parser.parse(json_str);
-    const Poco::JSON::Object::Ptr & object = json.extract<Poco::JSON::Object::Ptr>();
+    if (cache_ptr)
+        object = cache_ptr->getOrSetTableMetadata(IcebergMetadataFilesCache::getKey(configuration_ptr, metadata_file_path), create_fn);
+    else
+        object = create_fn().first;
 
     IcebergSchemaProcessor schema_processor;
 
     auto format_version = object->getValue<int>(FORMAT_VERSION_FIELD);
 
     auto ptr
-        = std::make_unique<IcebergMetadata>(object_storage, configuration_ptr, local_context, metadata_version, format_version, object);
+        = std::make_unique<IcebergMetadata>(object_storage, configuration_ptr, local_context, metadata_version, format_version, object, cache_ptr);
 
     return ptr;
 }
 
-ManifestList IcebergMetadata::initializeManifestList(const String & filename) const
+void IcebergMetadata::initializeDataFiles(ManifestListPtr manifest_list_ptr) const
 {
-    auto configuration_ptr = configuration.lock();
-    if (configuration_ptr == nullptr)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
-
-    StorageObjectStorage::ObjectInfo object_info(filename);
-    auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, getContext(), log);
-    AvroForIcebergDeserializer manifest_list_deserializer(std::move(manifest_list_buf), filename, getFormatSettings(getContext()));
-
-    ManifestList manifest_list;
-
-    for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i)
+    for (const auto & manifest_file_content : *manifest_list_ptr)
     {
-        const std::string file_path = manifest_list_deserializer.getValueFromRowByName(i, MANIFEST_FILE_PATH_COLUMN, TypeIndex::String).safeGet<std::string>();
-        const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location);
-        Int64 added_sequence_number = 0;
-        if (format_version > 1)
-            added_sequence_number = manifest_list_deserializer.getValueFromRowByName(i, SEQUENCE_NUMBER_COLUMN, TypeIndex::Int64).safeGet<Int64>();
-
-        /// We can't encapsulate this logic in getManifestFile because we need not only the name of the file, but also an inherited sequence number which is known only during the parsing of ManifestList
-        auto manifest_file_content = initializeManifestFile(manifest_file_name, added_sequence_number);
-        manifest_files_by_name.emplace(manifest_file_name, manifest_file_content);
         for (const auto & data_file_path : manifest_file_content->getFiles())
         {
             if (std::holds_alternative<DataFileEntry>(data_file_path.file))
                 manifest_file_by_data_file.emplace(std::get<DataFileEntry>(data_file_path.file).file_name, manifest_file_content);
         }
-        manifest_list.push_back(manifest_file_content);
     }
-
-    return manifest_list;
 }
 
-ManifestFilePtr IcebergMetadata::initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const
+ManifestListPtr IcebergMetadata::getManifestList(const String & filename) const
 {
     auto configuration_ptr = configuration.lock();
+    if (configuration_ptr == nullptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
 
-    ObjectInfo manifest_object_info(filename);
-    auto buffer = StorageObjectStorageSource::createReadBuffer(manifest_object_info, object_storage, getContext(), log);
-    AvroForIcebergDeserializer manifest_file_deserializer(std::move(buffer), filename, getFormatSettings(getContext()));
-    auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(manifest_file_deserializer, filename);
-    schema_processor.addIcebergTableSchema(schema_object);
-    return std::make_shared<ManifestFileContent>(
-        manifest_file_deserializer,
-        format_version,
-        configuration_ptr->getPath(),
-        schema_id,
-        schema_processor,
-        inherited_sequence_number,
-        table_location,
-        getContext());
+    auto create_fn = [&]()
+    {
+        ManifestList manifest_list;
+        StorageObjectStorage::ObjectInfo object_info(filename);
+        auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, getContext(), log);
+        AvroForIcebergDeserializer manifest_list_deserializer(std::move(manifest_list_buf), filename, getFormatSettings(getContext()));
 
-}
+        ManifestFileCacheKeys manifest_file_cache_keys;
 
-ManifestFilePtr IcebergMetadata::tryGetManifestFile(const String & filename) const
-{
-    auto manifest_file_it = manifest_files_by_name.find(filename);
-    if (manifest_file_it != manifest_files_by_name.end())
-        return manifest_file_it->second;
-    return nullptr;
+        for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i)
+        {
+            const std::string file_path = manifest_list_deserializer.getValueFromRowByName(i, MANIFEST_FILE_PATH_COLUMN, TypeIndex::String).safeGet<std::string>();
+            const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location);
+            Int64 added_sequence_number = 0;
+            if (format_version > 1)
+                added_sequence_number = manifest_list_deserializer.getValueFromRowByName(i, SEQUENCE_NUMBER_COLUMN, TypeIndex::Int64).safeGet<Int64>();
+            manifest_file_cache_keys.emplace_back(manifest_file_name, added_sequence_number);
+        }
+        /// We only return the list of {file name, seq number} for cache.
+        /// Because ManifestList holds a list of ManifestFilePtr which consume much memory space.
+        /// ManifestFilePtr is shared pointers can be held for too much time, so we cache ManifestFile separately.
+        return manifest_file_cache_keys;
+    };
+
+    ManifestFileCacheKeys manifest_file_cache_keys;
+    ManifestList manifest_list;
+    if (manifest_cache)
+    {
+        manifest_file_cache_keys = manifest_cache->getOrSetManifestFileCacheKeys(IcebergMetadataFilesCache::getKey(configuration_ptr, filename), create_fn);
+    }
+    else
+    {
+        manifest_file_cache_keys = create_fn();
+    }
+    for (const auto & entry : manifest_file_cache_keys)
+    {
+        auto manifest_file_ptr = getManifestFile(entry.manifest_file_path, entry.added_sequence_number);
+        manifest_list.push_back(manifest_file_ptr);
+    }
+    ManifestListPtr manifest_list_ptr = std::make_shared<ManifestList>(std::move(manifest_list));
+    initializeDataFiles(manifest_list_ptr);
+    return manifest_list_ptr;
 }
 
-ManifestListPtr IcebergMetadata::getManifestList(const String & filename) const
+ManifestFilePtr IcebergMetadata::getManifestFile(const String & filename, Int64 inherited_sequence_number) const
 {
-    auto manifest_file_it = manifest_lists_by_name.find(filename);
-    if (manifest_file_it != manifest_lists_by_name.end())
-        return manifest_file_it->second;
     auto configuration_ptr = configuration.lock();
-    auto manifest_list_ptr = std::make_shared<ManifestList>(initializeManifestList(filename));
-    manifest_lists_by_name.emplace(filename, manifest_list_ptr);
-    return manifest_list_ptr;
+
+    auto create_fn = [&]()
+    {
+        ObjectInfo manifest_object_info(filename);
+        auto buffer = StorageObjectStorageSource::createReadBuffer(manifest_object_info, object_storage, getContext(), log);
+        AvroForIcebergDeserializer manifest_file_deserializer(std::move(buffer), filename, getFormatSettings(getContext()));
+        auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(manifest_file_deserializer, filename);
+        schema_processor.addIcebergTableSchema(schema_object);
+        return std::make_shared<ManifestFileContent>(
+            manifest_file_deserializer,
+            format_version,
+            configuration_ptr->getPath(),
+            schema_id,
+            schema_object,
+            schema_processor,
+            inherited_sequence_number,
+            table_location,
+            getContext());
+    };
+
+    if (manifest_cache)
+    {
+        auto manifest_file = manifest_cache->getOrSetManifestFile(IcebergMetadataFilesCache::getKey(configuration_ptr, filename), create_fn);
+        schema_processor.addIcebergTableSchema(manifest_file->getSchemaObject());
+        return manifest_file;
+    }
+    return create_fn();
 }
 
 Strings IcebergMetadata::getDataFiles(const ActionsDAG * filter_dag) const
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 744a215bdbdc..3f4005d153c4 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -7,6 +7,7 @@
 #include <Disks/ObjectStorages/IObjectStorage.h>
 #include <Interpreters/Context_fwd.h>
 #include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
 
 #include <Poco/JSON/Array.h>
@@ -38,7 +39,8 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
         const DB::ContextPtr & context_,
         Int32 metadata_version_,
         Int32 format_version_,
-        const Poco::JSON::Object::Ptr & metadata_object);
+        const Poco::JSON::Object::Ptr & metadata_object,
+        IcebergMetadataFilesCachePtr cache_ptr);
 
     /// Get table schema parsed from metadata.
     NamesAndTypesList getTableSchema() const override
@@ -91,16 +93,13 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
 private:
     using ManifestEntryByDataFile = std::unordered_map<String, Iceberg::ManifestFilePtr>;
-    using ManifestFilesStorage = std::unordered_map<String, Iceberg::ManifestFilePtr>;
-    using ManifestListsStorage = std::unordered_map<String, Iceberg::ManifestListPtr>;
 
     const ObjectStoragePtr object_storage;
     const ConfigurationObserverPtr configuration;
     mutable IcebergSchemaProcessor schema_processor;
     LoggerPtr log;
 
-    mutable ManifestFilesStorage manifest_files_by_name;
-    mutable ManifestListsStorage manifest_lists_by_name;
+    IcebergMetadataFilesCachePtr manifest_cache;
     mutable ManifestEntryByDataFile manifest_file_by_data_file;
 
     std::tuple<Int64, Int32> getVersion() const { return std::make_tuple(relevant_snapshot_id, relevant_snapshot_schema_id); }
@@ -117,28 +116,26 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext
 
     mutable std::optional<Strings> cached_unprunned_files_for_last_processed_snapshot;
 
-    Strings getDataFiles(const ActionsDAG * filter_dag) const;
+    void updateState(const ContextPtr & local_context, bool metadata_file_changed);
 
-    void updateState(const ContextPtr & local_context);
+    Strings getDataFiles(const ActionsDAG * filter_dag) const;
 
     void updateSnapshot();
 
-    Iceberg::ManifestList initializeManifestList(const String & filename) const;
+    Iceberg::ManifestListPtr getManifestList(const String & filename) const;
     mutable std::vector<Iceberg::ManifestFileEntry> positional_delete_files_for_current_query;
 
     void addTableSchemaById(Int32 schema_id);
 
-    Iceberg::ManifestListPtr getManifestList(const String & filename) const;
-
     std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const;
 
-    Iceberg::ManifestFilePtr initializeManifestFile(const String & filename, Int64 inherited_sequence_number) const;
+    void initializeDataFiles(Iceberg::ManifestListPtr manifest_list_ptr) const;
+
+    Iceberg::ManifestFilePtr getManifestFile(const String & filename, Int64 inherited_sequence_number) const;
 
     std::optional<String> getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata);
 
     Poco::JSON::Object::Ptr readJSON(const String & metadata_file_path, const ContextPtr & local_context) const;
-
-    Iceberg::ManifestFilePtr tryGetManifestFile(const String & filename) const;
 };
 }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h
new file mode 100644
index 000000000000..48390f9946c5
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h
@@ -0,0 +1,164 @@
+#pragma once
+#include "config.h"
+
+#if USE_AVRO
+
+#include <Core/Settings.h>
+#include <Common/CacheBase.h>
+#include <Common/HashTable/Hash.h>
+#include <Common/logger_useful.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
+#include <Storages/ObjectStorage/StorageObjectStorage.h>
+
+namespace ProfileEvents
+{
+    extern const Event IcebergMetadataFilesCacheMisses;
+    extern const Event IcebergMetadataFilesCacheHits;
+    extern const Event IcebergMetadataFilesCacheWeightLost;
+}
+
+namespace CurrentMetrics
+{
+    extern const Metric IcebergMetadataFilesCacheSize;
+}
+
+namespace DB
+{
+
+/// The structure that can identify a manifest file. We store it in cache.
+/// And we can get `ManifestFileContent` from cache by ManifestFileEntry.
+struct ManifestFileCacheKey
+{
+    String manifest_file_path;
+    Int64 added_sequence_number;
+};
+
+using ManifestFileCacheKeys = std::vector<ManifestFileCacheKey>;
+
+/// We have three kinds of metadata files in iceberg: metadata object, manifest list and manifest files.
+/// For simplicity, we keep them in the same cache.
+struct IcebergMetadataFilesCacheCell : private boost::noncopyable
+{
+    /// The cached element could be
+    /// - metadata.json deserialized as Poco::JSON::Object::Ptr
+    /// - manifest list consists of cache keys which will retrieve the manifest file from cache
+    /// - manifest file
+    std::variant<Poco::JSON::Object::Ptr, ManifestFileCacheKeys, Iceberg::ManifestFilePtr> cached_element;
+    Int64 memory_bytes;
+
+    explicit IcebergMetadataFilesCacheCell(Poco::JSON::Object::Ptr metadata_object_, size_t memory_bytes_)
+        : cached_element(metadata_object_)
+        , memory_bytes(memory_bytes_ + SIZE_IN_MEMORY_OVERHEAD)
+        , metric_increment{CurrentMetrics::IcebergMetadataFilesCacheSize, memory_bytes}
+    {
+    }
+    explicit IcebergMetadataFilesCacheCell(ManifestFileCacheKeys && manifest_file_cache_keys_)
+        : cached_element(std::move(manifest_file_cache_keys_))
+        , memory_bytes(getMemorySizeOfManifestCacheKeys(std::get<ManifestFileCacheKeys>(cached_element)) + SIZE_IN_MEMORY_OVERHEAD)
+        , metric_increment{CurrentMetrics::IcebergMetadataFilesCacheSize, memory_bytes}
+    {
+    }
+    explicit IcebergMetadataFilesCacheCell(Iceberg::ManifestFilePtr manifest_file_)
+        : cached_element(manifest_file_)
+        , memory_bytes(std::get<Iceberg::ManifestFilePtr>(cached_element)->getSizeInMemory() + SIZE_IN_MEMORY_OVERHEAD)
+        , metric_increment{CurrentMetrics::IcebergMetadataFilesCacheSize, memory_bytes}
+    {
+    }
+private:
+    CurrentMetrics::Increment metric_increment;
+    static constexpr size_t SIZE_IN_MEMORY_OVERHEAD = 200; /// we always underestimate the size of an object;
+
+    static size_t getMemorySizeOfManifestCacheKeys(const ManifestFileCacheKeys & manifest_file_cache_keys)
+    {
+         size_t total_size = 0;
+         for (const auto & entry: manifest_file_cache_keys)
+         {
+             total_size += sizeof(ManifestFileCacheKey) + entry.manifest_file_path.capacity();
+         }
+         return total_size;
+    }
+
+};
+
+struct IcebergMetadataFilesCacheWeightFunction
+{
+    size_t operator()(const IcebergMetadataFilesCacheCell & cell) const
+    {
+        return cell.memory_bytes;
+    }
+};
+
+class IcebergMetadataFilesCache : public CacheBase<String, IcebergMetadataFilesCacheCell, std::hash<String>, IcebergMetadataFilesCacheWeightFunction>
+{
+public:
+    using Base = CacheBase<String, IcebergMetadataFilesCacheCell, std::hash<String>, IcebergMetadataFilesCacheWeightFunction>;
+
+    IcebergMetadataFilesCache(const String & cache_policy, size_t max_size_in_bytes, size_t max_count, double size_ratio)
+        : Base(cache_policy, max_size_in_bytes, max_count, size_ratio)
+    {}
+
+    static String getKey(StorageObjectStorage::ConfigurationPtr config, const String & data_path)
+    {
+        return std::filesystem::path(config->getDataSourceDescription()) / data_path;
+    }
+
+    template <typename LoadFunc>
+    Poco::JSON::Object::Ptr getOrSetTableMetadata(const String & data_path, LoadFunc && load_fn)
+    {
+        auto load_fn_wrapper = [&]()
+        {
+            const auto & [json_ptr, json_size] = load_fn();
+            return std::make_shared<IcebergMetadataFilesCacheCell>(json_ptr, json_size);
+        };
+        auto result = Base::getOrSet(data_path, load_fn_wrapper);
+        if (result.second)
+            ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheMisses);
+        else
+            ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheHits);
+        return std::get<Poco::JSON::Object::Ptr>(result.first->cached_element);
+    }
+
+    template <typename LoadFunc>
+    ManifestFileCacheKeys getOrSetManifestFileCacheKeys(const String & data_path, LoadFunc && load_fn)
+    {
+        auto load_fn_wrapper = [&]()
+        {
+            auto && manifest_file_cache_keys = load_fn();
+            return std::make_shared<IcebergMetadataFilesCacheCell>(std::move(manifest_file_cache_keys));
+        };
+        auto result = Base::getOrSet(data_path, load_fn_wrapper);
+        if (result.second)
+            ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheMisses);
+        else
+            ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheHits);
+        return std::get<ManifestFileCacheKeys>(result.first->cached_element);
+    }
+
+    template <typename LoadFunc>
+    Iceberg::ManifestFilePtr getOrSetManifestFile(const String & data_path, LoadFunc && load_fn)
+    {
+        auto load_fn_wrapper = [&]()
+        {
+            Iceberg::ManifestFilePtr manifest_file = load_fn();
+            return std::make_shared<IcebergMetadataFilesCacheCell>(manifest_file);
+        };
+        auto result = Base::getOrSet(data_path, load_fn_wrapper);
+        if (result.second)
+            ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheMisses);
+        else
+            ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheHits);
+        return std::get<Iceberg::ManifestFilePtr>(result.first->cached_element);
+    }
+
+private:
+    void onRemoveOverflowWeightLoss(size_t weight_loss) override
+    {
+        ProfileEvents::increment(ProfileEvents::IcebergMetadataFilesCacheWeightLost, weight_loss);
+    }
+};
+
+using IcebergMetadataFilesCachePtr = std::shared_ptr<IcebergMetadataFilesCache>;
+
+}
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index 650a5c4af033..4d9a4c817597 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -137,12 +137,14 @@ ManifestFileContent::ManifestFileContent(
     Int32 format_version_,
     const String & common_path,
     Int32 schema_id_,
+    Poco::JSON::Object::Ptr schema_object_,
     const IcebergSchemaProcessor & schema_processor,
     Int64 inherited_sequence_number,
     const String & table_location,
     DB::ContextPtr context)
 {
     this->schema_id = schema_id_;
+    this->schema_object = schema_object_;
 
     for (const auto & column_name : {COLUMN_STATUS_NAME, COLUMN_TUPLE_DATA_FILE_NAME})
     {
@@ -331,6 +333,16 @@ const std::set<Int32> & ManifestFileContent::getColumnsIDsWithBounds() const
     return column_ids_which_have_bounds;
 }
 
+size_t ManifestFileContent::getSizeInMemory() const
+{
+    size_t total_size = sizeof(ManifestFileContent);
+    if (partition_key_description)
+        total_size += sizeof(DB::KeyDescription);
+    total_size += column_ids_which_have_bounds.size() * sizeof(Int32);
+    total_size += files.capacity() * sizeof(ManifestFileEntry);
+    return total_size;
+}
+
 std::optional<Int64> ManifestFileContent::getRowsCountInAllDataFilesExcludingDeleted() const
 {
     Int64 result = 0;
@@ -376,7 +388,6 @@ std::optional<Int64> ManifestFileContent::getBytesCountInAllDataFiles() const
             return std::nullopt;
     }
     return result;
-
 }
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 18c9ab2527c8..4ccfdb8eb0ed 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -90,6 +90,7 @@ class ManifestFileContent
         Int32 format_version_,
         const String & common_path,
         Int32 schema_id_,
+        Poco::JSON::Object::Ptr schema_object_,
         const DB::IcebergSchemaProcessor & schema_processor,
         Int64 inherited_sequence_number,
         const std::string & table_location,
@@ -100,6 +101,10 @@ class ManifestFileContent
 
     bool hasPartitionKey() const;
     const DB::KeyDescription & getPartitionKeyDescription() const;
+    Poco::JSON::Object::Ptr getSchemaObject() const { return schema_object; }
+    /// Get size in bytes of how much memory one instance of this ManifestFileContent class takes.
+    /// Used for in-memory caches size accounting.
+    size_t getSizeInMemory() const;
 
     /// Fields with rows count in manifest files are optional
     /// they can be absent.
@@ -111,7 +116,7 @@ class ManifestFileContent
 private:
 
     Int32 schema_id;
-
+    Poco::JSON::Object::Ptr schema_object;
     std::optional<DB::KeyDescription> partition_key_description;
     // Size - number of files
     std::vector<ManifestFileEntry> files;
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index 01f6e650f36f..335addfd7149 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -2630,6 +2630,104 @@ def test_iceberg_snapshot_reads(started_cluster, format_version, storage_type):
         == instance.query("SELECT number, toString(number + 1) FROM numbers(300)")
     )
 
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure"])
+def test_metadata_cache(started_cluster, storage_type):
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+    TABLE_NAME = "test_metadata_cache_" + storage_type + "_" + get_uuid_str()
+
+    write_iceberg_from_df(
+        spark,
+        generate_data(spark, 0, 10),
+        TABLE_NAME,
+        mode="overwrite",
+        format_version="1",
+        partition_by="a",
+    )
+
+    default_upload_directory(
+        started_cluster,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    table_expr = get_creation_expression(storage_type, TABLE_NAME, started_cluster, table_function=True)
+
+    query_id = f"{TABLE_NAME}-{uuid.uuid4()}"
+    instance.query(
+        f"SELECT * FROM {table_expr}", query_id=query_id,
+    )
+
+    instance.query("SYSTEM FLUSH LOGS")
+
+    assert 0 < int(
+        instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheMisses'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+        )
+    )
+
+    assert 0 == int(
+        instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheHits'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+        )
+    )
+
+    query_id = f"{TABLE_NAME}-{uuid.uuid4()}"
+    instance.query(
+        f"SELECT * FROM {table_expr}",
+        query_id=query_id,
+    )
+
+    instance.query("SYSTEM FLUSH LOGS")
+
+    assert 0 == int(
+        instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheMisses'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+        )
+    )
+
+    assert 0 < int(
+        instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheHits'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+        )
+    )
+
+    instance.query("SYSTEM DROP ICEBERG METADATA CACHE")
+
+    query_id = f"{TABLE_NAME}-{uuid.uuid4()}"
+    instance.query(
+        f"SELECT * FROM {table_expr}", query_id=query_id,
+    )
+
+    instance.query("SYSTEM FLUSH LOGS")
+
+    assert 0 < int(
+        instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheMisses'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+        )
+    )
+
+    assert 0 == int(
+        instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheHits'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
+        )
+    )
+
+    query_id = f"{TABLE_NAME}-{uuid.uuid4()}"
+    instance.query(
+        f"SELECT * FROM {table_expr}",
+        query_id=query_id,
+        settings={"use_iceberg_metadata_files_cache":"0"},
+    )
+
+    instance.query("SYSTEM FLUSH LOGS")
+    assert "0\t0\n" == instance.query(
+            f"SELECT ProfileEvents['IcebergMetadataFilesCacheHits'], ProfileEvents['IcebergMetadataFilesCacheMisses'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'",
+        )
+
+
 @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
 def test_minmax_pruning(started_cluster, storage_type):
     instance = started_cluster.instances["node1"]
diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference
index ba91e23e4ce7..ef5070c81212 100644
--- a/tests/queries/0_stateless/01271_show_privileges.reference
+++ b/tests/queries/0_stateless/01271_show_privileges.reference
@@ -116,6 +116,7 @@ SYSTEM DROP DNS CACHE	['SYSTEM DROP DNS','DROP DNS CACHE','DROP DNS']	GLOBAL	SYS
 SYSTEM DROP CONNECTIONS CACHE	['SYSTEM DROP CONNECTIONS CACHE','DROP CONNECTIONS CACHE']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM PREWARM MARK CACHE	['SYSTEM PREWARM MARK','PREWARM MARK CACHE','PREWARM MARKS']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP MARK CACHE	['SYSTEM DROP MARK','DROP MARK CACHE','DROP MARKS']	GLOBAL	SYSTEM DROP CACHE
+SYSTEM DROP ICEBERG METADATA CACHE	['SYSTEM DROP ICEBERG_METADATA_CACHE']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM PREWARM PRIMARY INDEX CACHE	['SYSTEM PREWARM PRIMARY INDEX','PREWARM PRIMARY INDEX CACHE','PREWARM PRIMARY INDEX']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP PRIMARY INDEX CACHE	['SYSTEM DROP PRIMARY INDEX','DROP PRIMARY INDEX CACHE','DROP PRIMARY INDEX']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP UNCOMPRESSED CACHE	['SYSTEM DROP UNCOMPRESSED','DROP UNCOMPRESSED CACHE','DROP UNCOMPRESSED']	GLOBAL	SYSTEM DROP CACHE

From 5c6d646f2f2a93766cf7b0299680a22ec71ec020 Mon Sep 17 00:00:00 2001
From: Anton Ivashkin <ianton@live.com>
Date: Fri, 11 Apr 2025 17:05:06 +0200
Subject: [PATCH 14/14] Fix after cherry-picks

---
 src/Core/SettingsChangesHistory.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 40d50b264a1d..a8c525f37851 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -69,21 +69,14 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             // Altinity Antalya modifications atop of 25.2
             {"object_storage_cluster", "", "", "New setting"},
             {"object_storage_max_nodes", 0, 0, "New setting"},
+            {"use_iceberg_metadata_files_cache", true, true, "New setting"},
+            {"iceberg_timestamp_ms", 0, 0, "New setting."},
+            {"iceberg_snapshot_id", 0, 0, "New setting."},
         });
         addSettingsChanges(settings_changes_history, "24.12.2.20000",
         {
             // Altinity Antalya modifications atop of 24.12
             {"input_format_parquet_use_metadata_cache", true, true, "New setting, turned ON by default"}, // https://github.com/Altinity/ClickHouse/pull/586
-            /// Release closed. Please use 25.4
-            {"allow_experimental_database_unity_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'unity'"},
-            {"allow_experimental_database_glue_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'glue'"},
-            {"use_page_cache_with_distributed_cache", false, false, "New setting"},
-            {"use_iceberg_metadata_files_cache", true, true, "New setting"},
-            {"use_query_condition_cache", false, false, "New setting."},
-            {"iceberg_timestamp_ms", 0, 0, "New setting."},
-            {"iceberg_snapshot_id", 0, 0, "New setting."},
-            {"parallel_replicas_for_cluster_engines", false, true, "New setting."},
-            /// Release closed. Please use 25.4
         });
         addSettingsChanges(settings_changes_history, "25.2",
         {