From d63fd14f38df3fbaf8c3118110d6befc82767fe8 Mon Sep 17 00:00:00 2001 From: shiyer7474 Date: Fri, 29 Nov 2024 14:55:45 +0000 Subject: [PATCH 01/10] Parquet File Metadata caching implementation --- src/Common/ProfileEvents.cpp | 2 + src/Core/Settings.h | 2 + src/Processors/Formats/IInputFormat.h | 3 + .../Formats/Impl/ParquetBlockInputFormat.cpp | 56 ++++++++++++++++++- .../Formats/Impl/ParquetBlockInputFormat.h | 19 +++++++ .../StorageObjectStorageSource.cpp | 3 + .../03262_parquet_s3_metadata_cache.reference | 3 + .../03262_parquet_s3_metadata_cache.sql | 28 ++++++++++ 8 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03262_parquet_s3_metadata_cache.reference create mode 100644 tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index d43d9fdcea8e..0a13dc3c0136 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -806,6 +806,8 @@ The server successfully detected this situation and will download merged part fr M(GWPAsanAllocateSuccess, "Number of successful allocations done by GWPAsan") \ M(GWPAsanAllocateFailed, "Number of failed allocations done by GWPAsan (i.e. filled pool)") \ M(GWPAsanFree, "Number of free operations done by GWPAsan") \ + M(ParquetMetaDataCacheHits, "Number of times the read from filesystem cache hit the cache.") \ + M(ParquetMetaDataCacheMisses, "Number of times the read from filesystem cache miss the cache.") \ #ifdef APPLY_FOR_EXTERNAL_EVENTS diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4f9abb306274..dc78d3ddc81f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1293,6 +1293,8 @@ class IColumn; M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \ M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", "Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'.", 0) \ M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, "Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple", 0) \ + M(Bool, parquet_use_metadata_cache, false, "Enable parquet file metadata caching.", 0) \ + M(UInt64, parquet_metadata_cache_max_entries, 100000, "Maximum number of parquet file metadata to cache.", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index 713c1089d289..8981b3c2916b 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -66,6 +66,9 @@ class IInputFormat : public SourceWithKeyCondition void needOnlyCount() { need_only_count = true; } + /// Set additional info/key/id related to underlying storage of the ReadBuffer + virtual void setStorageRelatedUniqueKey(const Settings & /*settings*/, const String & /*key*/) {} + protected: ReadBuffer & getReadBuffer() const { chassert(in); return *in; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 4231ef3c7ab5..2a247a78c27e 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -3,6 +3,8 @@ #if USE_PARQUET +#include +#include #include #include #include @@ -34,6 +36,12 @@ namespace CurrentMetrics extern const Metric ParquetDecoderThreadsScheduled; } +namespace ProfileEvents +{ + extern const Event ParquetMetaDataCacheHits; + extern const Event ParquetMetaDataCacheMisses; +} + namespace DB { @@ -426,6 +434,22 @@ static std::vector getHyperrectangleForRowGroup(const parquet::FileMetaDa return hyperrectangle; } +std::mutex ParquetFileMetaDataCache::mutex; + +ParquetFileMetaDataCache::ParquetFileMetaDataCache(UInt64 max_cache_entries) + : CacheBase(max_cache_entries) {} + +ParquetFileMetaDataCache * ParquetFileMetaDataCache::instance(UInt64 max_cache_entries) +{ + static ParquetFileMetaDataCache * instance = nullptr; + if (!instance) + { + std::lock_guard lock(mutex); + instance = new ParquetFileMetaDataCache(max_cache_entries); + } + return instance; +} + ParquetBlockInputFormat::ParquetBlockInputFormat( ReadBuffer & buf, const Block & header_, @@ -450,6 +474,28 @@ ParquetBlockInputFormat::~ParquetBlockInputFormat() pool->wait(); } +std::shared_ptr ParquetBlockInputFormat::getFileMetaData() +{ + if (!use_metadata_cache || !metadata_cache_key.length()) + { + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); + return parquet::ReadMetaData(arrow_file); + } + + auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance(metadata_cache_max_entries)->getOrSet( + metadata_cache_key, + [this]() + { + return parquet::ReadMetaData(arrow_file); + } + ); + if (loaded) + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); + else + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheHits); + return parquet_file_metadata; +} + void ParquetBlockInputFormat::initializeIfNeeded() { if (std::exchange(is_initialized, true)) @@ -463,7 +509,7 @@ void ParquetBlockInputFormat::initializeIfNeeded() if (is_stopped) return; - metadata = parquet::ReadMetaData(arrow_file); + metadata = getFileMetaData(); std::shared_ptr schema; THROW_ARROW_NOT_OK(parquet::arrow::FromParquetSchema(metadata->schema(), &schema)); @@ -843,6 +889,14 @@ const BlockMissingValues & ParquetBlockInputFormat::getMissingValues() const return previous_block_missing_values; } +void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) +{ + metadata_cache_key = key_; + use_metadata_cache = settings.parquet_use_metadata_cache; + metadata_cache_max_entries = settings.parquet_metadata_cache_max_entries; +} + + ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) { diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index ed528cc077c3..7e34d00fb27a 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -2,6 +2,7 @@ #include "config.h" #if USE_PARQUET +#include #include #include #include @@ -65,6 +66,8 @@ class ParquetBlockInputFormat : public IInputFormat size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; } + void setStorageRelatedUniqueKey(const Settings & settings, const String & key_) override; + private: Chunk read() override; @@ -83,6 +86,8 @@ class ParquetBlockInputFormat : public IInputFormat void threadFunction(size_t row_group_batch_idx); + std::shared_ptr getFileMetaData(); + // Data layout in the file: // // row group 0 @@ -288,6 +293,9 @@ class ParquetBlockInputFormat : public IInputFormat std::exception_ptr background_exception = nullptr; std::atomic is_stopped{0}; bool is_initialized = false; + String metadata_cache_key; + bool use_metadata_cache = false; + UInt64 metadata_cache_max_entries{0}; }; class ParquetSchemaReader : public ISchemaReader @@ -306,6 +314,17 @@ class ParquetSchemaReader : public ISchemaReader std::shared_ptr metadata; }; +class ParquetFileMetaDataCache : public CacheBase +{ +public: + static ParquetFileMetaDataCache * instance(UInt64 max_cache_entries); + void clear() {} + +private: + ParquetFileMetaDataCache(UInt64 max_cache_entries); + static std::mutex mutex; +}; + } #endif diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 04e319cd0b89..a743b9f87a85 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -381,6 +381,9 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (need_only_count) input_format->needOnlyCount(); + if (object_info->getPath().length()) + input_format->setStorageRelatedUniqueKey(context_->getSettingsRef(), object_info->getPath() + ":" + object_info->metadata->etag); + builder.init(Pipe(input_format)); if (read_from_format_info.columns_description.hasDefaults()) diff --git a/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.reference b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.reference new file mode 100644 index 000000000000..51fdf048b8ac --- /dev/null +++ b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.reference @@ -0,0 +1,3 @@ +10 +10 +10 diff --git a/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql new file mode 100644 index 000000000000..5648043b25e8 --- /dev/null +++ b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql @@ -0,0 +1,28 @@ +-- Tags: no-parallel, no-fasttest + +DROP TABLE IF EXISTS t_parquet_03262; + +CREATE TABLE t_parquet_03262 (a UInt64) +ENGINE = S3(s3_conn, filename = 'test_03262_{_partition_id}', format = Parquet) +PARTITION BY a; + +INSERT INTO t_parquet_03262 SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1; + +SELECT COUNT(*) +FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) +SETTINGS parquet_use_metadata_cache=1; + +SELECT COUNT(*) +FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) +SETTINGS parquet_use_metadata_cache=1,custom_x='test03262'; + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['ParquetMetaDataCacheHits'] +FROM system.query_log +where query like '%test03262%' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +DROP TABLE t_parquet_03262; From 415b35158f8417dbb9b3298ceb1dd130b4328e5b Mon Sep 17 00:00:00 2001 From: shiyer7474 Date: Mon, 9 Dec 2024 04:09:08 +0000 Subject: [PATCH 02/10] Code review comments --- .../Formats/Impl/ParquetBlockInputFormat.cpp | 21 ++++++++----------- .../Formats/Impl/ParquetBlockInputFormat.h | 10 +++++---- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 2a247a78c27e..66a5679d57bb 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -434,19 +434,16 @@ static std::vector getHyperrectangleForRowGroup(const parquet::FileMetaDa return hyperrectangle; } -std::mutex ParquetFileMetaDataCache::mutex; - ParquetFileMetaDataCache::ParquetFileMetaDataCache(UInt64 max_cache_entries) : CacheBase(max_cache_entries) {} ParquetFileMetaDataCache * ParquetFileMetaDataCache::instance(UInt64 max_cache_entries) { static ParquetFileMetaDataCache * instance = nullptr; - if (!instance) - { - std::lock_guard lock(mutex); + static std::once_flag once; + std::call_once(once, [&] { instance = new ParquetFileMetaDataCache(max_cache_entries); - } + }); return instance; } @@ -476,14 +473,14 @@ ParquetBlockInputFormat::~ParquetBlockInputFormat() std::shared_ptr ParquetBlockInputFormat::getFileMetaData() { - if (!use_metadata_cache || !metadata_cache_key.length()) + if (!metadata_cache.use_cache || !metadata_cache.key.length()) { ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); return parquet::ReadMetaData(arrow_file); } - auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance(metadata_cache_max_entries)->getOrSet( - metadata_cache_key, + auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance(metadata_cache.max_entries)->getOrSet( + metadata_cache.key, [this]() { return parquet::ReadMetaData(arrow_file); @@ -891,9 +888,9 @@ const BlockMissingValues & ParquetBlockInputFormat::getMissingValues() const void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) { - metadata_cache_key = key_; - use_metadata_cache = settings.parquet_use_metadata_cache; - metadata_cache_max_entries = settings.parquet_metadata_cache_max_entries; + metadata_cache.key = key_; + metadata_cache.use_cache = settings.parquet_use_metadata_cache; + metadata_cache.max_entries = settings.parquet_metadata_cache_max_entries; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 7e34d00fb27a..a4d8d827a7bf 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -293,9 +293,12 @@ class ParquetBlockInputFormat : public IInputFormat std::exception_ptr background_exception = nullptr; std::atomic is_stopped{0}; bool is_initialized = false; - String metadata_cache_key; - bool use_metadata_cache = false; - UInt64 metadata_cache_max_entries{0}; + struct Cache + { + String key; + bool use_cache = false; + UInt64 max_entries{0}; + } metadata_cache; }; class ParquetSchemaReader : public ISchemaReader @@ -322,7 +325,6 @@ class ParquetFileMetaDataCache : public CacheBase private: ParquetFileMetaDataCache(UInt64 max_cache_entries); - static std::mutex mutex; }; } From b928250f79ab2ea0a83607e7d06fc21b17093fa3 Mon Sep 17 00:00:00 2001 From: shiyer7474 Date: Wed, 18 Dec 2024 03:12:47 +0000 Subject: [PATCH 03/10] More code review --- src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 66a5679d57bb..b27936b75336 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -439,12 +439,8 @@ ParquetFileMetaDataCache::ParquetFileMetaDataCache(UInt64 max_cache_entries) ParquetFileMetaDataCache * ParquetFileMetaDataCache::instance(UInt64 max_cache_entries) { - static ParquetFileMetaDataCache * instance = nullptr; - static std::once_flag once; - std::call_once(once, [&] { - instance = new ParquetFileMetaDataCache(max_cache_entries); - }); - return instance; + static ParquetFileMetaDataCache instance(max_cache_entries); + return &instance; } ParquetBlockInputFormat::ParquetBlockInputFormat( @@ -475,7 +471,6 @@ std::shared_ptr ParquetBlockInputFormat::getFileMetaData( { if (!metadata_cache.use_cache || !metadata_cache.key.length()) { - ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); return parquet::ReadMetaData(arrow_file); } From f8a2ad9474b82348adc6a6e8ebb2925410e91030 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 09:30:14 -0300 Subject: [PATCH 04/10] Rename setting on code --- src/Core/Settings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index dc78d3ddc81f..63137265fce8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1293,8 +1293,8 @@ class IColumn; M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \ M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", "Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'.", 0) \ M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, "Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple", 0) \ - M(Bool, parquet_use_metadata_cache, false, "Enable parquet file metadata caching.", 0) \ - M(UInt64, parquet_metadata_cache_max_entries, 100000, "Maximum number of parquet file metadata to cache.", 0) \ + M(Bool, input_format_parquet_use_metadata_cache, false, "Enable parquet file metadata caching.", 0) \ + M(UInt64, input_format_parquet_metadata_cache_max_entries, 100000, "Maximum number of parquet file metadata to cache.", 0) \ // End of FORMAT_FACTORY_SETTINGS From 3a992b08a02b8a73134f5ede53e0c6caaaa5f332 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 11:24:14 -0300 Subject: [PATCH 05/10] avoid 1 extra s3 call to get file size in case it is not needed & rename setting --- .../Formats/Impl/ParquetBlockInputFormat.cpp | 36 +++++++++++++++---- .../Formats/Impl/ParquetBlockInputFormat.h | 3 ++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index b27936b75336..d32e6c2516fe 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -467,18 +467,24 @@ ParquetBlockInputFormat::~ParquetBlockInputFormat() pool->wait(); } +std::shared_ptr ParquetBlockInputFormat::readMetadataFromFile() +{ + createArrowFileIfNotCreated(); + return parquet::ReadMetaData(arrow_file); +} + std::shared_ptr ParquetBlockInputFormat::getFileMetaData() { if (!metadata_cache.use_cache || !metadata_cache.key.length()) { - return parquet::ReadMetaData(arrow_file); + return readMetadataFromFile(); } auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance(metadata_cache.max_entries)->getOrSet( metadata_cache.key, - [this]() + [&]() { - return parquet::ReadMetaData(arrow_file); + return readMetadataFromFile(); } ); if (loaded) @@ -488,15 +494,23 @@ std::shared_ptr ParquetBlockInputFormat::getFileMetaData( return parquet_file_metadata; } -void ParquetBlockInputFormat::initializeIfNeeded() +void ParquetBlockInputFormat::createArrowFileIfNotCreated() { - if (std::exchange(is_initialized, true)) + if (arrow_file) + { return; + } // Create arrow file adapter. // TODO: Make the adapter do prefetching on IO threads, based on the full set of ranges that // we'll need to read (which we know in advance). Use max_download_threads for that. arrow_file = asArrowFile(*in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); +} + +void ParquetBlockInputFormat::initializeIfNeeded() +{ + if (std::exchange(is_initialized, true)) + return; if (is_stopped) return; @@ -532,6 +546,8 @@ void ParquetBlockInputFormat::initializeIfNeeded() return std::min(std::max(preferred_num_rows, MIN_ROW_NUM), static_cast(format_settings.parquet.max_block_size)); }; + bool has_row_groups_to_read = false; + for (int row_group = 0; row_group < num_row_groups; ++row_group) { if (skip_row_groups.contains(row_group)) @@ -553,6 +569,12 @@ void ParquetBlockInputFormat::initializeIfNeeded() row_group_batches.back().total_bytes_compressed += metadata->RowGroup(row_group)->total_compressed_size(); auto rows = adaptive_chunk_size(row_group); row_group_batches.back().adaptive_chunk_size = rows ? rows : format_settings.parquet.max_block_size; + has_row_groups_to_read = true; + } + + if (has_row_groups_to_read) + { + createArrowFileIfNotCreated(); } } @@ -884,8 +906,8 @@ const BlockMissingValues & ParquetBlockInputFormat::getMissingValues() const void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) { metadata_cache.key = key_; - metadata_cache.use_cache = settings.parquet_use_metadata_cache; - metadata_cache.max_entries = settings.parquet_metadata_cache_max_entries; + metadata_cache.use_cache = settings.input_format_parquet_use_metadata_cache; + metadata_cache.max_entries = settings.input_format_parquet_metadata_cache_max_entries; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index a4d8d827a7bf..3e7ceb329a44 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -86,6 +86,9 @@ class ParquetBlockInputFormat : public IInputFormat void threadFunction(size_t row_group_batch_idx); + void createArrowFileIfNotCreated(); + std::shared_ptr readMetadataFromFile(); + std::shared_ptr getFileMetaData(); // Data layout in the file: From a78f188687353697083a90a67eff176929abcbff Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 11:27:28 -0300 Subject: [PATCH 06/10] add comment --- src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index d32e6c2516fe..f1bc3be2db10 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -475,7 +475,10 @@ std::shared_ptr ParquetBlockInputFormat::readMetadataFrom std::shared_ptr ParquetBlockInputFormat::getFileMetaData() { - if (!metadata_cache.use_cache || !metadata_cache.key.length()) + // in-memory cache is not implemented for local file operations, only for remote files + // there is a chance the user sets `input_format_parquet_use_metadata_cache=1` for a local file operation + // and the cache_key won't be set. Therefore, we also need to check for metadata_cache.key + if (!metadata_cache.use_cache || metadata_cache.key.empty()) { return readMetadataFromFile(); } From 9add7d803d315f44d067d41c42bfed434496a257 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 11:32:49 -0300 Subject: [PATCH 07/10] update test setting name --- tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql index 5648043b25e8..c7d63da354cd 100644 --- a/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql +++ b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql @@ -10,11 +10,11 @@ INSERT INTO t_parquet_03262 SELECT number FROM numbers(10) SETTINGS s3_truncate_ SELECT COUNT(*) FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) -SETTINGS parquet_use_metadata_cache=1; +SETTINGS input_format_parquet_use_metadata_cache=1; SELECT COUNT(*) FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) -SETTINGS parquet_use_metadata_cache=1,custom_x='test03262'; +SETTINGS input_format_parquet_use_metadata_cache=1,custom_x='test03262'; SYSTEM FLUSH LOGS; From 465e96e78673f6ce4fd752ff53b4c0e3e5015edc Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 11:33:32 -0300 Subject: [PATCH 08/10] minor update to test --- tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql index c7d63da354cd..f453dcba0c66 100644 --- a/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql +++ b/tests/queries/0_stateless/03262_parquet_s3_metadata_cache.sql @@ -14,13 +14,13 @@ SETTINGS input_format_parquet_use_metadata_cache=1; SELECT COUNT(*) FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) -SETTINGS input_format_parquet_use_metadata_cache=1,custom_x='test03262'; +SETTINGS input_format_parquet_use_metadata_cache=1, log_comment='test_03262_parquet_metadata_cache'; SYSTEM FLUSH LOGS; SELECT ProfileEvents['ParquetMetaDataCacheHits'] FROM system.query_log -where query like '%test03262%' +where log_comment = 'test_03262_parquet_metadata_cache' AND type = 'QueryFinish' ORDER BY event_time desc LIMIT 1; From 861bdf5912efa15b2061ea79a82f3cccb9345324 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 11:49:15 -0300 Subject: [PATCH 09/10] make parquet_metadata_cache_max_entries a server setting --- src/Core/ServerSettings.h | 3 ++- src/Core/Settings.h | 1 - src/Processors/Formats/IInputFormat.h | 2 +- src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp | 5 +++-- src/Processors/Formats/Impl/ParquetBlockInputFormat.h | 2 +- src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index a52be0c8ba9c..c3f3b7384761 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -166,7 +166,8 @@ namespace DB M(String, mutation_workload, "default", "Name of workload to be used to access resources for all mutations (may be overridden by a merge tree setting)", 0) \ M(Bool, prepare_system_log_tables_on_startup, false, "If true, ClickHouse creates all configured `system.*_log` tables before the startup. It can be helpful if some startup scripts depend on these tables.", 0) \ M(UInt64, config_reload_interval_ms, 2000, "How often clickhouse will reload config and check for new changes", 0) \ - M(Bool, disable_insertion_and_mutation, false, "Disable all insert/alter/delete queries. This setting will be enabled if someone needs read-only nodes to prevent insertion and mutation affect reading performance.", 0) + M(Bool, disable_insertion_and_mutation, false, "Disable all insert/alter/delete queries. This setting will be enabled if someone needs read-only nodes to prevent insertion and mutation affect reading performance.", 0) \ + M(UInt64, input_format_parquet_metadata_cache_max_entries, 100000, "Maximum number of parquet file metadata to cache.", 0) /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in StorageSystemServerSettings.cpp diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 63137265fce8..1982b56178de 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1294,7 +1294,6 @@ class IColumn; M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", "Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'.", 0) \ M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, "Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple", 0) \ M(Bool, input_format_parquet_use_metadata_cache, false, "Enable parquet file metadata caching.", 0) \ - M(UInt64, input_format_parquet_metadata_cache_max_entries, 100000, "Maximum number of parquet file metadata to cache.", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index 8981b3c2916b..f0fa3b870e8f 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -67,7 +67,7 @@ class IInputFormat : public SourceWithKeyCondition void needOnlyCount() { need_only_count = true; } /// Set additional info/key/id related to underlying storage of the ReadBuffer - virtual void setStorageRelatedUniqueKey(const Settings & /*settings*/, const String & /*key*/) {} + virtual void setStorageRelatedUniqueKey(const ServerSettings & /* server_settings */, const Settings & /*settings*/, const String & /*key*/) {} protected: ReadBuffer & getReadBuffer() const { chassert(in); return *in; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index f1bc3be2db10..f4bb5cbc76f1 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -4,6 +4,7 @@ #if USE_PARQUET #include +#include #include #include #include @@ -906,11 +907,11 @@ const BlockMissingValues & ParquetBlockInputFormat::getMissingValues() const return previous_block_missing_values; } -void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) +void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const ServerSettings & server_settings, const Settings & settings, const String & key_) { metadata_cache.key = key_; metadata_cache.use_cache = settings.input_format_parquet_use_metadata_cache; - metadata_cache.max_entries = settings.input_format_parquet_metadata_cache_max_entries; + metadata_cache.max_entries = server_settings.input_format_parquet_metadata_cache_max_entries; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 3e7ceb329a44..edcc15e603f0 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -66,7 +66,7 @@ class ParquetBlockInputFormat : public IInputFormat size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; } - void setStorageRelatedUniqueKey(const Settings & settings, const String & key_) override; + void setStorageRelatedUniqueKey(const ServerSettings & server_settings, const Settings & settings, const String & key_) override; private: Chunk read() override; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index a743b9f87a85..24dede94f588 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -382,7 +382,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade input_format->needOnlyCount(); if (object_info->getPath().length()) - input_format->setStorageRelatedUniqueKey(context_->getSettingsRef(), object_info->getPath() + ":" + object_info->metadata->etag); + input_format->setStorageRelatedUniqueKey(context_->getServerSettings(), context_->getSettingsRef(), object_info->getPath() + ":" + object_info->metadata->etag); builder.init(Pipe(input_format)); From 5a7a8adb6bc1e60e5c3358fdfec14a40e972baea Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 18 Dec 2024 13:25:09 -0300 Subject: [PATCH 10/10] settings history --- src/Core/SettingsChangesHistory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index b3e7a59c5e2f..85917d33624a 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -92,7 +92,8 @@ static std::initializer_list