From 6e8903999e62d52ac7cbf9b72eef786c6c5072bc Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 30 Sep 2025 20:12:30 +0000 Subject: [PATCH 001/112] Backport #87442 to 25.8: Fix bool decoding in parquet reader v3 --- src/Processors/Formats/Impl/Parquet/Decoding.cpp | 2 +- .../queries/0_stateless/03630_parquet_bool_bug.reference | 2 ++ tests/queries/0_stateless/03630_parquet_bool_bug.sql | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03630_parquet_bool_bug.reference create mode 100644 tests/queries/0_stateless/03630_parquet_bool_bug.sql diff --git a/src/Processors/Formats/Impl/Parquet/Decoding.cpp b/src/Processors/Formats/Impl/Parquet/Decoding.cpp index 40f4efcf68c9..a3cfb47ee8f9 100644 --- a/src/Processors/Formats/Impl/Parquet/Decoding.cpp +++ b/src/Processors/Formats/Impl/Parquet/Decoding.cpp @@ -249,7 +249,7 @@ struct PlainBooleanDecoder : public PageDecoder /// x = 00000000 000000hg 00000000 000000fe 00000000 000000dc 00000000 000000ba x = (x | (x << 7)) & 0x0101010101010101ul; /// x = 0000000h 0000000g 0000000f 0000000e 0000000d 0000000c 0000000b 0000000a - memcpy(to + i * 8, &x, 8); + memcpy(to + i, &x, 8); i += 8; } else diff --git a/tests/queries/0_stateless/03630_parquet_bool_bug.reference b/tests/queries/0_stateless/03630_parquet_bool_bug.reference new file mode 100644 index 000000000000..0404dc20010c --- /dev/null +++ b/tests/queries/0_stateless/03630_parquet_bool_bug.reference @@ -0,0 +1,2 @@ +8 +256 diff --git a/tests/queries/0_stateless/03630_parquet_bool_bug.sql b/tests/queries/0_stateless/03630_parquet_bool_bug.sql new file mode 100644 index 000000000000..e80f1dc30b5a --- /dev/null +++ b/tests/queries/0_stateless/03630_parquet_bool_bug.sql @@ -0,0 +1,8 @@ +-- Tags: no-parallel, no-fasttest + +insert into function file('03630_parquet_bool_bug.parquet', Parquet, 'tags Array(Bool)') settings engine_file_truncate_on_insert=1 values ([false,false,false,false,false,false,false,false]), ([true,true,true,true,true,true,true,true]); +select sum(tags) from file('03630_parquet_bool_bug.parquet') array join tags settings input_format_parquet_use_native_reader_v3=1; + +-- Try all 256 1-byte masks to verify the bit shifting nonsense in PlainBooleanDecoder. +insert into function file('03630_parquet_bool_bug.parquet') select number as n, arrayMap(i -> toBool(bitShiftRight(number, i) % 2 = 1), range(8)) as bits from numbers(256) settings engine_file_truncate_on_insert=1; +select sum(n = arraySum(arrayMap(i -> bitShiftLeft(bits[i+1], i), range(8)))) as ok from file('03630_parquet_bool_bug.parquet') settings input_format_parquet_use_native_reader_v3=1, schema_inference_make_columns_nullable=0; From 20570cc516be8c6eef1ecbfce4d299ff3a17f278 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 7 Oct 2025 11:11:32 +0000 Subject: [PATCH 002/112] Backport #88105 to 25.8: Fix uncaught exception due noexcept tryGetCreateTableQuery() --- src/Databases/DataLake/DatabaseDataLake.cpp | 7 ++++--- src/Databases/IDatabase.h | 11 +++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Databases/DataLake/DatabaseDataLake.cpp b/src/Databases/DataLake/DatabaseDataLake.cpp index fd41602a3a40..90260079ac7b 100644 --- a/src/Databases/DataLake/DatabaseDataLake.cpp +++ b/src/Databases/DataLake/DatabaseDataLake.cpp @@ -630,7 +630,7 @@ ASTPtr DatabaseDataLake::getCreateDatabaseQuery() const ASTPtr DatabaseDataLake::getCreateTableQueryImpl( const String & name, ContextPtr /* context_ */, - bool /* throw_on_error */) const + bool throw_on_error) const { auto catalog = getCatalog(); auto table_metadata = DataLake::TableMetadata().withLocation().withSchema(); @@ -639,8 +639,9 @@ ASTPtr DatabaseDataLake::getCreateTableQueryImpl( if (!catalog->tryGetTableMetadata(namespace_name, table_name, table_metadata)) { - throw Exception( - ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, "Table `{}` doesn't exist", name); + if (throw_on_error) + throw Exception(ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, "Table `{}` doesn't exist", name); + return {}; } auto create_table_query = std::make_shared(); diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 8a9f412c00a0..eb7d5f2c75ce 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -346,15 +346,18 @@ class IDatabase : public std::enable_shared_from_this return static_cast(0); } - /// Get the CREATE TABLE query for the table. It can also provide information for detached tables for which there is metadata. - ASTPtr tryGetCreateTableQuery(const String & name, ContextPtr context) const noexcept + /// Get the CREATE TABLE query for the table. + /// It can also provide information for detached tables for which there is metadata. + /// + /// Does not throw if the table does not exist, but can throw on other errors. + ASTPtr tryGetCreateTableQuery(const String & name, ContextPtr context) const { - return getCreateTableQueryImpl(name, context, false); + return getCreateTableQueryImpl(name, context, /*throw_on_error=*/ false); } ASTPtr getCreateTableQuery(const String & name, ContextPtr context) const { - return getCreateTableQueryImpl(name, context, true); + return getCreateTableQueryImpl(name, context, /*throw_on_error=*/ true); } /// Get the CREATE DATABASE query for current database. From 40cacd97f842172f2293f7577a4973a9abc75735 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 7 Oct 2025 11:50:53 +0000 Subject: [PATCH 003/112] Update autogenerated version to 25.8.9.20 and contributors --- cmake/autogenerated_versions.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index c55e3f3097ad..f4b0bffe7db2 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54509) +SET(VERSION_REVISION 54510) SET(VERSION_MAJOR 25) SET(VERSION_MINOR 8) -SET(VERSION_PATCH 9) -SET(VERSION_GITHASH 8a2475033080b4a8d57b7771f52140af663dd4e0) -SET(VERSION_DESCRIBE v25.8.9.1-lts) -SET(VERSION_STRING 25.8.9.1) +SET(VERSION_PATCH 10) +SET(VERSION_GITHASH a1f4cd9c23f649b8891e952f973937f40eb9d273) +SET(VERSION_DESCRIBE v25.8.10.1-lts) +SET(VERSION_STRING 25.8.10.1) # end of autochange From 0b375d79653ef924c81efcd59cc28b4fd248dca7 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 8 Oct 2025 07:13:22 +0000 Subject: [PATCH 004/112] Backport #86560 to 25.8: Fix stack overflow in quantileDD merge --- src/AggregateFunctions/DDSketch.h | 64 ++++++------- .../DDSketch/DDSketchEncoding.h | 8 +- src/AggregateFunctions/DDSketch/Mapping.h | 59 ++++++------ src/AggregateFunctions/DDSketch/Store.h | 43 +++++---- .../tests/gtest_ddsketch.cpp | 89 +++++++++++++++++++ .../03568_ddsketch_merge.reference | 1 + .../0_stateless/03568_ddsketch_merge.sql | 8 ++ 7 files changed, 192 insertions(+), 80 deletions(-) create mode 100644 src/AggregateFunctions/tests/gtest_ddsketch.cpp create mode 100644 tests/queries/0_stateless/03568_ddsketch_merge.reference create mode 100644 tests/queries/0_stateless/03568_ddsketch_merge.sql diff --git a/src/AggregateFunctions/DDSketch.h b/src/AggregateFunctions/DDSketch.h index 16b6318181cc..7c2b042699a3 100644 --- a/src/AggregateFunctions/DDSketch.h +++ b/src/AggregateFunctions/DDSketch.h @@ -1,49 +1,49 @@ #pragma once -#include // for std::unique_ptr -#include -#include #include -#include +#include // for std::unique_ptr #include -#include -#include - +#include #include #include -#include +#include +#include +#include +#include +#include namespace DB { namespace ErrorCodes { - extern const int BAD_ARGUMENTS; - extern const int INCORRECT_DATA; +extern const int BAD_ARGUMENTS; +extern const int INCORRECT_DATA; } class DDSketchDenseLogarithmic { public: explicit DDSketchDenseLogarithmic(Float64 relative_accuracy = 0.01) - : mapping(std::make_unique(relative_accuracy)), - store(std::make_unique()), - negative_store(std::make_unique()), - zero_count(0.0), - count(0.0) + : mapping(std::make_unique(relative_accuracy)) + , store(std::make_unique()) + , negative_store(std::make_unique()) + , zero_count(0.0) + , count(0.0) { } - DDSketchDenseLogarithmic(std::unique_ptr mapping_, - std::unique_ptr store_, - std::unique_ptr negative_store_, - Float64 zero_count_) - : mapping(std::move(mapping_)), - store(std::move(store_)), - negative_store(std::move(negative_store_)), - zero_count(zero_count_), - count(store->count + negative_store->count + zero_count_) + DDSketchDenseLogarithmic( + std::unique_ptr mapping_, + std::unique_ptr store_, + std::unique_ptr negative_store_, + Float64 zero_count_) + : mapping(std::move(mapping_)) + , store(std::move(store_)) + , negative_store(std::move(negative_store_)) + , zero_count(zero_count_) + , count(store->count + negative_store->count + zero_count_) { } @@ -97,7 +97,11 @@ class DDSketchDenseLogarithmic return quantile_value; } - void copy(const DDSketchDenseLogarithmic& other) + Float64 getGamma() const { return mapping->getGamma(); } + + Float64 getCount() const { return count; } + + void copy(const DDSketchDenseLogarithmic & other) { Float64 rel_acc = (other.mapping->getGamma() - 1) / (other.mapping->getGamma() + 1); mapping = std::make_unique(rel_acc); @@ -109,9 +113,9 @@ class DDSketchDenseLogarithmic count = other.count; } - void merge(const DDSketchDenseLogarithmic& other) + void merge(const DDSketchDenseLogarithmic & other) { - if (mapping->getGamma() != other.mapping->getGamma()) + if (*mapping != *other.mapping) { // modify the one with higher precision to match the one with lower precision if (mapping->getGamma() > other.mapping->getGamma()) @@ -147,7 +151,7 @@ class DDSketchDenseLogarithmic /// NOLINTBEGIN(readability-static-accessed-through-instance) - void serialize(WriteBuffer& buf) const + void serialize(WriteBuffer & buf) const { // Write the mapping writeBinary(enc.FlagIndexMappingBaseLogarithmic.byte, buf); @@ -165,7 +169,7 @@ class DDSketchDenseLogarithmic writeBinary(zero_count, buf); } - void deserialize(ReadBuffer& buf) + void deserialize(ReadBuffer & buf) { // Read the mapping UInt8 flag = 0; @@ -219,7 +223,7 @@ class DDSketchDenseLogarithmic auto new_positive_store = std::make_unique(); auto new_negative_store = std::make_unique(); - auto remap_store = [this, &new_mapping](DDSketchDenseStore& old_store, std::unique_ptr& target_store) + auto remap_store = [this, &new_mapping](DDSketchDenseStore & old_store, std::unique_ptr & target_store) { for (int i = 0; i < old_store.length(); ++i) { diff --git a/src/AggregateFunctions/DDSketch/DDSketchEncoding.h b/src/AggregateFunctions/DDSketch/DDSketchEncoding.h index 477bc3f54495..64dc8c0e55ed 100644 --- a/src/AggregateFunctions/DDSketch/DDSketchEncoding.h +++ b/src/AggregateFunctions/DDSketch/DDSketchEncoding.h @@ -1,7 +1,6 @@ #pragma once -#include -#include +#include /** * An encoded DDSketch comprises multiple contiguous blocks (sequences of bytes). @@ -36,7 +35,10 @@ class DDSketchEncoding { public: UInt8 byte; - Flag(UInt8 t, UInt8 s) : byte(t | s) { } + Flag(UInt8 t, UInt8 s) + : byte(t | s) + { + } [[maybe_unused]] UInt8 Type() const { return byte & flagTypeMask; } [[maybe_unused]] UInt8 SubFlag() const { return byte & subFlagMask; } }; diff --git a/src/AggregateFunctions/DDSketch/Mapping.h b/src/AggregateFunctions/DDSketch/Mapping.h index 0d1ff785d59d..0f4d939f8f56 100644 --- a/src/AggregateFunctions/DDSketch/Mapping.h +++ b/src/AggregateFunctions/DDSketch/Mapping.h @@ -1,26 +1,29 @@ #pragma once -#include #include -#include #include +#include #include +#include #include +#include +#include namespace DB { namespace ErrorCodes { - extern const int BAD_ARGUMENTS; +extern const int BAD_ARGUMENTS; } class DDSketchLogarithmicMapping { public: explicit DDSketchLogarithmicMapping(Float64 relative_accuracy_, Float64 offset_ = 0.0) - : relative_accuracy(relative_accuracy_), offset(offset_) + : relative_accuracy(relative_accuracy_) + , offset(offset_) { if (relative_accuracy <= 0 || relative_accuracy >= 1) { @@ -44,48 +47,40 @@ class DDSketchLogarithmicMapping return static_cast(logGamma(value) + offset); } - Float64 value(int key) const - { - return lowerBound(key) * (1 + relative_accuracy); - } + Float64 value(int key) const { return lowerBound(key) * (1 + relative_accuracy); } - Float64 logGamma(Float64 value) const - { - return std::log(value) * multiplier; - } + Float64 logGamma(Float64 value) const { return std::log(value) * multiplier; } - Float64 powGamma(Float64 value) const - { - return std::exp(value / multiplier); - } + Float64 powGamma(Float64 value) const { return std::exp(value / multiplier); } - Float64 lowerBound(int index) const - { - return powGamma(static_cast(index) - offset); - } + Float64 lowerBound(int index) const { return powGamma(static_cast(index) - offset); } - Float64 getGamma() const - { - return gamma; - } + Float64 getGamma() const { return gamma; } - Float64 getMinPossible() const - { - return min_possible; - } + Float64 getMinPossible() const { return min_possible; } - [[maybe_unused]] Float64 getMaxPossible() const + [[maybe_unused]] Float64 getMaxPossible() const { return max_possible; } + + bool operator==(const DDSketchLogarithmicMapping & other) const { - return max_possible; + if (gamma == other.gamma) + { + return true; + } + + auto [min, max] = std::minmax(gamma, other.gamma); + const Float64 difference = max - min; + const Float64 acceptable = (std::nextafter(min, max) - min) * min; + return difference <= acceptable; } - void serialize(WriteBuffer& buf) const + void serialize(WriteBuffer & buf) const { writeBinary(gamma, buf); writeBinary(offset, buf); } - void deserialize(ReadBuffer& buf) + void deserialize(ReadBuffer & buf) { readBinary(gamma, buf); readBinary(offset, buf); diff --git a/src/AggregateFunctions/DDSketch/Store.h b/src/AggregateFunctions/DDSketch/Store.h index 0e499e445d2a..594746f73e10 100644 --- a/src/AggregateFunctions/DDSketch/Store.h +++ b/src/AggregateFunctions/DDSketch/Store.h @@ -1,13 +1,15 @@ #pragma once -#include -#include #include #include +#include +#include +#include #include +#include #include -#include +#include // We start with 128 bins and grow the number of bins by 128 @@ -18,6 +20,11 @@ constexpr UInt32 CHUNK_SIZE = 128; namespace DB { +namespace ErrorCodes +{ +extern const int INCORRECT_DATA; +} + class DDSketchDenseStore { public: @@ -27,9 +34,12 @@ class DDSketchDenseStore int offset = 0; std::vector bins; - explicit DDSketchDenseStore(UInt32 chunk_size_ = CHUNK_SIZE) : chunk_size(chunk_size_) {} + explicit DDSketchDenseStore(UInt32 chunk_size_ = CHUNK_SIZE) + : chunk_size(chunk_size_) + { + } - void copy(DDSketchDenseStore* other) + void copy(DDSketchDenseStore * other) { bins = other->bins; count = other->count; @@ -38,10 +48,7 @@ class DDSketchDenseStore offset = other->offset; } - int length() const - { - return static_cast(bins.size()); - } + int length() const { return static_cast(bins.size()); } void add(int key, Float64 weight) { @@ -64,9 +71,10 @@ class DDSketchDenseStore return max_key; } - void merge(DDSketchDenseStore* other) + void merge(DDSketchDenseStore * other) { - if (other->count == 0) return; + if (other->count == 0) + return; if (count == 0) { @@ -89,9 +97,8 @@ class DDSketchDenseStore /// NOLINTBEGIN(readability-static-accessed-through-instance) - void serialize(WriteBuffer& buf) const + void serialize(WriteBuffer & buf) const { - // Calculate the size of the dense and sparse encodings to choose the smallest one UInt64 num_bins = 0; UInt64 num_non_empty_bins = 0; @@ -144,8 +151,10 @@ class DDSketchDenseStore } } - void deserialize(ReadBuffer& buf) + void deserialize(ReadBuffer & buf) { + count = 0; + UInt8 encoding_mode; readBinary(encoding_mode, buf); if (encoding_mode == enc.BinEncodingContiguousCounts) @@ -165,7 +174,7 @@ class DDSketchDenseStore start_key += index_delta; } } - else + else if (encoding_mode == enc.BinEncodingIndexDeltasAndCounts) { UInt64 num_non_empty_bins; readVarUInt(num_non_empty_bins, buf); @@ -180,6 +189,10 @@ class DDSketchDenseStore add(previous_index, bin_count); } } + else + { + throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid flag for encoding mode"); + } } /// NOLINTEND(readability-static-accessed-through-instance) diff --git a/src/AggregateFunctions/tests/gtest_ddsketch.cpp b/src/AggregateFunctions/tests/gtest_ddsketch.cpp new file mode 100644 index 000000000000..69c01d81fcd1 --- /dev/null +++ b/src/AggregateFunctions/tests/gtest_ddsketch.cpp @@ -0,0 +1,89 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +TEST(DDSketch, MergeDifferentGammasWithoutSegfault) +{ + using namespace DB; + + DDSketchDenseLogarithmic lhs{}; + DDSketchDenseLogarithmic rhs{}; + + /* + { + "mapping": { + "gamma": 2.0, + "index_offset": 0.0, + "interpolation": 0 + }, + "positive_values": { + "bin_counts": {}, + "contiguous_bin_counts": [ + 1.0 + ], + "contiguous_bin_index_offset": -8 + }, + "negative_values": { + "bin_counts": {}, + "contiguous_bin_counts": [], + "contiguous_bin_index_offset": 0 + }, + "zero_count": 0.0 + } + */ + std::string lhs_data = base64Decode("AgAAAAAAAABAAAAAAAAAAAABDAEPAgAAAAAAAPA/AwwA/v///w8CBAAAAAAAAAAA"); + ReadBufferFromString lhs_buffer{lhs_data}; + lhs.deserialize(lhs_buffer); + + ASSERT_DOUBLE_EQ(lhs.getCount(), 1); + ASSERT_DOUBLE_EQ(lhs.getGamma(), 2.0); + + /* + { + "mapping": { + "gamma": 1.4142135623730951, + "index_offset": 0.0, + "interpolation": 0 + }, + "positive_values": { + "bin_counts": {}, + "contiguous_bin_counts": [ + 1.0 + ], + "contiguous_bin_index_offset": -18 + }, + "negative_values": { + "bin_counts": {}, + "contiguous_bin_counts": [], + "contiguous_bin_index_offset": 0 + }, + "zero_count": 0.0 + } + */ + std::string rhs_data = base64Decode("As07f2aeoPY/AAAAAAAAAAABDAEjAgAAAAAAAPA/AwwA/v///w8CBAAAAAAAAAAA"); + ReadBufferFromString rhs_buffer{rhs_data}; + rhs.deserialize(rhs_buffer); + + ASSERT_DOUBLE_EQ(rhs.getCount(), 1); + ASSERT_DOUBLE_EQ(rhs.getGamma(), 1.4142135623730951); + + lhs.merge(rhs); + std::vector merge_buffer; + + WriteBufferFromVector> writer{merge_buffer}; + lhs.serialize(writer); + + ReadBufferFromMemory reader{merge_buffer.data(), merge_buffer.size()}; + ASSERT_NO_THROW(rhs.deserialize(reader)); + + ASSERT_FLOAT_EQ(rhs.getCount(), 2); + ASSERT_DOUBLE_EQ(rhs.getGamma(), 2.0); +} diff --git a/tests/queries/0_stateless/03568_ddsketch_merge.reference b/tests/queries/0_stateless/03568_ddsketch_merge.reference new file mode 100644 index 000000000000..d00491fd7e5b --- /dev/null +++ b/tests/queries/0_stateless/03568_ddsketch_merge.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/03568_ddsketch_merge.sql b/tests/queries/0_stateless/03568_ddsketch_merge.sql new file mode 100644 index 000000000000..ccb81882809e --- /dev/null +++ b/tests/queries/0_stateless/03568_ddsketch_merge.sql @@ -0,0 +1,8 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: Depends on base64Decode + +SELECT quantileDDMerge(0.01)(state) != 0 FROM format( + RowBinary, + 'state AggregateFunction(quantileDD(0.01), Float64)', + base64Decode('AgAAAAAAAABAAAAAAAAAAAABDAEPAgAAAAAAAPA/AwwA/v///w8CBAAAAAAAAAAAAs07f2aeoPY/AAAAAAAAAAABDAEjAgAAAAAAAPA/AwwA/v///w8CBAAAAAAAAAAA') +) From b25dc060b206964612fbabc6a2a0f7ec9661ce0f Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 8 Oct 2025 16:13:00 +0000 Subject: [PATCH 005/112] Backport #88153 to 25.8: Fix redundant host resolution in DDL Worker --- src/Interpreters/DDLWorker.cpp | 37 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index adc8c01a0294..5590029fc66f 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1139,6 +1139,7 @@ bool DDLWorker::initializeMainThread() auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(fs::path(queue_dir) / ""); initializeReplication(); + markReplicasActive(true); initialized = true; return true; } @@ -1211,14 +1212,6 @@ void DDLWorker::runMainThread() } cleanup_event->set(); - try - { - markReplicasActive(reinitialized); - } - catch (...) - { - tryLogCurrentException(log, "An error occurred when markReplicasActive: "); - } scheduleTasks(reinitialized); subsequent_errors_count = 0; @@ -1297,24 +1290,21 @@ void DDLWorker::createReplicaDirs(const ZooKeeperPtr & zookeeper, const NameSet zookeeper->createAncestors(fs::path(replicas_dir) / host_id / ""); } -void DDLWorker::markReplicasActive(bool reinitialized) +void DDLWorker::markReplicasActive(bool /*reinitialized*/) { auto zookeeper = getZooKeeper(); - if (reinitialized) + // Reset all active_node_holders + for (auto & it : active_node_holders) { - // Reset all active_node_holders - for (auto & it : active_node_holders) - { - auto & active_node_holder = it.second.second; - if (active_node_holder) - active_node_holder->setAlreadyRemoved(); - active_node_holder.reset(); - } - - active_node_holders.clear(); + auto & active_node_holder = it.second.second; + if (active_node_holder) + active_node_holder->setAlreadyRemoved(); + active_node_holder.reset(); } + active_node_holders.clear(); + for (auto it = active_node_holders.begin(); it != active_node_holders.end();) { auto & zk = it->second.first; @@ -1394,7 +1384,12 @@ void DDLWorker::markReplicasActive(bool reinitialized) { zookeeper->deleteEphemeralNodeIfContentMatches(active_path, active_id); } - zookeeper->create(active_path, active_id, zkutil::CreateMode::Ephemeral); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(active_path, active_id, zkutil::CreateMode::Ephemeral)); + /// To bump node mtime + ops.emplace_back(zkutil::makeSetRequest(fs::path(replicas_dir) / host_id, "", -1)); + zookeeper->multi(ops); + auto active_node_holder_zookeeper = zookeeper; auto active_node_holder = zkutil::EphemeralNodeHolder::existing(active_path, *active_node_holder_zookeeper); active_node_holders[host_id] = {active_node_holder_zookeeper, active_node_holder}; From b7c8d8b779be943de60dfedf3d9df0fcf8108f0e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 9 Oct 2025 11:11:09 +0000 Subject: [PATCH 006/112] Backport #88213 to 25.8: Fix propagation of `is_shared` flag in `ColumnLowCardinality` --- src/Columns/ColumnLowCardinality.cpp | 6 +++--- src/Columns/ColumnLowCardinality.h | 17 +++++++++-------- src/DataTypes/DataTypeLowCardinality.cpp | 2 +- .../SerializationLowCardinality.cpp | 2 +- src/Functions/IFunction.cpp | 4 ++-- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 2 +- .../Formats/Impl/DWARFBlockInputFormat.cpp | 13 ++++++++----- .../Formats/Impl/NativeORCBlockInputFormat.cpp | 2 +- 8 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/Columns/ColumnLowCardinality.cpp b/src/Columns/ColumnLowCardinality.cpp index 1db462151d32..02fe28c0eba6 100644 --- a/src/Columns/ColumnLowCardinality.cpp +++ b/src/Columns/ColumnLowCardinality.cpp @@ -350,7 +350,7 @@ MutableColumnPtr ColumnLowCardinality::cloneResized(size_t size) const if (size == 0) unique_ptr = unique_ptr->cloneEmpty(); - return ColumnLowCardinality::create(IColumn::mutate(std::move(unique_ptr)), getIndexes().cloneResized(size)); + return ColumnLowCardinality::create(IColumn::mutate(std::move(unique_ptr)), getIndexes().cloneResized(size), /*is_shared=*/false); } MutableColumnPtr ColumnLowCardinality::cloneNullable() const @@ -584,7 +584,7 @@ std::vector ColumnLowCardinality::scatter(ColumnIndex num_colu for (auto & column : columns) { auto unique_ptr = dictionary.getColumnUniquePtr(); - column = ColumnLowCardinality::create(IColumn::mutate(std::move(unique_ptr)), std::move(column)); + column = ColumnLowCardinality::create(IColumn::mutate(std::move(unique_ptr)), std::move(column), /*is_shared=*/false); } return columns; @@ -603,7 +603,7 @@ ColumnLowCardinality::MutablePtr ColumnLowCardinality::cutAndCompact(size_t star { auto sub_positions = IColumn::mutate(idx.getPositions()->cut(start, length)); auto new_column_unique = Dictionary::compact(getDictionary(), sub_positions); - return ColumnLowCardinality::create(std::move(new_column_unique), std::move(sub_positions)); + return ColumnLowCardinality::create(std::move(new_column_unique), std::move(sub_positions), /*is_shared=*/false); } void ColumnLowCardinality::compactInplace() diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index 54a1ee52a212..dcce5e27d9fc 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -28,7 +28,7 @@ class ColumnLowCardinality final : public COWHelper, ColumnLowCardinality>; - ColumnLowCardinality(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared = false); + ColumnLowCardinality(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared); ColumnLowCardinality(const ColumnLowCardinality & other) = default; public: @@ -36,12 +36,12 @@ class ColumnLowCardinality final : public COWHelper, ColumnLowCardinality>; - static Ptr create(const ColumnPtr & column_unique_, const ColumnPtr & indexes_, bool is_shared = false) + static Ptr create(const ColumnPtr & column_unique_, const ColumnPtr & indexes_, bool is_shared) { return ColumnLowCardinality::create(column_unique_->assumeMutable(), indexes_->assumeMutable(), is_shared); } - static MutablePtr create(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared = false) + static MutablePtr create(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared) { return Base::create(std::move(column_unique), std::move(indexes), is_shared); } @@ -75,7 +75,7 @@ class ColumnLowCardinality final : public COWHelper scatter(ColumnIndex num_columns, const Selector & selector) const override; diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index 41aefce682db..210750724c1a 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -133,7 +133,7 @@ MutableColumnPtr DataTypeLowCardinality::createColumn() const { MutableColumnPtr indexes = DataTypeUInt8().createColumn(); MutableColumnPtr dictionary = createColumnUnique(*dictionary_type); - return ColumnLowCardinality::create(std::move(dictionary), std::move(indexes)); + return ColumnLowCardinality::create(std::move(dictionary), std::move(indexes), /*is_shared=*/false); } Field DataTypeLowCardinality::getDefault() const diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index eec367faac1c..c7325aab43a8 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -622,7 +622,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( if (column_is_empty) low_cardinality_column.setSharedDictionary(global_dictionary); - auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column)); + auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column), /*is_shared=*/true); low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows); } else diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index 32f6ff57183a..dd4ad34a5366 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -406,9 +406,9 @@ ColumnPtr IExecutableFunction::executeWithoutSparseColumns( ColumnUniquePtr res_dictionary = std::move(res_mut_dictionary); if (indexes && !res_is_constant) - result = ColumnLowCardinality::create(res_dictionary, res_indexes->index(*indexes, 0)); + result = ColumnLowCardinality::create(res_dictionary, res_indexes->index(*indexes, 0), /*is_shared=*/false); else - result = ColumnLowCardinality::create(res_dictionary, res_indexes); + result = ColumnLowCardinality::create(res_dictionary, res_indexes, /*is_shared=*/false); if (res_is_constant) result = ColumnConst::create(std::move(result), input_rows_count); diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 3ed775e5d5c1..4dae97e0d58b 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1314,7 +1314,7 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn( auto arrow_indexes_column = std::make_shared(indexes_array); auto indexes_column = readColumnWithIndexesData(arrow_indexes_column, dict_info.default_value_index, dict_info.dictionary_size, is_lc_nullable); - auto lc_column = ColumnLowCardinality::create(dict_info.values->column, indexes_column); + auto lc_column = ColumnLowCardinality::create(dict_info.values->column, indexes_column, /*is_shared=*/true); auto lc_type = std::make_shared(is_lc_nullable ? makeNullable(dict_info.values->type) : dict_info.values->type); return {std::move(lc_column), std::move(lc_type), column_name}; } diff --git a/src/Processors/Formats/Impl/DWARFBlockInputFormat.cpp b/src/Processors/Formats/Impl/DWARFBlockInputFormat.cpp index 2d599dd22dd8..c82984068b80 100644 --- a/src/Processors/Formats/Impl/DWARFBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/DWARFBlockInputFormat.cpp @@ -396,7 +396,10 @@ Chunk DWARFBlockInputFormat::parseEntries(UnitState & unit) auto col_attr_name = ColumnVector::create(); auto col_attr_form = ColumnVector::create(); auto col_attr_int = ColumnVector::create(); - auto col_attr_str = ColumnLowCardinality::create(MutableColumnPtr(ColumnUnique::create(ColumnString::create()->cloneResized(1), /*is_nullable*/ false)), MutableColumnPtr(ColumnVector::create())); + auto col_attr_str = ColumnLowCardinality::create( + MutableColumnPtr(ColumnUnique::create(ColumnString::create()->cloneResized(1), /*is_nullable*/ false)), + MutableColumnPtr(ColumnVector::create()), + /*is_shared=*/false); auto col_attr_offsets = ColumnVector::create(); size_t num_rows = 0; auto err = llvm::Error::success(); @@ -749,8 +752,8 @@ Chunk DWARFBlockInputFormat::parseEntries(UnitState & unit) auto index = ColumnVector::create(); index->insert(1); auto indices = index->replicate({num_rows}); - cols.push_back(ColumnLowCardinality::create(ColumnUnique::create( - std::move(dict), /*is_nullable*/ false), indices)); + cols.push_back(ColumnLowCardinality::create( + ColumnUnique::create(std::move(dict), /*is_nullable*/ false), indices, /*is_shared*/ false)); break; } case COL_UNIT_OFFSET: @@ -761,8 +764,8 @@ Chunk DWARFBlockInputFormat::parseEntries(UnitState & unit) auto index = ColumnVector::create(); index->insert(1); auto indices = index->replicate({num_rows}); - cols.push_back(ColumnLowCardinality::create(ColumnUnique>::create( - std::move(dict), /*is_nullable*/ false), indices)); + cols.push_back(ColumnLowCardinality::create( + ColumnUnique>::create(std::move(dict), /*is_nullable*/ false), indices, /*is_shared*/ false)); break; } case COL_ANCESTOR_TAGS: diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp index 1ba10e3d44a0..17e4a9426170 100644 --- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp @@ -1422,7 +1422,7 @@ static ColumnWithTypeAndName readColumnWithEncodedStringOrFixedStringData( } } - return ColumnLowCardinality::create(std::move(dictionary_column), std::move(new_index_column)); + return ColumnLowCardinality::create(std::move(dictionary_column), std::move(new_index_column), /*is_shared=*/false); }; MutableColumnPtr internal_column; From b22201a5b1a16ed05594b5227f55d55a014b7e3c Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 9 Oct 2025 11:50:52 +0000 Subject: [PATCH 007/112] Update autogenerated version to 25.8.10.7 and contributors --- cmake/autogenerated_versions.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index f4b0bffe7db2..de536b119436 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54510) +SET(VERSION_REVISION 54511) SET(VERSION_MAJOR 25) SET(VERSION_MINOR 8) -SET(VERSION_PATCH 10) -SET(VERSION_GITHASH a1f4cd9c23f649b8891e952f973937f40eb9d273) -SET(VERSION_DESCRIBE v25.8.10.1-lts) -SET(VERSION_STRING 25.8.10.1) +SET(VERSION_PATCH 11) +SET(VERSION_GITHASH 02ec3a1ea1e08fb18d2d9638f00b2b557fa1cc1c) +SET(VERSION_DESCRIBE v25.8.11.1-lts) +SET(VERSION_STRING 25.8.11.1) # end of autochange From 0f2a8450374556a4c1db67454aa898ed5fb678ec Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 9 Oct 2025 14:12:54 +0000 Subject: [PATCH 008/112] Backport #88278 to 25.8: Update Azure sdk with Content-Length fix --- contrib/azure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/azure b/contrib/azure index 9e62bd3c7645..0f7a2013f7d7 160000 --- a/contrib/azure +++ b/contrib/azure @@ -1 +1 @@ -Subproject commit 9e62bd3c7645fbf276d37bcf99d9b90230d8efc9 +Subproject commit 0f7a2013f7d79058047fc4bd35e94d20578c0d2b From f796fe7b2afa77717852613c45450f65af068ccc Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 10 Oct 2025 10:13:12 +0000 Subject: [PATCH 009/112] Backport #88290 to 25.8: Fix object storages with correlated subqueries --- .../QueryPlan/ReadFromObjectStorageStep.cpp | 5 +++++ .../QueryPlan/ReadFromObjectStorageStep.h | 1 + ...bject_storage_correlated_subqueries.reference | 9 +++++++++ ...3644_object_storage_correlated_subqueries.sql | 16 ++++++++++++++++ 4 files changed, 31 insertions(+) create mode 100644 tests/queries/0_stateless/03644_object_storage_correlated_subqueries.reference create mode 100644 tests/queries/0_stateless/03644_object_storage_correlated_subqueries.sql diff --git a/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp b/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp index 49270683dfd9..bb37a5fea25a 100644 --- a/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp +++ b/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp @@ -53,6 +53,11 @@ ReadFromObjectStorageStep::ReadFromObjectStorageStep( { } +QueryPlanStepPtr ReadFromObjectStorageStep::clone() const +{ + return std::make_unique(*this); +} + void ReadFromObjectStorageStep::applyFilters(ActionDAGNodes added_filter_nodes) { SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); diff --git a/src/Processors/QueryPlan/ReadFromObjectStorageStep.h b/src/Processors/QueryPlan/ReadFromObjectStorageStep.h index c6adc4a961ef..b0e5397ad688 100644 --- a/src/Processors/QueryPlan/ReadFromObjectStorageStep.h +++ b/src/Processors/QueryPlan/ReadFromObjectStorageStep.h @@ -33,6 +33,7 @@ class ReadFromObjectStorageStep : public SourceStepWithFilter void updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) override; void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; + QueryPlanStepPtr clone() const override; private: ObjectStoragePtr object_storage; diff --git a/tests/queries/0_stateless/03644_object_storage_correlated_subqueries.reference b/tests/queries/0_stateless/03644_object_storage_correlated_subqueries.reference new file mode 100644 index 000000000000..07193989308c --- /dev/null +++ b/tests/queries/0_stateless/03644_object_storage_correlated_subqueries.reference @@ -0,0 +1,9 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/queries/0_stateless/03644_object_storage_correlated_subqueries.sql b/tests/queries/0_stateless/03644_object_storage_correlated_subqueries.sql new file mode 100644 index 000000000000..6ac7e423d185 --- /dev/null +++ b/tests/queries/0_stateless/03644_object_storage_correlated_subqueries.sql @@ -0,0 +1,16 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: needs s3 + +-- Use correlated subqueries which are supported only by the new analyzer. +set enable_analyzer = 1; + +INSERT INTO TABLE FUNCTION s3('http://localhost:11111/test/test-data-03644_object_storage.csv', 'test', 'testtest', 'CSV', 'number UInt64') SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert = 1; + +SELECT n1.c1 +FROM s3('http://localhost:11111/test/test-data-03644_object_storage.csv', 'test', 'testtest') AS n1 +WHERE n1.c1 > ( + SELECT AVG(n2.c1) + FROM s3('http://localhost:11111/test/test-data-03644_object_storage.csv', 'test', 'testtest') AS n2 + WHERE n2.c1 < n1.c1 +) +SETTINGS allow_experimental_correlated_subqueries = 1; From 3a7a768f653a2e893d1b39748a1234d30cccac9e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 10 Oct 2025 16:13:30 +0000 Subject: [PATCH 010/112] Backport #88330 to 25.8: Add missing checks for canContainMergeTreeTables() into system tables --- src/Databases/DataLake/DatabaseDataLake.cpp | 3 +-- src/Storages/System/StorageSystemDataSkippingIndices.cpp | 2 ++ src/Storages/System/StorageSystemProjections.cpp | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Databases/DataLake/DatabaseDataLake.cpp b/src/Databases/DataLake/DatabaseDataLake.cpp index 90260079ac7b..15ca4b9dd3e7 100644 --- a/src/Databases/DataLake/DatabaseDataLake.cpp +++ b/src/Databases/DataLake/DatabaseDataLake.cpp @@ -500,7 +500,6 @@ DatabaseTablesIteratorPtr DatabaseDataLake::getTablesIterator( } catch (...) { - tryLogCurrentException(log, fmt::format("Ignoring table {}", table_name)); promise->set_exception(std::current_exception()); } }); @@ -587,7 +586,7 @@ DatabaseTablesIteratorPtr DatabaseDataLake::getLightweightTablesIterator( } catch (...) { - tryLogCurrentException(log, fmt::format("ignoring table {}", table_name)); + tryLogCurrentException(log, fmt::format("Ignoring table {}", table_name)); } promise->set_value(storage); }); diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.cpp b/src/Storages/System/StorageSystemDataSkippingIndices.cpp index 32ca0781c32f..fa70a42cce0d 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.cpp +++ b/src/Storages/System/StorageSystemDataSkippingIndices.cpp @@ -266,6 +266,8 @@ void ReadFromSystemDataSkippingIndices::initializePipeline(QueryPipelineBuilder { if (database_name == DatabaseCatalog::TEMPORARY_DATABASE) continue; + if (!database->canContainMergeTreeTables()) + continue; /// Lazy database can contain only very primitive tables, /// it cannot contain tables with data skipping indices. diff --git a/src/Storages/System/StorageSystemProjections.cpp b/src/Storages/System/StorageSystemProjections.cpp index f3352b510748..e88763f2bf3f 100644 --- a/src/Storages/System/StorageSystemProjections.cpp +++ b/src/Storages/System/StorageSystemProjections.cpp @@ -253,6 +253,8 @@ void ReadFromSystemProjections::initializePipeline(QueryPipelineBuilder & pipeli { if (database_name == DatabaseCatalog::TEMPORARY_DATABASE) continue; + if (!database->canContainMergeTreeTables()) + continue; /// Lazy database can contain only very primitive tables, it cannot contain tables with projections. /// Skip it to avoid unnecessary tables loading in the Lazy database. From 5fbf50b47d6e3e7c4e3d025cebdfeb434ceece9e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sun, 12 Oct 2025 09:11:07 +0000 Subject: [PATCH 011/112] Backport #87826 to 25.8: Fix AzureBlobStorage copy --- src/Backups/BackupIO_AzureBlobStorage.cpp | 68 +---- .../AzureBlobStorageCommon.cpp | 257 +++++++++--------- .../AzureBlobStorage/AzureBlobStorageCommon.h | 4 +- .../AzureBlobStorage/AzureObjectStorage.cpp | 32 ++- .../copyAzureBlobStorageFile.cpp | 96 ++++--- .../copyAzureBlobStorageFile.h | 2 +- .../test.py | 71 +++-- 7 files changed, 268 insertions(+), 262 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index cdc6b194f84a..7d95ba062740 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -22,70 +22,12 @@ namespace fs = std::filesystem; namespace DB { + namespace ErrorCodes { extern const int LOGICAL_ERROR; } -/// This function compares the authorization methods used to access AzureBlobStorage -/// It takes 2 variables of variant type as input and checks if they are the same type and value -static bool compareAuthMethod (AzureBlobStorage::AuthMethod auth_method_a, AzureBlobStorage::AuthMethod auth_method_b) -{ - const auto * conn_string_a = std::get_if(&auth_method_a); - const auto * conn_string_b = std::get_if(&auth_method_b); - - if (conn_string_a && conn_string_b) - { - return *conn_string_a == *conn_string_b; - } - - const auto * shared_key_a = std::get_if>(&auth_method_a); - const auto * shared_key_b = std::get_if>(&auth_method_b); - - if (shared_key_a && shared_key_b) - { - return (shared_key_a->get()->AccountName == shared_key_b->get()->AccountName); - } - - try - { - const auto * workload_identity_a = std::get_if>(&auth_method_a); - const auto * workload_identity_b = std::get_if>(&auth_method_b); - - if (workload_identity_a && workload_identity_b) - { - Azure::Core::Credentials::TokenRequestContext tokenRequestContext; - return workload_identity_a->get()->GetToken(tokenRequestContext, {}).Token == workload_identity_b->get()->GetToken(tokenRequestContext, {}).Token; - } - - const auto * managed_identity_a = std::get_if>(&auth_method_a); - const auto * managed_identity_b = std::get_if>(&auth_method_b); - - if (managed_identity_a && managed_identity_b) - { - Azure::Core::Credentials::TokenRequestContext tokenRequestContext; - return managed_identity_a->get()->GetToken(tokenRequestContext, {}).Token == managed_identity_b->get()->GetToken(tokenRequestContext, {}).Token; - } - - const auto * static_credential_a = std::get_if>(&auth_method_a); - const auto * static_credential_b = std::get_if>(&auth_method_b); - - if (static_credential_a && static_credential_b) - { - Azure::Core::Credentials::TokenRequestContext tokenRequestContext; - auto az_context = Azure::Core::Context(); - return static_credential_a->get()->GetToken(tokenRequestContext, az_context).Token == static_credential_b->get()->GetToken(tokenRequestContext, az_context).Token; - } - } - catch (const Azure::Core::Credentials::AuthenticationException & e) - { - /// This is added to catch exception from GetToken. We want to log & fail silently i.e return false so that we can fallback to read & copy (i.e not native copy) - LOG_DEBUG(getLogger("compareAuthMethod"), "Exception caught while comparing credentials, error = {}", e.what()); - return false; - } - return false; -} - BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( const AzureBlobStorage::ConnectionParams & connection_params_, const String & blob_path_, @@ -166,7 +108,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, /* dest_path */ dst_blob_path[0], settings, read_settings, - compareAuthMethod(connection_params.auth_method, destination_disk->getObjectStorage()->getAzureBlobStorageAuthMethod()), + std::optional(), threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupRDAzure")); return file_size; @@ -233,7 +175,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk( /// In this case we can't use the native copy. if (auto src_blob_path = src_disk->getBlobPath(src_path); src_blob_path.size() == 2) { - LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); + LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorage", src_path, src_disk->getName()); copyAzureBlobStorageFile( src_disk->getObjectStorage()->getAzureBlobStorageClient(), client, @@ -245,7 +187,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk( fs::path(blob_path) / path_in_backup, settings, read_settings, - compareAuthMethod(src_disk->getObjectStorage()->getAzureBlobStorageAuthMethod(), connection_params.auth_method), + std::optional(), threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWRAzure")); return; /// copied! } @@ -269,7 +211,7 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St /* dest_path */ destination, settings, read_settings, - true, + std::optional(), threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWRAzure")); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.cpp index 8be2e4d7e141..8d3b3696be12 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.cpp @@ -93,8 +93,6 @@ namespace ErrorCodes namespace AzureBlobStorage { -#if USE_AZURE_BLOB_STORAGE - static void validateStorageAccountUrl(const String & storage_account_url) { const auto * storage_account_url_pattern_str = R"(http(()|s)://[a-z0-9-.:]+(()|/)[a-z0-9]*(()|/))"; @@ -121,6 +119,8 @@ static void validateContainerName(const String & container_name) container_name_pattern_str, container_name); } +#if USE_AZURE_BLOB_STORAGE + static bool isConnectionString(const std::string & candidate) { return !candidate.starts_with("http"); @@ -223,113 +223,6 @@ std::unique_ptr ConnectionParams::createForContainer() const }, auth_method); } -Endpoint processEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix) -{ - String storage_url; - String account_name; - String container_name; - String prefix; - - auto get_container_name = [&] - { - if (config.has(config_prefix + ".container_name")) - return config.getString(config_prefix + ".container_name"); - - if (config.has(config_prefix + ".container")) - return config.getString(config_prefix + ".container"); - - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected either `container` or `container_name` parameter in config"); - }; - - if (config.has(config_prefix + ".endpoint")) - { - String endpoint = config.getString(config_prefix + ".endpoint"); - - /// For some authentication methods account name is not present in the endpoint - /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) - bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); - - size_t pos = endpoint.find("//"); - if (pos == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected '//' in endpoint"); - - if (endpoint_contains_account_name) - { - size_t acc_pos_begin = endpoint.find('/', pos + 2); - if (acc_pos_begin == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected account_name in endpoint"); - - storage_url = endpoint.substr(0, acc_pos_begin); - size_t acc_pos_end = endpoint.find('/', acc_pos_begin + 1); - - if (acc_pos_end == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected container_name in endpoint"); - - account_name = endpoint.substr(acc_pos_begin + 1, acc_pos_end - acc_pos_begin - 1); - - size_t cont_pos_end = endpoint.find('/', acc_pos_end + 1); - - if (cont_pos_end != std::string::npos) - { - container_name = endpoint.substr(acc_pos_end + 1, cont_pos_end - acc_pos_end - 1); - prefix = endpoint.substr(cont_pos_end + 1); - } - else - { - container_name = endpoint.substr(acc_pos_end + 1); - } - } - else - { - size_t cont_pos_begin = endpoint.find('/', pos + 2); - - if (cont_pos_begin == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected container_name in endpoint"); - - storage_url = endpoint.substr(0, cont_pos_begin); - size_t cont_pos_end = endpoint.find('/', cont_pos_begin + 1); - - if (cont_pos_end != std::string::npos) - { - container_name = endpoint.substr(cont_pos_begin + 1,cont_pos_end - cont_pos_begin - 1); - prefix = endpoint.substr(cont_pos_end + 1); - } - else - { - container_name = endpoint.substr(cont_pos_begin + 1); - } - } - - if (config.has(config_prefix + ".endpoint_subpath")) - { - String endpoint_subpath = config.getString(config_prefix + ".endpoint_subpath"); - prefix = fs::path(prefix) / endpoint_subpath; - } - } - else if (config.has(config_prefix + ".connection_string")) - { - storage_url = config.getString(config_prefix + ".connection_string"); - container_name = get_container_name(); - } - else if (config.has(config_prefix + ".storage_account_url")) - { - storage_url = config.getString(config_prefix + ".storage_account_url"); - validateStorageAccountUrl(storage_url); - container_name = get_container_name(); - } - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected either `storage_account_url` or `connection_string` or `endpoint` in config"); - - if (!container_name.empty()) - validateContainerName(container_name); - - std::optional container_already_exists {}; - if (config.has(config_prefix + ".container_already_exists")) - container_already_exists = {config.getBool(config_prefix + ".container_already_exists")}; - - return {storage_url, account_name, container_name, prefix, "", container_already_exists}; -} - void processURL(const String & url, const String & container_name, Endpoint & endpoint, AuthMethod & auth_method) { endpoint.container_name = container_name; @@ -505,6 +398,113 @@ BlobClientOptions getClientOptions( #endif +Endpoint processEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix) +{ + String storage_url; + String account_name; + String container_name; + String prefix; + + auto get_container_name = [&] + { + if (config.has(config_prefix + ".container_name")) + return config.getString(config_prefix + ".container_name"); + + if (config.has(config_prefix + ".container")) + return config.getString(config_prefix + ".container"); + + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected either `container` or `container_name` parameter in config"); + }; + + if (config.has(config_prefix + ".endpoint")) + { + String endpoint = config.getString(config_prefix + ".endpoint"); + + /// For some authentication methods account name is not present in the endpoint + /// 'endpoint_contains_account_name' bool is used to understand how to split the endpoint (default : true) + bool endpoint_contains_account_name = config.getBool(config_prefix + ".endpoint_contains_account_name", true); + + size_t pos = endpoint.find("//"); + if (pos == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected '//' in endpoint"); + + if (endpoint_contains_account_name) + { + size_t acc_pos_begin = endpoint.find('/', pos + 2); + if (acc_pos_begin == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected account_name in endpoint"); + + storage_url = endpoint.substr(0, acc_pos_begin); + size_t acc_pos_end = endpoint.find('/', acc_pos_begin + 1); + + if (acc_pos_end == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected container_name in endpoint"); + + account_name = endpoint.substr(acc_pos_begin + 1, acc_pos_end - acc_pos_begin - 1); + + size_t cont_pos_end = endpoint.find('/', acc_pos_end + 1); + + if (cont_pos_end != std::string::npos) + { + container_name = endpoint.substr(acc_pos_end + 1, cont_pos_end - acc_pos_end - 1); + prefix = endpoint.substr(cont_pos_end + 1); + } + else + { + container_name = endpoint.substr(acc_pos_end + 1); + } + } + else + { + size_t cont_pos_begin = endpoint.find('/', pos + 2); + + if (cont_pos_begin == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected container_name in endpoint"); + + storage_url = endpoint.substr(0, cont_pos_begin); + size_t cont_pos_end = endpoint.find('/', cont_pos_begin + 1); + + if (cont_pos_end != std::string::npos) + { + container_name = endpoint.substr(cont_pos_begin + 1,cont_pos_end - cont_pos_begin - 1); + prefix = endpoint.substr(cont_pos_end + 1); + } + else + { + container_name = endpoint.substr(cont_pos_begin + 1); + } + } + if (config.has(config_prefix + ".endpoint_subpath")) + { + String endpoint_subpath = config.getString(config_prefix + ".endpoint_subpath"); + prefix = fs::path(prefix) / endpoint_subpath; + } + } + else if (config.has(config_prefix + ".connection_string")) + { + storage_url = config.getString(config_prefix + ".connection_string"); + container_name = get_container_name(); + } + else if (config.has(config_prefix + ".storage_account_url")) + { + storage_url = config.getString(config_prefix + ".storage_account_url"); + validateStorageAccountUrl(storage_url); + container_name = get_container_name(); + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected either `storage_account_url` or `connection_string` or `endpoint` in config"); + + if (!container_name.empty()) + validateContainerName(container_name); + + std::optional container_already_exists {}; + if (config.has(config_prefix + ".container_already_exists")) + container_already_exists = {config.getBool(config_prefix + ".container_already_exists")}; + + return {storage_url, account_name, container_name, prefix, "", container_already_exists}; +} + std::unique_ptr getRequestSettings(const Settings & query_settings) { auto settings = std::make_unique(); @@ -608,31 +608,41 @@ void AzureSettingsByEndpoint::loadFromConfig( for (const String & key : config_keys) { - const auto key_path = config_prefix + "." + key; - String endpoint_path = key_path + ".connection_string"; - - if (!config.has(endpoint_path)) + if (config.has(config_prefix + "." + key + ".object_storage_type")) { - endpoint_path = key_path + ".storage_account_url"; + const auto &object_storage_type = config.getString(config_prefix + "." + key + ".object_storage_type"); + if (object_storage_type != "azure" && object_storage_type != "azure_blob_storage") + { + /// Then its not an azure config + continue; + } + + const auto key_path = config_prefix + "." + key; + String endpoint_path = key_path + ".connection_string"; if (!config.has(endpoint_path)) { - endpoint_path = key_path + ".endpoint"; + endpoint_path = key_path + ".storage_account_url"; if (!config.has(endpoint_path)) { - /// Error, shouldn't hit this todo:: throw error - continue; + endpoint_path = key_path + ".endpoint"; + + if (!config.has(endpoint_path)) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "URL not provided for azure blob storage disk {}", + object_storage_type); + } } } - } - auto request_settings = AzureBlobStorage::getRequestSettings(config, key_path, settings); - - azure_settings.emplace( - config.getString(endpoint_path), - std::move(*request_settings)); + auto endpoint = AzureBlobStorage::processEndpoint(config, key_path); + auto request_settings = AzureBlobStorage::getRequestSettings(config, key_path, settings); + azure_settings.emplace( + endpoint.storage_account_url, + std::move(*request_settings)); + } } } @@ -654,4 +664,5 @@ std::optional AzureSettingsByEndpoint::getSet return {}; } + } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.h index 5980255ac484..e0bd961b2f6f 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageCommon.h @@ -149,8 +149,6 @@ struct ConnectionParams std::unique_ptr createForContainer() const; }; - -Endpoint processEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix); void processURL(const String & url, const String & container_name, Endpoint & endpoint, AuthMethod & auth_method); std::unique_ptr getContainerClient(const ConnectionParams & params, bool readonly); @@ -165,6 +163,8 @@ AuthMethod getAuthMethod(const Poco::Util::AbstractConfiguration & config, const #endif +Endpoint processEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix); + std::unique_ptr getRequestSettings(const Settings & query_settings); std::unique_ptr getRequestSettingsForBackup(ContextPtr context, String endpoint, bool use_native_copy); std::unique_ptr getRequestSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Settings & settings_ref); diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 4fe9c5c43117..0f4d533c2a1c 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -327,26 +327,34 @@ ObjectMetadata AzureObjectStorage::getObjectMetadata(const std::string & path) c void AzureObjectStorage::copyObject( /// NOLINT const StoredObject & object_from, const StoredObject & object_to, - const ReadSettings &, + const ReadSettings & read_settings, const WriteSettings &, std::optional object_to_attributes) { + auto settings_ptr = settings.get(); auto client_ptr = client.get(); - auto dest_blob_client = client_ptr->GetBlobClient(object_to.remote_path); - auto source_blob_client = client_ptr->GetBlobClient(object_from.remote_path); - - Azure::Storage::Blobs::CopyBlobFromUriOptions copy_options; - if (object_to_attributes.has_value()) - { - for (const auto & [key, value] : *object_to_attributes) - copy_options.Metadata[key] = value; - } + auto object_metadata = getObjectMetadata(object_from.remote_path); ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (client_ptr->IsClientForDisk()) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - - dest_blob_client.CopyFromUri(source_blob_client.GetUrl(), copy_options); + LOG_TRACE(log, "AzureObjectStorage::copyObject of size {}", object_metadata.size_bytes); + + auto scheduler = threadPoolCallbackRunnerUnsafe(getThreadPoolWriter(), "AzureObjCopy"); + + copyAzureBlobStorageFile( + client_ptr, + client_ptr, + connection_params.getContainer(), + object_from.remote_path, + 0, + object_metadata.size_bytes, + connection_params.getContainer(), + object_to.remote_path, + settings_ptr, + read_settings, + object_to_attributes, + scheduler); } void AzureObjectStorage::applyNewSettings( diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index b49e34dd2d3a..7e584ab59e14 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -353,60 +353,92 @@ void copyAzureBlobStorageFile( const String & dest_blob, std::shared_ptr settings, const ReadSettings & read_settings, - bool same_credentials, + const std::optional & object_to_attributes, ThreadPoolCallbackRunnerUnsafe schedule) { auto log = getLogger("copyAzureBlobStorageFile"); + bool is_native_copy_done = false; - if (settings->use_native_copy && same_credentials) + if (settings->use_native_copy) { + /// Do native copy LOG_TRACE(log, "Copying Blob: {} from Container: {} using native copy", src_blob, src_container_for_logging); ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (dest_client->IsClientForDisk()) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); - auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); + try + { + auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); - auto source_uri = block_blob_client_src.GetUrl(); + auto source_uri = block_blob_client_src.GetUrl(); - if (size < settings->max_single_part_copy_size) - { - LOG_TRACE(log, "Copy blob sync {} -> {}", src_blob, dest_blob); - block_blob_client_dest.CopyFromUri(source_uri); - } - else - { - Azure::Storage::Blobs::StartBlobCopyOperation operation = block_blob_client_dest.StartCopyFromUri(source_uri); + if (size < settings->max_single_part_copy_size) + { + Azure::Storage::Blobs::CopyBlobFromUriOptions copy_options; + if (object_to_attributes.has_value()) + { + for (const auto & [key, value] : *object_to_attributes) + copy_options.Metadata[key] = value; + } - auto copy_response = operation.PollUntilDone(std::chrono::milliseconds(100)); - auto properties_model = copy_response.Value; + LOG_TRACE(log, "Copy blob sync {} -> {}", src_blob, dest_blob); + block_blob_client_dest.CopyFromUri(source_uri, copy_options); + } + else + { + Azure::Storage::Blobs::StartBlobCopyFromUriOptions copy_options; + if (object_to_attributes.has_value()) + { + for (const auto & [key, value] : *object_to_attributes) + copy_options.Metadata[key] = value; + } - auto copy_status = properties_model.CopyStatus; - auto copy_status_description = properties_model.CopyStatusDescription; + Azure::Storage::Blobs::StartBlobCopyOperation operation = block_blob_client_dest.StartCopyFromUri(source_uri, copy_options); + auto copy_response = operation.PollUntilDone(std::chrono::milliseconds(100)); + auto properties_model = copy_response.Value; - if (copy_status.HasValue() && copy_status.Value() == Azure::Storage::Blobs::Models::CopyStatus::Success) - { - LOG_TRACE(log, "Copy of {} to {} finished", properties_model.CopySource.Value(), dest_blob); + auto copy_status = properties_model.CopyStatus; + auto copy_status_description = properties_model.CopyStatusDescription; + + + if (copy_status.HasValue() && copy_status.Value() == Azure::Storage::Blobs::Models::CopyStatus::Success) + { + LOG_TRACE(log, "Copy of {} to {} finished", properties_model.CopySource.Value(), dest_blob); + } + else + { + if (copy_status.HasValue()) + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy from {} to {} failed with status {} description {} (operation is done {})", + src_blob, dest_blob, copy_status.Value().ToString(), copy_status_description.Value(), operation.IsDone()); + throw Exception( + ErrorCodes::AZURE_BLOB_STORAGE_ERROR, + "Copy from {} to {} didn't complete with success status (operation is done {})", + src_blob, + dest_blob, + operation.IsDone()); + } } - else + is_native_copy_done = true; + } + catch (const Azure::Storage::StorageException & e) + { + if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Unauthorized) { - if (copy_status.HasValue()) - throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy from {} to {} failed with status {} description {} (operation is done {})", - src_blob, dest_blob, copy_status.Value().ToString(), copy_status_description.Value(), operation.IsDone()); - throw Exception( - ErrorCodes::AZURE_BLOB_STORAGE_ERROR, - "Copy from {} to {} didn't complete with success status (operation is done {})", - src_blob, - dest_blob, - operation.IsDone()); + LOG_TRACE(log, "Copy operation has thrown unauthorized access error, which indicates that the storage account of the source & destination are not the same. " + "Will attempt to copy using read & write. source container = {} blob = {} and destination container = {} blob = {}", + src_container_for_logging, src_blob, dest_container_for_logging, dest_blob); } + else + throw; } } - else + if (!is_native_copy_done) { - LOG_TRACE(log, "Copying Blob: {} from Container: {} native copy is disabled {}", src_blob, src_container_for_logging, same_credentials ? "" : " because of different credentials"); + /// Copy through read and write + LOG_TRACE(log, "Reading and writing Blob: {} from Container: {}", src_blob, src_container_for_logging); auto create_read_buffer = [&] { return std::make_unique( diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 036fbf34b0da..2059e715a7a7 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -30,7 +30,7 @@ void copyAzureBlobStorageFile( const String & dest_blob, std::shared_ptr settings, const ReadSettings & read_settings, - bool same_credentials = true, + const std::optional & object_to_attributes, ThreadPoolCallbackRunnerUnsafe schedule_ = {}); diff --git a/tests/integration/test_azure_blob_storage_native_copy/test.py b/tests/integration/test_azure_blob_storage_native_copy/test.py index f5d24485b9e3..9e4c70da7c98 100644 --- a/tests/integration/test_azure_blob_storage_native_copy/test.py +++ b/tests/integration/test_azure_blob_storage_native_copy/test.py @@ -55,17 +55,16 @@ def generate_config(port): 1000000000 1 - + local object_storage azure_blob_storage - true - http://azurite1:{port}/devstoreaccount1/ - othercontainer + DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:{port}/devstoreaccount1; + cont false - devstoreaccount1 - Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== - + true + 4 + @@ -89,20 +88,12 @@ def generate_config(port): - - -
- disk_azure_different_auth -
-
-
disk_azure disk_azure_cache disk_azure_other_bucket - disk_azure_different_auth """ @@ -295,21 +286,43 @@ def test_backup_restore_native_copy_disabled_in_query(cluster): assert not node4.contains_in_log("using native copy") -def test_backup_restore_native_copy_disabled_due_to_different_auth(cluster): + +def test_clickhouse_disks_azure(cluster): node4 = cluster.instances["node4"] - azure_query( - node4, - f"CREATE TABLE test_simple_merge_tree_native_copy_disabled_due_to_different_auth(key UInt64, data String) Engine = MergeTree() ORDER BY tuple() SETTINGS storage_policy='policy_azure_different_auth'", + disk = "disk_azure_small_native_copy" + node4.exec_in_container( + [ + "bash", + "-c", + f"echo 'meow' | /usr/bin/clickhouse disks --disk {disk} --query 'write im_a_file.txt'", + ] ) - azure_query( - node4, f"INSERT INTO test_simple_merge_tree_native_copy_disabled_due_to_different_auth VALUES (1, 'a')" + out = node4.exec_in_container( + [ + "/usr/bin/clickhouse", + "disks", + "--disk", + disk, + "--query", + "read im_a_file.txt", + ] ) - - backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_merge_tree_native_copy_disabled_due_to_different_auth_backup')" - print("BACKUP DEST", backup_destination) - azure_query( - node4, - f"BACKUP TABLE test_simple_merge_tree_native_copy_disabled_due_to_different_auth TO {backup_destination}", + assert out == "meow\n\n" + node4.exec_in_container( + [ + "bash", + "-c", + f"/usr/bin/clickhouse disks --disk {disk} --log-level trace --query 'copy im_a_file.txt another_file.txt'", + ] ) - - assert not node4.contains_in_log("using native copy") \ No newline at end of file + out = node4.exec_in_container( + [ + "/usr/bin/clickhouse", + "disks", + "--disk", + disk, + "--query", + "read another_file.txt", + ] + ) + assert out == "meow\n\n" From f9125f7bf3bafd6d1e04add18ce5aa7ea95521b3 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 12 Oct 2025 11:49:43 +0200 Subject: [PATCH 012/112] Fix includes in AzureObjectStorage --- src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 0f4d533c2a1c..83ca6950da69 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include From 3f5e3cb5428b784e9dc24f32287b6b6e8803d5ed Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 13 Oct 2025 05:11:59 +0000 Subject: [PATCH 013/112] Backport #88089 to 25.8: Add a URI normalization for the `SOURCE` grants filter. --- docs/en/sql-reference/statements/grant.md | 17 ++++++++++++ src/TableFunctions/ITableFunction.cpp | 2 +- src/TableFunctions/ITableFunction.h | 16 ++++++++++++ src/TableFunctions/ITableFunctionXDBC.cpp | 2 -- src/TableFunctions/TableFunctionURL.cpp | 2 +- ...6_normalize_url_in_source_grants.reference | 0 .../03636_normalize_url_in_source_grants.sh | 26 +++++++++++++++++++ 7 files changed, 61 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/03636_normalize_url_in_source_grants.reference create mode 100755 tests/queries/0_stateless/03636_normalize_url_in_source_grants.sh diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index 9b0ea66d21b6..11a8a655cfaa 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -680,6 +680,23 @@ GRANT READ ON S3('s3://foo/.*') TO john GRANT READ ON S3('s3://bar/.*') TO john ``` +:::warning +Source filter takes **regexp** as a parameter, so a grant +`GRANT READ ON URL('http://www.google.com') TO john;` + +will allow queries +```sql +SELECT * FROM url('https://www.google.com'); +SELECT * FROM url('https://www-google.com'); +``` + +because `.` is treated as an `Any Single Character` in the regexps. +This may lead to potential vulnerability. The correct grant should be +```sql +GRANT READ ON URL('https://www\.google\.com') TO john; +``` +::: + **Re-granting with GRANT OPTION:** If the original grant has `WITH GRANT OPTION`, it can be re-granted using `GRANT CURRENT GRANTS`: diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index 3f59a8c48a11..8fc0211417bd 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -44,7 +44,7 @@ StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr conte if (is_insert_query) type_to_check = AccessType::WRITE; - context->getAccess()->checkAccessWithFilter(type_to_check, toStringSource(*access_object), getFunctionURI()); + context->getAccess()->checkAccessWithFilter(type_to_check, toStringSource(*access_object), getFunctionURINormalized()); } auto table_function_properties = TableFunctionFactory::instance().tryGetProperties(getName()); diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index b3701915a3b5..c4e7534c1e49 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -106,6 +106,8 @@ class ITableFunction : public std::enable_shared_from_this /// For example for s3Cluster the database storage name is S3Cluster, and we need to check /// privileges as if it was S3. virtual const char * getNonClusteredStorageEngineName() const; + +protected: /// The URI of function for permission checking. Can be empty string if not applicable. /// For example for url('https://foo.bar') URI would be 'https://foo.bar'. virtual const String & getFunctionURI() const @@ -113,6 +115,20 @@ class ITableFunction : public std::enable_shared_from_this static const String empty; return empty; } + + String getFunctionURINormalized() const + { + try + { + Poco::URI uri(getFunctionURI()); + uri.normalize(); + return uri.toString(); + } + catch (const Poco::Exception &) + { + return ""; + } + } }; /// Properties of table function that are independent of argument types and parameters. diff --git a/src/TableFunctions/ITableFunctionXDBC.cpp b/src/TableFunctions/ITableFunctionXDBC.cpp index 6904f26b2878..810ba5c49a3b 100644 --- a/src/TableFunctions/ITableFunctionXDBC.cpp +++ b/src/TableFunctions/ITableFunctionXDBC.cpp @@ -62,8 +62,6 @@ class ITableFunctionXDBC : public ITableFunction void startBridgeIfNot(ContextPtr context) const; - const String & getFunctionURI() const override { return connection_string; } - String connection_string; String schema_name; String remote_table_name; diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index 0cbacdf42a16..e5833e1414a4 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -143,7 +143,7 @@ ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context, ColumnsDescription columns; if (const auto access_object = getSourceAccessObject()) - context->getAccess()->checkAccessWithFilter(AccessType::READ, toStringSource(*access_object), getFunctionURI()); + context->getAccess()->checkAccessWithFilter(AccessType::READ, toStringSource(*access_object), getFunctionURINormalized()); if (format == "auto") { columns = StorageURL::getTableStructureAndFormatFromData( diff --git a/tests/queries/0_stateless/03636_normalize_url_in_source_grants.reference b/tests/queries/0_stateless/03636_normalize_url_in_source_grants.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/03636_normalize_url_in_source_grants.sh b/tests/queries/0_stateless/03636_normalize_url_in_source_grants.sh new file mode 100755 index 000000000000..f1a55f14f28e --- /dev/null +++ b/tests/queries/0_stateless/03636_normalize_url_in_source_grants.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +user="user03636_${CLICKHOUSE_DATABASE}_$RANDOM" + +${CLICKHOUSE_CLIENT} < Date: Mon, 13 Oct 2025 09:14:05 +0000 Subject: [PATCH 014/112] Backport #88401 to 25.8: Fix quadratic complexity in `countMatches` --- src/Functions/countMatches.cpp | 162 +++++++++++++++++- src/Functions/countMatches.h | 156 ----------------- .../03666_count_matches_complexity.reference | 2 + .../03666_count_matches_complexity.sql | 2 + 4 files changed, 162 insertions(+), 160 deletions(-) delete mode 100644 src/Functions/countMatches.h create mode 100644 tests/queries/0_stateless/03666_count_matches_complexity.reference create mode 100644 tests/queries/0_stateless/03666_count_matches_complexity.sql diff --git a/src/Functions/countMatches.cpp b/src/Functions/countMatches.cpp index 078823744dd0..79eaadeaa393 100644 --- a/src/Functions/countMatches.cpp +++ b/src/Functions/countMatches.cpp @@ -1,9 +1,166 @@ +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + +namespace Setting +{ + extern const SettingsBool count_matches_stop_at_empty_match; +} namespace { +using Pos = const char *; + +template +class FunctionCountMatches : public IFunction +{ + const bool count_matches_stop_at_empty_match; + +public: + static constexpr auto name = CountMatchesBase::name; + static FunctionPtr create(ContextPtr context) { return std::make_shared>(context); } + + explicit FunctionCountMatches(ContextPtr context) + : count_matches_stop_at_empty_match(context->getSettingsRef()[Setting::count_matches_stop_at_empty_match]) + { + } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 2; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors args + { + {"haystack", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"}, + {"pattern", static_cast(&isString), isColumnConst, "constant String"} + }; + validateFunctionArguments(*this, arguments, args); + + return std::make_shared(); + } + + DataTypePtr getReturnTypeForDefaultImplementationForDynamic() const override + { + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + const IColumn * col_pattern = arguments[1].column.get(); + const ColumnConst * col_pattern_const = checkAndGetColumnConst(col_pattern); + if (col_pattern_const == nullptr) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Pattern argument is not const"); + + const OptimizedRegularExpression re = Regexps::createRegexp(col_pattern_const->getValue()); + + const IColumn * col_haystack = arguments[0].column.get(); + OptimizedRegularExpression::MatchVec matches; + + if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString(col_haystack)) + { + std::string_view str = col_haystack_const->getDataColumn().getDataAt(0).toView(); + uint64_t matches_count = countMatches(str, re, matches); + return result_type->createColumnConst(input_rows_count, matches_count); + } + if (const ColumnString * col_haystack_string = checkAndGetColumn(col_haystack)) + { + auto col_res = ColumnUInt64::create(); + + const ColumnString::Chars & src_chars = col_haystack_string->getChars(); + const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets(); + + ColumnUInt64::Container & vec_res = col_res->getData(); + vec_res.resize(input_rows_count); + + ColumnString::Offset current_src_offset = 0; + + for (size_t i = 0; i < input_rows_count; ++i) + { + Pos pos = reinterpret_cast(&src_chars[current_src_offset]); + current_src_offset = src_offsets[i]; + Pos end = reinterpret_cast(&src_chars[current_src_offset]); + + std::string_view str(pos, end - pos); + vec_res[i] = countMatches(str, re, matches); + } + + return col_res; + } + if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn(col_haystack)) + { + auto col_res = ColumnUInt64::create(); + + ColumnUInt64::Container & vec_res = col_res->getData(); + vec_res.resize(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) + { + std::string_view str = col_haystack_fixedstring->getDataAt(i).toView(); + vec_res[i] = countMatches(str, re, matches); + } + + return col_res; + } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Could not cast haystack argument to String or FixedString"); + } + + uint64_t countMatches(std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) const + { + /// Only one match is required, no need to copy more. + static const unsigned matches_limit = 1; + + Pos pos = reinterpret_cast(src.data()); + Pos end = reinterpret_cast(src.data() + src.size()); + + uint64_t match_count = 0; + while (pos < end) + { + if (re.match(pos, end - pos, matches, matches_limit)) + { + if (matches[0].length > 0) + { + pos += matches[0].offset + matches[0].length; + ++match_count; + } + else + { + if (count_matches_stop_at_empty_match) + /// Progress should be made, but with empty match the progress will not be done. + break; + + /// Progress is made by a single character in case the pattern does not match or have zero-byte match. + /// The reason is simply because the pattern could match another part of input when forwarded. + ++pos; + } + } + else + break; + } + + return match_count; + } +}; + struct FunctionCountMatchesCaseSensitive { static constexpr auto name = "countMatches"; @@ -17,9 +174,6 @@ struct FunctionCountMatchesCaseInsensitive } -namespace DB -{ - REGISTER_FUNCTION(CountMatches) { factory.registerFunction>(); diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h deleted file mode 100644 index 7a77ad4ec3ab..000000000000 --- a/src/Functions/countMatches.h +++ /dev/null @@ -1,156 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; -} - -namespace Setting -{ - extern const SettingsBool count_matches_stop_at_empty_match; -} - -using Pos = const char *; - -template -class FunctionCountMatches : public IFunction -{ - const bool count_matches_stop_at_empty_match; - -public: - static constexpr auto name = CountMatchesBase::name; - static FunctionPtr create(ContextPtr context) { return std::make_shared>(context); } - - explicit FunctionCountMatches(ContextPtr context) - : count_matches_stop_at_empty_match(context->getSettingsRef()[Setting::count_matches_stop_at_empty_match]) - { - } - - String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 2; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors args{ - {"haystack", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"}, - {"pattern", static_cast(&isString), isColumnConst, "constant String"} - }; - validateFunctionArguments(*this, arguments, args); - - return std::make_shared(); - } - - DataTypePtr getReturnTypeForDefaultImplementationForDynamic() const override - { - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override - { - const IColumn * col_pattern = arguments[1].column.get(); - const ColumnConst * col_pattern_const = checkAndGetColumnConst(col_pattern); - if (col_pattern_const == nullptr) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Pattern argument is not const"); - - const OptimizedRegularExpression re = Regexps::createRegexp(col_pattern_const->getValue()); - - const IColumn * col_haystack = arguments[0].column.get(); - OptimizedRegularExpression::MatchVec matches; - - if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString(col_haystack)) - { - std::string_view str = col_haystack_const->getDataColumn().getDataAt(0).toView(); - uint64_t matches_count = countMatches(str, re, matches); - return result_type->createColumnConst(input_rows_count, matches_count); - } - if (const ColumnString * col_haystack_string = checkAndGetColumn(col_haystack)) - { - auto col_res = ColumnUInt64::create(); - - const ColumnString::Chars & src_chars = col_haystack_string->getChars(); - const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets(); - - ColumnUInt64::Container & vec_res = col_res->getData(); - vec_res.resize(input_rows_count); - - ColumnString::Offset current_src_offset = 0; - - for (size_t i = 0; i < input_rows_count; ++i) - { - Pos pos = reinterpret_cast(&src_chars[current_src_offset]); - current_src_offset = src_offsets[i]; - Pos end = reinterpret_cast(&src_chars[current_src_offset]); - - std::string_view str(pos, end - pos); - vec_res[i] = countMatches(str, re, matches); - } - - return col_res; - } - if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn(col_haystack)) - { - auto col_res = ColumnUInt64::create(); - - ColumnUInt64::Container & vec_res = col_res->getData(); - vec_res.resize(input_rows_count); - - for (size_t i = 0; i < input_rows_count; ++i) - { - std::string_view str = col_haystack_fixedstring->getDataAt(i).toView(); - vec_res[i] = countMatches(str, re, matches); - } - - return col_res; - } - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Could not cast haystack argument to String or FixedString"); - } - - uint64_t countMatches(std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) const - { - /// Only one match is required, no need to copy more. - static const unsigned matches_limit = 1; - - Pos pos = reinterpret_cast(src.data()); - Pos end = reinterpret_cast(src.data() + src.size()); - - uint64_t match_count = 0; - while (pos < end) - { - if (re.match(pos, end - pos, matches, matches_limit) && matches[0].length > 0) - { - pos += matches[0].offset + matches[0].length; - ++match_count; - } - else - { - if (count_matches_stop_at_empty_match) - /// Progress should be made, but with empty match the progress will not be done. - break; - - /// Progress is made by a single character in case the pattern does not match or have zero-byte match. - /// The reason is simply because the pattern could match another part of input when forwarded. - ++pos; - } - } - - return match_count; - } -}; - -} diff --git a/tests/queries/0_stateless/03666_count_matches_complexity.reference b/tests/queries/0_stateless/03666_count_matches_complexity.reference new file mode 100644 index 000000000000..32aa8df235fb --- /dev/null +++ b/tests/queries/0_stateless/03666_count_matches_complexity.reference @@ -0,0 +1,2 @@ +0 +1000000 diff --git a/tests/queries/0_stateless/03666_count_matches_complexity.sql b/tests/queries/0_stateless/03666_count_matches_complexity.sql new file mode 100644 index 000000000000..d866543b5d8e --- /dev/null +++ b/tests/queries/0_stateless/03666_count_matches_complexity.sql @@ -0,0 +1,2 @@ +SELECT countMatches(repeat('\0\0\0\0\0\0\0\0\0\0', 1000000), 'a'); +SELECT countMatches(repeat('\0\0\0\0\0\0\0\0\0\0a', 1000000), 'a'); From e3979ed1ff153e3d5fc26a92dfe3fb1552c01c78 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 13 Oct 2025 09:15:38 +0000 Subject: [PATCH 015/112] Backport #87987 to 25.8: Fix index analysis with session_timezone specified --- src/Storages/MergeTree/KeyCondition.cpp | 54 +++++---- ...3173_set_transformed_partition_pruning.sql | 4 +- ...6_index_analysis_with_session_tz.reference | 41 +++++++ .../03636_index_analysis_with_session_tz.sql | 103 ++++++++++++++++++ 4 files changed, 178 insertions(+), 24 deletions(-) create mode 100644 tests/queries/0_stateless/03636_index_analysis_with_session_tz.reference create mode 100644 tests/queries/0_stateless/03636_index_analysis_with_session_tz.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 3258fbc4dcea..f85d173a94c0 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -53,6 +53,7 @@ namespace Setting { extern const SettingsBool analyze_index_with_space_filling_curves; extern const SettingsDateTimeOverflowBehavior date_time_overflow_behavior; + extern const SettingsTimezone session_timezone; } namespace ErrorCodes @@ -1093,7 +1094,7 @@ bool applyFunctionChainToColumn( } // And cast it to the argument type of the first function in the chain - auto in_argument_type = getArgumentTypeOfMonotonicFunction(*functions[0]); + auto in_argument_type = removeLowCardinality(getArgumentTypeOfMonotonicFunction(*functions[0])); if (canBeSafelyCast(result_type, in_argument_type)) { result_column = castColumnAccurate({result_column, result_type, ""}, in_argument_type); @@ -1122,13 +1123,13 @@ bool applyFunctionChainToColumn( if (func->getArgumentTypes().empty()) return false; - auto argument_type = getArgumentTypeOfMonotonicFunction(*func); + auto argument_type = removeLowCardinality(getArgumentTypeOfMonotonicFunction(*func)); if (!canBeSafelyCast(result_type, argument_type)) return false; result_column = castColumnAccurate({result_column, result_type, ""}, argument_type); - result_column = func->execute({{result_column, argument_type, ""}}, func->getResultType(), result_column->size(), /* dry_run = */ false); - result_type = func->getResultType(); + result_type = removeLowCardinality(func->getResultType()); + result_column = func->execute({{result_column, argument_type, ""}}, result_type, result_column->size(), /* dry_run = */ false); // Transforming nullable columns to the nested ones, in case no nulls found if (result_column->isNullable()) @@ -1141,7 +1142,7 @@ bool applyFunctionChainToColumn( return false; } result_column = result_column_nullable.getNestedColumnPtr(); - result_type = removeNullable(func->getResultType()); + result_type = removeNullable(result_type); } } out_column = result_column; @@ -1903,48 +1904,57 @@ bool KeyCondition::extractMonotonicFunctionsChainFromKey( auto func_name = func->function_base->getName(); auto func_base = func->function_base; - ColumnsWithTypeAndName arguments; ColumnWithTypeAndName const_arg; FunctionWithOptionalConstArg::Kind kind = FunctionWithOptionalConstArg::Kind::NO_CONST; if (date_time_parsing_functions.contains(func_name)) { - const auto & arg_types = func_base->getArgumentTypes(); - if (!arg_types.empty() && isStringOrFixedString(arg_types[0])) - func_name = func_name + "OrNull"; - } + const auto & func_arg_types = func_base->getArgumentTypes(); - auto func_builder = FunctionFactory::instance().tryGet(func_name, context); + const bool has_string_argument = !func_arg_types.empty() && isStringOrFixedString(func_arg_types[0]); + const bool has_session_timezone = !context->getSettingsRef()[Setting::session_timezone].value.empty(); - if (func->children.size() == 1) - { - arguments.push_back({nullptr, removeLowCardinality(func->children[0]->result_type), ""}); + // Skipping analysis in case when is requires parsing datetime from string + // with `session_timezone` specified + if (has_string_argument && has_session_timezone) + return false; + + // Otherwise, in case when datetime parsing is required, rebuilding the function, + // to get its "-OrNull" version required for safe parsing, and not failing on + // values with incorrect format + if (has_string_argument) + { + ColumnsWithTypeAndName new_args; + for (const auto & type : func->function_base->getArgumentTypes()) + new_args.push_back({nullptr, type, ""}); + + const auto func_builder = FunctionFactory::instance().tryGet(func_name + "OrNull", context); + func_base = func_builder->build(new_args); + } } - else if (func->children.size() == 2) + + // For single argument functions, the input may be used as-is, for binary functions, + // we'll produce a partially applied version of `func` with the reduced arity + if (func->children.size() == 2) { const auto * left = func->children[0]; const auto * right = func->children[1]; if (left->column && isColumnConst(*left->column)) { const_arg = {left->result_type->createColumnConst(0, (*left->column)[0]), left->result_type, ""}; - arguments.push_back(const_arg); - arguments.push_back({nullptr, removeLowCardinality(right->result_type), ""}); kind = FunctionWithOptionalConstArg::Kind::LEFT_CONST; } else { const_arg = {right->result_type->createColumnConst(0, (*right->column)[0]), right->result_type, ""}; - arguments.push_back({nullptr, removeLowCardinality(left->result_type), ""}); - arguments.push_back(const_arg); kind = FunctionWithOptionalConstArg::Kind::RIGHT_CONST; } } - auto out_func = func_builder->build(arguments); if (kind == FunctionWithOptionalConstArg::Kind::NO_CONST) - out_functions_chain.push_back(out_func); + out_functions_chain.push_back(func_base); else - out_functions_chain.push_back(std::make_shared(out_func, const_arg, kind)); + out_functions_chain.push_back(std::make_shared(func_base, const_arg, kind)); } out_key_column_num = it->second; diff --git a/tests/queries/0_stateless/03173_set_transformed_partition_pruning.sql b/tests/queries/0_stateless/03173_set_transformed_partition_pruning.sql index c49aafab237c..4d7b12bf5651 100644 --- a/tests/queries/0_stateless/03173_set_transformed_partition_pruning.sql +++ b/tests/queries/0_stateless/03173_set_transformed_partition_pruning.sql @@ -1,4 +1,4 @@ --- Tags: no-msan +-- Tags: no-msan, long -- msan: too slow SELECT '-- Single partition by function'; @@ -234,7 +234,7 @@ SELECT toString(toDate('2000-01-01') + 10 * number) FROM numbers(50) UNION ALL SELECT toString(toDate('2100-01-01') + 10 * number) FROM numbers(50); -SELECT count() FROM 03173_nested_date_parsing WHERE id IN ('2000-01-21', '2023-05-02') SETTINGS log_comment='03173_nested_date_parsing'; +SELECT count() FROM 03173_nested_date_parsing WHERE id IN ('2000-01-21', '2023-05-02') SETTINGS log_comment='03173_nested_date_parsing', session_timezone = ''; SYSTEM FLUSH LOGS query_log; SELECT ProfileEvents['SelectedParts'] FROM system.query_log WHERE type = 'QueryFinish' AND current_database = currentDatabase() AND log_comment = '03173_nested_date_parsing'; SELECT count() FROM 03173_nested_date_parsing WHERE id IN ('not a date'); diff --git a/tests/queries/0_stateless/03636_index_analysis_with_session_tz.reference b/tests/queries/0_stateless/03636_index_analysis_with_session_tz.reference new file mode 100644 index 000000000000..e3b96109a27d --- /dev/null +++ b/tests/queries/0_stateless/03636_index_analysis_with_session_tz.reference @@ -0,0 +1,41 @@ +-- PK UTC timezone +1 +Condition: (toStartOfDay(ts) in [1756857600, 1756857600]) +Parts: 1/1 +Granules: 1/1 + +-- PK EST timezone +1 +Condition: (toStartOfDay(ts) in [1756857600, 1756857600]) +Parts: 1/1 +Granules: 1/1 + +-- Partitions UTC timezone +1 +Condition: (ts in [1756882680, 1756882680]) +Parts: 1/1 +Granules: 1/1 +Condition: (toStartOfDay(ts) in [1756857600, 1756857600]) +Parts: 1/1 +Granules: 1/1 + +-- Partitions EST timezone +1 +Condition: (ts in [1756882680, 1756882680]) +Parts: 1/1 +Granules: 1/1 +Condition: (toStartOfDay(ts) in [1756857600, 1756857600]) +Parts: 1/1 +Granules: 1/1 + +-- Partitions UTC timezone +1 +Condition: true +Parts: 1/1 +Granules: 1/1 + +-- Partitions EST timezone +1 +Condition: true +Parts: 1/1 +Granules: 1/1 diff --git a/tests/queries/0_stateless/03636_index_analysis_with_session_tz.sql b/tests/queries/0_stateless/03636_index_analysis_with_session_tz.sql new file mode 100644 index 000000000000..c897ab523dae --- /dev/null +++ b/tests/queries/0_stateless/03636_index_analysis_with_session_tz.sql @@ -0,0 +1,103 @@ +SET session_timezone = 'UTC'; +-- For explain with indexes and key condition values verification +SET parallel_replicas_local_plan = 1; + +DROP TABLE IF EXISTS 03636_data_pk, 03636_data_partitions, 03636_data_parsed; + +CREATE TABLE 03636_data_pk (ts DateTime) ENGINE = MergeTree ORDER BY toStartOfDay(ts) +AS +SELECT 1756882680; + +SELECT '-- PK UTC timezone'; + +SELECT count() FROM 03636_data_pk WHERE ts = 1756882680; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT count() FROM 03636_data_pk WHERE ts = 1756882680 +) +WHERE trim(explain) ilike 'condition: %' + OR trim(explain) ilike 'parts: %' + OR trim(explain) ilike 'granules: %'; + +SELECT ''; +SELECT '-- PK EST timezone'; + +SELECT count() FROM 03636_data_pk WHERE ts = 1756882680 SETTINGS session_timezone = 'EST'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT count() FROM 03636_data_pk WHERE ts = 1756882680 +) +WHERE trim(explain) ilike 'condition: %' + OR trim(explain) ilike 'parts: %' + OR trim(explain) ilike 'granules: %' +SETTINGS session_timezone = 'EST'; + +DROP TABLE 03636_data_pk; + +CREATE TABLE 03636_data_partitions (ts DateTime) ENGINE = MergeTree ORDER BY tuple() PARTITION BY toStartOfDay(ts) +AS +SELECT 1756882680; + +SELECT ''; +SELECT '-- Partitions UTC timezone'; + +SELECT count() FROM 03636_data_partitions WHERE ts = 1756882680; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT count() FROM 03636_data_partitions WHERE ts = 1756882680 +) +WHERE trim(explain) ilike 'condition: %' + OR trim(explain) ilike 'parts: %' + OR trim(explain) ilike 'granules: %'; + +SELECT ''; +SELECT '-- Partitions EST timezone'; + +SELECT count() FROM 03636_data_partitions WHERE ts = 1756882680 SETTINGS session_timezone = 'EST'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT count() FROM 03636_data_partitions WHERE ts = 1756882680 +) +WHERE trim(explain) ilike 'condition: %' + OR trim(explain) ilike 'parts: %' + OR trim(explain) ilike 'granules: %' +SETTINGS session_timezone = 'EST'; + +DROP TABLE 03636_data_partitions; + +CREATE TABLE 03636_data_parsed (ts String) ENGINE = MergeTree ORDER BY toStartOfDay(toDateTime(ts)) +AS +SELECT '2025-09-02 19:00:00'; + +SELECT ''; +SELECT '-- Partitions UTC timezone'; + +SELECT count() FROM 03636_data_parsed WHERE ts = '2025-09-02 19:00:00'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT count() FROM 03636_data_parsed WHERE ts = '2025-09-02 19:00:00' +) +WHERE trim(explain) ilike 'condition: %' + OR trim(explain) ilike 'parts: %' + OR trim(explain) ilike 'granules: %'; + +SELECT ''; +SELECT '-- Partitions EST timezone'; + +SELECT count() FROM 03636_data_parsed WHERE ts = '2025-09-02 19:00:00' SETTINGS session_timezone = 'EST'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT count() FROM 03636_data_parsed WHERE ts = '2025-09-02 19:00:00' +) +WHERE trim(explain) ilike 'condition: %' + OR trim(explain) ilike 'parts: %' + OR trim(explain) ilike 'granules: %' +SETTINGS session_timezone = 'EST'; + +DROP TABLE 03636_data_parsed; From 82c2b31e7e5ac6a92745ea595a8c84d414afa29e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 13 Oct 2025 11:11:24 +0000 Subject: [PATCH 016/112] Backport #88339 to 25.8: fix threads count for inserts --- src/Interpreters/InterpreterInsertQuery.cpp | 2 +- .../03652_threads_count_insert.reference | 8 +++ .../0_stateless/03652_threads_count_insert.sh | 56 +++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03652_threads_count_insert.reference create mode 100755 tests/queries/0_stateless/03652_threads_count_insert.sh diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 5b52cdbb9920..a0e9c1b6cccb 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -726,7 +726,7 @@ QueryPipeline InterpreterInsertQuery::buildInsertPipeline(ASTInsertQuery & query QueryPipeline pipeline = QueryPipeline(std::move(chain)); - pipeline.setNumThreads(max_insert_threads); + pipeline.setNumThreads(max_threads); pipeline.setConcurrencyControl(settings[Setting::use_concurrency_control]); if (query.hasInlinedData() && !async_insert) diff --git a/tests/queries/0_stateless/03652_threads_count_insert.reference b/tests/queries/0_stateless/03652_threads_count_insert.reference new file mode 100644 index 000000000000..2681d6258e98 --- /dev/null +++ b/tests/queries/0_stateless/03652_threads_count_insert.reference @@ -0,0 +1,8 @@ +max_threads: 1 max_insert_threads: 1 +1 +max_threads: 1 max_insert_threads: 5 +1 +max_threads: 10 max_insert_threads: 1 +10 +max_threads: 10 max_insert_threads: 5 +10 diff --git a/tests/queries/0_stateless/03652_threads_count_insert.sh b/tests/queries/0_stateless/03652_threads_count_insert.sh new file mode 100755 index 000000000000..7a95b4ea833a --- /dev/null +++ b/tests/queries/0_stateless/03652_threads_count_insert.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Tags: no-object-storage, no-parallel, no-fasttest + +# no-object-storage: s3 has 20 more threads +# no-parallel: it checks the number of threads, which can be lowered in presence of other queries + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +cat <= 10, 10, peak_threads_usage), +from system.query_log where + current_database = currentDatabase() and + type != 'QueryStart' and + query_id = '$QUERY_ID' +order by ALL; +EOF + + done +done From 02e8bae240d9da87bbe262aa1f7238e3acad1f0a Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 13 Oct 2025 14:13:24 +0000 Subject: [PATCH 017/112] Backport #88217 to 25.8: max_cpu_share alone should determine the hard cap in a workload setting, even if max_cpus is unset --- src/Common/Scheduler/IResourceManager.h | 3 ++ .../Scheduler/Nodes/CustomResourceManager.cpp | 6 ++++ .../Scheduler/Nodes/CustomResourceManager.h | 1 + .../Nodes/WorkloadResourceManager.cpp | 28 +++++++++++++++++-- .../Scheduler/Nodes/WorkloadResourceManager.h | 6 ++-- .../tests/gtest_workload_resource_manager.cpp | 18 ++++++++++++ src/Common/Scheduler/WorkloadSettings.cpp | 2 +- .../Scheduler/createResourceManager.cpp | 10 +++++++ 8 files changed, 68 insertions(+), 6 deletions(-) diff --git a/src/Common/Scheduler/IResourceManager.h b/src/Common/Scheduler/IResourceManager.h index 5cca911cdbb6..4b5537f73f00 100644 --- a/src/Common/Scheduler/IResourceManager.h +++ b/src/Common/Scheduler/IResourceManager.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -37,6 +38,8 @@ class IClassifier : private boost::noncopyable /// Returns ResourceLink that should be used to access resource. /// Returned link is valid until classifier destruction. virtual ResourceLink get(const String & resource_name) = 0; + /// Returns settings that should be used to limit workload on given resource. + virtual WorkloadSettings getWorkloadSettings(const String & resource_name) const = 0; }; using ClassifierPtr = std::shared_ptr; diff --git a/src/Common/Scheduler/Nodes/CustomResourceManager.cpp b/src/Common/Scheduler/Nodes/CustomResourceManager.cpp index 244a83374f8b..33f5d43de9e6 100644 --- a/src/Common/Scheduler/Nodes/CustomResourceManager.cpp +++ b/src/Common/Scheduler/Nodes/CustomResourceManager.cpp @@ -177,6 +177,12 @@ ResourceLink CustomResourceManager::Classifier::get(const String & resource_name return ResourceLink{}; // unlimited access } +WorkloadSettings CustomResourceManager::Classifier::getWorkloadSettings(const String & resource_name) const +{ + UNUSED(resource_name); + return {}; +} + CustomResourceManager::CustomResourceManager() : state(new State()) { diff --git a/src/Common/Scheduler/Nodes/CustomResourceManager.h b/src/Common/Scheduler/Nodes/CustomResourceManager.h index e7559aaa77f5..a232aa32c60d 100644 --- a/src/Common/Scheduler/Nodes/CustomResourceManager.h +++ b/src/Common/Scheduler/Nodes/CustomResourceManager.h @@ -84,6 +84,7 @@ class CustomResourceManager : public IResourceManager Classifier(const ClassifierSettings & settings_, const StatePtr & state_, const String & classifier_name); bool has(const String & resource_name) override; ResourceLink get(const String & resource_name) override; + WorkloadSettings getWorkloadSettings(const String & resource_name) const override; private: const ClassifierSettings settings; std::unordered_map resources; // accessible resources by names diff --git a/src/Common/Scheduler/Nodes/WorkloadResourceManager.cpp b/src/Common/Scheduler/Nodes/WorkloadResourceManager.cpp index 986125503ef8..43aa4bb8720f 100644 --- a/src/Common/Scheduler/Nodes/WorkloadResourceManager.cpp +++ b/src/Common/Scheduler/Nodes/WorkloadResourceManager.cpp @@ -423,11 +423,32 @@ ResourceLink WorkloadResourceManager::Classifier::get(const String & resource_na } } -void WorkloadResourceManager::Classifier::attach(const ResourcePtr & resource, const VersionPtr & version, ResourceLink link) +WorkloadSettings WorkloadResourceManager::Classifier::getWorkloadSettings(const String & resource_name) const +{ + std::unique_lock lock{mutex}; + auto iter = attachments.find(resource_name); + if (iter != attachments.end()) + { + // Extract settings from the attached resource + return iter->second.settings; + } + return {}; +} + +void WorkloadResourceManager::Classifier::attach(const ResourcePtr & resource, const VersionPtr & version, UnifiedSchedulerNode * node) { std::unique_lock lock{mutex}; chassert(!attachments.contains(resource->getName())); - attachments[resource->getName()] = Attachment{.resource = resource, .version = version, .link = link}; + ResourceLink link; + WorkloadSettings wl_settings{}; + if (node) + { + auto queue = node->getQueue(); + if (queue) + link = ResourceLink{.queue = queue.get()}; + wl_settings = node->getSettings(); + } + attachments[resource->getName()] = Attachment{.resource = resource, .version = version, .link = link, .settings = wl_settings}; } void WorkloadResourceManager::Resource::updateResource(const ASTPtr & new_resource_entity) @@ -447,11 +468,12 @@ std::future WorkloadResourceManager::Resource::attachClassifier(Classifier { if (auto iter = node_for_workload.find(workload_name); iter != node_for_workload.end()) { + auto nodePtr = iter->second; auto queue = iter->second->getQueue(); if (!queue) throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Unable to use workload '{}' that have children for resource '{}'", workload_name, resource_name); - classifier.attach(shared_from_this(), current_version, ResourceLink{.queue = queue.get()}); + classifier.attach(shared_from_this(), current_version, nodePtr.get()); } else { diff --git a/src/Common/Scheduler/Nodes/WorkloadResourceManager.h b/src/Common/Scheduler/Nodes/WorkloadResourceManager.h index def320eb37af..aea9d435c896 100644 --- a/src/Common/Scheduler/Nodes/WorkloadResourceManager.h +++ b/src/Common/Scheduler/Nodes/WorkloadResourceManager.h @@ -241,21 +241,23 @@ class WorkloadResourceManager : public IResourceManager /// NOTE: It is called from query threads (possibly multiple) bool has(const String & resource_name) override; ResourceLink get(const String & resource_name) override; + WorkloadSettings getWorkloadSettings(const String & resource_name) const override; /// Attaches/detaches a specific resource /// NOTE: It is called from scheduler threads (possibly multiple) - void attach(const ResourcePtr & resource, const VersionPtr & version, ResourceLink link); + void attach(const ResourcePtr & resource, const VersionPtr & version, UnifiedSchedulerNode * node); void detach(const ResourcePtr & resource); private: const ClassifierSettings settings; WorkloadResourceManager * resource_manager; - std::mutex mutex; + mutable std::mutex mutex; struct Attachment { ResourcePtr resource; VersionPtr version; ResourceLink link; + WorkloadSettings settings; }; std::unordered_map attachments; // TSA_GUARDED_BY(mutex); }; diff --git a/src/Common/Scheduler/Nodes/tests/gtest_workload_resource_manager.cpp b/src/Common/Scheduler/Nodes/tests/gtest_workload_resource_manager.cpp index e4d6164d34d5..3b3969340fe6 100644 --- a/src/Common/Scheduler/Nodes/tests/gtest_workload_resource_manager.cpp +++ b/src/Common/Scheduler/Nodes/tests/gtest_workload_resource_manager.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -1268,6 +1269,23 @@ TEST(SchedulerWorkloadResourceManager, CPUSchedulingIndependentPools) t.wait(); } +TEST(SchedulerWorkloadResourceManager, MaxCPUsDerivedFromShare) +{ + ResourceTest t; + + t.query("CREATE RESOURCE cpu (MASTER THREAD, WORKER THREAD)"); + // Only max_cpu_share is set, max_cpus is unset + t.query("CREATE WORKLOAD all SETTINGS max_cpu_share = 0.5"); + ClassifierPtr c = t.manager->acquire("all"); + + // The expected hard cap is max_cpu_share * getNumberOfCPUCoresToUse() + WorkloadSettings settings = c->getWorkloadSettings("cpu"); + double expected_cap = 0.5 * getNumberOfCPUCoresToUse(); + double actual_cap = settings.max_cpus; + + EXPECT_DOUBLE_EQ(actual_cap, expected_cap); +} + auto getAcquired() { return CurrentMetrics::get(CurrentMetrics::ConcurrencyControlAcquired); diff --git a/src/Common/Scheduler/WorkloadSettings.cpp b/src/Common/Scheduler/WorkloadSettings.cpp index f5b9d98786d6..bba7975f078c 100644 --- a/src/Common/Scheduler/WorkloadSettings.cpp +++ b/src/Common/Scheduler/WorkloadSettings.cpp @@ -232,7 +232,7 @@ void WorkloadSettings::initFromChanges(CostUnit unit_, const ASTCreateWorkloadQu if (share_limit > 0) { Float64 value = share_limit * getNumberOfCPUCoresToUse(); - if (value > 0 && value < limit) + if (value > 0 && (limit == 0 || value < limit)) limit = value; } max_cpus = limit; diff --git a/src/Common/Scheduler/createResourceManager.cpp b/src/Common/Scheduler/createResourceManager.cpp index c5a430024b6d..500aed901f80 100644 --- a/src/Common/Scheduler/createResourceManager.cpp +++ b/src/Common/Scheduler/createResourceManager.cpp @@ -55,6 +55,16 @@ class ResourceManagerDispatcher : public IResourceManager return ResourceLink{}; } + WorkloadSettings getWorkloadSettings(const String & resource_name) const override + { + for (const auto & classifier : classifiers) + { + if (classifier->has(resource_name)) + return classifier->getWorkloadSettings(resource_name); + } + return {}; + } + private: const ClassifierSettings settings; std::vector classifiers; // should be constant after initialization to avoid races From 876b9615234cedc0a2245cf4329c151a7f7b7c2b Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 14 Oct 2025 06:15:04 +0000 Subject: [PATCH 018/112] Backport #87660 to 25.8: backups: use cloned storage client with overridden retry policy settings for native copy --- src/Backups/BackupIO_S3.cpp | 64 +++++++++++++++- src/Backups/BackupIO_S3.h | 25 +++++++ src/Core/Settings.cpp | 7 +- src/Databases/DataLake/GlueCatalog.cpp | 3 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 7 +- src/IO/S3/Client.cpp | 56 ++++++++------ src/IO/S3/Client.h | 3 + src/IO/S3/copyS3File.cpp | 2 +- src/IO/S3/copyS3File.h | 2 +- .../test_backup_restore_s3/test.py | 75 +++++++++++++++++-- 10 files changed, 199 insertions(+), 45 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 1738ca4607f4..51cf22a12eee 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -37,7 +37,7 @@ namespace Setting extern const SettingsUInt64 s3_max_connections; extern const SettingsUInt64 s3_max_redirects; extern const SettingsBool s3_slow_all_threads_after_network_error; - extern const SettingsBool s3_slow_all_threads_after_retryable_error; + extern const SettingsBool backup_slow_all_threads_after_retryable_s3_error; } namespace S3AuthSetting @@ -76,6 +76,36 @@ namespace ErrorCodes namespace { +class S3BackupClientCreator +{ +public: + explicit S3BackupClientCreator(const ContextPtr & context) + { + const Settings & local_settings = context->getSettingsRef(); + retry_strategy = S3::PocoHTTPClientConfiguration::RetryStrategy{ + .max_retries = static_cast(local_settings[Setting::backup_restore_s3_retry_attempts]), + .initial_delay_ms = static_cast(local_settings[Setting::backup_restore_s3_retry_initial_backoff_ms]), + .max_delay_ms = static_cast(local_settings[Setting::backup_restore_s3_retry_max_backoff_ms]), + .jitter_factor = local_settings[Setting::backup_restore_s3_retry_jitter_factor]}; + slow_all_threads_after_retryable_error = local_settings[Setting::backup_slow_all_threads_after_retryable_s3_error]; + } + + S3BackupDiskClientFactory::Entry operator()(DiskPtr disk) const + { + auto disk_client = disk->getS3StorageClient(); + + auto config = disk_client->getClientConfiguration(); + config.retry_strategy = retry_strategy; + config.s3_slow_all_threads_after_retryable_error = slow_all_threads_after_retryable_error; + + return {disk_client->cloneWithConfigurationOverride(config), disk_client}; + } + +private: + S3::PocoHTTPClientConfiguration::RetryStrategy retry_strategy; + bool slow_all_threads_after_retryable_error = false; +}; + std::shared_ptr makeS3Client( const S3::URI & s3_uri, const String & access_key_id, @@ -114,7 +144,7 @@ namespace .jitter_factor = local_settings[Setting::backup_restore_s3_retry_jitter_factor]}, local_settings[Setting::s3_slow_all_threads_after_network_error], - local_settings[Setting::s3_slow_all_threads_after_retryable_error], + local_settings[Setting::backup_slow_all_threads_after_retryable_s3_error], local_settings[Setting::enable_s3_requests_logging], /* for_disk_s3 = */ false, /* opt_disk_name = */ {}, @@ -180,6 +210,31 @@ namespace } +S3BackupDiskClientFactory::S3BackupDiskClientFactory(const S3BackupDiskClientFactory::CreateFn & create_fn_) + : create_fn(create_fn_) +{ +} + +std::shared_ptr S3BackupDiskClientFactory::getOrCreate(DiskPtr disk) +{ + std::lock_guard lock(clients_mutex); + + auto [it, inserted] = clients.try_emplace(disk->getName(), Entry{}); + auto log = getLogger("S3BackupDiskClientFactory"); + auto & entry = it->second; + if (inserted) + LOG_TRACE(log, "Creating S3 client for copy from disk '{}' to backup bucket", disk->getName()); + else if (const_pointer_cast(entry.disk_reported_client.lock()) != disk->getS3StorageClient()) + LOG_INFO( + log, "Updating S3 client for copy from disk '{}' to the backup bucket because the disk client was updated", disk->getName()); + + while (const_pointer_cast(entry.disk_reported_client.lock()) != disk->getS3StorageClient()) + entry = create_fn(disk); + + chassert(entry.backup_client); + return entry.backup_client; +} + BackupReaderS3::BackupReaderS3( const S3::URI & s3_uri_, const String & access_key_id_, @@ -278,7 +333,6 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s BackupReaderDefault::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode); } - BackupWriterS3::BackupWriterS3( const S3::URI & s3_uri_, const String & access_key_id_, @@ -295,6 +349,7 @@ BackupWriterS3::BackupWriterS3( , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false, ""} , s3_capabilities(getCapabilitiesFromConfig(context_->getConfigRef(), "s3")) + , disk_client_factory(S3BackupClientCreator(context_)) { s3_settings.loadFromConfig(context_->getConfigRef(), "s3", context_->getSettingsRef()); @@ -331,8 +386,9 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { LOG_TRACE(log, "Copying file {} from disk {} to S3", src_path, src_disk->getName()); + /// Use storage client with overridden retry strategy settings. copyS3File( - src_disk->getS3StorageClient(), + /* src_s3_client */ disk_client_factory.getOrCreate(src_disk), /* src_bucket */ blob_path[1], /* src_key= */ blob_path[0], start_pos, diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 6cedf04a67af..c8a3575243e7 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -6,15 +6,39 @@ #include #include #include +#include #include #include #include #include #include +#include + + namespace DB { +class S3BackupDiskClientFactory +{ +public: + struct Entry + { + std::shared_ptr backup_client; + std::weak_ptr disk_reported_client; + }; + using CreateFn = std::function; + explicit S3BackupDiskClientFactory(const CreateFn & create_fn_); + std::shared_ptr getOrCreate(DiskPtr disk); + +private: + const CreateFn create_fn; + + mutable std::mutex clients_mutex; + /// Disk name to client entry; + std::unordered_map clients TSA_GUARDED_BY(clients_mutex); +}; + /// Represents a backup stored to AWS S3. class BackupReaderS3 : public BackupReaderDefault { @@ -87,6 +111,7 @@ class BackupWriterS3 : public BackupWriterDefault S3Settings s3_settings; std::shared_ptr client; S3Capabilities s3_capabilities; + S3BackupDiskClientFactory disk_client_factory; BlobStorageLogWriterPtr blob_storage_log; }; diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 497760a1325d..903d984dee49 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -467,11 +467,11 @@ When set to `true`, all threads executing S3 requests to the same backup endpoin after any single s3 request encounters a retryable network error, such as socket timeout. When set to `false`, each thread handles S3 request backoff independently of the others. )", 0) \ - DECLARE_WITH_ALIAS(Bool, s3_slow_all_threads_after_retryable_error, false, R"( -When set to `true`, all threads executing S3 requests to the same endpoint are slowed down + DECLARE(Bool, backup_slow_all_threads_after_retryable_s3_error, false, R"( +When set to `true`, all threads executing S3 requests to the same backup endpoint are slowed down after any single S3 request encounters a retryable S3 error, such as 'Slow Down'. When set to `false`, each thread handles s3 request backoff independently of the others. -)", 0, backup_slow_all_threads_after_retryable_s3_error) \ +)", 0) \ DECLARE(UInt64, azure_list_object_keys_size, 1000, R"( Maximum number of files that could be returned in batch by ListObject request )", 0) \ @@ -7091,6 +7091,7 @@ Sets the evaluation time to be used with promql dialect. 'auto' means the curren MAKE_OBSOLETE(M, Bool, enable_variant_type, true) \ MAKE_OBSOLETE(M, Bool, enable_dynamic_type, true) \ MAKE_OBSOLETE(M, Bool, enable_json_type, true) \ + MAKE_OBSOLETE(M, Bool, s3_slow_all_threads_after_retryable_error, false) \ \ /* moved to config.xml: see also src/Core/ServerSettings.h */ \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_buffer_flush_schedule_pool_size, 16) \ diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 0c95669f3d19..c0233f362ec7 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -61,7 +61,6 @@ namespace DB::Setting extern const SettingsUInt64 s3_max_redirects; extern const SettingsUInt64 s3_retry_attempts; extern const SettingsBool s3_slow_all_threads_after_network_error; - extern const SettingsBool s3_slow_all_threads_after_retryable_error; extern const SettingsBool enable_s3_requests_logging; extern const SettingsUInt64 s3_connect_timeout_ms; extern const SettingsUInt64 s3_request_timeout_ms; @@ -111,7 +110,7 @@ GlueCatalog::GlueCatalog( int s3_max_redirects = static_cast(global_settings[DB::Setting::s3_max_redirects]); int s3_retry_attempts = static_cast(global_settings[DB::Setting::s3_retry_attempts]); bool s3_slow_all_threads_after_network_error = global_settings[DB::Setting::s3_slow_all_threads_after_network_error]; - bool s3_slow_all_threads_after_retryable_error = global_settings[DB::Setting::s3_slow_all_threads_after_retryable_error]; + bool s3_slow_all_threads_after_retryable_error = false; bool enable_s3_requests_logging = global_settings[DB::Setting::enable_s3_requests_logging]; DB::S3::PocoHTTPClientConfiguration poco_config = DB::S3::ClientFactory::instance().createClientConfiguration( diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index cf653ffe561d..e9c4fe2df80a 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -32,7 +32,6 @@ namespace Setting extern const SettingsUInt64 s3_max_redirects; extern const SettingsUInt64 s3_retry_attempts; extern const SettingsBool s3_slow_all_threads_after_network_error; - extern const SettingsBool s3_slow_all_threads_after_retryable_error; } namespace S3AuthSetting @@ -111,10 +110,6 @@ getClient(const S3::URI & url, const S3Settings & settings, ContextPtr context, if (!for_disk_s3 && local_settings.isChanged("s3_slow_all_threads_after_network_error")) s3_slow_all_threads_after_network_error = static_cast(local_settings[Setting::s3_slow_all_threads_after_network_error]); - bool s3_slow_all_threads_after_retryable_error = static_cast(global_settings[Setting::s3_slow_all_threads_after_retryable_error]); - if (!for_disk_s3 && local_settings.isChanged("s3_slow_all_threads_after_retryable_error")) - s3_slow_all_threads_after_retryable_error = static_cast(local_settings[Setting::s3_slow_all_threads_after_retryable_error]); - bool enable_s3_requests_logging = global_settings[Setting::enable_s3_requests_logging]; if (!for_disk_s3 && local_settings.isChanged("enable_s3_requests_logging")) enable_s3_requests_logging = local_settings[Setting::enable_s3_requests_logging]; @@ -125,7 +120,7 @@ getClient(const S3::URI & url, const S3Settings & settings, ContextPtr context, s3_max_redirects, S3::PocoHTTPClientConfiguration::RetryStrategy{.max_retries = static_cast(s3_retry_attempts)}, s3_slow_all_threads_after_network_error, - s3_slow_all_threads_after_retryable_error, + /* s3_slow_all_threads_after_retryable_error = */ false, enable_s3_requests_logging, for_disk_s3, opt_disk_name, diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 28bc4d52f45f..59a8d337f16f 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -225,7 +225,12 @@ std::unique_ptr Client::create( std::unique_ptr Client::clone() const { - return std::unique_ptr(new Client(*this, client_configuration)); + return cloneWithConfigurationOverride(this->client_configuration); +} + +std::unique_ptr Client::cloneWithConfigurationOverride(const PocoHTTPClientConfiguration & client_configuration_override) const +{ + return std::unique_ptr(new Client(*this, client_configuration_override)); } namespace @@ -286,27 +291,7 @@ Client::Client( LOG_TRACE(log, "API mode of the S3 client: {}", api_mode); - if (client_configuration.for_disk_s3) - { - LOG_TRACE( - log, - "S3 client for disk '{}' initialized with s3_retry_attempts: {}", - client_configuration.opt_disk_name.value_or(""), - client_configuration.retry_strategy.max_retries); - LOG_TRACE( - log, - "S3 client for disk '{}': slowing down threads on retryable errors is {}", - client_configuration.opt_disk_name.value_or(""), - client_configuration.s3_slow_all_threads_after_retryable_error ? "enabled" : "disabled"); - } - else - { - LOG_TRACE(log, "S3 client initialized with s3_retry_attempts: {}", client_configuration.retry_strategy.max_retries); - LOG_TRACE( - log, - "S3 client: slowing down threads on retryable errors is {}", - client_configuration.s3_slow_all_threads_after_retryable_error ? "enabled" : "disabled"); - } + logConfiguration(); detect_region = provider_type == ProviderType::AWS && explicit_region == Aws::Region::AWS_GLOBAL; @@ -335,6 +320,8 @@ Client::Client( cache = std::make_shared(*other.cache); ClientCacheRegistry::instance().registerClient(cache); + logConfiguration(); + ProfileEvents::increment(ProfileEvents::TinyS3Clients); } @@ -891,6 +878,31 @@ void Client::slowDownAfterRetryableError() const } } +void Client::logConfiguration() const +{ + if (client_configuration.for_disk_s3) + { + LOG_TRACE( + log, + "S3 client for disk '{}' initialized with s3_retry_attempts: {}", + client_configuration.opt_disk_name.value_or(""), + client_configuration.retry_strategy.max_retries); + LOG_TRACE( + log, + "S3 client for disk '{}': slowing down threads on retryable errors is {}", + client_configuration.opt_disk_name.value_or(""), + client_configuration.s3_slow_all_threads_after_retryable_error ? "enabled" : "disabled"); + } + else + { + LOG_TRACE(log, "S3 client initialized with s3_retry_attempts: {}", client_configuration.retry_strategy.max_retries); + LOG_TRACE( + log, + "S3 client: slowing down threads on retryable errors is {}", + client_configuration.s3_slow_all_threads_after_retryable_error ? "enabled" : "disabled"); + } +} + bool Client::supportsMultiPartCopy() const { return provider_type != ProviderType::GCS; diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 48c26124e306..14bf90d854c2 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -132,6 +132,8 @@ class Client : private Aws::S3::S3Client std::unique_ptr clone() const; + std::unique_ptr cloneWithConfigurationOverride(const PocoHTTPClientConfiguration & client_configuration_override) const; + Client & operator=(const Client &) = delete; Client(Client && other) = delete; @@ -301,6 +303,7 @@ class Client : private Aws::S3::S3Client void updateNextTimeToRetryAfterRetryableError(Aws::Client::AWSError error, Int64 attempt_no) const; void slowDownAfterRetryableError() const; + void logConfiguration() const; String initial_endpoint; std::shared_ptr credentials_provider; PocoHTTPClientConfiguration client_configuration; diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index f4c41e179142..25c1c3126e41 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -896,7 +896,7 @@ void copyDataToS3File( void copyS3File( - const std::shared_ptr & src_s3_client, + std::shared_ptr src_s3_client, const String & src_bucket, const String & src_key, size_t src_offset, diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index 760498b6fa7a..994f8a7b1d84 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -31,7 +31,7 @@ using CreateReadBuffer = std::function()>; /// /// read_settings - is used for throttling in case of native copy is not possible void copyS3File( - const std::shared_ptr & src_s3_client, + std::shared_ptr src_s3_client, const String & src_bucket, const String & src_key, size_t src_offset, diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 374d8acd868c..6486fa9ae61d 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -4,6 +4,7 @@ import pytest +from ast import literal_eval from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV from helpers.config_cluster import minio_secret_key @@ -384,8 +385,16 @@ def test_backup_to_s3_multipart(): size=1000000, ) node = cluster.instances["node"] - assert node.contains_in_log( - f"copyDataToS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}" + + node.query("SYSTEM FLUSH LOGS") + pattern = f"Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}" + assert ( + int( + node.query( + f"SELECT count() FROM system.text_log WHERE logger_name='copyDataToS3File' AND message like '{pattern}%'", + ) + ) + > 0 ) backup_query_id = backup_events["query_id"] @@ -465,9 +474,56 @@ def test_backup_to_s3_native_copy(storage_policy): # single part upload assert backup_events["S3CopyObject"] > 0 assert restore_events["S3CopyObject"] > 0 + + node.query("SYSTEM FLUSH LOGS") + pattern = f"Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" + assert ( + int( + node.query( + f"SELECT count() FROM system.text_log WHERE logger_name='copyS3File' AND message like '{pattern}%'", + ) + ) + > 0 + ) + + +@pytest.mark.parametrize( + "storage_policy", + [ + "policy_s3", + "policy_s3_other_bucket", + "policy_s3_plain_rewritable", + ], +) +def test_backup_to_s3_native_copy_slow_down_all_threads(storage_policy): + backup_name = new_backup_name() + backup_destination = f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', '{minio_secret_key}')" + (backup_events, restore_events) = check_backup_and_restore( + cluster, + storage_policy, + backup_destination, + backup_settings={"backup_slow_all_threads_after_retryable_s3_error": True}, + ) + # single part upload + assert backup_events["S3CopyObject"] > 0 + assert restore_events["S3CopyObject"] > 0 node = cluster.instances["node"] - assert node.contains_in_log( - f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" + + disks = literal_eval( + node.query( + f"SELECT disks FROM system.storage_policies WHERE policy_name='{storage_policy}'" + ) + ) + assert len(disks) == 1 + node.query("SYSTEM FLUSH LOGS") + pattern = f"S3 client for disk \\'{disks[0]}\\': slowing down threads on retryable errors is enabled" + assert ( + int( + node.query( + f"SELECT count() FROM system.text_log WHERE logger_name='S3Client' AND message LIKE '%{pattern}%'", + ) + ) + > 0 ) @@ -481,9 +537,16 @@ def test_backup_to_s3_native_copy_multipart(): # multi part upload assert backup_events["S3CreateMultipartUpload"] > 0 assert restore_events["S3CreateMultipartUpload"] > 0 + node = cluster.instances["node"] - assert node.contains_in_log( - f"copyS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/" + pattern = f"Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/" + assert ( + int( + node.query( + f"SELECT count() FROM system.text_log WHERE logger_name='copyS3File' AND message like '{pattern}%'", + ) + ) + > 0 ) From 9364aef4a82fef503730bbeaa0ee53c760ee2531 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 14 Oct 2025 11:11:48 +0000 Subject: [PATCH 019/112] Backport #87798 to 25.8: Fix reading null map subcolumn from Variants that cannot be inside Nullable --- src/DataTypes/DataTypeDynamic.cpp | 2 + src/DataTypes/DataTypeVariant.cpp | 2 +- .../Serializations/SerializationVariant.cpp | 104 +-- .../Serializations/SerializationVariant.h | 12 +- ...03201_variant_null_map_subcolumn.reference | 642 +++++++++--------- .../03201_variant_null_map_subcolumn.sh | 12 +- .../03202_dynamic_null_map_subcolumn.sql.j2 | 10 +- ...variant_array_null_map_subcolumn.reference | 1 + ...03640_variant_array_null_map_subcolumn.sql | 6 + 9 files changed, 405 insertions(+), 386 deletions(-) create mode 100644 tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.reference create mode 100644 tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.sql diff --git a/src/DataTypes/DataTypeDynamic.cpp b/src/DataTypes/DataTypeDynamic.cpp index ea4a362f8d6a..186a806d5171 100644 --- a/src/DataTypes/DataTypeDynamic.cpp +++ b/src/DataTypes/DataTypeDynamic.cpp @@ -215,6 +215,8 @@ std::unique_ptr DataTypeDynamic::getDynamicSubcolumnDa bool is_null_map_subcolumn = subcolumn_nested_name == "null"; if (is_null_map_subcolumn) { + if (!subcolumn_type->canBeInsideNullable()) + return nullptr; res->type = std::make_shared(); } else if (!subcolumn_nested_name.empty()) diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index fe58b5e0f9f1..8c176fb5929b 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -182,7 +182,7 @@ SerializationPtr DataTypeVariant::doGetDefaultSerialization() const variant_names.push_back(variant->getName()); } - return std::make_shared(std::move(serializations), std::move(variant_names), SerializationVariant::getVariantsDeserializeTextOrder(variants), getName()); + return std::make_shared(variants, getName()); } void DataTypeVariant::forEachChild(const DB::IDataType::ChildCallback & callback) const diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 143b7a14fa9d..7471cdf10ca8 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -57,6 +57,19 @@ struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBin } }; +SerializationVariant::SerializationVariant(const DataTypes & variant_types_, const String & variant_name_) : variant_types(variant_types_), deserialize_text_order(getVariantsDeserializeTextOrder(variant_types_)), variant_name(variant_name_) +{ + variant_serializations.reserve(variant_serializations.size()); + variant_names.reserve(variant_serializations.size()); + + for (const auto & variant : variant_types) + { + variant_serializations.push_back(variant->getDefaultSerialization()); + variant_names.push_back(variant->getName()); + } +} + + void SerializationVariant::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, @@ -89,7 +102,7 @@ void SerializationVariant::enumerateStreams( settings.path.push_back(Substream::VariantElements); settings.path.back().data = data; - for (size_t i = 0; i < variants.size(); ++i) + for (size_t i = 0; i < variant_serializations.size(); ++i) { DataTypePtr type = type_variant ? type_variant->getVariant(i) : nullptr; settings.path.back().creator = std::make_shared( @@ -99,7 +112,7 @@ void SerializationVariant::enumerateStreams( column_variant ? column_variant->localDiscriminatorByGlobal(i) : i, !type || type->canBeInsideNullable() || type->lowCardinality()); - auto variant_data = SubstreamData(variants[i]) + auto variant_data = SubstreamData(variant_serializations[i]) .withType(type) .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) .withSerializationInfo(data.serialization_info) @@ -107,7 +120,7 @@ void SerializationVariant::enumerateStreams( addVariantElementToPath(settings.path, i); settings.path.back().data = variant_data; - variants[i]->enumerateStreams(settings, callback, variant_data); + variant_serializations[i]->enumerateStreams(settings, callback, variant_data); settings.path.pop_back(); } @@ -119,8 +132,11 @@ void SerializationVariant::enumerateStreams( .withType(type_variant ? std::make_shared() : nullptr) .withColumn(column_variant ? ColumnUInt8::create() : nullptr); - for (size_t i = 0; i < variants.size(); ++i) + for (size_t i = 0; i < variant_serializations.size(); ++i) { + if (!variant_types[i]->canBeInsideNullable()) + continue; + settings.path.back().creator = std::make_shared(local_discriminators, variant_names[i], i, column_variant ? column_variant->localDiscriminatorByGlobal(i) : i); settings.path.push_back(Substream::VariantElementNullMap); settings.path.back().variant_element_name = variant_names[i]; @@ -149,14 +165,14 @@ void SerializationVariant::serializeBinaryBulkStatePrefix( const ColumnVariant & col = assert_cast(column); auto variant_state = std::make_shared(mode); - variant_state->variant_states.resize(variants.size()); + variant_state->variant_states.resize(variant_serializations.size()); settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i < variants.size(); ++i) + for (size_t i = 0; i < variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->variant_states[i]); settings.path.pop_back(); } @@ -172,10 +188,10 @@ void SerializationVariant::serializeBinaryBulkStateSuffix( auto * variant_state = checkAndGetState(state); settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i < variants.size(); ++i) + for (size_t i = 0; i < variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkStateSuffix(settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkStateSuffix(settings, variant_state->variant_states[i]); settings.path.pop_back(); } settings.path.pop_back(); @@ -193,13 +209,13 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix( auto variant_state = std::make_shared(); variant_state->discriminators_state = discriminators_state; - variant_state->variant_states.resize(variants.size()); + variant_state->variant_states.resize(variant_serializations.size()); settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i < variants.size(); ++i) + for (size_t i = 0; i < variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->variant_states[i], cache); + variant_serializations[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->variant_states[i], cache); settings.path.pop_back(); } @@ -260,10 +276,10 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian if (limit == 0) { settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), col.getVariantByGlobalDiscriminator(i).size(), 0, settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), col.getVariantByGlobalDiscriminator(i).size(), 0, settings, variant_state->variant_states[i]); settings.path.pop_back(); } settings.path.pop_back(); @@ -294,14 +310,14 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian } settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); /// We can use the same offset/limit as for whole Variant column if (i == non_empty_global_discr) - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), offset, limit, settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), offset, limit, settings, variant_state->variant_states[i]); else - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), col.getVariantByGlobalDiscriminator(i).size(), 0, settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), col.getVariantByGlobalDiscriminator(i).size(), 0, settings, variant_state->variant_states[i]); settings.path.pop_back(); } variants_statistics[variant_names[non_empty_global_discr]] += limit; @@ -326,10 +342,10 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian } settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), col.getVariantByGlobalDiscriminator(i).size(), 0, settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), col.getVariantByGlobalDiscriminator(i).size(), 0, settings, variant_state->variant_states[i]); settings.path.pop_back(); } settings.path.pop_back(); @@ -360,10 +376,10 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian /// Second, serialize variants in global order. settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->variant_states[i]); + variant_serializations[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->variant_states[i]); size_t variant_size = col.getVariantByGlobalDiscriminator(i).size(); variants_statistics[variant_names[i]] += variant_size; total_size_of_variants += variant_size; @@ -376,7 +392,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian /// In general case we should iterate through local discriminators in range [offset, offset + limit] to serialize global discriminators and calculate offset/limit pair for each variant. const auto & local_discriminators = col.getLocalDiscriminators(); const auto & offsets = col.getOffsets(); - std::vector> variant_offsets_and_limits(variants.size(), {0, 0}); + std::vector> variant_offsets_and_limits(variant_serializations.size(), {0, 0}); size_t end = offset + limit; std::bitset non_empty_variants_in_range; ColumnVariant::Discriminator last_non_empty_variant_discr = 0; @@ -424,10 +440,10 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian /// Serialize variants in global order. settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkWithMultipleStreams( + variant_serializations[i]->serializeBinaryBulkWithMultipleStreams( col.getVariantByGlobalDiscriminator(i), variant_offsets_and_limits[i].second ? variant_offsets_and_limits[i].first : col.getVariantByGlobalDiscriminator(i).size(), variant_offsets_and_limits[i].second, @@ -545,7 +561,7 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( /// if we didn't do it during discriminators deserialization. if (variant_rows_offsets.empty()) { - variant_rows_offsets.resize(variants.size(), 0); + variant_rows_offsets.resize(variant_serializations.size(), 0); if (rows_offset) { @@ -571,7 +587,7 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( if (variant_limits.empty()) { - variant_limits.resize(variants.size(), 0); + variant_limits.resize(variant_serializations.size(), 0); auto & discriminators_data = col.getLocalDiscriminators(); for (size_t i = discriminators_offset ; i != discriminators_data.size(); ++i) @@ -584,10 +600,10 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( /// Now we can deserialize variants according to their limits. settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->deserializeBinaryBulkWithMultipleStreams( + variant_serializations[i]->deserializeBinaryBulkWithMultipleStreams( col.getVariantPtrByLocalDiscriminator(i), variant_rows_offsets[i], variant_limits[i], settings, variant_state->variant_states[i], cache); settings.path.pop_back(); @@ -608,10 +624,10 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( if (settings.insert_only_rows_in_current_range_from_substreams_cache || !insertDataFromSubstreamsCacheIfAny(cache, settings, col.getOffsetsPtr())) { std::vector variant_offsets; - variant_offsets.reserve(variants.size()); + variant_offsets.reserve(variant_serializations.size()); size_t num_non_empty_variants = 0; ColumnVariant::Discriminator last_non_empty_discr = 0; - for (size_t i = 0; i != variants.size(); ++i) + for (size_t i = 0; i != variant_serializations.size(); ++i) { if (variant_limits[i]) { @@ -674,8 +690,8 @@ std::pair, std::vector> SerializationVariant::deseri state.remaining_rows_in_granule = 0; /// Calculate limits for variants during discriminators deserialization. - std::vector variant_rows_offsets(variants.size(), 0); - std::vector variant_limits(variants.size(), 0); + std::vector variant_rows_offsets(variant_serializations.size(), 0); + std::vector variant_limits(variant_serializations.size(), 0); limit += rows_offset; while (limit) @@ -776,7 +792,7 @@ void SerializationVariant::serializeBinary(const IColumn & column, size_t row_nu auto global_discr = col.globalDiscriminatorAt(row_num); writeBinaryLittleEndian(global_discr, ostr); if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) - variants[global_discr]->serializeBinary(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeBinary(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -788,10 +804,10 @@ void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr { col.insertDefault(); } - else if (global_discr < variants.size()) + else if (global_discr < variant_serializations.size()) { auto & variant_column = col.getVariantByGlobalDiscriminator(global_discr); - variants[global_discr]->deserializeBinary(variant_column, istr, settings); + variant_serializations[global_discr]->deserializeBinary(variant_column, istr, settings); col.getLocalDiscriminators().push_back(col.localDiscriminatorByGlobal(global_discr)); col.getOffsets().push_back(variant_column.size() - 1); } @@ -992,7 +1008,7 @@ bool SerializationVariant::tryDeserializeImpl( ReadBufferFromString variant_buf(field); auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); size_t prev_size = variant_column.size(); - if (try_deserialize_nested(variant_column, variants[global_discr], variant_buf, modified_settings) && variant_buf.eof()) + if (try_deserialize_nested(variant_column, variant_serializations[global_discr], variant_buf, modified_settings) && variant_buf.eof()) { column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); column_variant.getOffsets().push_back(prev_size); @@ -1014,7 +1030,7 @@ void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t r if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullEscaped(ostr, settings); else - variants[global_discr]->serializeTextEscaped(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeTextEscaped(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -1053,7 +1069,7 @@ void SerializationVariant::serializeTextRaw(const IColumn & column, size_t row_n if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullRaw(ostr, settings); else - variants[global_discr]->serializeTextRaw(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeTextRaw(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } bool SerializationVariant::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -1092,7 +1108,7 @@ void SerializationVariant::serializeTextQuoted(const IColumn & column, size_t ro if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullQuoted(ostr); else - variants[global_discr]->serializeTextQuoted(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeTextQuoted(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } bool SerializationVariant::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -1132,7 +1148,7 @@ void SerializationVariant::serializeTextCSV(const IColumn & column, size_t row_n if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullCSV(ostr, settings); else - variants[global_discr]->serializeTextCSV(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeTextCSV(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } bool SerializationVariant::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -1171,7 +1187,7 @@ void SerializationVariant::serializeText(const IColumn & column, size_t row_num, if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullText(ostr, settings); else - variants[global_discr]->serializeText(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeText(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } bool SerializationVariant::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -1210,7 +1226,7 @@ void SerializationVariant::serializeTextJSON(const IColumn & column, size_t row_ if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullJSON(ostr); else - variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } void SerializationVariant::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const @@ -1220,7 +1236,7 @@ void SerializationVariant::serializeTextJSONPretty(const IColumn & column, size_ if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullJSON(ostr); else - variants[global_discr]->serializeTextJSONPretty(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings, indent); + variant_serializations[global_discr]->serializeTextJSONPretty(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings, indent); } bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -1260,7 +1276,7 @@ void SerializationVariant::serializeTextXML(const IColumn & column, size_t row_n if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) SerializationNullable::serializeNullXML(ostr); else - variants[global_discr]->serializeTextXML(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); + variant_serializations[global_discr]->serializeTextXML(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); } } diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h index b4eaf70f4201..450b442978ba 100644 --- a/src/DataTypes/Serializations/SerializationVariant.h +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -72,14 +72,7 @@ class SerializationVariant : public ISerialization using VariantSerializations = std::vector; - explicit SerializationVariant( - const VariantSerializations & variants_, - const std::vector & variant_names_, - const std::vector & deserialize_text_order_, - const String & variant_name_) - : variants(variants_), variant_names(variant_names_), deserialize_text_order(deserialize_text_order_), variant_name(variant_name_) - { - } + explicit SerializationVariant(const DataTypes & variant_types_, const String & variant_name_); void enumerateStreams( EnumerateStreamsSettings & settings, @@ -222,8 +215,9 @@ class SerializationVariant : public ISerialization std::function try_deserialize_nested, const FormatSettings & settings) const; - VariantSerializations variants; + VariantSerializations variant_serializations; std::vector variant_names; + DataTypes variant_types; std::vector deserialize_text_order; /// Name of Variant data type for better exception messages. String variant_name; diff --git a/tests/queries/0_stateless/03201_variant_null_map_subcolumn.reference b/tests/queries/0_stateless/03201_variant_null_map_subcolumn.reference index 8565fe3d0fa1..629957c24411 100644 --- a/tests/queries/0_stateless/03201_variant_null_map_subcolumn.reference +++ b/tests/queries/0_stateless/03201_variant_null_map_subcolumn.reference @@ -1,113 +1,113 @@ Memory test -[] 1 0 0 [] -1 0 1 0 [] -\N 1 1 0 [] -['str_3','str_3','str_3'] 1 0 3 [1,1,1] -4 0 1 0 [] -\N 1 1 0 [] -[6,6,6,6,6,6] 1 0 6 [0,0,0,0,0,0] -7 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 9 [1,1,1,1,1,1,1,1,1] -10 0 1 0 [] -\N 1 1 0 [] -['str_12','str_12'] 1 0 2 [1,1] -13 0 1 0 [] -\N 1 1 0 [] -[15,15,15,15,15] 1 0 5 [0,0,0,0,0] -16 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 8 [1,1,1,1,1,1,1,1] -19 0 1 0 [] -\N 1 1 0 [] -['str_21'] 1 0 1 [1] -22 0 1 0 [] -\N 1 1 0 [] -[24,24,24,24] 1 0 4 [0,0,0,0] -25 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 7 [1,1,1,1,1,1,1] -28 0 1 0 [] -\N 1 1 0 [] -[] 1 0 0 [] -31 0 1 0 [] -\N 1 1 0 [] -[33,33,33] 1 0 3 [0,0,0] -34 0 1 0 [] -\N 1 1 0 [] +[] 1 0 [] 1 0 0 [] -0 1 0 [] -1 1 0 [] -1 0 3 [1,1,1] -0 1 0 [] -1 1 0 [] -1 0 6 [0,0,0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 9 [1,1,1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 2 [1,1] -0 1 0 [] -1 1 0 [] -1 0 5 [0,0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 8 [1,1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 1 [1] -0 1 0 [] -1 1 0 [] -1 0 4 [0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 7 [1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 0 [] -0 1 0 [] -1 1 0 [] -1 0 3 [0,0,0] -0 1 0 [] -1 1 0 [] -0 0 [] [] -1 0 [] [] -1 0 [] [] -0 3 [1,1,1] [0,0,0] -1 0 [] [] -1 0 [] [] -0 6 [0,0,0,0,0,0] [1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 9 [1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 2 [1,1] [0,0] -1 0 [] [] -1 0 [] [] -0 5 [0,0,0,0,0] [1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 8 [1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 1 [1] [0] -1 0 [] [] -1 0 [] [] -0 4 [0,0,0,0] [1,1,1,1] -1 0 [] [] -1 0 [] [] -0 7 [1,1,1,1,1,1,1] [1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 0 [] [] -1 0 [] [] -1 0 [] [] -0 3 [0,0,0] [1,1,1] -1 0 [] [] -1 0 [] [] +\N 1 0 [] +['str_3','str_3','str_3'] 1 3 [1,1,1] +4 0 0 [] +\N 1 0 [] +[6,6,6,6,6,6] 1 6 [0,0,0,0,0,0] +7 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 9 [1,1,1,1,1,1,1,1,1] +10 0 0 [] +\N 1 0 [] +['str_12','str_12'] 1 2 [1,1] +13 0 0 [] +\N 1 0 [] +[15,15,15,15,15] 1 5 [0,0,0,0,0] +16 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 8 [1,1,1,1,1,1,1,1] +19 0 0 [] +\N 1 0 [] +['str_21'] 1 1 [1] +22 0 0 [] +\N 1 0 [] +[24,24,24,24] 1 4 [0,0,0,0] +25 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 7 [1,1,1,1,1,1,1] +28 0 0 [] +\N 1 0 [] +[] 1 0 [] +31 0 0 [] +\N 1 0 [] +[33,33,33] 1 3 [0,0,0] +34 0 0 [] +\N 1 0 [] +1 0 [] +0 0 [] +1 0 [] +1 3 [1,1,1] +0 0 [] +1 0 [] +1 6 [0,0,0,0,0,0] +0 0 [] +1 0 [] +1 9 [1,1,1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 2 [1,1] +0 0 [] +1 0 [] +1 5 [0,0,0,0,0] +0 0 [] +1 0 [] +1 8 [1,1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 1 [1] +0 0 [] +1 0 [] +1 4 [0,0,0,0] +0 0 [] +1 0 [] +1 7 [1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 0 [] +0 0 [] +1 0 [] +1 3 [0,0,0] +0 0 [] +1 0 [] +0 [] [] +0 [] [] +0 [] [] +3 [1,1,1] [0,0,0] +0 [] [] +0 [] [] +6 [0,0,0,0,0,0] [1,1,1,1,1,1] +0 [] [] +0 [] [] +9 [1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1] +0 [] [] +0 [] [] +2 [1,1] [0,0] +0 [] [] +0 [] [] +5 [0,0,0,0,0] [1,1,1,1,1] +0 [] [] +0 [] [] +8 [1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1] +0 [] [] +0 [] [] +1 [1] [0] +0 [] [] +0 [] [] +4 [0,0,0,0] [1,1,1,1] +0 [] [] +0 [] [] +7 [1,1,1,1,1,1,1] [1,1,1,1,1,1,1] +0 [] [] +0 [] [] +0 [] [] +0 [] [] +0 [] [] +3 [0,0,0] [1,1,1] +0 [] [] +0 [] [] 0 2 3 @@ -134,114 +134,114 @@ test 35 MergeTree compact test -[] 1 0 0 [] -1 0 1 0 [] -\N 1 1 0 [] -['str_3','str_3','str_3'] 1 0 3 [1,1,1] -4 0 1 0 [] -\N 1 1 0 [] -[6,6,6,6,6,6] 1 0 6 [0,0,0,0,0,0] -7 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 9 [1,1,1,1,1,1,1,1,1] -10 0 1 0 [] -\N 1 1 0 [] -['str_12','str_12'] 1 0 2 [1,1] -13 0 1 0 [] -\N 1 1 0 [] -[15,15,15,15,15] 1 0 5 [0,0,0,0,0] -16 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 8 [1,1,1,1,1,1,1,1] -19 0 1 0 [] -\N 1 1 0 [] -['str_21'] 1 0 1 [1] -22 0 1 0 [] -\N 1 1 0 [] -[24,24,24,24] 1 0 4 [0,0,0,0] -25 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 7 [1,1,1,1,1,1,1] -28 0 1 0 [] -\N 1 1 0 [] -[] 1 0 0 [] -31 0 1 0 [] -\N 1 1 0 [] -[33,33,33] 1 0 3 [0,0,0] -34 0 1 0 [] -\N 1 1 0 [] -1 0 0 [] -0 1 0 [] -1 1 0 [] -1 0 3 [1,1,1] -0 1 0 [] -1 1 0 [] -1 0 6 [0,0,0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 9 [1,1,1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 2 [1,1] -0 1 0 [] -1 1 0 [] -1 0 5 [0,0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 8 [1,1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 1 [1] -0 1 0 [] -1 1 0 [] -1 0 4 [0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 7 [1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] +[] 1 0 [] 1 0 0 [] -0 1 0 [] -1 1 0 [] -1 0 3 [0,0,0] -0 1 0 [] -1 1 0 [] -0 0 [] [] -1 0 [] [] -1 0 [] [] -0 3 [1,1,1] [0,0,0] -1 0 [] [] -1 0 [] [] -0 6 [0,0,0,0,0,0] [1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 9 [1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 2 [1,1] [0,0] -1 0 [] [] -1 0 [] [] -0 5 [0,0,0,0,0] [1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 8 [1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 1 [1] [0] -1 0 [] [] -1 0 [] [] -0 4 [0,0,0,0] [1,1,1,1] -1 0 [] [] -1 0 [] [] -0 7 [1,1,1,1,1,1,1] [1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 0 [] [] -1 0 [] [] -1 0 [] [] -0 3 [0,0,0] [1,1,1] -1 0 [] [] -1 0 [] [] +\N 1 0 [] +['str_3','str_3','str_3'] 1 3 [1,1,1] +4 0 0 [] +\N 1 0 [] +[6,6,6,6,6,6] 1 6 [0,0,0,0,0,0] +7 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 9 [1,1,1,1,1,1,1,1,1] +10 0 0 [] +\N 1 0 [] +['str_12','str_12'] 1 2 [1,1] +13 0 0 [] +\N 1 0 [] +[15,15,15,15,15] 1 5 [0,0,0,0,0] +16 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 8 [1,1,1,1,1,1,1,1] +19 0 0 [] +\N 1 0 [] +['str_21'] 1 1 [1] +22 0 0 [] +\N 1 0 [] +[24,24,24,24] 1 4 [0,0,0,0] +25 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 7 [1,1,1,1,1,1,1] +28 0 0 [] +\N 1 0 [] +[] 1 0 [] +31 0 0 [] +\N 1 0 [] +[33,33,33] 1 3 [0,0,0] +34 0 0 [] +\N 1 0 [] +1 0 [] +0 0 [] +1 0 [] +1 3 [1,1,1] +0 0 [] +1 0 [] +1 6 [0,0,0,0,0,0] +0 0 [] +1 0 [] +1 9 [1,1,1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 2 [1,1] +0 0 [] +1 0 [] +1 5 [0,0,0,0,0] +0 0 [] +1 0 [] +1 8 [1,1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 1 [1] +0 0 [] +1 0 [] +1 4 [0,0,0,0] +0 0 [] +1 0 [] +1 7 [1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 0 [] +0 0 [] +1 0 [] +1 3 [0,0,0] +0 0 [] +1 0 [] +0 [] [] +0 [] [] +0 [] [] +3 [1,1,1] [0,0,0] +0 [] [] +0 [] [] +6 [0,0,0,0,0,0] [1,1,1,1,1,1] +0 [] [] +0 [] [] +9 [1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1] +0 [] [] +0 [] [] +2 [1,1] [0,0] +0 [] [] +0 [] [] +5 [0,0,0,0,0] [1,1,1,1,1] +0 [] [] +0 [] [] +8 [1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1] +0 [] [] +0 [] [] +1 [1] [0] +0 [] [] +0 [] [] +4 [0,0,0,0] [1,1,1,1] +0 [] [] +0 [] [] +7 [1,1,1,1,1,1,1] [1,1,1,1,1,1,1] +0 [] [] +0 [] [] +0 [] [] +0 [] [] +0 [] [] +3 [0,0,0] [1,1,1] +0 [] [] +0 [] [] 0 2 3 @@ -268,114 +268,114 @@ test 35 MergeTree wide test -[] 1 0 0 [] -1 0 1 0 [] -\N 1 1 0 [] -['str_3','str_3','str_3'] 1 0 3 [1,1,1] -4 0 1 0 [] -\N 1 1 0 [] -[6,6,6,6,6,6] 1 0 6 [0,0,0,0,0,0] -7 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 9 [1,1,1,1,1,1,1,1,1] -10 0 1 0 [] -\N 1 1 0 [] -['str_12','str_12'] 1 0 2 [1,1] -13 0 1 0 [] -\N 1 1 0 [] -[15,15,15,15,15] 1 0 5 [0,0,0,0,0] -16 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 8 [1,1,1,1,1,1,1,1] -19 0 1 0 [] -\N 1 1 0 [] -['str_21'] 1 0 1 [1] -22 0 1 0 [] -\N 1 1 0 [] -[24,24,24,24] 1 0 4 [0,0,0,0] -25 0 1 0 [] -\N 1 1 0 [] -[NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 0 7 [1,1,1,1,1,1,1] -28 0 1 0 [] -\N 1 1 0 [] -[] 1 0 0 [] -31 0 1 0 [] -\N 1 1 0 [] -[33,33,33] 1 0 3 [0,0,0] -34 0 1 0 [] -\N 1 1 0 [] -1 0 0 [] -0 1 0 [] -1 1 0 [] -1 0 3 [1,1,1] -0 1 0 [] -1 1 0 [] -1 0 6 [0,0,0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 9 [1,1,1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 2 [1,1] -0 1 0 [] -1 1 0 [] -1 0 5 [0,0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 8 [1,1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] -1 0 1 [1] -0 1 0 [] -1 1 0 [] -1 0 4 [0,0,0,0] -0 1 0 [] -1 1 0 [] -1 0 7 [1,1,1,1,1,1,1] -0 1 0 [] -1 1 0 [] +[] 1 0 [] 1 0 0 [] -0 1 0 [] -1 1 0 [] -1 0 3 [0,0,0] -0 1 0 [] -1 1 0 [] -0 0 [] [] -1 0 [] [] -1 0 [] [] -0 3 [1,1,1] [0,0,0] -1 0 [] [] -1 0 [] [] -0 6 [0,0,0,0,0,0] [1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 9 [1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 2 [1,1] [0,0] -1 0 [] [] -1 0 [] [] -0 5 [0,0,0,0,0] [1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 8 [1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 1 [1] [0] -1 0 [] [] -1 0 [] [] -0 4 [0,0,0,0] [1,1,1,1] -1 0 [] [] -1 0 [] [] -0 7 [1,1,1,1,1,1,1] [1,1,1,1,1,1,1] -1 0 [] [] -1 0 [] [] -0 0 [] [] -1 0 [] [] -1 0 [] [] -0 3 [0,0,0] [1,1,1] -1 0 [] [] -1 0 [] [] +\N 1 0 [] +['str_3','str_3','str_3'] 1 3 [1,1,1] +4 0 0 [] +\N 1 0 [] +[6,6,6,6,6,6] 1 6 [0,0,0,0,0,0] +7 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 9 [1,1,1,1,1,1,1,1,1] +10 0 0 [] +\N 1 0 [] +['str_12','str_12'] 1 2 [1,1] +13 0 0 [] +\N 1 0 [] +[15,15,15,15,15] 1 5 [0,0,0,0,0] +16 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 8 [1,1,1,1,1,1,1,1] +19 0 0 [] +\N 1 0 [] +['str_21'] 1 1 [1] +22 0 0 [] +\N 1 0 [] +[24,24,24,24] 1 4 [0,0,0,0] +25 0 0 [] +\N 1 0 [] +[NULL,NULL,NULL,NULL,NULL,NULL,NULL] 1 7 [1,1,1,1,1,1,1] +28 0 0 [] +\N 1 0 [] +[] 1 0 [] +31 0 0 [] +\N 1 0 [] +[33,33,33] 1 3 [0,0,0] +34 0 0 [] +\N 1 0 [] +1 0 [] +0 0 [] +1 0 [] +1 3 [1,1,1] +0 0 [] +1 0 [] +1 6 [0,0,0,0,0,0] +0 0 [] +1 0 [] +1 9 [1,1,1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 2 [1,1] +0 0 [] +1 0 [] +1 5 [0,0,0,0,0] +0 0 [] +1 0 [] +1 8 [1,1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 1 [1] +0 0 [] +1 0 [] +1 4 [0,0,0,0] +0 0 [] +1 0 [] +1 7 [1,1,1,1,1,1,1] +0 0 [] +1 0 [] +1 0 [] +0 0 [] +1 0 [] +1 3 [0,0,0] +0 0 [] +1 0 [] +0 [] [] +0 [] [] +0 [] [] +3 [1,1,1] [0,0,0] +0 [] [] +0 [] [] +6 [0,0,0,0,0,0] [1,1,1,1,1,1] +0 [] [] +0 [] [] +9 [1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1] +0 [] [] +0 [] [] +2 [1,1] [0,0] +0 [] [] +0 [] [] +5 [0,0,0,0,0] [1,1,1,1,1] +0 [] [] +0 [] [] +8 [1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1] +0 [] [] +0 [] [] +1 [1] [0] +0 [] [] +0 [] [] +4 [0,0,0,0] [1,1,1,1] +0 [] [] +0 [] [] +7 [1,1,1,1,1,1,1] [1,1,1,1,1,1,1] +0 [] [] +0 [] [] +0 [] [] +0 [] [] +0 [] [] +3 [0,0,0] [1,1,1] +0 [] [] +0 [] [] 0 2 3 diff --git a/tests/queries/0_stateless/03201_variant_null_map_subcolumn.sh b/tests/queries/0_stateless/03201_variant_null_map_subcolumn.sh index c9dca46f41ed..3795d4423204 100755 --- a/tests/queries/0_stateless/03201_variant_null_map_subcolumn.sh +++ b/tests/queries/0_stateless/03201_variant_null_map_subcolumn.sh @@ -11,14 +11,14 @@ function test() { echo "test" $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(36)" - $CH_CLIENT -q "select v, v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id" - $CH_CLIENT -q "select v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id" - $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null, v.\`Array(Variant(String, UInt64))\`.String.null from test order by id" + $CH_CLIENT -q "select v, v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id" + $CH_CLIENT -q "select v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null, v.\`Array(Variant(String, UInt64))\`.String.null from test order by id" $CH_CLIENT -q "select id from test where v.UInt64 is null order by id" $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(250000) settings min_insert_block_size_rows=100000, min_insert_block_size_bytes=0" - $CH_CLIENT -q "select v, v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id format Null" - $CH_CLIENT -q "select v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id format Null" - $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null, v.\`Array(Variant(String, UInt64))\`.String.null from test order by id format Null" + $CH_CLIENT -q "select v, v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id format Null" + $CH_CLIENT -q "select v.UInt64.null, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null from test order by id format Null" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64.null, v.\`Array(Variant(String, UInt64))\`.String.null from test order by id format Null" $CH_CLIENT -q "select id from test where v.UInt64 is null order by id format Null" } diff --git a/tests/queries/0_stateless/03202_dynamic_null_map_subcolumn.sql.j2 b/tests/queries/0_stateless/03202_dynamic_null_map_subcolumn.sql.j2 index 21bf738dccb2..323de80588bf 100644 --- a/tests/queries/0_stateless/03202_dynamic_null_map_subcolumn.sql.j2 +++ b/tests/queries/0_stateless/03202_dynamic_null_map_subcolumn.sql.j2 @@ -35,11 +35,11 @@ select count() from test where not empty(d.`Array(Array(Dynamic))`); select count() from test where d is NULL; select count() from test where not empty(d.`Tuple(a Array(Dynamic))`.a.String); -select d, d.UInt64.null, d.String.null, d.`Array(Variant(String, UInt64))`.null from test format Null; -select d.UInt64.null, d.String.null, d.`Array(Variant(String, UInt64))`.null from test format Null; -select d.Int8.null, d.Date.null, d.`Array(String)`.null from test format Null; -select d, d.UInt64.null, d.Date.null, d.`Array(Variant(String, UInt64))`.null, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64.null from test format Null; -select d.UInt64.null, d.Date.null, d.`Array(Variant(String, UInt64))`.null, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64.null, d.`Array(Variant(String, UInt64))`.String.null from test format Null; +select d, d.UInt64.null, d.String.null from test format Null; +select d.UInt64.null, d.String.null from test format Null; +select d.Int8.null, d.Date.null from test format Null; +select d, d.UInt64.null, d.Date.null, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64.null from test format Null; +select d.UInt64.null, d.Date.null, d.`Array(Variant(String, UInt64))`.size0, d.`Array(Variant(String, UInt64))`.UInt64.null, d.`Array(Variant(String, UInt64))`.String.null from test format Null; select d, d.`Tuple(a UInt64, b String)`.a, d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64.null, d.`Array(Variant(String, UInt64))`.UInt64.null from test format Null; select d.`Array(Dynamic)`.`Variant(String, UInt64)`.UInt64.null, d.`Array(Dynamic)`.size0, d.`Array(Variant(String, UInt64))`.UInt64.null from test format Null; select d.`Array(Array(Dynamic))`.size1, d.`Array(Array(Dynamic))`.UInt64.null, d.`Array(Array(Dynamic))`.`Map(String, Tuple(a UInt64))`.values.a from test format Null; diff --git a/tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.reference b/tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.reference new file mode 100644 index 000000000000..c386f99e34af --- /dev/null +++ b/tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.reference @@ -0,0 +1 @@ +[0,1,0] diff --git a/tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.sql b/tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.sql new file mode 100644 index 000000000000..6afb40336005 --- /dev/null +++ b/tests/queries/0_stateless/03640_variant_array_null_map_subcolumn.sql @@ -0,0 +1,6 @@ +drop table if exists test; +create table test (v Variant(Array(Nullable(String)))) engine=MergeTree order by tuple(); +insert into test select ['hello', null, 'world']; +select v.`Array(Nullable(String))`.null from test; +drop table test; + From 9e77704aef45867e5d74fefe641b407b8a757677 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 14 Oct 2025 16:13:43 +0000 Subject: [PATCH 020/112] Backport #87903 to 25.8: Add more information for Iceberg SELECT queries profiling --- src/Common/ProfileEvents.cpp | 4 + src/Interpreters/IcebergMetadataLog.cpp | 16 +- src/Interpreters/IcebergMetadataLog.h | 5 +- .../DataLakes/Iceberg/IcebergIterator.cpp | 19 +- .../DataLakes/Iceberg/IcebergMetadata.cpp | 8 +- .../DataLakes/Iceberg/ManifestFile.cpp | 7 +- .../DataLakes/Iceberg/ManifestFile.h | 1 + .../DataLakes/Iceberg/ManifestFilesPruning.h | 8 +- .../Iceberg/StatelessMetadataFileGetter.cpp | 5 +- .../integration/test_storage_iceberg/test.py | 250 +++++++++++++----- 10 files changed, 240 insertions(+), 83 deletions(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index e6dee19233e0..fb451601fc2d 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -87,6 +87,10 @@ M(IcebergMetadataFilesCacheHits, "Number of times iceberg metadata files have been found in the cache.", ValueType::Number) \ M(IcebergMetadataFilesCacheMisses, "Number of times iceberg metadata files have not been found in the iceberg metadata cache and had to be read from (remote) disk.", ValueType::Number) \ M(IcebergMetadataFilesCacheWeightLost, "Approximate number of bytes evicted from the iceberg metadata cache.", ValueType::Number) \ + M(IcebergMetadataReadWaitTimeMicroseconds, "Total time data readers spend waiting for iceberg metadata files to be read and parsed, summed across all reader threads.", ValueType::Microseconds) \ + M(IcebergIteratorInitializationMicroseconds, "Total time spent on synchronous initialization of iceberg data iterators.", ValueType::Microseconds) \ + M(IcebergMetadataUpdateMicroseconds, "Total time spent on synchronous initialization of iceberg data iterators.", ValueType::Microseconds) \ + M(IcebergMetadataReturnedObjectInfos, "Total number of returned object infos from iceberg iterator.", ValueType::Number) \ M(VectorSimilarityIndexCacheHits, "Number of times an index granule has been found in the vector index cache.", ValueType::Number) \ M(VectorSimilarityIndexCacheMisses, "Number of times an index granule has not been found in the vector index cache and had to be read from disk.", ValueType::Number) \ M(VectorSimilarityIndexCacheWeightLost, "Approximate number of bytes evicted from the vector index cache.", ValueType::Number) \ diff --git a/src/Interpreters/IcebergMetadataLog.cpp b/src/Interpreters/IcebergMetadataLog.cpp index ab8266dfdc30..7388e55a4fcb 100644 --- a/src/Interpreters/IcebergMetadataLog.cpp +++ b/src/Interpreters/IcebergMetadataLog.cpp @@ -44,6 +44,12 @@ namespace const DataTypePtr rowType = makeNullable(std::make_shared()); +auto iceberg_pruning_status_datatype = std::make_shared(DataTypeEnum8::Values{ + {"NotPruned", static_cast(Iceberg::PruningReturnStatus::NOT_PRUNED)}, + {"PartitionPruned", static_cast(Iceberg::PruningReturnStatus::PARTITION_PRUNED)}, + {"MinMaxIndexPruned", static_cast(Iceberg::PruningReturnStatus::MIN_MAX_INDEX_PRUNED)}}); + +const DataTypePtr iceberg_pruning_status_datatype_nullable = makeNullable(iceberg_pruning_status_datatype); } ColumnsDescription IcebergMetadataLogElement::getColumnsDescription() @@ -64,7 +70,8 @@ ColumnsDescription IcebergMetadataLogElement::getColumnsDescription() {"table_path", std::make_shared(), "Table path."}, {"file_path", std::make_shared(), "File path."}, {"content", std::make_shared(), "Content in a JSON format (json file content, avro metadata or avro entry)."}, - {"row_in_file", rowType, "Row in file."}}; + {"row_in_file", rowType, "Row in file."}, + {"pruning_status", iceberg_pruning_status_datatype_nullable, "Status of partition pruning or min-max index pruning for the file."}}; } void IcebergMetadataLogElement::appendToBlock(MutableColumns & columns) const @@ -79,6 +86,7 @@ void IcebergMetadataLogElement::appendToBlock(MutableColumns & columns) const columns[column_index++]->insert(file_path); columns[column_index++]->insert(metadata_content); columns[column_index++]->insert(row_in_file ? *row_in_file : rowType->getDefault()); + columns[column_index++]->insert(pruning_status ? *pruning_status : iceberg_pruning_status_datatype_nullable->getDefault()); } void insertRowToLogTable( @@ -87,7 +95,8 @@ void insertRowToLogTable( IcebergMetadataLogLevel row_log_level, const String & table_path, const String & file_path, - std::optional row_in_file) + std::optional row_in_file, + std::optional pruning_status) { IcebergMetadataLogLevel set_log_level = local_context->getSettingsRef()[Setting::iceberg_metadata_log_level].value; if (set_log_level < row_log_level) @@ -104,6 +113,7 @@ void insertRowToLogTable( .table_path = table_path, .file_path = file_path, .metadata_content = row, - .row_in_file = row_in_file}); + .row_in_file = row_in_file, + .pruning_status = pruning_status}); } } diff --git a/src/Interpreters/IcebergMetadataLog.h b/src/Interpreters/IcebergMetadataLog.h index d4c65c4d6346..b43e2cfa47b2 100644 --- a/src/Interpreters/IcebergMetadataLog.h +++ b/src/Interpreters/IcebergMetadataLog.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -15,6 +16,7 @@ struct IcebergMetadataLogElement String file_path; String metadata_content; std::optional row_in_file; + std::optional pruning_status; static std::string name() { return "IcebergMetadataLog"; } @@ -29,7 +31,8 @@ void insertRowToLogTable( IcebergMetadataLogLevel row_log_level, const String & table_path, const String & file_path, - std::optional row_in_file); + std::optional row_in_file, + std::optional pruning_status); class IcebergMetadataLog : public SystemLog { diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp index c45f082d68a4..fc3afcf3c883 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp @@ -1,4 +1,3 @@ - #include "config.h" #if USE_AVRO @@ -45,13 +44,21 @@ #include +#include #include #include +#include +#include +#include + + namespace ProfileEvents { extern const Event IcebergPartitionPrunedFiles; extern const Event IcebergMinMaxIndexPrunedFiles; +extern const Event IcebergMetadataReadWaitTimeMicroseconds; +extern const Event IcebergMetadataReturnedObjectInfos; }; @@ -158,6 +165,14 @@ std::optional SingleThreadIcebergKeysIterator::next() local_context); } auto pruning_status = current_pruner ? current_pruner->canBePruned(manifest_file_entry) : PruningReturnStatus::NOT_PRUNED; + insertRowToLogTable( + local_context, + "", + DB::IcebergMetadataLogLevel::ManifestFileEntry, + configuration.lock()->getRawPath().path, + current_manifest_file_content->getPathToManifestFile(), + manifest_file_entry.row_number, + pruning_status); switch (pruning_status) { case PruningReturnStatus::NOT_PRUNED: @@ -321,6 +336,7 @@ IcebergIterator::IcebergIterator( ObjectInfoPtr IcebergIterator::next(size_t) { + ProfileEventTimeIncrement watch(ProfileEvents::IcebergMetadataReadWaitTimeMicroseconds); Iceberg::ManifestFileEntry manifest_file_entry; if (blocking_queue.pop(manifest_file_entry)) { @@ -333,6 +349,7 @@ ObjectInfoPtr IcebergIterator::next(size_t) { object_info->addEqualityDeleteObject(equality_delete); } + ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos); return object_info; } { diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index a92630e1bb05..8cfc7ed86039 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -68,7 +68,9 @@ namespace ProfileEvents { - extern const Event IcebergTrivialCountOptimizationApplied; +extern const Event IcebergIteratorInitializationMicroseconds; +extern const Event IcebergMetadataUpdateMicroseconds; +extern const Event IcebergTrivialCountOptimizationApplied; } namespace DB @@ -249,6 +251,7 @@ bool IcebergMetadata::update(const ContextPtr & local_context) DB::IcebergMetadataLogLevel::Metadata, configuration_ptr->getRawPath().path, metadata_file_path, + std::nullopt, std::nullopt); if (previous_snapshot_id != relevant_snapshot_id) @@ -587,6 +590,7 @@ DataLakeMetadataPtr IcebergMetadata::create( DB::IcebergMetadataLogLevel::Metadata, configuration_ptr->getRawPath().path, metadata_file_path, + std::nullopt, std::nullopt); return std::make_unique(object_storage, configuration_ptr, local_context, metadata_version, format_version, object, cache_ptr, compression_method); } @@ -782,6 +786,8 @@ ObjectIterator IcebergMetadata::iterate( { SharedLockGuard lock(mutex); + ProfileEventTimeIncrement watch(ProfileEvents::IcebergIteratorInitializationMicroseconds); + auto table_snapshot = std::make_shared(last_metadata_version, relevant_snapshot_schema_id, relevant_snapshot_id); return std::make_shared( diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp index 00c9e34d010c..98ffcc794fa1 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp @@ -160,6 +160,7 @@ ManifestFileContent::ManifestFileContent( DB::IcebergMetadataLogLevel::ManifestFileMetadata, common_path, path_to_manifest_file, + std::nullopt, std::nullopt); for (const auto & column_name : {f_status, f_data_file}) @@ -235,7 +236,8 @@ ManifestFileContent::ManifestFileContent( DB::IcebergMetadataLogLevel::ManifestFileEntry, common_path, path_to_manifest_file, - i); + i, + std::nullopt); FileContentType content_type = FileContentType::DATA; if (format_version_ > 1) content_type = FileContentType(manifest_file_deserializer.getValueFromRowByName(i, c_data_file_content, TypeIndex::Int32).safeGet()); @@ -418,6 +420,7 @@ ManifestFileContent::ManifestFileContent( this->data_files_without_deleted.emplace_back( file_path_key, file_path, + i, status, added_sequence_number, snapshot_id, @@ -444,6 +447,7 @@ ManifestFileContent::ManifestFileContent( this->position_deletes_files_without_deleted.emplace_back( file_path_key, file_path, + i, status, added_sequence_number, snapshot_id, @@ -472,6 +476,7 @@ ManifestFileContent::ManifestFileContent( this->equality_deletes_files.emplace_back( file_path_key, file_path, + i, status, added_sequence_number, snapshot_id, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h index ac12be343439..a045e976f3c6 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h @@ -61,6 +61,7 @@ struct ManifestFileEntry String file_path_key; // It's a processed file path to be used by Object Storage String file_path; + Int64 row_number; ManifestEntryStatus status; Int64 added_sequence_number; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h index bbe291f56150..f17a1ee97326 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h @@ -1,7 +1,6 @@ #pragma once #include "config.h" -#if USE_AVRO #include #include @@ -21,6 +20,13 @@ enum class PruningReturnStatus MIN_MAX_INDEX_PRUNED }; +} + +#if USE_AVRO + +namespace DB::Iceberg +{ + struct ManifestFileEntry; class ManifestFileContent; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp index a0f3c628d774..2f76d0a0bb03 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp @@ -153,6 +153,7 @@ ManifestFileCacheKeys getManifestList( DB::IcebergMetadataLogLevel::ManifestListMetadata, configuration_ptr->getRawPath().path, filename, + std::nullopt, std::nullopt); for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i) @@ -187,7 +188,8 @@ ManifestFileCacheKeys getManifestList( DB::IcebergMetadataLogLevel::ManifestListEntry, configuration_ptr->getRawPath().path, filename, - i); + i, + std::nullopt); } /// We only return the list of {file name, seq number} for cache. /// Because ManifestList holds a list of ManifestFilePtr which consume much memory space. @@ -248,7 +250,6 @@ std::pair parseTableSchemaV1Method(const Poco::J auto current_schema_id = schema->getValue(f_schema_id); return {schema, current_schema_id}; } - } } diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py index b8fa49f8b1aa..4ec8b39dc75d 100644 --- a/tests/integration/test_storage_iceberg/test.py +++ b/tests/integration/test_storage_iceberg/test.py @@ -2984,6 +2984,147 @@ def test_writes_mutate_delete(started_cluster, storage_type, partition_type): df = spark.read.format("iceberg").load(f"/iceberg_data/default/{TABLE_NAME}").collect() assert len(df) == 1 +class PrunedInfo: + def __init__(self, not_pruned, partition_pruned, min_max_index_pruned): + self.not_pruned = not_pruned + self.partition_pruned = partition_pruned + self.min_max_index_pruned = min_max_index_pruned + + def __repr__(self): + return "PrunedInfo(not_pruned={}, partition_pruned={}, min_max_index_pruned={})".format(self.not_pruned, self.partition_pruned, self.min_max_index_pruned) + + def __eq__(self, other): + return (self.not_pruned == other.not_pruned and + self.partition_pruned == other.partition_pruned and + self.min_max_index_pruned == other.min_max_index_pruned) + +def get_date_and_time_columns(instance, query_id: str): + result = dict() + for name in ['event_date', 'event_time']: + query_result = instance.query(f"SELECT {name} FROM system.iceberg_metadata_log WHERE query_id = '{query_id}'") + result[name] = query_result.split('\n') + result[name] = list(filter(lambda x: len(x) > 0, result[name])) + return result + +def get_iceberg_metadata_to_dict(instance, query_id: str): + result = dict() + for name in ['content', 'content_type', 'table_path', 'file_path', 'row_in_file', 'pruning_status']: + # We are ok with duplicates in the table itself but for test purposes we want to remove duplicates here + select_distinct_expression = f"SELECT DISTINCT(*) FROM (SELECT content, content_type, table_path, file_path, row_in_file, pruning_status FROM system.iceberg_metadata_log WHERE query_id = '{query_id}') ORDER BY ALL" + query_result = instance.query(f"SELECT {name} FROM ({select_distinct_expression})") + print("Query result for {}: {}".format(name, query_result)) + result[name] = query_result.split('\n')[:-1] + result['row_in_file'] = list(map(lambda x : int(x) if x.isdigit() else None, result['row_in_file'])) + result['pruning_status'] = list(map(lambda x : x if x != '\\N' else None, result['pruning_status'])) + print("Result dictionary: {}".format(result)) + return result + +def verify_result_dictionary(diction : dict, allowed_content_types : set): + prunned_info = PrunedInfo(0, 0, 0) + # Expected content_type and only it is present + if set(diction['content_type']) != allowed_content_types: + raise ValueError("Content type mismatch. Expected: {}, got: {}".format(allowed_content_types, set(diction['content_type']))) + # For all entries we have the same table_path + if not (len(set(diction['table_path'])) == 1 or (len(allowed_content_types) == 0 and len(diction['table_path']) == 0)): + raise ValueError("Unexpected number of table paths are found for one query. Set: {}".format(set(diction['table_path']))) + extensions = list(map(lambda x: x.split('.')[-1], diction['file_path'])) + for i in range(len(diction['content_type'])): + if diction['content_type'][i] == 'Metadata': + # File with content_type 'Metadata' has json extension + if extensions[i] != 'json': + raise ValueError("Unexpected file extension for Metadata. Expected: json, got: {}".format(extensions[i])) + else: + # File with content_types except 'Metadata' has avro extension + if extensions[i] != 'avro': + raise ValueError("Unexpected file extension for {}. Expected: avro, got: {}".format(diction['content_type'][i], extensions[i])) + + # All content is json-serializable + for content in diction['content']: + if content == '': + continue + try: + json.loads(content) + except: + raise ValueError("Content is not valid JSON. Content: {}".format(content)) + for file_path in set(diction['file_path']): + row_values = set() + number_of_missing_row_values = 0 + number_of_rows = 0 + partitioned_rows = set() + not_deleted_files = set() + for i in range(len(diction['file_path'])): + if file_path == diction['file_path'][i]: + if diction['row_in_file'][i] is not None: + row_values.add(diction['row_in_file'][i]) + # If row is present the type is entry + if diction['content_type'][i] not in ['ManifestFileEntry', 'ManifestListEntry']: + raise ValueError("Row should not be specified for an entry {}, file_path: {}".format(diction['content_type'][i], file_path)) + if diction['content'][i] != '': + number_of_rows += 1 + + if diction['content_type'][i] == 'ManifestFileEntry': + if diction['content'][i] == '': + if diction['pruning_status'][i] is None: + raise ValueError("Pruning status should be specified for this manifest file entry, file_path: {}".format(file_path)) + partitioned_rows.add(diction['row_in_file'][i]) + if diction['pruning_status'][i] == 'NotPruned': + prunned_info.not_pruned += 1 + elif diction['pruning_status'][i] == 'PartitionPruned': + prunned_info.partition_pruned += 1 + elif diction['pruning_status'][i] == 'MinMaxIndexPruned': + prunned_info.min_max_index_pruned += 1 + else: + raise ValueError("Unexpected pruning status: {}, file_path: {}".format(diction['pruning_status'][i], file_path)) + else: + data_object = json.loads(diction['content'][i]) + print("Data object: {}".format(data_object)) + if data_object['status'] < 2: + not_deleted_files.add(diction['row_in_file'][i]) + else: + # If row is not present that the type is metadata + if diction['content_type'][i] not in ['Metadata', 'ManifestFileMetadata', 'ManifestListMetadata']: + raise ValueError("Row should be specified for an entry {}, file_path: {}".format(diction['content_type'][i], file_path)) + + number_of_missing_row_values += 1 + if partitioned_rows != not_deleted_files: + raise ValueError("Partitioned rows are not consistent with not deleted files for file path: {}, partitioned rows: {}, not deleted files: {}".format(file_path, partitioned_rows, not_deleted_files)) + + # We have exactly one metadata file + if number_of_missing_row_values != 1: + raise ValueError("Not a one row value (corresponding to metadata file) is missing for file path: {}".format(file_path)) + + # Rows in avro files are consistent + if len(row_values) != number_of_rows: + raise ValueError("Unexpected number of row values for file path: {}".format(file_path)) + for i in range(number_of_rows): + if not i in row_values: + raise ValueError("Missing row value for file path: {}, missing row index: {}".format(file_path, i)) + return prunned_info + +def get_prunned_info_from_profile_events(instance, query_id: str): + instance.query("SYSTEM FLUSH LOGS") + + not_pruned = int( + instance.query( + f"SELECT ProfileEvents['IcebergMetadataReturnedObjectInfos'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'" + ) + ) + + partition_pruned = int( + instance.query( + f"SELECT ProfileEvents['IcebergPartitionPrunedFiles'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'" + ) + ) + + min_max_index_pruned = int( + instance.query( + f"SELECT ProfileEvents['IcebergMinMaxIndexPrunedFiles'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'" + ) + ) + + return PrunedInfo(not_pruned, partition_pruned, min_max_index_pruned) + + @pytest.mark.parametrize("format_version", ["1", "2"]) @pytest.mark.parametrize("storage_type", ["s3", "local", "azure"]) def test_system_iceberg_metadata(started_cluster, format_version, storage_type): @@ -2998,7 +3139,34 @@ def test_system_iceberg_metadata(started_cluster, format_version, storage_type): + get_uuid_str() ) - write_iceberg_from_df(spark, generate_data(spark, 0, 100), TABLE_NAME) + def execute_spark_query(query: str): + return execute_spark_query_general( + spark, + started_cluster, + storage_type, + TABLE_NAME, + query, + ) + + execute_spark_query( + f""" + CREATE TABLE {TABLE_NAME} ( + a INT, + b STRING + ) + USING iceberg + PARTITIONED BY (identity(a)) + OPTIONS('format-version'='2') + """ + ) + + for i in range(5): + spark.sql( + f""" + INSERT INTO {TABLE_NAME} VALUES + ({i}, '{i}'); + """ + ) default_upload_directory( started_cluster, @@ -3007,91 +3175,27 @@ def test_system_iceberg_metadata(started_cluster, format_version, storage_type): f"/iceberg_data/default/{TABLE_NAME}/", ) - def get_iceberg_metadata_to_dict(query_id: str): - instance = started_cluster.instances["node1"] - result = dict() - for name in ['content', 'content_type', 'table_path', 'file_path', 'row_in_file']: - # We are ok with duplicates in the table itself but for test purposes we want to remove duplicates here - select_distinct_expression = f"SELECT DISTINCT(*) FROM (SELECT content, content_type, table_path, file_path, row_in_file FROM system.iceberg_metadata_log WHERE query_id = '{query_id}') ORDER BY ALL" - query_result = instance.query(f"SELECT {name} FROM ({select_distinct_expression})") - result[name] = query_result.split('\n') - result[name] = list(filter(lambda x: len(x) > 0, result[name])) - result['row_in_file'] = list(map(lambda x : int(x) if x.isdigit() else None, result['row_in_file'])) - return result - - def verify_result_dictionary(diction : dict, allowed_content_types : set): - # Expected content_type and only it is present - if set(diction['content_type']) != allowed_content_types: - raise ValueError("Content type mismatch. Expected: {}, got: {}".format(allowed_content_types, set(diction['content_type']))) - # For all entries we have the same table_path - if not (len(set(diction['table_path'])) == 1 or (len(allowed_content_types) == 0 and len(diction['table_path']) == 0)): - raise ValueError("Unexpected number of table paths are found for one query. Set: {}".format(set(diction['table_path']))) - extensions = list(map(lambda x: x.split('.')[-1], diction['file_path'])) - for i in range(len(diction['content_type'])): - if diction['content_type'][i] == 'Metadata': - # File with content_type 'Metadata' has json extension - if extensions[i] != 'json': - raise ValueError("Unexpected file extension for Metadata. Expected: json, got: {}".format(extensions[i])) - else: - # File with content_types except 'Metadata' has avro extension - if extensions[i] != 'avro': - raise ValueError("Unexpected file extension for {}. Expected: avro, got: {}".format(diction['content_type'][i], extensions[i])) - - # All content is json-serializable - for content in diction['content']: - try: - json.loads(content) - except: - raise ValueError("Content is not valid JSON. Content: {}".format(content)) - for file_path in set(diction['file_path']): - row_values = set() - number_of_missing_row_values = 0 - number_of_rows = 0 - for i in range(len(diction['file_path'])): - if file_path == diction['file_path'][i]: - if diction['row_in_file'][i] is not None: - row_values.add(diction['row_in_file'][i]) - # If row is present the type is entry - if diction['content_type'][i] not in ['ManifestFileEntry', 'ManifestListEntry']: - raise ValueError("Row should not be specified for an entry {}, file_path: {}".format(diction['content_type'][i], file_path)) - number_of_rows += 1 - else: - # If row is not present that the type is metadata - if diction['content_type'][i] not in ['Metadata', 'ManifestFileMetadata', 'ManifestListMetadata']: - raise ValueError("Row should be specified for an entry {}, file_path: {}".format(diction['content_type'][i], file_path)) - - number_of_missing_row_values += 1 - - # We have exactly one metadata file - if number_of_missing_row_values != 1: - raise ValueError("Not a one row value (corresponding to metadata file) is missing for file path: {}".format(file_path)) - - # Rows in avro files are consistent - if len(row_values) != number_of_rows: - raise ValueError("Unexpected number of row values for file path: {}".format(file_path)) - for i in range(number_of_rows): - if not i in row_values: - raise ValueError("Missing row value for file path: {}, missing row index: {}".format(file_path, i)) - - create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster) content_types = ["Metadata", "ManifestListMetadata", "ManifestListEntry", "ManifestFileMetadata", "ManifestFileEntry"] settings = ["none", "metadata", "manifest_list_metadata", "manifest_list_entry", "manifest_file_metadata", "manifest_file_entry"] + for i in range(len(settings)): allowed_content_types = set(content_types[:i]) - query_id = TABLE_NAME + "_" + str(i) + "_" + uuid.uuid4().hex + query_id = TABLE_NAME + "_" + str(i) + "_" + get_uuid_str() - assert instance.query(f"SELECT * FROM {TABLE_NAME}", query_id = query_id, settings={"iceberg_metadata_log_level":settings[i]}) + instance.query(f"SELECT * FROM {TABLE_NAME} WHERE a >= 2", query_id = query_id, settings={"iceberg_metadata_log_level":settings[i], "use_iceberg_partition_pruning": 1}) - instance.query("SYSTEM FLUSH LOGS iceberg_metadata_log") + expected_prunned_info = get_prunned_info_from_profile_events(instance, query_id) - diction = get_iceberg_metadata_to_dict(query_id) + diction = get_iceberg_metadata_to_dict(instance, query_id) try: - verify_result_dictionary(diction, allowed_content_types) + if settings[i] == 'manifest_file_entry': + table_prunned_info = verify_result_dictionary(diction, allowed_content_types) + assert table_prunned_info == expected_prunned_info, "Not prunned files count mismatch. Table: {}, ProfileEvents: {}".format(table_prunned_info, expected_prunned_info) except: print("Dictionary: {}, Allowed Content Types: {}".format(diction, allowed_content_types)) raise From 7d3738c58436d7193091c5a09d8707a687c5613f Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 15 Oct 2025 13:22:09 +0000 Subject: [PATCH 021/112] Backport #88544 to 25.8: Fix `ColumnBLOB should be converted to a regular column before usage` from `CREATE AS SELECT` --- src/Planner/Planner.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index bd33315d138d..65f27253906f 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -1977,15 +1977,18 @@ void Planner::buildPlanForQueryNode() addAdditionalFilterStepIfNeeded(query_plan, query_node, select_query_options, planner_context); } + const auto & client_info = query_context->getClientInfo(); + // Not all cases are supported here yet. E.g. for this query: // select * from remote('127.0.0.{1,2}', numbers_mt(1e6)) group by number // we will have `BlocksMarshallingStep` added to the query plan, but not for // select * from remote('127.0.0.{1,2}', numbers_mt(1e6)) // because `to_stage` for it will be `QueryProcessingStage::Complete`. if (query_context->getSettingsRef()[Setting::enable_parallel_blocks_marshalling] - && query_context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY + && client_info.query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && select_query_options.to_stage != QueryProcessingStage::Complete // Don't do it for INSERT SELECT, for example - && query_context->getClientInfo().distributed_depth <= 1 // Makes sense for higher depths too, just not supported + && client_info.distributed_depth <= 1 // Makes sense for higher depths too, just not supported + && !client_info.is_replicated_database_internal ) query_plan.addStep(std::make_unique(query_plan.getCurrentHeader())); From 438a73f564128592e7cf735722d59a9fa65890a0 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 15 Oct 2025 16:14:09 +0000 Subject: [PATCH 022/112] Backport #87029 to 25.8: Fix auto cluster functions schema handling --- src/TableFunctions/TableFunctionURL.cpp | 8 ++++---- ...cluster_functions_with_parallel_replicas.reference | 1 + ..._auto_cluster_functions_with_parallel_replicas.sql | 11 ++++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index 0cbacdf42a16..2698128f5723 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -111,11 +111,11 @@ StoragePtr TableFunctionURL::getStorage( return std::make_shared( global_context, parallel_replicas_cluster_name, - filename, - format, - compression_method, + source, + format_, + compression_method_, StorageID(getDatabaseName(), table_name), - columns, + getActualTableStructure(global_context, true), ConstraintsDescription{}, configuration); } diff --git a/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.reference b/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.reference index e60ede2aeaab..3fcc0142b38a 100644 --- a/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.reference +++ b/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.reference @@ -33,6 +33,7 @@ Expression ((Project names + Projection)) ReadFromObjectStorage 4 4 +4 Expression ((Project names + (Projection + Change column names to column identifiers))) ReadFromURL Expression ((Project names + (Projection + Change column names to column identifiers))) diff --git a/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.sql b/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.sql index 0814052a3811..92e0e112625c 100644 --- a/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.sql +++ b/tests/queries/0_stateless/03275_auto_cluster_functions_with_parallel_replicas.sql @@ -20,10 +20,15 @@ EXPLAIN SELECT number FROM system.numbers n JOIN (SELECT * FROM s3('http://local SELECT count() FROM s3('http://localhost:11111/test/a.tsv', 'TSV'); DROP TABLE IF EXISTS dupe_test_with_auto_functions; -CREATE TABLE dupe_test_with_auto_functions (c1 String, c2 String, c3 String) ENGINE = MergeTree ORDER BY c1; +CREATE TABLE dupe_test_with_auto_functions (n1 String, n2 String, n3 String) ENGINE = MergeTree ORDER BY n1; INSERT INTO dupe_test_with_auto_functions SELECT * FROM s3('http://localhost:11111/test/a.tsv', 'TSV'); SELECT count() FROM dupe_test_with_auto_functions; +DROP TABLE IF EXISTS insert_with_url_function; +CREATE TABLE insert_with_url_function (n1 String, n2 String, n3 String) ENGINE = MergeTree ORDER BY n1; +INSERT INTO insert_with_url_function SELECT * FROM url('http://localhost:11111/test/a.tsv', 'TSV'); +SELECT count() FROM insert_with_url_function; + SET parallel_replicas_for_cluster_engines=false; @@ -33,11 +38,11 @@ EXPLAIN SELECT * FROM s3('http://localhost:11111/test/a.tsv', 'TSV'); SELECT count() FROM s3('http://localhost:11111/test/a.tsv', 'TSV'); DROP TABLE IF EXISTS dupe_test_without_cluster_functions; -CREATE TABLE dupe_test_without_cluster_functions (c1 String, c2 String, c3 String) ENGINE = MergeTree ORDER BY c1; +CREATE TABLE dupe_test_without_cluster_functions (n1 String, n2 String, n3 String) ENGINE = MergeTree ORDER BY n1; INSERT INTO dupe_test_without_cluster_functions SELECT * FROM s3('http://localhost:11111/test/a.tsv', 'TSV'); SELECT count() FROM dupe_test_without_cluster_functions; DROP TABLE IF EXISTS dupe_test_with_cluster_function; -CREATE TABLE dupe_test_with_cluster_function (c1 String, c2 String, c3 String) ENGINE = MergeTree ORDER BY c1; +CREATE TABLE dupe_test_with_cluster_function (n1 String, n2 String, n3 String) ENGINE = MergeTree ORDER BY n1; INSERT INTO dupe_test_with_cluster_function SELECT * FROM s3Cluster('test_cluster_two_shards', 'http://localhost:11111/test/a.tsv', 'TSV'); SELECT count() FROM dupe_test_with_cluster_function; From 4e6df7f240f930c023a914a166783a11ad38f277 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 16 Oct 2025 12:17:45 +0000 Subject: [PATCH 023/112] Backport #88484 to 25.8: Dynamic `backups.max_attempts_after_bad_version` to work on big clusters --- src/Backups/BackupCoordinationStageSync.cpp | 3 +- .../concurrency_helper.py | 59 +++++++++++++ .../test_concurrency.py | 61 +++++--------- .../test_disallow_concurrency.py | 83 ++++--------------- .../test_huge_concurrent_restore.py | 78 +++++++++++++++++ 5 files changed, 173 insertions(+), 111 deletions(-) create mode 100644 tests/integration/test_backup_restore_on_cluster/concurrency_helper.py create mode 100644 tests/integration/test_backup_restore_on_cluster/test_huge_concurrent_restore.py diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index 60b06be31226..b86c4309e104 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -115,7 +115,8 @@ BackupCoordinationStageSync::BackupCoordinationStageSync( , failure_after_host_disconnected_for_seconds(with_retries.getKeeperSettings().failure_after_host_disconnected_for_seconds) , finish_timeout_after_error(with_retries.getKeeperSettings().finish_timeout_after_error) , sync_period_ms(with_retries.getKeeperSettings().sync_period_ms) - , max_attempts_after_bad_version(with_retries.getKeeperSettings().max_attempts_after_bad_version) + // all_hosts.size() is added to max_attempts_after_bad_version since each host change the num_hosts node once, and it's a valid case + , max_attempts_after_bad_version(with_retries.getKeeperSettings().max_attempts_after_bad_version + all_hosts.size()) , zookeeper_path(zookeeper_path_) , root_zookeeper_path(zookeeper_path.parent_path().parent_path()) , operation_zookeeper_path(zookeeper_path.parent_path()) diff --git a/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py b/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py new file mode 100644 index 000000000000..0442357c5e62 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py @@ -0,0 +1,59 @@ +from pathlib import Path +from typing import Callable, List + +from helpers.cluster import ClickHouseCluster, ClickHouseInstance + + +def generate_cluster_def(file: str, num_nodes: int) -> str: + path = ( + Path(__file__).parent / f"_gen/cluster_{Path(file).stem}_{num_nodes}_nodes.xml" + ) + path.parent.mkdir(parents=True, exist_ok=True) + replicas = "\n".join( + f""" + node{i} + 9000 + """ + for i in range(num_nodes) + ) + path.write_text( + encoding="utf-8", + data=f""" + + + +{replicas} + + + +""", + ) + return str(path.absolute()) + + +def add_nodes_to_cluster( + cluster: ClickHouseCluster, + num_nodes: int, + main_configs: List[str], + user_configs: List[str], +) -> List[ClickHouseInstance]: + nodes = [ + cluster.add_instance( + f"node{i}", + main_configs=main_configs, + user_configs=user_configs, + external_dirs=["/backups/"], + macros={"replica": f"node{i}", "shard": "shard1"}, + with_zookeeper=True, + ) + for i in range(num_nodes) + ] + return nodes + + +def create_test_table(node: ClickHouseInstance) -> None: + node.query( + """CREATE TABLE tbl ON CLUSTER 'cluster' ( x UInt64 ) +ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}') +ORDER BY tuple()""" + ) diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index 304ae4af752d..e9354a27fdd2 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -1,50 +1,32 @@ import concurrent -import os.path import time from random import randint, random +from typing import List import pytest -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, ClickHouseInstance from helpers.test_tools import TSV, assert_eq_with_retry +from .concurrency_helper import ( + add_nodes_to_cluster, + create_test_table, + generate_cluster_def, +) + cluster = ClickHouseCluster(__file__) num_nodes = 10 -def generate_cluster_def(): - path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "./_gen/cluster_for_concurrency_test.xml", - ) - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "w") as f: - f.write("\n\t\n\t\t\n\t\t\t\n") - for i in range(num_nodes): - f.write( - f"\t\t\t\t\n\t\t\t\t\tnode{i}\n\t\t\t\t\t9000\n\t\t\t\t\n" - ) - f.write("\t\t\t\n\t\t\n\t\n") - return path - - -main_configs = ["configs/backups_disk.xml", generate_cluster_def()] +main_configs = [ + "configs/backups_disk.xml", + generate_cluster_def(__file__, num_nodes), +] # No [Zoo]Keeper retries for tests with concurrency user_configs = ["configs/allow_database_types.xml"] -nodes = [] -for i in range(num_nodes): - nodes.append( - cluster.add_instance( - f"node{i}", - main_configs=main_configs, - user_configs=user_configs, - external_dirs=["/backups/"], - macros={"replica": f"node{i}", "shard": "shard1"}, - with_zookeeper=True, - ) - ) +nodes = add_nodes_to_cluster(cluster, num_nodes, main_configs, user_configs) node0 = nodes[0] @@ -70,23 +52,18 @@ def drop_after_test(): backup_id_counter = 0 +def create_and_fill_table() -> None: + create_test_table(node0) + for i, node in enumerate(nodes): + node.query(f"INSERT INTO tbl VALUES ({i})") + + def new_backup_name(): global backup_id_counter backup_id_counter += 1 return f"Disk('backups', '{backup_id_counter}')" -def create_and_fill_table(): - node0.query( - "CREATE TABLE tbl ON CLUSTER 'cluster' (" - "x Int32" - ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" - "ORDER BY tuple()" - ) - for i in range(num_nodes): - nodes[i].query(f"INSERT INTO tbl VALUES ({i})") - - expected_sum = num_nodes * (num_nodes - 1) // 2 diff --git a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py index 3dea986e3d97..6913a359d8da 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py @@ -1,72 +1,24 @@ import concurrent -import os.path -import re -import time -from random import randint import pytest -from helpers.cluster import ClickHouseCluster -from helpers.test_tools import TSV, assert_eq_with_retry +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.test_tools import assert_eq_with_retry + +from .concurrency_helper import ( + add_nodes_to_cluster, + create_test_table, + generate_cluster_def, +) cluster = ClickHouseCluster(__file__) num_nodes = 2 - -def generate_cluster_def(): - path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "./_gen/cluster_for_test_disallow_concurrency.xml", - ) - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "w") as f: - f.write( - """ - - - - - """ - ) - for i in range(num_nodes): - f.write( - """ - - node""" - + str(i) - + """ - 9000 - - """ - ) - f.write( - """ - - - - - """ - ) - return path - - -main_configs = ["configs/disallow_concurrency.xml", generate_cluster_def()] # No [Zoo]Keeper retries for tests with concurrency user_configs = ["configs/allow_database_types.xml"] -nodes = [] -for i in range(num_nodes): - nodes.append( - cluster.add_instance( - f"node{i}", - main_configs=main_configs, - user_configs=user_configs, - external_dirs=["/backups/"], - macros={"replica": f"node{i}", "shard": "shard1"}, - with_zookeeper=True, - ) - ) +nodes = add_nodes_to_cluster(cluster, num_nodes, main_configs, user_configs) node0 = nodes[0] @@ -96,23 +48,18 @@ def drop_after_test(): backup_id_counter = 0 +def create_and_fill_table() -> None: + create_test_table(node0) + for node in nodes: + node.query("INSERT INTO tbl SELECT number FROM numbers(40000000)") + + def new_backup_name(): global backup_id_counter backup_id_counter += 1 return f"Disk('backups', '{backup_id_counter}')" -def create_and_fill_table(): - node0.query( - "CREATE TABLE tbl ON CLUSTER 'cluster' (" - "x UInt64" - ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" - "ORDER BY x" - ) - for i in range(num_nodes): - nodes[i].query(f"INSERT INTO tbl SELECT number FROM numbers(40000000)") - - def get_status_and_error(node, backup_or_restore_id): return ( node.query( diff --git a/tests/integration/test_backup_restore_on_cluster/test_huge_concurrent_restore.py b/tests/integration/test_backup_restore_on_cluster/test_huge_concurrent_restore.py new file mode 100644 index 000000000000..a6474c7a93d7 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/test_huge_concurrent_restore.py @@ -0,0 +1,78 @@ +from typing import List + +import pytest + +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.test_tools import TSV + +from .concurrency_helper import ( + add_nodes_to_cluster, + create_test_table, + generate_cluster_def, +) + +cluster = ClickHouseCluster(__file__) + +# Testing backups.max_attempts_after_bad_version is dynamic, and depends on num_nodes +num_nodes = 20 + +main_configs = [ + "configs/backups_disk.xml", + generate_cluster_def(__file__, num_nodes), +] +# No [Zoo]Keeper retries for tests with concurrency +user_configs = ["configs/allow_database_types.xml"] + +nodes = add_nodes_to_cluster(cluster, num_nodes, main_configs, user_configs) + +node0 = nodes[0] + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(autouse=True) +def drop_after_test(): + try: + yield + finally: + node0.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' SYNC") + node0.query("DROP DATABASE IF EXISTS mydb ON CLUSTER 'cluster' SYNC") + + +backup_id_counter = 0 + + +def new_backup_name(): + global backup_id_counter + backup_id_counter += 1 + return f"Disk('backups', '{backup_id_counter}')" + + +def create_and_fill_table() -> None: + create_test_table(node0) + for i, node in enumerate(nodes): + node.query(f"INSERT INTO tbl VALUES ({i})") + + +expected_sum = num_nodes * (num_nodes - 1) // 2 + + +def test_backup_restore_huge_cluster(): + create_and_fill_table() + + backup_name = new_backup_name() + node0.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}") + + node0.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC") + node0.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + node0.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") + + for i in range(num_nodes): + assert nodes[i].query("SELECT sum(x) FROM tbl") == TSV([expected_sum]) From a39fb565e6f3fe53cc71ce20727088f6a7ad8893 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 16 Oct 2025 12:19:37 +0000 Subject: [PATCH 024/112] Backport #88154 to 25.8: Cleanup stale replicas from DDL Worker replicas set --- src/Interpreters/DDLWorker.cpp | 27 +++++++++++++++++++++++++++ src/Interpreters/DDLWorker.h | 1 + 2 files changed, 28 insertions(+) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 5590029fc66f..b1922a694919 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1396,6 +1396,32 @@ void DDLWorker::markReplicasActive(bool /*reinitialized*/) } } +void DDLWorker::cleanupStaleReplicas(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper) +{ + auto replicas = zookeeper->getChildren(replicas_dir); + static constexpr Int64 REPLICA_MAX_INACTIVE_SECONDS = 86400; + for (const auto & replica : replicas) + { + auto replica_path = fs::path(replicas_dir) / replica; + auto responses = zookeeper->tryGet({replica_path, fs::path(replica_path) / "active"}); + /// Replica not active + if (responses[1].error == Coordination::Error::ZNONODE) + { + auto stat = responses[0].stat; + /// Replica was not active for too long, let's cleanup to avoid polluting Keeper with + /// removed replicas + if (stat.mtime / 1000 + REPLICA_MAX_INACTIVE_SECONDS < current_time_seconds) + { + LOG_INFO(log, "Replica {} is stale, removing it", replica); + auto code = zookeeper->tryRemove(replica_path, -1); + if (code != Coordination::Error::ZOK) + LOG_WARNING(log, "Cannot remove stale replica {}, code {}", replica, Coordination::errorMessage(code)); + } + } + } + +} + void DDLWorker::runCleanupThread() { setThreadName("DDLWorkerClnr"); @@ -1423,6 +1449,7 @@ void DDLWorker::runCleanupThread() continue; cleanupQueue(current_time_seconds, zookeeper); + cleanupStaleReplicas(current_time_seconds, zookeeper); last_cleanup_time_seconds = current_time_seconds; } catch (...) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 6717040fa033..abf0a9f84098 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -157,6 +157,7 @@ class DDLWorker /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); + void cleanupStaleReplicas(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat); /// Init task node From 5200b9cbac5826a7b0d44dfb8466fc18cf7a9c58 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 16 Oct 2025 18:15:01 +0000 Subject: [PATCH 025/112] Backport #87958 to 25.8: Don't remove injective functions from GROUP BY if arguments types are not allowed in GROUP BY --- .../OptimizeGroupByInjectiveFunctionsPass.cpp | 27 ++++++++++++++++++- src/Interpreters/TreeOptimizer.cpp | 7 +++++ ...injective_functoon_bad_arguments.reference | 23 ++++++++++++++++ ...up_by_injective_functoon_bad_arguments.sql | 8 ++++++ 4 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.reference create mode 100644 tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.sql diff --git a/src/Analyzer/Passes/OptimizeGroupByInjectiveFunctionsPass.cpp b/src/Analyzer/Passes/OptimizeGroupByInjectiveFunctionsPass.cpp index 9e1c21375fae..5293ba0bf860 100644 --- a/src/Analyzer/Passes/OptimizeGroupByInjectiveFunctionsPass.cpp +++ b/src/Analyzer/Passes/OptimizeGroupByInjectiveFunctionsPass.cpp @@ -13,6 +13,7 @@ namespace Setting { extern const SettingsBool group_by_use_nulls; extern const SettingsBool optimize_injective_functions_in_group_by; + extern const SettingsBool allow_suspicious_types_in_group_by; } namespace @@ -88,7 +89,8 @@ class OptimizeGroupByInjectiveFunctionsVisitor : public InDepthQueryTreeVisitorW // Aggregate functions are not allowed in GROUP BY clause auto function = function_node->getFunctionOrThrow(); - bool can_be_eliminated = function->isInjective(function_node->getArgumentColumns()); + auto arguments = function_node->getArgumentColumns(); + bool can_be_eliminated = function->isInjective(arguments) && isValidGroupByKeyTypes(arguments); if (can_be_eliminated) { @@ -106,6 +108,29 @@ class OptimizeGroupByInjectiveFunctionsVisitor : public InDepthQueryTreeVisitorW grouping_set = std::move(new_group_by_keys); } + + bool isValidGroupByKeyTypes(const ColumnsWithTypeAndName & columns) const + { + if (getContext()->getSettingsRef()[Setting::allow_suspicious_types_in_group_by]) + return true; + + bool is_valid = true; + auto check = [&](const IDataType & type) + { + /// Dynamic and Variant types are not allowed in GROUP BY by default. + is_valid &= !isDynamic(type) && !isVariant(type); + }; + + for (const auto & column : columns) + { + check(*column.type); + column.type->forEachChild(check); + if (!is_valid) + break; + } + + return is_valid; + } }; } diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 7f794c3e90e8..c56b65ba7c69 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -60,6 +60,7 @@ namespace Setting extern const SettingsBool optimize_redundant_functions_in_order_by; extern const SettingsBool optimize_rewrite_array_exists_to_has; extern const SettingsBool optimize_or_like_chain; + extern const SettingsBool optimize_injective_functions_in_group_by; } namespace ErrorCodes @@ -132,6 +133,12 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context) { if (const auto * function = group_exprs[i]->as()) { + if (!settings[Setting::optimize_injective_functions_in_group_by]) + { + ++i; + continue; + } + /// assert function is injective if (possibly_injective_function_names.contains(function->name)) { diff --git a/tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.reference b/tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.reference new file mode 100644 index 000000000000..95b069a1550e --- /dev/null +++ b/tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.reference @@ -0,0 +1,23 @@ +1 str +1 str +1 str +QUERY id: 0 + PROJECTION COLUMNS + count() UInt64 + toString(json.a) String + PROJECTION + LIST id: 1, nodes: 2 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + FUNCTION id: 3, function_name: toString, function_type: ordinary, result_type: String + ARGUMENTS + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: json.a, result_type: Dynamic, source_id: 6 + JOIN TREE + TABLE id: 6, alias: __table1, table_name: default.test + GROUP BY + LIST id: 7, nodes: 1 + FUNCTION id: 8, function_name: toString, function_type: ordinary, result_type: String + ARGUMENTS + LIST id: 9, nodes: 1 + COLUMN id: 10, column_name: json.a, result_type: Dynamic, source_id: 6 + SETTINGS enable_analyzer=1 optimize_injective_functions_in_group_by=1 diff --git a/tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.sql b/tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.sql new file mode 100644 index 000000000000..0cacbda282bd --- /dev/null +++ b/tests/queries/0_stateless/03641_group_by_injective_functoon_bad_arguments.sql @@ -0,0 +1,8 @@ +create table test (json JSON) engine=MergeTree order by tuple(); +insert into test select '{"a" : "str"}'; +select count(), toString(json.a) from test group by toString(json.a) settings enable_analyzer=0, optimize_injective_functions_in_group_by=0; +select count(), toString(json.a) from test group by toString(json.a) settings enable_analyzer=1, optimize_injective_functions_in_group_by=0; +select count(), toString(json.a) from test group by toString(json.a) settings enable_analyzer=1, optimize_injective_functions_in_group_by=1; +explain query tree select count(), toString(json.a) from test group by toString(json.a) settings enable_analyzer=1, optimize_injective_functions_in_group_by=1; +drop table test; + From dcdda50c7e316baa250049db9cc8d7fb5d490413 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 17 Oct 2025 08:00:24 +0200 Subject: [PATCH 026/112] Fix wrong conflict resolution --- .../test_disallow_concurrency.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py index 6913a359d8da..fb985b61ac88 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py @@ -15,6 +15,11 @@ num_nodes = 2 + +main_configs = [ + "configs/disallow_concurrency.xml", + generate_cluster_def(__file__, num_nodes), +] # No [Zoo]Keeper retries for tests with concurrency user_configs = ["configs/allow_database_types.xml"] From 976a55951cf6669e1c51944f513407aa5681ada8 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 17 Oct 2025 00:00:50 +0200 Subject: [PATCH 027/112] Reuse cluster_conf in backup_restore_on_cluster --- .../concurrency_helper.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py b/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py index 0442357c5e62..522e75b53b81 100644 --- a/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py +++ b/tests/integration/test_backup_restore_on_cluster/concurrency_helper.py @@ -1,14 +1,15 @@ from pathlib import Path -from typing import Callable, List +from typing import List from helpers.cluster import ClickHouseCluster, ClickHouseInstance def generate_cluster_def(file: str, num_nodes: int) -> str: + # For multiple workers, it has race and sometimes errors out, + # so we generate it once and reuse path = ( Path(__file__).parent / f"_gen/cluster_{Path(file).stem}_{num_nodes}_nodes.xml" ) - path.parent.mkdir(parents=True, exist_ok=True) replicas = "\n".join( f""" node{i} @@ -16,9 +17,7 @@ def generate_cluster_def(file: str, num_nodes: int) -> str: """ for i in range(num_nodes) ) - path.write_text( - encoding="utf-8", - data=f""" + config = f""" @@ -26,7 +25,15 @@ def generate_cluster_def(file: str, num_nodes: int) -> str: -""", +""" + if path.is_file(): + existing = path.read_text(encoding="utf-8") + if existing == config: + return str(path.absolute()) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + encoding="utf-8", + data=config, ) return str(path.absolute()) From 5426bfcb1eb5635382b3d36259a0b4a99979cc27 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 17 Oct 2025 08:16:09 +0000 Subject: [PATCH 028/112] Backport #88605 to 25.8: Fix potential crash caused by concurrent mutation of underlying const PREWHERE columns --- src/Storages/MergeTree/MergeTreeReadTask.cpp | 7 ++++++- .../03680_mergetree_shrink_const_from_prewhere.reference | 3 +++ .../03680_mergetree_shrink_const_from_prewhere.sql | 9 +++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.reference create mode 100644 tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.sql diff --git a/src/Storages/MergeTree/MergeTreeReadTask.cpp b/src/Storages/MergeTree/MergeTreeReadTask.cpp index 2da717539d77..0e8b05ba7957 100644 --- a/src/Storages/MergeTree/MergeTreeReadTask.cpp +++ b/src/Storages/MergeTree/MergeTreeReadTask.cpp @@ -255,7 +255,12 @@ MergeTreeReadTask::BlockAndProgress MergeTreeReadTask::read() if (read_result.num_rows != 0) { for (const auto & column : read_result.columns) - column->assumeMutableRef().shrinkToFit(); + { + /// We may have columns that has other references, usually it is a constant column that has been created during analysis + /// (that will not be const here anymore, i.e. after materialize()), and we do not need to shrink it anyway. + if (column->use_count() == 1) + column->assumeMutableRef().shrinkToFit(); + } block = sample_block.cloneWithColumns(read_result.columns); } diff --git a/tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.reference b/tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.reference new file mode 100644 index 000000000000..01e79c32a8c9 --- /dev/null +++ b/tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.reference @@ -0,0 +1,3 @@ +1 +2 +3 diff --git a/tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.sql b/tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.sql new file mode 100644 index 000000000000..eaf1ee51216a --- /dev/null +++ b/tests/queries/0_stateless/03680_mergetree_shrink_const_from_prewhere.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS const_node; +CREATE TABLE const_node (`v` Nullable(UInt8)) ENGINE = MergeTree ORDER BY tuple(); +SYSTEM STOP MERGES const_node; +INSERT INTO const_node VALUES (1); +INSERT INTO const_node VALUES (2); +INSERT INTO const_node VALUES (3); +-- Here we have condition with a constant "materialize(255)", for which convertToFullColumnIfConst() will return underlying column w/o copying, +-- and later shrinkToFit() will be called from multiple threads on this column, and leads to UB +SELECT v FROM const_node PREWHERE and(materialize(255), *) ORDER BY v; From bfa674c2cac73108a9427a447257aad5c459a193 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 17 Oct 2025 11:11:25 +0000 Subject: [PATCH 029/112] Backport #88668 to 25.8: wrap `~PooledConnection` in try-catch --- .../Net/include/Poco/Net/HTTPChunkedStream.h | 2 +- base/poco/Net/src/HTTPChunkedStream.cpp | 4 +- src/Common/HTTPConnectionPool.cpp | 59 +++++++++++-------- .../tests/gtest_http_chunked_stream.cpp | 16 +++++ ...ed_losing_files_after_exception.reference} | 0 ...buted_losing_files_after_exception.sql.j2} | 0 6 files changed, 52 insertions(+), 29 deletions(-) create mode 100644 src/Common/tests/gtest_http_chunked_stream.cpp rename tests/queries/0_stateless/{02537_distributed_loosing_files_after_exception.reference => 02537_distributed_losing_files_after_exception.reference} (100%) rename tests/queries/0_stateless/{02537_distributed_loosing_files_after_exception.sql.j2 => 02537_distributed_losing_files_after_exception.sql.j2} (100%) diff --git a/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h b/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h index d5263319ed38..604e66da5786 100644 --- a/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h +++ b/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h @@ -45,7 +45,7 @@ namespace Net ~HTTPChunkedStreamBuf(); void close(); - bool isComplete(bool read_from_device_to_check_eof = false); + bool isComplete(bool read_from_device_to_check_eof = false) noexcept; protected: int readFromDevice(char * buffer, std::streamsize length); diff --git a/base/poco/Net/src/HTTPChunkedStream.cpp b/base/poco/Net/src/HTTPChunkedStream.cpp index 043c38ce9e71..ca0b73128ff4 100644 --- a/base/poco/Net/src/HTTPChunkedStream.cpp +++ b/base/poco/Net/src/HTTPChunkedStream.cpp @@ -140,7 +140,7 @@ int HTTPChunkedStreamBuf::readFromDevice(char* buffer, std::streamsize length) } -bool HTTPChunkedStreamBuf::isComplete(bool read_from_device_to_check_eof) +bool HTTPChunkedStreamBuf::isComplete(bool read_from_device_to_check_eof) noexcept { if (read_from_device_to_check_eof) { @@ -150,7 +150,7 @@ bool HTTPChunkedStreamBuf::isComplete(bool read_from_device_to_check_eof) /// "Unexpected EOF" exception would be thrown readFromDevice(nullptr, 0); } - catch (Poco::Net::MessageException &) + catch (...) { return false; } diff --git a/src/Common/HTTPConnectionPool.cpp b/src/Common/HTTPConnectionPool.cpp index 33a85b3a7c09..a6db50c05455 100644 --- a/src/Common/HTTPConnectionPool.cpp +++ b/src/Common/HTTPConnectionPool.cpp @@ -438,38 +438,45 @@ class EndpointConnectionPool : public std::enable_shared_from_this(response_stream)) + if (bool(response_stream)) { - response_stream_completed = fixed_steam->isComplete(); + if (auto * fixed_steam = dynamic_cast(response_stream)) + { + response_stream_completed = fixed_steam->isComplete(); + } + else if (auto * chunked_steam = dynamic_cast(response_stream)) + { + response_stream_completed = chunked_steam->isComplete(); + } + else if (auto * http_stream = dynamic_cast(response_stream)) + { + response_stream_completed = http_stream->isComplete(); + } + else + { + response_stream_completed = false; + } } - else if (auto * chunked_steam = dynamic_cast(response_stream)) - { - response_stream_completed = chunked_steam->isComplete(); - } - else if (auto * http_stream = dynamic_cast(response_stream)) - { - response_stream_completed = http_stream->isComplete(); - } - else - { - response_stream_completed = false; - } - } - response_stream = nullptr; - Session::setSendDataHooks(); - Session::setReceiveDataHooks(); - Session::setSendThrottler(); - Session::setReceiveThrottler(); + response_stream = nullptr; + Session::setSendDataHooks(); + Session::setReceiveDataHooks(); + Session::setSendThrottler(); + Session::setReceiveThrottler(); - group->atConnectionDestroy(); + group->atConnectionDestroy(); - if (!isExpired) - if (auto lock = pool.lock()) - lock->atConnectionDestroy(*this); + if (!isExpired) + if (auto lock = pool.lock()) + lock->atConnectionDestroy(*this); - CurrentMetrics::sub(metrics.active_count); + CurrentMetrics::sub(metrics.active_count); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } private: diff --git a/src/Common/tests/gtest_http_chunked_stream.cpp b/src/Common/tests/gtest_http_chunked_stream.cpp new file mode 100644 index 000000000000..c737f4794105 --- /dev/null +++ b/src/Common/tests/gtest_http_chunked_stream.cpp @@ -0,0 +1,16 @@ +#include + +#include +#include + + +TEST(HTTPChunkedStreamBuf, IsCompleteHandlesInvalidSocketException) +{ + Poco::Net::HTTPClientSession session; + Poco::Net::HTTPChunkedStreamBuf buf(session, std::ios::in); + + /// Default-initialized socket throws InvalidSocketException in SocketImpl::receiveBytes, + /// which HTTPChunkedStreamBuf::isComplete should swallow and return false. + bool complete = buf.isComplete(true); + ASSERT_FALSE(complete); +} diff --git a/tests/queries/0_stateless/02537_distributed_loosing_files_after_exception.reference b/tests/queries/0_stateless/02537_distributed_losing_files_after_exception.reference similarity index 100% rename from tests/queries/0_stateless/02537_distributed_loosing_files_after_exception.reference rename to tests/queries/0_stateless/02537_distributed_losing_files_after_exception.reference diff --git a/tests/queries/0_stateless/02537_distributed_loosing_files_after_exception.sql.j2 b/tests/queries/0_stateless/02537_distributed_losing_files_after_exception.sql.j2 similarity index 100% rename from tests/queries/0_stateless/02537_distributed_loosing_files_after_exception.sql.j2 rename to tests/queries/0_stateless/02537_distributed_losing_files_after_exception.sql.j2 From 11398954ad12e9f659b3727930307162fffa5de2 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 17 Oct 2025 14:12:12 +0000 Subject: [PATCH 030/112] Backport #88513 to 25.8: Keeper improvement: add config for checking node ACL on removal --- src/Coordination/CoordinationSettings.cpp | 8 +- src/Coordination/KeeperStorage.cpp | 7 ++ .../test_keeper_remove_acl/__init__.py | 0 .../configs/check_node_acl_on_remove.xml | 7 ++ .../configs/enable_keeper1.xml | 25 ++++++ .../test_keeper_remove_acl/test.py | 89 +++++++++++++++++++ 6 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_keeper_remove_acl/__init__.py create mode 100644 tests/integration/test_keeper_remove_acl/configs/check_node_acl_on_remove.xml create mode 100644 tests/integration/test_keeper_remove_acl/configs/enable_keeper1.xml create mode 100644 tests/integration/test_keeper_remove_acl/test.py diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 7eb50ea7ac8b..172844e8ea5b 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -69,7 +69,8 @@ namespace ErrorCodes DECLARE(UInt64, log_slow_total_threshold_ms, 5000, "Requests for which the total latency is larger than this settings will be logged", 0) \ DECLARE(UInt64, log_slow_cpu_threshold_ms, 100, "Requests for which the CPU (preprocessing and processing) latency is larger than this settings will be logged", 0) \ DECLARE(UInt64, log_slow_connection_operation_threshold_ms, 1000, "Log message if a certain operation took too long inside a single connection", 0) \ - DECLARE(Bool, use_xid_64, false, "Enable 64-bit XID. It is disabled by default because of backward compatibility", 0) + DECLARE(Bool, use_xid_64, false, "Enable 64-bit XID. It is disabled by default because of backward compatibility", 0) \ + DECLARE(Bool, check_node_acl_on_remove, false, "When trying to remove a node, check ACLs from both the node itself and the parent node. If disabled, default behaviour will be used where only ACL from the parent node is checked", 0) \ DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) IMPLEMENT_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) @@ -277,6 +278,11 @@ void KeeperConfigurationAndSettings::dump(WriteBufferFromOwnString & buf) const write_int(coordination_settings[CoordinationSetting::log_slow_cpu_threshold_ms]); writeText("log_slow_connection_operation_threshold_ms=", buf); write_int(coordination_settings[CoordinationSetting::log_slow_connection_operation_threshold_ms]); + + writeText("use_xid_64=", buf); + write_bool(coordination_settings[CoordinationSetting::use_xid_64]); + writeText("check_node_acl_on_remove=", buf); + write_bool(coordination_settings[CoordinationSetting::check_node_acl_on_remove]); } KeeperConfigurationAndSettingsPtr diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index a5bd73fed4b5..cdac684740aa 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -52,6 +52,7 @@ namespace DB namespace CoordinationSetting { extern const CoordinationSettingsUInt64 log_slow_cpu_threshold_ms; + extern const CoordinationSettingsBool check_node_acl_on_remove; } namespace ErrorCodes @@ -1828,6 +1829,12 @@ processLocal(const Coordination::ZooKeeperGetRequest & zk_request, Storage & sto template bool checkAuth(const Coordination::ZooKeeperRemoveRequest & zk_request, Storage & storage, int64_t session_id, bool is_local) { + if (auto check_node_acl = storage.keeper_context->getCoordinationSettings()[CoordinationSetting::check_node_acl_on_remove]; + check_node_acl && !storage.checkACL(zk_request.getPath(), Coordination::ACL::Delete, session_id, is_local)) + { + return false; + } + return storage.checkACL(Coordination::parentNodePath(zk_request.getPath()), Coordination::ACL::Delete, session_id, is_local); } diff --git a/tests/integration/test_keeper_remove_acl/__init__.py b/tests/integration/test_keeper_remove_acl/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_keeper_remove_acl/configs/check_node_acl_on_remove.xml b/tests/integration/test_keeper_remove_acl/configs/check_node_acl_on_remove.xml new file mode 100644 index 000000000000..77b2a0bcf65c --- /dev/null +++ b/tests/integration/test_keeper_remove_acl/configs/check_node_acl_on_remove.xml @@ -0,0 +1,7 @@ + + + + 0 + + + diff --git a/tests/integration/test_keeper_remove_acl/configs/enable_keeper1.xml b/tests/integration/test_keeper_remove_acl/configs/enable_keeper1.xml new file mode 100644 index 000000000000..873facafb3c8 --- /dev/null +++ b/tests/integration/test_keeper_remove_acl/configs/enable_keeper1.xml @@ -0,0 +1,25 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node + 9234 + true + 3 + + + + diff --git a/tests/integration/test_keeper_remove_acl/test.py b/tests/integration/test_keeper_remove_acl/test.py new file mode 100644 index 000000000000..24f41c0fb1b1 --- /dev/null +++ b/tests/integration/test_keeper_remove_acl/test.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import pytest +import logging + +import helpers.keeper_utils as keeper_utils +from helpers.cluster import ClickHouseCluster + +from kazoo.security import make_digest_acl +from kazoo.exceptions import NoAuthError + + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance( + "node", + main_configs=["configs/enable_keeper1.xml", "configs/check_node_acl_on_remove.xml"], + stay_alive=True, +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def wait_nodes(): + keeper_utils.wait_nodes(cluster, [node]) + + +def get_fake_zk(nodename, timeout=30.0): + return keeper_utils.get_fake_zk(cluster, nodename, timeout=timeout) + + +def stop_zk_connection(zk_conn): + zk_conn.stop() + zk_conn.close() + + +def test_server_restart(started_cluster): + try: + wait_nodes() + + node.stop_clickhouse() + node.replace_in_config( + "/etc/clickhouse-server/config.d/check_node_acl_on_remove.xml", "1", "0" + ) + node.start_clickhouse() + + def create_node_with_acl(): + node_zk = get_fake_zk("node") + node_zk.add_auth("digest", "clickhouse:password") + + if node_zk.exists("/test_acl_node"): + node_zk.delete("/test_acl_node") + + acl = make_digest_acl("clickhouse", "password", all=True) + node_zk.create("/test_acl_node", b"test_data", acl=[acl]) + stop_zk_connection(node_zk) + + def delete_node(): + node_zk = get_fake_zk("node") + node_zk.delete("/test_acl_node") + stop_zk_connection(node_zk) + + create_node_with_acl() + delete_node() + node.stop_clickhouse() + node.replace_in_config( + "/etc/clickhouse-server/config.d/check_node_acl_on_remove.xml", "0", "1" + ) + node.start_clickhouse() + + create_node_with_acl() + + with pytest.raises(NoAuthError): + delete_node() + finally: + try: + stop_zk_connection( + node_zk, + ) + except: + pass From 1145dbf7cbfd666077401bde5c81e40ff93fb10f Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 17 Oct 2025 20:13:28 +0000 Subject: [PATCH 031/112] Backport #88746 to 25.8: Startup clickhouse-keeper on boot --- packages/clickhouse-keeper.postinstall | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/clickhouse-keeper.postinstall b/packages/clickhouse-keeper.postinstall index 2b4f303b6849..aab4596be52f 100644 --- a/packages/clickhouse-keeper.postinstall +++ b/packages/clickhouse-keeper.postinstall @@ -28,6 +28,25 @@ if [ "$1" = configure ] || [ -n "$not_deb_os" ]; then "${KEEPER_USER}" fi + if [ -x "/bin/systemctl" ] && [ -f /lib/systemd/system/clickhouse-keeper.service ] && [ -d /run/systemd/system ]; then + # if old rc.d service present - remove it + if [ -x "/etc/init.d/clickhouse-keeper" ] && [ -x "/usr/sbin/update-rc.d" ]; then + /usr/sbin/update-rc.d clickhouse-keeper remove + fi + + /bin/systemctl daemon-reload + /bin/systemctl enable clickhouse-keeper + else + # If you downgrading to version older than 1.1.54336 run: systemctl disable clickhouse-keeper + if [ -x "/etc/init.d/clickhouse-keeper" ]; then + if [ -x "/usr/sbin/update-rc.d" ]; then + /usr/sbin/update-rc.d clickhouse-keeper defaults 19 19 >/dev/null || exit $? + else + echo # Other OS + fi + fi + fi + chown -R "${KEEPER_USER}:${KEEPER_GROUP}" "${KEEPER_CONFDIR}" chmod 0755 "${KEEPER_CONFDIR}" From 0e23fd010115f6f0f1585694eaf5804acea6aa56 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 18 Oct 2025 12:15:56 +0000 Subject: [PATCH 032/112] Backport #88617 to 25.8: cleanup temporary table entry from snapshot_detached_tables during table drop --- src/Databases/DatabaseMemory.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Databases/DatabaseMemory.cpp b/src/Databases/DatabaseMemory.cpp index 266de24116b4..34e1c04491ba 100644 --- a/src/Databases/DatabaseMemory.cpp +++ b/src/Databases/DatabaseMemory.cpp @@ -87,6 +87,7 @@ void DatabaseMemory::dropTable( std::lock_guard lock{mutex}; table->is_dropped = true; create_queries.erase(table_name); + snapshot_detached_tables.erase(table_name); UUID table_uuid = table->getStorageID().uuid; if (table_uuid != UUIDHelpers::Nil) DatabaseCatalog::instance().removeUUIDMappingFinally(table_uuid); From c37f56ce1e72476950b50e5173f885a44baf7eb7 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 21 Oct 2025 10:12:53 +0000 Subject: [PATCH 033/112] Backport #88814 to 25.8: Catch exceptions when async logging fails to prevent program aborts --- src/Loggers/OwnSplitChannel.cpp | 107 ++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/src/Loggers/OwnSplitChannel.cpp b/src/Loggers/OwnSplitChannel.cpp index 749d8c8ac63f..dd2e6e9de74a 100644 --- a/src/Loggers/OwnSplitChannel.cpp +++ b/src/Loggers/OwnSplitChannel.cpp @@ -487,20 +487,40 @@ void OwnAsyncSplitChannel::runChannel(size_t i) while (is_open) { - log_notification(notification); - notification = queues[i]->waitDequeueMessage(); + try + { + log_notification(notification); + notification = queues[i]->waitDequeueMessage(); + } + catch (...) + { + const std::string & exception_message = getCurrentExceptionMessage(true); + writeRetry(STDERR_FILENO, "Cannot log message in OwnAsyncSplitChannel channel: "); + writeRetry(STDERR_FILENO, exception_message.data(), exception_message.size()); + writeRetry(STDERR_FILENO, "\n"); + } } - /// Flush everything before closing - log_notification(notification); - - /// We want to process only what's currently in the queue and not block other logging - auto queue = queues[i]->getCurrentQueueAndClear(); - while (!queue.empty()) + try { - notification = queue.front(); - queue.pop_front(); + /// Flush everything before closing log_notification(notification); + + /// We want to process only what's currently in the queue and not block other logging + auto queue = queues[i]->getCurrentQueueAndClear(); + while (!queue.empty()) + { + notification = queue.front(); + queue.pop_front(); + log_notification(notification); + } + } + catch (...) + { + const std::string & exception_message = getCurrentExceptionMessage(true); + writeRetry(STDERR_FILENO, "Cannot flush messages in OwnAsyncSplitChannel channel: "); + writeRetry(STDERR_FILENO, exception_message.data(), exception_message.size()); + writeRetry(STDERR_FILENO, "\n"); } } @@ -530,40 +550,59 @@ void OwnAsyncSplitChannel::runTextLog() auto notification = text_log_queue.waitDequeueMessage(); while (is_open) { - if (flush_text_logs) + try { - auto text_log_locked = text_log.lock(); - if (!text_log_locked) - return; + if (flush_text_logs) + { + auto text_log_locked = text_log.lock(); + if (!text_log_locked) + return; - if (notification) - log_notification(notification, text_log_locked); + if (notification) + log_notification(notification, text_log_locked); - flush_queue(text_log_locked); + flush_queue(text_log_locked); - flush_text_logs = false; - flush_text_logs.notify_all(); + flush_text_logs = false; + flush_text_logs.notify_all(); + } + else if (notification) + { + auto text_log_locked = text_log.lock(); + if (!text_log_locked) + return; + log_notification(notification, text_log_locked); + } + + notification = text_log_queue.waitDequeueMessage(); } - else if (notification) + catch (...) { - auto text_log_locked = text_log.lock(); - if (!text_log_locked) - return; - log_notification(notification, text_log_locked); + const std::string & exception_message = getCurrentExceptionMessage(true); + writeRetry(STDERR_FILENO, "Cannot log message in OwnAsyncSplitChannel text log: "); + writeRetry(STDERR_FILENO, exception_message.data(), exception_message.size()); + writeRetry(STDERR_FILENO, "\n"); } - - notification = text_log_queue.waitDequeueMessage(); } - /// We want to flush everything already in the queue before closing so all messages are logged - auto text_log_locked = text_log.lock(); - if (!text_log_locked) - return; - - if (notification) - log_notification(notification, text_log_locked); + try + { + /// We want to flush everything already in the queue before closing so all messages are logged + auto text_log_locked = text_log.lock(); + if (!text_log_locked) + return; - flush_queue(text_log_locked); + if (notification) + log_notification(notification, text_log_locked); + flush_queue(text_log_locked); + } + catch (...) + { + const std::string & exception_message = getCurrentExceptionMessage(true); + writeRetry(STDERR_FILENO, "Cannot flush queue in OwnAsyncSplitChannel text log: "); + writeRetry(STDERR_FILENO, exception_message.data(), exception_message.size()); + writeRetry(STDERR_FILENO, "\n"); + } } void OwnAsyncSplitChannel::setChannelProperty(const std::string & channel_name, const std::string & name, const std::string & value) From 5c7cc0157a9fbddaf6edb48382005b0f3e82816f Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 21 Oct 2025 12:18:03 +0000 Subject: [PATCH 034/112] Backport #87789 to 25.8: Fix Insert Select with CTE --- src/Interpreters/InterpreterInsertQuery.cpp | 7 +++++++ .../03632_insert_select_cte_bug.reference | 1 + .../0_stateless/03632_insert_select_cte_bug.sql | 17 +++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 tests/queries/0_stateless/03632_insert_select_cte_bug.reference create mode 100644 tests/queries/0_stateless/03632_insert_select_cte_bug.sql diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index a0e9c1b6cccb..619a2799a0bf 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -80,6 +82,7 @@ namespace Setting extern const SettingsBool async_query_sending_for_remote; extern const SettingsBool async_socket_for_remote; extern const SettingsUInt64 max_distributed_depth; + extern const SettingsBool enable_global_with_statement; } namespace MergeTreeSetting @@ -760,6 +763,10 @@ InterpreterInsertQuery::distributedWriteIntoReplicatedMergeTreeFromClusterStorag { if (auto * select_query = select.list_of_selects->children.at(0)->as()) { + if (local_context->getSettingsRef()[Setting::enable_global_with_statement]) + ApplyWithAliasVisitor::visit(select.list_of_selects->children.at(0)); + ApplyWithSubqueryVisitor(local_context).visit(select.list_of_selects->children.at(0)); + JoinedTables joined_tables(Context::createCopy(local_context), *select_query); if (joined_tables.tablesCount() == 1) src_storage = joined_tables.getLeftTableStorage(); diff --git a/tests/queries/0_stateless/03632_insert_select_cte_bug.reference b/tests/queries/0_stateless/03632_insert_select_cte_bug.reference new file mode 100644 index 000000000000..d00491fd7e5b --- /dev/null +++ b/tests/queries/0_stateless/03632_insert_select_cte_bug.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/03632_insert_select_cte_bug.sql b/tests/queries/0_stateless/03632_insert_select_cte_bug.sql new file mode 100644 index 000000000000..0692f5d6b375 --- /dev/null +++ b/tests/queries/0_stateless/03632_insert_select_cte_bug.sql @@ -0,0 +1,17 @@ +SET enable_analyzer=1; -- parallel distributed insert select for replicated tables works only with analyzer +SET parallel_distributed_insert_select=2; +SET enable_global_with_statement=1; + +DROP TABLE IF EXISTS test_insert SYNC; + +CREATE TABLE test_insert (c1 String, c2 UInt8) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test_03632/tables/test_insert', '{replica}') +ORDER BY (); + +INSERT INTO test_insert +WITH cte_test AS (SELECT '1234', 1) +SELECT * FROM cte_test; + +SELECT count() FROM test_insert; + +DROP TABLE test_insert; From 27f3f97c73bb7aafb9e1d4ab9f4a973cda784043 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 21 Oct 2025 12:19:39 +0000 Subject: [PATCH 035/112] Backport #86184 to 25.8: Fix inferring Date/DateTime/DateTime64 on dates that are out of supported range --- src/Common/DateLUT.cpp | 11 ++++ src/Common/DateLUT.h | 2 + src/Common/DateLUTImpl.h | 40 +++++++++++++ src/IO/ReadHelpers.cpp | 21 ++++++- src/IO/ReadHelpers.h | 52 ++++++++++++++--- src/IO/parseDateTimeBestEffort.cpp | 48 ++++++++++----- .../test_storage_kafka/test_batch_fast.py | 2 +- .../01186_conversion_to_nullable.reference | 6 +- .../01556_accurate_cast_or_null.reference | 6 +- tests/queries/0_stateless/02404_data.CSV | 10 ---- .../0_stateless/02404_data.CSVWithNames | 11 ---- .../0_stateless/02404_data.CustomSeparated | 10 ---- .../0_stateless/02404_data.JSONCompactEachRow | 10 ---- .../0_stateless/02404_data.JSONEachRow | 10 ---- tests/queries/0_stateless/02404_data.TSKV | 10 ---- tests/queries/0_stateless/02404_data.TSV | 10 ---- .../0_stateless/02404_data.TSVWithNames | 11 ---- tests/queries/0_stateless/02404_data.Values | 1 - ...rence_cache_respect_format_settings.sql.j2 | 1 + .../03149_asof_join_ddb_timestamps.reference | 8 +-- ...bad_date_and_datetimes_inference.reference | 49 ++++++++++++++++ ...03599_bad_date_and_datetimes_inference.sql | 58 +++++++++++++++++++ 22 files changed, 266 insertions(+), 121 deletions(-) delete mode 100644 tests/queries/0_stateless/02404_data.CSV delete mode 100644 tests/queries/0_stateless/02404_data.CSVWithNames delete mode 100644 tests/queries/0_stateless/02404_data.CustomSeparated delete mode 100644 tests/queries/0_stateless/02404_data.JSONCompactEachRow delete mode 100644 tests/queries/0_stateless/02404_data.JSONEachRow delete mode 100644 tests/queries/0_stateless/02404_data.TSKV delete mode 100644 tests/queries/0_stateless/02404_data.TSV delete mode 100644 tests/queries/0_stateless/02404_data.TSVWithNames delete mode 100644 tests/queries/0_stateless/02404_data.Values create mode 100644 tests/queries/0_stateless/03599_bad_date_and_datetimes_inference.reference create mode 100644 tests/queries/0_stateless/03599_bad_date_and_datetimes_inference.sql diff --git a/src/Common/DateLUT.cpp b/src/Common/DateLUT.cpp index ba7dd6602e48..39e8044f36d6 100644 --- a/src/Common/DateLUT.cpp +++ b/src/Common/DateLUT.cpp @@ -209,6 +209,11 @@ ExtendedDayNum makeDayNum(const DateLUTImpl & date_lut, Int16 year, UInt8 month, return date_lut.makeDayNum(year, month, day_of_month, default_error_day_num); } +std::optional tryToMakeDayNum(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month) +{ + return date_lut.tryToMakeDayNum(year, month, day_of_month); +} + Int64 makeDate(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month) { static_assert(std::same_as); @@ -221,6 +226,12 @@ Int64 makeDateTime(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 return date_lut.makeDateTime(year, month, day_of_month, hour, minute, second); } +std::optional tryToMakeDateTime(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month, UInt8 hour, UInt8 minute, UInt8 second) +{ + static_assert(std::same_as); + return date_lut.tryToMakeDateTime(year, month, day_of_month, hour, minute, second); +} + const std::string & getDateLUTTimeZone(const DateLUTImpl & date_lut) { return date_lut.getTimeZone(); diff --git a/src/Common/DateLUT.h b/src/Common/DateLUT.h index 72992e3c70d2..ff025fd8036b 100644 --- a/src/Common/DateLUT.h +++ b/src/Common/DateLUT.h @@ -87,9 +87,11 @@ inline UInt64 timeInNanoseconds(std::chrono::time_point tryToMakeDayNum(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month); Int64 makeDate(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month); Int64 makeDateTime(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month, UInt8 hour, UInt8 minute, UInt8 second); +std::optional tryToMakeDateTime(const DateLUTImpl & date_lut, Int16 year, UInt8 month, UInt8 day_of_month, UInt8 hour, UInt8 minute, UInt8 second); const std::string & getDateLUTTimeZone(const DateLUTImpl & date_lut); UInt32 getDayNumOffsetEpoch(); diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 5c08916f8095..b9c058b60a85 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -1177,6 +1177,20 @@ class DateLUTImpl return LUTIndex{std::min(index, static_cast(DATE_LUT_SIZE - 1))}; } + std::optional tryToMakeLUTIndex(Int16 year, UInt8 month, UInt8 day_of_month) const + { + if (unlikely(year < DATE_LUT_MIN_YEAR || year > DATE_LUT_MAX_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31)) + return std::nullopt; + + auto year_lut_index = (year - DATE_LUT_MIN_YEAR) * 12 + month - 1; + UInt32 index = years_months_lut[year_lut_index].toUnderType() + day_of_month - 1; + + if (index >= DATE_LUT_SIZE) + return std::nullopt; + + return LUTIndex(index); + } + /// Create DayNum from year, month, day of month. ExtendedDayNum makeDayNum(Int16 year, UInt8 month, UInt8 day_of_month, Int32 default_error_day_num = 0) const { @@ -1186,6 +1200,18 @@ class DateLUTImpl return toDayNum(makeLUTIndex(year, month, day_of_month)); } + std::optional tryToMakeDayNum(Int16 year, UInt8 month, UInt8 day_of_month) const + { + if (unlikely(year < DATE_LUT_MIN_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31)) + return std::nullopt; + + auto index = tryToMakeLUTIndex(year, month, day_of_month); + if (!index) + return std::nullopt; + + return toDayNum(*index); + } + Time makeDate(Int16 year, UInt8 month, UInt8 day_of_month) const { return lut[makeLUTIndex(year, month, day_of_month)].date; @@ -1204,6 +1230,20 @@ class DateLUTImpl return lut[index].date + time_offset; } + std::optional