From 4dfc6a3fe1c1f7899993c5236775a5730e38a511 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 24 Sep 2025 21:24:44 +0000 Subject: [PATCH 1/2] Merge pull request #87528 from arthurpassos/do_not_put_hive_columns_in_format_header Do not put hive partition columns in format header - fix 87515 --- 03631_hive_columns_not_in_format_header.reference | 2 ++ 03631_hive_columns_not_in_format_header.sql | 13 +++++++++++++ src/Storages/prepareReadingFromFormat.cpp | 8 +++++++- 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 03631_hive_columns_not_in_format_header.reference create mode 100644 03631_hive_columns_not_in_format_header.sql diff --git a/03631_hive_columns_not_in_format_header.reference b/03631_hive_columns_not_in_format_header.reference new file mode 100644 index 000000000000..231eebbbb627 --- /dev/null +++ b/03631_hive_columns_not_in_format_header.reference @@ -0,0 +1,2 @@ +1 +raw_blob String diff --git a/03631_hive_columns_not_in_format_header.sql b/03631_hive_columns_not_in_format_header.sql new file mode 100644 index 000000000000..895f7aa4dfc0 --- /dev/null +++ b/03631_hive_columns_not_in_format_header.sql @@ -0,0 +1,13 @@ +-- Tags: no-parallel, no-fasttest, no-random-settings + +INSERT INTO FUNCTION s3( + s3_conn, + filename='03631', + format=Parquet, + partition_strategy='hive', + partition_columns_in_data_file=1) PARTITION BY (year, country) SELECT 'Brazil' as country, 2025 as year, 1 as id; + +-- distinct because minio isn't cleaned up +SELECT count(distinct year) FROM s3(s3_conn, filename='03631/**.parquet', format=RawBLOB) SETTINGS use_hive_partitioning=1; + +DESCRIBE s3(s3_conn, filename='03631/**.parquet', format=RawBLOB) SETTINGS use_hive_partitioning=1; diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 5a99021221af..7b4ae6c2fd24 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -86,7 +86,13 @@ ReadFromFormatInfo prepareReadingFromFormat( } /// Create header for InputFormat with columns that will be read from the data. - info.format_header = storage_snapshot->getSampleBlockForColumns(info.columns_description.getNamesOfPhysical()); + for (const auto & column : info.columns_description) + { + /// Never read hive partition columns from the data file. This fixes https://github.com/ClickHouse/ClickHouse/issues/87515 + if (!hive_parameters.hive_partition_columns_to_read_from_file_path_map.contains(column.name)) + info.format_header.insert(ColumnWithTypeAndName{column.type, column.name}); + } + info.serialization_hints = getSerializationHintsForFileLikeStorage(storage_snapshot->metadata, context); return info; } From f5182ca215773a9a58ed901327c29c4a13a50abd Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 26 Sep 2025 12:24:00 +0000 Subject: [PATCH 2/2] Merge pull request #87621 from arthurpassos/do_not_put_hive_columns_in_format_header Move 03631_hive_columns_not_in_format_header.sql to the proper file location --- src/Storages/HivePartitioningUtils.cpp | 3 ++- .../0_stateless/03203_hive_style_partitioning.reference | 2 +- .../03631_hive_columns_not_in_format_header.reference | 0 .../0_stateless/03631_hive_columns_not_in_format_header.sql | 0 4 files changed, 3 insertions(+), 2 deletions(-) rename 03631_hive_columns_not_in_format_header.reference => tests/queries/0_stateless/03631_hive_columns_not_in_format_header.reference (100%) rename 03631_hive_columns_not_in_format_header.sql => tests/queries/0_stateless/03631_hive_columns_not_in_format_header.sql (100%) diff --git a/src/Storages/HivePartitioningUtils.cpp b/src/Storages/HivePartitioningUtils.cpp index 2150fb55efb2..2f301e1f1f7a 100644 --- a/src/Storages/HivePartitioningUtils.cpp +++ b/src/Storages/HivePartitioningUtils.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB { @@ -85,7 +86,7 @@ NamesAndTypesList extractHivePartitionColumnsFromPath( { if (const auto type = tryInferDataTypeByEscapingRule(value, format_settings ? *format_settings : getFormatSettings(context), FormatSettings::EscapingRule::Raw)) { - if (type->canBeInsideLowCardinality()) + if (type->canBeInsideLowCardinality() && isStringOrFixedString(type)) { hive_partition_columns_to_read_from_file_path.emplace_back(key, std::make_shared(type)); } diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index a481cf0a28f1..85afdea228d2 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -31,7 +31,7 @@ Elizabeth Delgado Elizabeth Cross 42 2020-01-01 [1,2,3] 42.42 -Array(Int64) LowCardinality(Float64) +Array(Int64) Float64 101 2071 2071 diff --git a/03631_hive_columns_not_in_format_header.reference b/tests/queries/0_stateless/03631_hive_columns_not_in_format_header.reference similarity index 100% rename from 03631_hive_columns_not_in_format_header.reference rename to tests/queries/0_stateless/03631_hive_columns_not_in_format_header.reference diff --git a/03631_hive_columns_not_in_format_header.sql b/tests/queries/0_stateless/03631_hive_columns_not_in_format_header.sql similarity index 100% rename from 03631_hive_columns_not_in_format_header.sql rename to tests/queries/0_stateless/03631_hive_columns_not_in_format_header.sql