From 814efbeafbb32895053a3338c904152d597b52b3 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Sep 2025 14:57:22 -0300 Subject: [PATCH 1/2] fix rawblob vs hive columns --- 03631_hive_columns_not_in_format_header.reference | 2 ++ 03631_hive_columns_not_in_format_header.sql | 13 +++++++++++++ src/Storages/prepareReadingFromFormat.cpp | 7 ++++++- 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 03631_hive_columns_not_in_format_header.reference create mode 100644 03631_hive_columns_not_in_format_header.sql diff --git a/03631_hive_columns_not_in_format_header.reference b/03631_hive_columns_not_in_format_header.reference new file mode 100644 index 000000000000..231eebbbb627 --- /dev/null +++ b/03631_hive_columns_not_in_format_header.reference @@ -0,0 +1,2 @@ +1 +raw_blob String diff --git a/03631_hive_columns_not_in_format_header.sql b/03631_hive_columns_not_in_format_header.sql new file mode 100644 index 000000000000..895f7aa4dfc0 --- /dev/null +++ b/03631_hive_columns_not_in_format_header.sql @@ -0,0 +1,13 @@ +-- Tags: no-parallel, no-fasttest, no-random-settings + +INSERT INTO FUNCTION s3( + s3_conn, + filename='03631', + format=Parquet, + partition_strategy='hive', + partition_columns_in_data_file=1) PARTITION BY (year, country) SELECT 'Brazil' as country, 2025 as year, 1 as id; + +-- distinct because minio isn't cleaned up +SELECT count(distinct year) FROM s3(s3_conn, filename='03631/**.parquet', format=RawBLOB) SETTINGS use_hive_partitioning=1; + +DESCRIBE s3(s3_conn, filename='03631/**.parquet', format=RawBLOB) SETTINGS use_hive_partitioning=1; diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 3b31a9b0d2ae..804e19da9e95 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -234,7 +234,12 @@ ReadFromFormatInfo prepareReadingFromFormat( } /// Create header for InputFormat with columns that will be read from the data. - info.format_header = storage_snapshot->getSampleBlockForColumns(info.columns_description.getNamesOfPhysical()); + for (const auto & column : columns_in_data_file) + { + /// Never read hive partition columns from the data file. This fixes https://github.com/ClickHouse/ClickHouse/issues/87515 + if (!hive_parameters.hive_partition_columns_to_read_from_file_path_map.contains(column.name)) + info.format_header.insert(ColumnWithTypeAndName{column.type, column.name}); + } info.serialization_hints = getSerializationHintsForFileLikeStorage(storage_snapshot->metadata, context); From 69062431fc0dbfb38fd0fd45defab36b12e3c6b3 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Sep 2025 08:40:29 -0300 Subject: [PATCH 2/2] use columns description instead of columns file. This should not break formats that support subset --- src/Storages/prepareReadingFromFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 804e19da9e95..e84005511bb4 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -234,7 +234,7 @@ ReadFromFormatInfo prepareReadingFromFormat( } /// Create header for InputFormat with columns that will be read from the data. - for (const auto & column : columns_in_data_file) + for (const auto & column : info.columns_description) { /// Never read hive partition columns from the data file. This fixes https://github.com/ClickHouse/ClickHouse/issues/87515 if (!hive_parameters.hive_partition_columns_to_read_from_file_path_map.contains(column.name))