From d61094c23548cce9502264ee63083b60cf3ae9ab Mon Sep 17 00:00:00 2001 From: Pavel Kruglov <48961922+Avogar@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:13:08 +0000 Subject: [PATCH 1/2] Merge pull request #87863 from ilejn/non_uniq_arrow_dict ArrowStream processing crash if non unique dictionary --- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 10 ++++++++++ .../0_stateless/02904_arrow_dictionary_indexes.sh | 1 + .../data_arrow/non_unique_dict.arrowstream | Bin 0 -> 528 bytes 3 files changed, 11 insertions(+) create mode 100644 tests/queries/0_stateless/data_arrow/non_unique_dict.arrowstream diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 496468277c96..c59a2ab3e8f1 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1024,6 +1024,16 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn( auto tmp_lc_column = lc_type->createColumn(); auto tmp_dict_column = IColumn::mutate(assert_cast(tmp_lc_column.get())->getDictionaryPtr()); dynamic_cast(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); + size_t expected_dictionary_size = dict_column.column->size() + (dict_info.default_value_index == -1) + make_nullable_if_low_cardinality; + if (tmp_dict_column->size() != expected_dictionary_size) + { + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Expected Dictionary size {}, real Dictionary size is {}. The discrepancy probably caused by duplicated values", + expected_dictionary_size, + tmp_dict_column->size()); + } + dict_column.column = std::move(tmp_dict_column); dict_info.values = std::make_shared(std::move(dict_column)); dict_info.dictionary_size = arrow_dict_column->length(); diff --git a/tests/queries/0_stateless/02904_arrow_dictionary_indexes.sh b/tests/queries/0_stateless/02904_arrow_dictionary_indexes.sh index 3335008c120b..a7530727daeb 100755 --- a/tests/queries/0_stateless/02904_arrow_dictionary_indexes.sh +++ b/tests/queries/0_stateless/02904_arrow_dictionary_indexes.sh @@ -18,3 +18,4 @@ $CLICKHOUSE_LOCAL -q "select uniqExact(a) from file('$CLICKHOUSE_TMP/$CLICKHOUSE $CLICKHOUSE_LOCAL -q "select * from file('$CUR_DIR/data_arrow/different_dicts.arrowstream') order by x" +$CLICKHOUSE_LOCAL -q "select * from file('$CUR_DIR/data_arrow/non_unique_dict.arrowstream') -- { serverError INCORRECT_DATA }" diff --git a/tests/queries/0_stateless/data_arrow/non_unique_dict.arrowstream b/tests/queries/0_stateless/data_arrow/non_unique_dict.arrowstream new file mode 100644 index 0000000000000000000000000000000000000000..60f8f303ae54f964b1cfa2240e88d736f74a2dfd GIT binary patch literal 528 zcmZvZF%H5o3`Jc?X$2jMgiyzh9DOpMg#zx5{pNB#Q&X}F z@3q1!sV~Wp3bK|(H>#E^;{&TRg$gfqR^fv|c*hpZ z(;5CQW>XJf^$!1PbMy&RZg)EM`)zmNODme7xo+6cT=U<#hJn}BJdQE@&m2o+^`;&i U19$#d{|^eO=DSXs*Qzh=U&W&&2><{9 literal 0 HcmV?d00001 From 0f4309476768a34417adf1a7aced17191491f753 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Mon, 6 Oct 2025 22:14:06 +0000 Subject: [PATCH 2/2] Merge collision --- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index c59a2ab3e8f1..0f8b9e73f9e6 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1024,7 +1024,7 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn( auto tmp_lc_column = lc_type->createColumn(); auto tmp_dict_column = IColumn::mutate(assert_cast(tmp_lc_column.get())->getDictionaryPtr()); dynamic_cast(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); - size_t expected_dictionary_size = dict_column.column->size() + (dict_info.default_value_index == -1) + make_nullable_if_low_cardinality; + size_t expected_dictionary_size = dict_column.column->size() + (dict_info.default_value_index == -1) + is_lc_nullable; if (tmp_dict_column->size() != expected_dictionary_size) { throw Exception(