From 84f2986621ec495776a7363af2a09313ed53ed6f Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Mon, 27 May 2024 10:51:27 +0800 Subject: [PATCH] [Fix](multi-catalog) Fix string dict filtering when use null related function in parquet and orc reader. (#35335) The following sql and when the dictionary column contains functions related to null, the results will be incorrect. ``` select * from ( select IF(o_orderpriority IS NULL, 'null', o_orderpriority) AS o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 'null'; ``` ``` select * from ( select IFNULL(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null' ``` ``` select * from ( select COALESCE(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null'; ``` --- be/src/vec/exec/format/orc/vorc_reader.cpp | 21 ++- .../format/parquet/vparquet_group_reader.cpp | 21 ++- .../scripts/create_preinstalled_table.hql | 42 ++++++ .../test_string_dict_filter.orc | Bin 0 -> 1652 bytes .../test_string_dict_filter.parquet | Bin 0 -> 2292 bytes .../hive/test_string_dict_filter.out | 115 ++++++++++++++++ .../hive/test_string_dict_filter.groovy | 129 ++++++++++++++++++ 7 files changed, 306 insertions(+), 22 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet create mode 100644 regression-test/data/external_table_p0/hive/test_string_dict_filter.out create mode 100644 regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index a188c2e3eb5119..4a7944defee5b0 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1830,21 +1830,20 @@ bool OrcReader::_can_filter_by_dict(int slot_id) { return false; } - // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by dict. std::function visit_function_call = [&](const VExpr* expr) { + // TODO: The current implementation of dictionary filtering does not take into account + // the implementation of NULL values because the dictionary itself does not contain + // NULL value encoding. As a result, many NULL-related functions or expressions + // cannot work properly, such as is null, is not null, coalesce, etc. + // Here we first disable dictionary filtering when predicate contains functions. + // Implementation of NULL value dictionary filtering will be carried out later. if (expr->node_type() == TExprNodeType::FUNCTION_CALL) { - std::string is_null_str; - std::string function_name = expr->fn().name.function_name; - if (function_name.compare("is_null_pred") == 0 || - function_name.compare("is_not_null_pred") == 0) { + return false; + } + for (auto& child : expr->children()) { + if (!visit_function_call(child.get())) { return false; } - } else { - for (auto& child : expr->children()) { - if (!visit_function_call(child.get())) { - return false; - } - } } return true; }; diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index f1e5dc42801336..2d8f9cbb48e698 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -197,21 +197,20 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id, return false; } - // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by dict. std::function visit_function_call = [&](const VExpr* expr) { + // TODO: The current implementation of dictionary filtering does not take into account + // the implementation of NULL values because the dictionary itself does not contain + // NULL value encoding. As a result, many NULL-related functions or expressions + // cannot work properly, such as is null, is not null, coalesce, etc. + // Here we first disable dictionary filtering when predicate contains functions. + // Implementation of NULL value dictionary filtering will be carried out later. if (expr->node_type() == TExprNodeType::FUNCTION_CALL) { - std::string is_null_str; - std::string function_name = expr->fn().name.function_name; - if (function_name.compare("is_null_pred") == 0 || - function_name.compare("is_not_null_pred") == 0) { + return false; + } + for (auto& child : expr->children()) { + if (!visit_function_call(child.get())) { return false; } - } else { - for (auto& child : expr->children()) { - if (!visit_function_call(child.get())) { - return false; - } - } } return true; }; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql index e88516ffb6f148..765958527aaa8b 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql @@ -1808,6 +1808,48 @@ TBLPROPERTIES ( msck repair table string_col_dict_plain_mixed_orc; +CREATE TABLE `test_string_dict_filter_parquet`( + `o_orderkey` int, + `o_custkey` int, + `o_orderstatus` string, + `o_totalprice` decimal(15,2), + `o_orderdate` date, + `o_orderpriority` string, + `o_clerk` string, + `o_shippriority` int, + `o_comment` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' +LOCATION + '/user/doris/preinstalled_data/parquet_table/test_string_dict_filter_parquet'; + +msck repair table test_string_dict_filter_parquet; + +CREATE TABLE `test_string_dict_filter_orc`( + `o_orderkey` int, + `o_custkey` int, + `o_orderstatus` string, + `o_totalprice` decimal(15,2), + `o_orderdate` date, + `o_orderpriority` string, + `o_clerk` string, + `o_shippriority` int, + `o_comment` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +LOCATION + '/user/doris/preinstalled_data/orc_table/test_string_dict_filter_orc'; + +msck repair table test_string_dict_filter_orc; + show tables; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc new file mode 100644 index 0000000000000000000000000000000000000000..30638e4117c92908cee3bc0253c3e176d3a76f1c GIT binary patch literal 1652 zcmZXPc~sK*8pgjrmVStcM5&OWBCcrenMvMEf^tLBv_`W?A=jp|lJ0fMo1sZwZPC=! zGWA-gZi;DElbX_u%&}rfZE@qeT$dyr%>|ROF3tQm^UrhM^Ld{4dCwaZ%v=iqb`24Q zL34qfc3g)=0sx3O44SwGgCJl@CS0&Z%gKLmqL$+@ooy(>m0kj=_PQ$1#E3s@NiU22B7-l4J{8l;F*I8GvU5x1Zr4Fr0Lh=IxX4^ zN1JjP{{7-ne3)gZL;6HnPi3M2>Ez_*vesv<#BFn7aR;dl{PkBr03^T#zX8Cy=!};- z;ec_DpG~wi`yyE|rk9x7Zo$#Pp4iJEra{K8$A(WAK#LuW@-*Xq;r6Ve-X+cLdfCtV zM7Ypzw*a+pBVrGuw+f2*8PJUZOO0>#`M8*I( zZwd|G#p3MUyeS}PpAP^h<-VIb!`uPVKZF@$Q7aNbF7LW})|Vv2^3Rzk@Uh@qvHmv! zJ`f1Z-$Rr-VRqI@p|XJ-D>s&Nk2gCd<#qUCQMo!fZaj@2b99Z>PP>;?4fv~ivv=>y zSIqn+!FB_5p4{`gaYd%%sevG#ea}WklWJaiKDxO zq0q(weeZkj-@!y(v`J<(b#I`!TDqb&=tz~1B7YKooHQ+MG2Y%*cbqO>PF4>H|NC^d z1Zm6&)ICx^8aQ00%YPF0u;y0m0xmtReo!MU%DqX?GHm~eql4^Ib2BrjCnb61mk_n* zcdTE2`2GdF=yzan>3J;G^i8Hh;o)BDfAu52!<6>FRxfiVR&rswYuivoYI2f9m^!s} z&*wWQIKNrN-}mlq4$qX<4)rchy;Y+QztO-`dv=;wJ#tT9iKu1SS5Y1_t+xAcRh<#F zseKc1*1jnP5BrlY#j{FkR<3;g6nK}|9uioxLAmg-M|q3L>x@4rn^7Kyj@*L0E6pno7i=;gp;-A*LgXmcYc?7DqF6^kh` ztLB7PSw-$IT}K=V^obC8e+>J3&Qo-khnSKhre#OMu*EV#1Ij#34*e+7%}yhh5sM`Y zyMOtrR?jcj>lvsXj4o9~mD|bn$t{#vB?F{1;rXmLo{@K+t|{-PTfRwnkaXHJD88d~ zA^aj))_mmqeEHTf5664<)ERa{P4T*Si6TjC;ap+AL1fZv#dP;p#;eY=?Cu*5Mb$$$ zaE0za9E6wP@!f_`p1eI9HT|!~{i41Av4-2g;5OkV#sfU3I{JOoLyzk2E)L%wJCJIk z`vWC{J6{2r^bIMTQN6h^SNAd`J4`@-`SRo2%nxTC2;b?`_T|^?u$knWCo5y)qcLa8 zA(h73>;m~v_T5Kv8Sav;r%FaYwHA5O-9j|<}OJ#6AW$7$$ z14}NedB&c;oX5HicIWRXS1CRB-8JGSFUxm1XjR1v8Hh@&-GPwXXYkGr#* zVoq?c5T_oXhpLKtf=dn@O0S^8fxmzY*B;&Du`jfH<(;AM@V4`R0A!%slzd zje`m$v_@;|G!Km&)e1&-h7e+86?p4pZOj6P3QuMgyP_6+v+&@6i!i&o3NXblW0=%R z2Crh*nXQns-xPw~KYR-?S-L`3_TU4+IL|)5ZUXu;UB)D{QIZX^rtBNa&2uazvjQ;( z^dKbyF|*LGyzx`Tzy_%--ae1V4(-pE(S7Ud@6duUwBjK0>-V2sWYj64Fn_!jGfFKO z77Jq*2|+i8{P;Q{OV=Vs*Xi{UBUGxsvA6$`hEr{PcyQ~T_dd8lPF<^v&(tt0UXmt||f`)#-1N%2@<1zn!_ z*DdLKydPLh4ts+@1m`Kj0+YTB{Ll$p(U+F=I3HL?q3?TnX z!en@I(8g>QL`!zvK@t`RVMe0YgN*=TdfqIU4@+7B7}ZHmGshWF=+?hb;4W!9|4>QI z0_<>@5#%gAP*SLr`xw##1)$RUiH9b&$T-xgTA=f0J*Q>ODcK^p6aA!gs_vW@PjemX zEMx(x;0KcVDpQ~fBVH2%M*xo${gDDO2ywBtj%V^To01w|k1<+Lvp_2?qDU*XCTKmM zNoh?lWCZwF(H|?N=QBHvOG@V@EMMdzz-aZ79S{LMQ}t(R=|v6+PE^oKOX@h`Cvy?u zNcyQw6d~VN^!rNb$sBM7oktZ9Ij0u6B-68enrXTXBgE5L{dBf;md9NG52bNT4gHsw z^gJw~v2+g(8~4kB+vDX=MV}j2Z%CKD^?sN2J6+}y