diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index a188c2e3eb5119..4a7944defee5b0 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1830,21 +1830,20 @@ bool OrcReader::_can_filter_by_dict(int slot_id) { return false; } - // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by dict. std::function visit_function_call = [&](const VExpr* expr) { + // TODO: The current implementation of dictionary filtering does not take into account + // the implementation of NULL values because the dictionary itself does not contain + // NULL value encoding. As a result, many NULL-related functions or expressions + // cannot work properly, such as is null, is not null, coalesce, etc. + // Here we first disable dictionary filtering when predicate contains functions. + // Implementation of NULL value dictionary filtering will be carried out later. if (expr->node_type() == TExprNodeType::FUNCTION_CALL) { - std::string is_null_str; - std::string function_name = expr->fn().name.function_name; - if (function_name.compare("is_null_pred") == 0 || - function_name.compare("is_not_null_pred") == 0) { + return false; + } + for (auto& child : expr->children()) { + if (!visit_function_call(child.get())) { return false; } - } else { - for (auto& child : expr->children()) { - if (!visit_function_call(child.get())) { - return false; - } - } } return true; }; diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index f1e5dc42801336..2d8f9cbb48e698 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -197,21 +197,20 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id, return false; } - // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by dict. std::function visit_function_call = [&](const VExpr* expr) { + // TODO: The current implementation of dictionary filtering does not take into account + // the implementation of NULL values because the dictionary itself does not contain + // NULL value encoding. As a result, many NULL-related functions or expressions + // cannot work properly, such as is null, is not null, coalesce, etc. + // Here we first disable dictionary filtering when predicate contains functions. + // Implementation of NULL value dictionary filtering will be carried out later. if (expr->node_type() == TExprNodeType::FUNCTION_CALL) { - std::string is_null_str; - std::string function_name = expr->fn().name.function_name; - if (function_name.compare("is_null_pred") == 0 || - function_name.compare("is_not_null_pred") == 0) { + return false; + } + for (auto& child : expr->children()) { + if (!visit_function_call(child.get())) { return false; } - } else { - for (auto& child : expr->children()) { - if (!visit_function_call(child.get())) { - return false; - } - } } return true; }; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql index e88516ffb6f148..765958527aaa8b 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql @@ -1808,6 +1808,48 @@ TBLPROPERTIES ( msck repair table string_col_dict_plain_mixed_orc; +CREATE TABLE `test_string_dict_filter_parquet`( + `o_orderkey` int, + `o_custkey` int, + `o_orderstatus` string, + `o_totalprice` decimal(15,2), + `o_orderdate` date, + `o_orderpriority` string, + `o_clerk` string, + `o_shippriority` int, + `o_comment` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' +LOCATION + '/user/doris/preinstalled_data/parquet_table/test_string_dict_filter_parquet'; + +msck repair table test_string_dict_filter_parquet; + +CREATE TABLE `test_string_dict_filter_orc`( + `o_orderkey` int, + `o_custkey` int, + `o_orderstatus` string, + `o_totalprice` decimal(15,2), + `o_orderdate` date, + `o_orderpriority` string, + `o_clerk` string, + `o_shippriority` int, + `o_comment` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +LOCATION + '/user/doris/preinstalled_data/orc_table/test_string_dict_filter_orc'; + +msck repair table test_string_dict_filter_orc; + show tables; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc new file mode 100644 index 00000000000000..30638e4117c929 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet new file mode 100644 index 00000000000000..f3e189da7e984d Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet differ diff --git a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out new file mode 100644 index 00000000000000..f8f59b864cc325 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out @@ -0,0 +1,115 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q01 -- +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q02 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +4 136777 O 32151.78 1995-10-11 \N Clerk#000000124 0 sits. slyly regular warthogs cajole. regular, regular theodolites acro + +-- !q03 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +4 136777 O 32151.78 1995-10-11 \N Clerk#000000124 0 sits. slyly regular warthogs cajole. regular, regular theodolites acro +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q04 -- +4 136777 O 32151.78 1995-10-11 \N Clerk#000000124 0 sits. slyly regular warthogs cajole. regular, regular theodolites acro + +-- !q05 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q06 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q07 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q08 -- + +-- !q09 -- +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q10 -- +null + +-- !q11 -- +null + +-- !q12 -- +null + +-- !q13 -- +null + +-- !q14 -- +null + +-- !q01 -- +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q02 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +4 136777 O 32151.78 1995-10-11 \N Clerk#000000124 0 sits. slyly regular warthogs cajole. regular, regular theodolites acro + +-- !q03 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +4 136777 O 32151.78 1995-10-11 \N Clerk#000000124 0 sits. slyly regular warthogs cajole. regular, regular theodolites acro +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q04 -- +4 136777 O 32151.78 1995-10-11 \N Clerk#000000124 0 sits. slyly regular warthogs cajole. regular, regular theodolites acro + +-- !q05 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q06 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q07 -- +1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951 0 nstructions sleep furiously among +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q08 -- + +-- !q09 -- +2 78002 O 46929.18 1996-12-01 1-URGENT Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot +3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955 0 sly final accounts boost. carefully regular ideas cajole carefully. depos +5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925 0 quickly. bold deposits sleep slyly. packages use slyly + +-- !q10 -- +null + +-- !q11 -- +null + +-- !q12 -- +null + +-- !q13 -- +null + +-- !q14 -- +null + diff --git a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy new file mode 100644 index 00000000000000..e213aabbdc8dc4 --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_string_dict_filter", "p0,external,hive,external_docker,external_docker_hive") { + def q_parquet = { + qt_q01 """ + select * from test_string_dict_filter_parquet where o_orderstatus = 'F'; + """ + qt_q02 """ + select * from test_string_dict_filter_parquet where o_orderstatus = 'O'; + """ + qt_q03 """ + select * from test_string_dict_filter_parquet where o_orderstatus in ('O', 'F'); + """ + qt_q04 """ + select * from test_string_dict_filter_parquet where o_orderpriority is null; + """ + qt_q05 """ + select * from test_string_dict_filter_parquet where o_orderpriority is not null; + """ + qt_q06 """ + select * from test_string_dict_filter_parquet where o_orderpriority in ('5-LOW', NULL); + """ + qt_q07 """ + select * from test_string_dict_filter_parquet where o_orderpriority in ('5-LOW') and o_orderstatus in ('O', 'F'); + """ + qt_q08 """ + select * from test_string_dict_filter_parquet where o_orderpriority in ('1-URGENT') and o_orderstatus in ('F'); + """ + qt_q09 """ + select * from test_string_dict_filter_parquet where o_orderpriority in ('1-URGENT') or o_orderstatus in ('F'); + """ + qt_q10 """ + select * from ( select IF(o_orderpriority IS NULL, 'null', o_orderpriority) AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null'; + """ + qt_q11 """ + select * from ( select IF(o_orderpriority IS NOT NULL, o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null'; + """ + qt_q12 """ + select * from ( select IFNULL(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null'; + """ + qt_q13 """ + select * from ( select IFNULL(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null'; + """ + qt_q14 """ + select * from ( select COALESCE(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = 'null'; + """ + } + def q_orc = { + qt_q01 """ + select * from test_string_dict_filter_orc where o_orderstatus = 'F'; + """ + qt_q02 """ + select * from test_string_dict_filter_orc where o_orderstatus = 'O'; + """ + qt_q03 """ + select * from test_string_dict_filter_orc where o_orderstatus in ('O', 'F'); + """ + qt_q04 """ + select * from test_string_dict_filter_orc where o_orderpriority is null; + """ + qt_q05 """ + select * from test_string_dict_filter_orc where o_orderpriority is not null; + """ + qt_q06 """ + select * from test_string_dict_filter_orc where o_orderpriority in ('5-LOW', NULL); + """ + qt_q07 """ + select * from test_string_dict_filter_orc where o_orderpriority in ('5-LOW') and o_orderstatus in ('O', 'F'); + """ + qt_q08 """ + select * from test_string_dict_filter_orc where o_orderpriority in ('1-URGENT') and o_orderstatus in ('F'); + """ + qt_q09 """ + select * from test_string_dict_filter_orc where o_orderpriority in ('1-URGENT') or o_orderstatus in ('F'); + """ + qt_q10 """ + select * from ( select IF(o_orderpriority IS NULL, 'null', o_orderpriority) AS o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 'null'; + """ + qt_q11 """ + select * from ( select IF(o_orderpriority IS NOT NULL, o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 'null'; + """ + qt_q12 """ + select * from ( select IFNULL(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 'null'; + """ + qt_q13 """ + select * from ( select IFNULL(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 'null'; + """ + qt_q14 """ + select * from ( select COALESCE(o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 'null'; + """ + } + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + try { + String hms_port = context.config.otherConfigs.get("hms_port") + String catalog_name = "test_string_dict_filter" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + "type"="hms", + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`default`""" + + q_parquet() + q_orc() + + sql """drop catalog if exists ${catalog_name}""" + } finally { + } + } +} +