From 10da22b7261d58c6326a09206345cff3973e379e Mon Sep 17 00:00:00 2001 From: minghong Date: Thu, 24 Jul 2025 15:56:13 +0800 Subject: [PATCH] fix bug in org.apache.doris.nereids.stats.StatsCalculator#disableJoinReorderIfStatsInvalid --- .../nereids/rules/rewrite/InitJoinOrder.java | 45 ++++++----- .../doris/nereids/stats/StatsCalculator.java | 8 +- .../invalid_stats_join_order.out | 19 +++++ .../agg_skew_rewrite/agg_skew_rewrite.out | 2 +- .../constant_propagation.out | 42 ++++++----- .../invalid_stats_join_order.groovy | 75 +++++++++++++++++++ .../agg_skew_rewrite/agg_skew_rewrite.groovy | 1 + .../constant_propagation.groovy | 3 +- .../mv/ssb/mv_ssb_test.groovy | 8 +- 9 files changed, 161 insertions(+), 42 deletions(-) create mode 100644 regression-test/data/nereids_p0/join/invalid_stats/invalid_stats_join_order.out create mode 100644 regression-test/suites/nereids_p0/join/invalid_stats/invalid_stats_join_order.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InitJoinOrder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InitJoinOrder.java index ef73359cca203d..970dc84c787dd6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InitJoinOrder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InitJoinOrder.java @@ -20,14 +20,19 @@ import org.apache.doris.nereids.rules.Rule; import org.apache.doris.nereids.rules.RuleType; import org.apache.doris.nereids.rules.rewrite.StatsDerive.DeriveContext; +import org.apache.doris.nereids.stats.StatsCalculator; import org.apache.doris.nereids.trees.plans.AbstractPlan; import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.algebra.CatalogRelation; import org.apache.doris.nereids.trees.plans.logical.LogicalJoin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.Optional; + /** * Due to the limitation on the data size in the memo, when optimizing large SQL queries, once this * limitation is triggered, some subtrees of the plan tree may not undergo optimization. Therefore, @@ -62,25 +67,29 @@ private Plan swapJoinChildrenIfNeed(LogicalJoin // if we swap left semi/anti to right semi/anti, we lost the opportunity to optimize join order return null; } - JoinType swapType = join.getJoinType().swap(); - if (swapType == null) { - return null; - } - AbstractPlan left = (AbstractPlan) join.left(); - AbstractPlan right = (AbstractPlan) join.right(); - if (left.getStats() == null) { - left.accept(derive, new DeriveContext()); - } - if (right.getStats() == null) { - right.accept(derive, new DeriveContext()); - } + List scans = join.collectToList(CatalogRelation.class::isInstance); + Optional disableReason = StatsCalculator.disableJoinReorderIfStatsInvalid(scans, null); + if (!disableReason.isPresent()) { + JoinType swapType = join.getJoinType().swap(); + if (swapType == null) { + return null; + } + AbstractPlan left = (AbstractPlan) join.left(); + AbstractPlan right = (AbstractPlan) join.right(); + if (left.getStats() == null) { + left.accept(derive, new DeriveContext()); + } + if (right.getStats() == null) { + right.accept(derive, new DeriveContext()); + } - // requires "left.getStats().getRowCount() > 0" to avoid dead loop when negative row count is estimated. - if (left.getStats().getRowCount() < right.getStats().getRowCount() * SWAP_THRESHOLD - && left.getStats().getRowCount() > 0) { - join = join.withTypeChildren(swapType, right, left, - join.getJoinReorderContext()); - return join; + // requires "left.getStats().getRowCount() > 0" to avoid dead loop when negative row count is estimated. + if (left.getStats().getRowCount() < right.getStats().getRowCount() * SWAP_THRESHOLD + && left.getStats().getRowCount() > 0) { + join = join.withTypeChildren(swapType, right, left, + join.getJoinReorderContext()); + return join; + } } return null; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 7c6632fdb614b5..2c09095332fdf9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -271,6 +271,12 @@ public static Optional disableJoinReorderIfStatsInvalid(List disableJoinReorderIfStatsInvalid(List[x] ----NestedLoopJoin[CROSS_JOIN] -------PhysicalProject[t2.x] ---------filter(((cast(x as BIGINT) * 10) = 10)) -----------PhysicalOlapScan[t2] apply RFs: RF0 ------PhysicalProject[t1.a] --------filter((t1.a = 10)) ----------PhysicalOlapScan[t1] +------PhysicalProject[t2.x] +--------filter(((cast(x as BIGINT) * 10) = 10)) +----------PhysicalOlapScan[t2] apply RFs: RF0 ----PhysicalProject[t3.a] ------filter(((cast(a as BIGINT) * 10) = 10) and (t3.__DORIS_DELETE_SIGN__ = 0)) --------PhysicalOlapScan[t3] @@ -263,29 +263,31 @@ PhysicalResultSink -- !subquery_7_shape -- PhysicalResultSink --NestedLoopJoin[CROSS_JOIN] -----PhysicalProject[t2.x] -------filter(((cast(x as BIGINT) * 10) = 10)) ---------PhysicalOlapScan[t2] ----PhysicalProject[t.a] ------filter((t1.a = 10)) --------PhysicalOlapScan[t1] +----PhysicalProject[t2.x] +------filter(((cast(x as BIGINT) * 10) = 10)) +--------PhysicalOlapScan[t2] -- !subquery_7_result -- 10 1 -- !subquery_8_shape -- PhysicalResultSink ---NestedLoopJoin[CROSS_JOIN] -----PhysicalUnion -------PhysicalProject[(cast(a as BIGINT) * 10) AS `k`, t1.b] ---------filter(((cast(a as BIGINT) * 10) = 10)) -----------PhysicalOlapScan[t1] -------PhysicalProject[cast(x as BIGINT) AS `k`, y AS `b`] ---------filter((t2.x = 10)) -----------PhysicalOlapScan[t2] -----PhysicalProject[t3.a] -------filter((t3.__DORIS_DELETE_SIGN__ = 0) and (t3.a = 2)) ---------PhysicalOlapScan[t3] +--PhysicalQuickSort[MERGE_SORT, orderKeys=(k asc null first, b asc null first)] +----PhysicalQuickSort[LOCAL_SORT, orderKeys=(k asc null first, b asc null first)] +------NestedLoopJoin[CROSS_JOIN] +--------PhysicalUnion +----------PhysicalProject[(cast(a as BIGINT) * 10) AS `k`, t1.b] +------------filter(((cast(a as BIGINT) * 10) = 10)) +--------------PhysicalOlapScan[t1] +----------PhysicalProject[cast(x as BIGINT) AS `k`, y AS `b`] +------------filter((t2.x = 10)) +--------------PhysicalOlapScan[t2] +--------PhysicalProject[t3.a] +----------filter((t3.__DORIS_DELETE_SIGN__ = 0) and (t3.a = 2)) +------------PhysicalOlapScan[t3] -- !subquery_8_result -- 10 2 2 diff --git a/regression-test/suites/nereids_p0/join/invalid_stats/invalid_stats_join_order.groovy b/regression-test/suites/nereids_p0/join/invalid_stats/invalid_stats_join_order.groovy new file mode 100644 index 00000000000000..024f2f8f8b8063 --- /dev/null +++ b/regression-test/suites/nereids_p0/join/invalid_stats/invalid_stats_join_order.groovy @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one + // or more contributor license agreements. See the NOTICE file + // distributed with this work for additional information + // regarding copyright ownership. The ASF licenses this file + // to you under the Apache License, Version 2.0 (the + // "License"); you may not use this file except in compliance + // with the License. You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, + // software distributed under the License is distributed on an + // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + // KIND, either express or implied. See the License for the + // specific language governing permissions and limitations + // under the License. + +suite("invalid_stats_join_order") { + sql """ + drop table if exists t1; + + CREATE TABLE IF NOT EXISTS t1 ( + k int(11) NULL COMMENT "", + v varchar(50) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 7 + PROPERTIES ("replication_num" = "1", "colocate_with" = "group_1"); + + insert into t1 values (1, 'a'); + + alter table t1 modify column k set stats ('row_count'='100', 'ndv'='1', 'num_nulls'='0', 'min_value'='0'); + + drop table if exists t2; + + CREATE TABLE IF NOT EXISTS t2 ( + k int(11) NULL COMMENT "", + v varchar(50) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 7 + PROPERTIES ("replication_num" = "1", "colocate_with" = "group_1"); + + insert into t2 values (1, 'a'); + alter table t2 modify column k set stats ('row_count'='10000000', 'ndv'='10', 'num_nulls'='0', 'min_value'='0'); + + set runtime_filter_mode=off; + """ + + qt_shape """ + explain shape plan + select * from t1 join t2 on t1.k = t2.k; + """ + + sql """ + alter table t1 modify column k set stats ('row_count'='-1', 'ndv'='1', 'num_nulls'='0', 'min_value'='0'); + """ + + // disable join reorder because rowCount = -1 + qt_shape """ + explain shape plan + select * from t1 join t2 on t1.k = t2.k; + """ + + sql """ + alter table t1 modify column k set stats ('row_count'='100', 'ndv'='11', 'num_nulls'='0', 'min_value'='0'); + """ + + // ndv > rowCount * 10 + qt_shape """ + explain shape plan + select * from t1 join t2 on t1.k = t2.k; + """ + +} \ No newline at end of file diff --git a/regression-test/suites/nereids_rules_p0/agg_skew_rewrite/agg_skew_rewrite.groovy b/regression-test/suites/nereids_rules_p0/agg_skew_rewrite/agg_skew_rewrite.groovy index 651709a2a03535..90cf4d77b871be 100644 --- a/regression-test/suites/nereids_rules_p0/agg_skew_rewrite/agg_skew_rewrite.groovy +++ b/regression-test/suites/nereids_rules_p0/agg_skew_rewrite/agg_skew_rewrite.groovy @@ -17,6 +17,7 @@ suite("test_agg_skew_hint") { sql "set runtime_filter_mode=OFF" + sql "set disable_join_reorder=true;" sql "drop table if exists test_skew_hint" sql "create table test_skew_hint (a int, b int, c int) distributed by hash(a) properties('replication_num'='1');" sql "insert into test_skew_hint(a,b,c) values(1,2,3),(1,2,4),(1,3,4),(2,3,5),(2,4,5),(3,4,5),(3,5,6),(3,6,7),(3,7,8),(3,8,9),(3,10,11);" diff --git a/regression-test/suites/nereids_rules_p0/constant_propagation/constant_propagation.groovy b/regression-test/suites/nereids_rules_p0/constant_propagation/constant_propagation.groovy index 902045d1d83c5a..82006f837108b7 100644 --- a/regression-test/suites/nereids_rules_p0/constant_propagation/constant_propagation.groovy +++ b/regression-test/suites/nereids_rules_p0/constant_propagation/constant_propagation.groovy @@ -33,6 +33,7 @@ suite('constant_propagation') { SET detail_shape_nodes='PhysicalProject,PhysicalHashAggregate,PhysicalQuickSort'; SET ignore_shape_nodes='PhysicalDistribute'; SET runtime_filter_type=2; + set disable_join_reorder=true; """ sql 'drop table if exists t1' @@ -245,7 +246,7 @@ suite('constant_propagation') { explain_and_result 'subquery_8', ''' select t.k, t.b, t3.a - from (select a * 10 as k, b from t1 union all select x as k, y as b from t2) t join t3 on t.k = t3.a * 5 where t3.a = 2 + from (select a * 10 as k, b from t1 union all select x as k, y as b from t2) t join t3 on t.k = t3.a * 5 where t3.a = 2 order by k, b, a ''' explain_and_result 'subquery_9', ''' diff --git a/regression-test/suites/nereids_rules_p0/mv/ssb/mv_ssb_test.groovy b/regression-test/suites/nereids_rules_p0/mv/ssb/mv_ssb_test.groovy index a80690a7bbef5a..c0fe409c74f87b 100644 --- a/regression-test/suites/nereids_rules_p0/mv/ssb/mv_ssb_test.groovy +++ b/regression-test/suites/nereids_rules_p0/mv/ssb/mv_ssb_test.groovy @@ -89,7 +89,13 @@ suite("mv_ssb_test") { sql "set runtime_filter_mode=OFF" sql "SET enable_nereids_timeout = false" sql "SET BATCH_SIZE = 4064" - + sql """ + alter table customer modify column c_mktsegment SET STATS ('ndv'='5', 'num_nulls'='0', 'row_count'='3000'); + alter table lineorder modify column lo_revenue SET STATS ('ndv'='453898', 'num_nulls'='0', 'row_count'='30600572'); + alter table part modify column p_mfgr SET STATS ('ndv'='5', 'num_nulls'='0', 'row_count'='20000.0'); + alter table date modify column d_dayofweek SET STATS ('ndv'='7', 'num_nulls'='0', 'row_count'='255'); + alter table supplier modify column s_phone SET STATS ('ndv'='200', 'num_nulls'='0', 'row_count'='200'); + """ def mv1_1 = """ SELECT SUM(lo_extendedprice*lo_discount) AS REVENUE