diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index c969f60fa727c1..82e4b732193e13 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -73,6 +73,7 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) { RETURN_IF_ERROR(JoinProbeLocalState::open(state)); auto& p = _parent->cast(); + _right_col_idx = p._right_col_idx; std::visit( [&](auto&& join_op_variants, auto have_other_join_conjunct) { using JoinOpType = std::decay_t; @@ -511,7 +512,7 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) const bool probe_dispose_null = _match_all_probe || _build_unique || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op == TJoinOp::LEFT_ANTI_JOIN || - _join_op == TJoinOp::LEFT_SEMI_JOIN; + _join_op == TJoinOp::LEFT_SEMI_JOIN || _is_mark_join; const std::vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; std::vector probe_not_ignore_null(eq_join_conjuncts.size()); size_t conjuncts_index = 0; @@ -642,14 +643,34 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) { } } - const int right_col_idx = - (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _left_table_data_types.size(); + _right_col_idx = (_is_right_semi_anti && !_have_other_join_conjunct && + (!_is_mark_join || _mark_join_conjuncts.empty())) + ? 0 + : _left_table_data_types.size(); size_t idx = 0; for (const auto* slot : slots_to_check) { auto data_type = slot->get_data_type_ptr(); - const auto slot_on_left = idx < right_col_idx; + const auto slot_on_left = idx < _right_col_idx; + + if (slot_on_left) { + if (idx >= _left_table_data_types.size()) { + return Status::InternalError( + "Join node(id={}, OP={}) intermediate slot({}, #{})'s on left table " + "idx out bound of _left_table_data_types: {} vs {}", + _node_id, _join_op, slot->col_name(), slot->id(), idx, + _left_table_data_types.size()); + } + } else if (idx - _right_col_idx >= _right_table_data_types.size()) { + return Status::InternalError( + "Join node(id={}, OP={}) intermediate slot({}, #{})'s on right table " + "idx out bound of _right_table_data_types: {} vs {}(idx = {}, _right_col_idx = " + "{})", + _node_id, _join_op, slot->col_name(), slot->id(), idx - _right_col_idx, + _right_table_data_types.size(), idx, _right_col_idx); + } + auto target_data_type = slot_on_left ? _left_table_data_types[idx] - : _right_table_data_types[idx - right_col_idx]; + : _right_table_data_types[idx - _right_col_idx]; ++idx; if (data_type->equals(*target_data_type)) { continue; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index 66d709e6541ad8..ee0bb78ee4f768 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -116,6 +116,9 @@ class HashJoinProbeLocalState final std::unique_ptr _process_hashtable_ctx_variants = std::make_unique(); + // Index of column(slot) from right table in the `_intermediate_row_desc`. + size_t _right_col_idx; + RuntimeProfile::Counter* _probe_expr_call_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr; @@ -185,6 +188,9 @@ class HashJoinProbeOperatorX final : public JoinProbeOperatorX _right_output_slot_flags; std::vector _right_table_column_names; const std::vector _partition_exprs; + + // Index of column(slot) from right table in the `_intermediate_row_desc`. + size_t _right_col_idx; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 7a5c34fb845937..4e1a22a9fab517 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -23,6 +23,7 @@ #include "vec/columns/column.h" #include "vec/columns/columns_number.h" #include "vec/common/arena.h" +#include "vec/common/custom_allocator.h" namespace doris { namespace vectorized { @@ -137,8 +138,15 @@ struct ProcessHashTableProbe { RuntimeProfile::Counter* _probe_side_output_timer = nullptr; RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; - int _right_col_idx; + // See `HashJoinProbeOperatorX::_right_col_idx` + const int _right_col_idx; + int _right_col_len; + + // For right semi with mark join conjunct, we need to store the mark join flags + // in the hash table. + // -1 means null, 0 means false, 1 means true + DorisVector mark_join_flags; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 1f9d127549ad70..b88d1f42541d34 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -56,9 +56,7 @@ ProcessHashTableProbe::ProcessHashTableProbe(HashJoinProbeLocalState _build_side_output_timer(parent->_build_side_output_timer), _probe_side_output_timer(parent->_probe_side_output_timer), _finish_probe_phase_timer(parent->_finish_probe_phase_timer), - _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct) - ? 0 - : _parent->left_table_data_types().size()), + _right_col_idx(parent->_right_col_idx), _right_col_len(_parent->right_table_data_types().size()) {} template @@ -66,11 +64,10 @@ void ProcessHashTableProbe::build_side_output_column( vectorized::MutableColumns& mcol, const std::vector& output_slot_flags, int size, bool have_other_join_conjunct, bool is_mark_join) { SCOPED_TIMER(_build_side_output_timer); - constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || - JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || - JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::LEFT_SEMI_JOIN; + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN; constexpr auto probe_all = JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN; @@ -209,7 +206,7 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - (is_mark_join && JoinOpType != doris::TJoinOp::RIGHT_SEMI_JOIN))); + (is_mark_join))); } auto& mcol = mutable_block.mutable_columns(); @@ -268,8 +265,8 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c build_side_output_column(mcol, *_right_output_slot_flags, current_offset, with_other_conjuncts, is_mark_join); - if constexpr (with_other_conjuncts || (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && - JoinOpType != TJoinOp::RIGHT_ANTI_JOIN)) { + if (with_other_conjuncts || !_parent->_mark_join_conjuncts.empty() || + (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != TJoinOp::RIGHT_ANTI_JOIN)) { auto check_all_match_one = [](const std::vector& vecs, uint32_t probe_idx, int size) { if (!size || vecs[0] != probe_idx || vecs[size - 1] != probe_idx + size - 1) { @@ -291,7 +288,14 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c output_block->swap(mutable_block.to_block()); - if constexpr (is_mark_join && JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) { + if constexpr (is_mark_join) { + if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || + JoinOpType == TJoinOp::RIGHT_ANTI_JOIN) { + if (mark_join_flags.empty()) { + mark_join_flags.resize(hash_table_ctx.hash_table->size(), 0); + } + } + return do_mark_join_conjuncts( output_block, hash_table_ctx.hash_table->get_bucket_size()); } else if constexpr (with_other_conjuncts) { @@ -363,15 +367,19 @@ template template Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Block* output_block, size_t hash_table_bucket_size) { - DCHECK(JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::LEFT_SEMI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN); + if (JoinOpType != TJoinOp::LEFT_ANTI_JOIN && JoinOpType != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && + JoinOpType != TJoinOp::LEFT_SEMI_JOIN && JoinOpType != TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN && + JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != TJoinOp::RIGHT_ANTI_JOIN) { + return Status::InternalError("join type {} is not supported", JoinOpType); + } constexpr bool is_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; + JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::RIGHT_ANTI_JOIN; constexpr bool is_null_aware_join = JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; + constexpr bool is_right_half_join = + JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || JoinOpType == TJoinOp::RIGHT_ANTI_JOIN; const auto row_count = output_block->rows(); if (!row_count) { @@ -458,37 +466,71 @@ Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Blo const bool should_be_null_if_build_side_has_null = *_has_null_in_build_side && is_null_aware_join && !with_other_conjuncts; for (size_t i = 0; i != row_count; ++i) { - bool not_matched_before = _parent->_last_probe_match != _probe_indexs[i]; - if (_build_indexs[i] == 0) { - bool has_null_mark_value = _parent->_last_probe_null_mark == _probe_indexs[i]; - if (not_matched_before) { - filter_map[i] = true; - mark_null_map[i] = has_null_mark_value || should_be_null_if_build_side_has_null; - mark_filter_data[i] = false; + if constexpr (is_right_half_join) { + const auto& build_index = _build_indexs[i]; + if (build_index == 0) { + continue; + } + + if (mark_join_flags[build_index] == 1) { + continue; + } + + if (mark_null_map[i]) { + mark_join_flags[build_index] = -1; + } else if (mark_filter_data[i]) { + mark_join_flags[build_index] = 1; } } else { - if (mark_null_map[i]) { // is null - _parent->_last_probe_null_mark = _probe_indexs[i]; - } else { - if (mark_filter_data[i] && not_matched_before) { - _parent->_last_probe_match = _probe_indexs[i]; + bool not_matched_before = _parent->_last_probe_match != _probe_indexs[i]; + if (_build_indexs[i] == 0) { + bool has_null_mark_value = _parent->_last_probe_null_mark == _probe_indexs[i]; + if (not_matched_before) { filter_map[i] = true; + mark_null_map[i] = has_null_mark_value || should_be_null_if_build_side_has_null; + mark_filter_data[i] = false; + } + } else { + if (mark_null_map[i]) { // is null + _parent->_last_probe_null_mark = _probe_indexs[i]; + } else { + if (mark_filter_data[i] && not_matched_before) { + _parent->_last_probe_match = _probe_indexs[i]; + filter_map[i] = true; + } } } } } - if constexpr (is_anti_join) { - // flip the mark column - for (size_t i = 0; i != row_count; ++i) { - mark_filter_data[i] ^= 1; // not null/ null + if constexpr (is_right_half_join) { + if constexpr (is_anti_join) { + // flip the mark column + for (size_t i = 0; i != row_count; ++i) { + if (mark_join_flags[i] == -1) { + // -1 means null. + continue; + } + + mark_join_flags[i] ^= 1; + } + } + // For right semi/anti join, no rows will be output in probe phase. + output_block->clear(); + return Status::OK(); + } else { + if constexpr (is_anti_join) { + // flip the mark column + for (size_t i = 0; i != row_count; ++i) { + mark_filter_data[i] ^= 1; // not null/ null + } } - } - auto result_column_id = output_block->columns(); - output_block->insert( - {std::move(filter_column), std::make_shared(), ""}); - return vectorized::Block::filter_block(output_block, result_column_id, result_column_id); + auto result_column_id = output_block->columns(); + output_block->insert( + {std::move(filter_column), std::make_shared(), ""}); + return vectorized::Block::filter_block(output_block, result_column_id, result_column_id); + } } template @@ -637,7 +679,8 @@ Status ProcessHashTableProbe::finish_probing(HashTableType& hash_tab if (block_size) { if (mcol.size() < _right_col_len + _right_col_idx) { return Status::InternalError( - "output block invalid, mcol.size()={}, _right_col_len={}, _right_col_idx={}", + "output block invalid, mcol.size()={}, _right_col_len={}, " + "_right_col_idx={}", mcol.size(), _right_col_len, _right_col_idx); } for (size_t j = 0; j < _right_col_len; ++j) { @@ -646,8 +689,31 @@ Status ProcessHashTableProbe::finish_probing(HashTableType& hash_tab _build_indexs.data() + block_size); } + if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || + JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { + if (is_mark_join) { + if (mark_join_flags.empty()) { + mark_join_flags.resize(hash_table_ctx.hash_table->size(), 0); + } + + // mark column is nullable + auto* mark_column = assert_cast( + mcol[_parent->_mark_column_id].get()); + mark_column->resize(block_size); + auto* null_map = mark_column->get_null_map_data().data(); + auto* data = assert_cast(mark_column->get_nested_column()) + .get_data() + .data(); + for (size_t i = 0; i != block_size; ++i) { + const auto build_index = _build_indexs[i]; + null_map[i] = mark_join_flags[build_index] == -1; + data[i] = mark_join_flags[build_index] == 1; + } + } + } + // just resize the left table column in case with other conjunct to make block size is not zero - if (_is_right_semi_anti && _have_other_join_conjunct) { + if (_is_right_semi_anti && _right_col_idx != 0) { for (int i = 0; i < _right_col_idx; ++i) { mcol[i]->resize(block_size); } diff --git a/be/src/vec/common/hash_table/join_hash_table.h b/be/src/vec/common/hash_table/join_hash_table.h index 317987541cdbe1..88695ef9b43119 100644 --- a/be/src/vec/common/hash_table/join_hash_table.h +++ b/be/src/vec/common/hash_table/join_hash_table.h @@ -101,24 +101,23 @@ class JoinHashTable { } } - if constexpr (with_other_conjuncts || - (is_mark_join && JoinOpType != TJoinOp::RIGHT_SEMI_JOIN)) { - if constexpr (!with_other_conjuncts) { - constexpr bool is_null_aware_join = - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN; - constexpr bool is_left_half_join = JoinOpType == TJoinOp::LEFT_SEMI_JOIN || - JoinOpType == TJoinOp::LEFT_ANTI_JOIN; - - /// For null aware join or left half(semi/anti) join without other conjuncts and without - /// mark join conjunct. - /// If one row on probe side has one match in build side, we should stop searching the - /// hash table for this row. - if (is_null_aware_join || (is_left_half_join && !has_mark_join_conjunct)) { - return _find_batch_conjunct( - keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, - build_idxs); - } + if constexpr (with_other_conjuncts) { + return _find_batch_conjunct( + keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, build_idxs); + } else if constexpr (is_mark_join) { + constexpr bool is_null_aware_join = JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN; + constexpr bool is_left_half_join = + JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType == TJoinOp::LEFT_ANTI_JOIN; + + /// For null aware join or left half(semi/anti) join without other conjuncts and without + /// mark join conjunct. + /// If one row on probe side has one match in build side, we should stop searching the + /// hash table for this row. + if (is_null_aware_join || (is_left_half_join && !has_mark_join_conjunct)) { + return _find_batch_conjunct( + keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, + build_idxs); } return _find_batch_conjunct( @@ -339,14 +338,7 @@ class JoinHashTable { auto do_the_probe = [&]() { while (build_idx && matched_cnt < batch_size) { - if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || - JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { - if (!visited[build_idx] && keys[probe_idx] == build_keys[build_idx]) { - probe_idxs[matched_cnt] = probe_idx; - build_idxs[matched_cnt] = build_idx; - matched_cnt++; - } - } else if constexpr (need_judge_null) { + if constexpr (need_judge_null) { if (build_idx == bucket_size) { build_idxs[matched_cnt] = build_idx; probe_idxs[matched_cnt] = probe_idx; diff --git a/regression-test/data/query_p0/join/mark_join/mark_join.out b/regression-test/data/query_p0/join/mark_join/mark_join.out index ed3575d0e14476..ea3b7ddd6a6d1a 100644 --- a/regression-test/data/query_p0/join/mark_join/mark_join.out +++ b/regression-test/data/query_p0/join/mark_join/mark_join.out @@ -17,3 +17,50 @@ 3 -3 \N c 3 3 \N c +-- !test_right_semi_mark_join -- +1 v1 o1 \N \N +2 v2 o2 \N \N +3 v3 o3 \N \N +4 v4 o4 \N \N +5 v5 o5 \N \N +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N + +-- !test_right_semi_mark_join_2 -- +1 v1 o1 \N \N +2 v2 o2 \N \N +3 v3 o3 \N \N +4 v4 o4 \N \N +5 v5 o5 \N \N +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N + +-- !test_right_semi_mark_join_no_null -- +1 v1 o1 false true +2 v2 o2 false true +3 v3 o3 false true +4 v4 o4 false true +5 v5 o5 false true +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N + +-- !test_right_semi_mark_join_no_null_2 -- +1 v1 o1 false true +2 v2 o2 false true +3 v3 o3 false true +4 v4 o4 false true +5 v5 o5 false true +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N \ No newline at end of file diff --git a/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out new file mode 100644 index 00000000000000..e00e19be11e07e --- /dev/null +++ b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out @@ -0,0 +1,63 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test -- +\N \N +0 18332 +1 \N +1 \N +1 \N +2 -56 +3 72 +4 -5581 +5 -62 +5 -62 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +7 -41 +7 -41 +7 -41 +7 -41 +8 -54 +9 -6236 +9 -6236 +9 -6236 +9 -6236 +9 -6236 +9 -6236 +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +11 \N +11 \N +11 \N +12 \N +12 \N +12 \N +12 \N +12 \N +12 \N +12 \N +13 -2343514 +13 -2343514 +13 -2343514 +13 -2343514 +13 -2343514 +13 -2343514 +14 -3361960 +14 -3361960 +14 -3361960 +14 -3361960 +14 -3361960 +14 -3361960 + diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy index 9759a0e9b4cd70..289c9d5b06bad2 100644 --- a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy +++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy @@ -16,10 +16,17 @@ // under the License. suite("mark_join") { - sql "drop table if exists t1;" - sql "drop table if exists t2;" + String suiteName = "mark_join" + String table_t1 = "${suiteName}_table_t1" + String table_t2 = "${suiteName}_table_t2" + String table_tbl1 = "${suiteName}_table_tbl1" + String table_tbl2 = "${suiteName}_table_tbl2" + String table_tbl3 = "${suiteName}_table_tbl3" + + sql "drop table if exists ${table_t1};" + sql "drop table if exists ${table_t2};" sql """ - create table t1 ( + create table ${table_t1} ( k1 int null, k2 int null, k3 bigint null, @@ -31,7 +38,7 @@ suite("mark_join") { """ sql """ - create table t2 ( + create table ${table_t2} ( k1 int null, k2 int null, k3 bigint null, @@ -42,23 +49,237 @@ suite("mark_join") { properties("replication_num" = "1"); """ - sql "insert into t1 select 1,1,1,'a';" - sql "insert into t1 select 2,2,2,'b';" - sql "insert into t1 select 3,-3,null,'c';" - sql "insert into t1 select 3,3,null,'c';" + sql "insert into ${table_t1} select 1,1,1,'a';" + sql "insert into ${table_t1} select 2,2,2,'b';" + sql "insert into ${table_t1} select 3,-3,null,'c';" + sql "insert into ${table_t1} select 3,3,null,'c';" - sql "insert into t2 select 1,1,1,'a';" - sql "insert into t2 select 2,2,2,'b';" - sql "insert into t2 select 3,-3,null,'c';" - sql "insert into t2 select 3,3,null,'c';" + sql "insert into ${table_t2} select 1,1,1,'a';" + sql "insert into ${table_t2} select 2,2,2,'b';" + sql "insert into ${table_t2} select 3,-3,null,'c';" + sql "insert into ${table_t2} select 3,3,null,'c';" qt_test """ - select * from t1 where exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2; + select * from ${table_t1} where exists (select ${table_t2}.k3 from ${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2; """ qt_test """ - select * from t1 where not exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2; + select * from ${table_t1} where not exists (select ${table_t2}.k3 from ${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2; """ qt_test """ - select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 = t1.k2) or k1 < 10 order by k1, k2; + select * from ${table_t1} where ${table_t1}.k1 not in (select ${table_t2}.k3 from ${table_t2} where ${table_t2}.k2 = ${table_t1}.k2) or k1 < 10 order by k1, k2; + """ + + sql "drop table if exists ${table_tbl1};" + sql "drop table if exists ${table_tbl2};" + sql "drop table if exists ${table_tbl3};" + + sql """ + CREATE TABLE `${table_tbl1}` ( + `unit_name` varchar(1080) NULL, + `cur_unit_name` varchar(1080) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`unit_name`) + DISTRIBUTED BY RANDOM BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + CREATE TABLE `${table_tbl2}` ( + `org_code` varchar(150) NOT NULL , + `org_name` varchar(300) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`org_code`) + DISTRIBUTED BY HASH(`org_code`) BUCKETS 4 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + CREATE TABLE `${table_tbl3}` ( + `id` bigint NOT NULL, + `acntm_name` varchar(500) NULL , + `vendor_name` varchar(500) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + insert into ${table_tbl1} (unit_name, cur_unit_name) values + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'), + (null, 'o1'), + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'), + (null, 'o1'), + (null, 'o2'), + (null, 'o3'), + (null, 'o4'), + (null, 'o5'), + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'); + """ + + sql """ + insert into ${table_tbl2}(org_code, org_name) values + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'), + ('v1', null), + ('v2', null), + ('v3', null), + ('v4', null), + ('v5', null); + """ + + sql """ + insert into ${table_tbl3} (id, vendor_name, acntm_name) + values(1, 'o1', 'v1'), + (2, 'o2', 'v2'), + (3, 'o3', 'v3'), + (4, 'o4', 'v4'), + (5, 'o5', 'v5'), + (6, null, 'v1'), + (7, null, 'v2'), + (8, null, 'v3'), + (9, null, 'v4'), + (10, null, 'v5'); + """ + + sql " analyze table ${table_tbl1} with sync;" + sql " analyze table ${table_tbl2} with sync;" + sql " analyze table ${table_tbl3} with sync;" + + sql "set disable_join_reorder=0;" + qt_test_right_semi_mark_join """ + select + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + ) v1, + ${table_tbl3}.vendor_name not in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + ) v2 + from + ${table_tbl3} order by 1,2,3,4,5; + """ + + sql "set disable_join_reorder=1;" + qt_test_right_semi_mark_join_2 """ + select + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + ) v1, + ${table_tbl3}.vendor_name not in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + ) v2 + from + ${table_tbl3} order by 1,2,3,4,5; + """ + + sql "set disable_join_reorder=0;" + qt_test_right_semi_mark_join_no_null """ + select + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null + ) v1, + ${table_tbl3}.vendor_name not in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null + ) v2 + from + ${table_tbl3} order by 1,2,3,4,5; + """ + + sql "set disable_join_reorder=1;" + qt_test_right_semi_mark_join_no_null_2 """ + select + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null + ) v1, + ${table_tbl3}.vendor_name not in ( + select + ${table_tbl1}.unit_name + from + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name + where + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null + ) v2 + from + ${table_tbl3} order by 1,2,3,4,5; """ } diff --git a/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy new file mode 100644 index 00000000000000..cc287f1a6f791d --- /dev/null +++ b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("right_semi_mark_join") { + String suiteName = "right_semi_mark_join" + String table_tbl1 = "${suiteName}_table_tbl1" + String table_tbl2 = "${suiteName}_table_tbl2" + String table_tbl3 = "${suiteName}_table_tbl3" + + sql "drop table if exists ${table_tbl1};" + sql "drop table if exists ${table_tbl2};" + sql "drop table if exists ${table_tbl3};" + + sql """ + create table ${table_tbl1} (pk int, col1 bigint, col2 bigint) engine = olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 properties("replication_num" = "1"); + """ + + sql """ + insert into + ${table_tbl1}(pk, col1, col2) + values + (0, null, 18332), (1, 788547, null), (2, 4644959, -56), (3, 8364628, 72), (4, null, -5581), + (5, 2344024, -62), (6, -2689177, 22979), (7, 1320, -41), (8, null, -54), (9, 12, -6236), + (10, -8321648, null), (11, 153691, null), (12, -8056, null), (13, -12, -2343514), (14, -35, -3361960); + """ + + sql """ + create table ${table_tbl2} ( + pk int, col1 bigint, col2 bigint + ) engine = olap + distributed by hash(pk) buckets 4 + properties("replication_num" = "1"); + """ + + sql """ + insert into + ${table_tbl2}(pk, col1, col2) + values + (0, 108, 31161), (1, 1479175, 6764263), (2, 110, 25), (3, 110, -18656), (4, null, -51), + (5, 21, 27), (6, -6950217, 1585978), (7, null, null), (8, null, 3453467), (9, null, -6701140); + """ + + sql """ + create table ${table_tbl3} ( + pk int, col1 bigint, col2 bigint, col3 bigint + ) engine = olap + DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 + properties("replication_num" = "1"); + """ + + sql """ + insert into + ${table_tbl3}(pk, col1, col2) + values + (0, 55, -58), (1, 49, 29792), (2, 95, 32361), (3, 31243, -27428), (4, -27400, null), + (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, -21951), (9, 13186, 24466), + (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, -18), (14, 21681, 14079), + (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, null, 20682), (19, 31243, -98), + (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, null, 20822), (77, 31243, -27428), + (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 31243, 4077), (82, null, 114), + (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 32361), (87, 26607, -27428), + (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, -21951), (9, 13186, 24466), + (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, -18), (14, 21681, 14079), + (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, null, 20682), (19, 31243, -98), + (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, null, 20822), (77, 31243, -27428), + (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 31243, 4077), (82, null, 114), + (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 32361), (87, 26607, -27428), + (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, -18), (14, 21681, 14079), + (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, null, 20682), (19, 31243, -98), + (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, null, 20822), (77, 31243, -27428), + (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 31243, 4077), (82, null, 114), + (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 32361), (87, 26607, -27428); + """ + + qt_test """ + SELECT + T1.pk AS C1, + T1.col2 AS C2 + FROM + ${table_tbl1} AS T1 FULL + OUTER JOIN ${table_tbl2} AS T2 ON T1.col1 <= T2.col2 + OR T2.col1 IN ( + SELECT + T3.col2 + FROM + ${table_tbl3} AS T3 + WHERE + T2.col2 = T3.col1 + ) + ORDER BY + C1, + C2 DESC; + """ +} \ No newline at end of file