From b8fd7b0f6c3a2619f64264c9e70fc3e1d235f4eb Mon Sep 17 00:00:00 2001 From: zzzxl1993 <474696115@qq.com> Date: Fri, 15 Dec 2023 19:31:03 +0800 Subject: [PATCH] [optimize](count) optimize pk exact query without reading data --- .../rowset/segment_v2/segment_iterator.cpp | 41 ++++ .../olap/rowset/segment_v2/segment_iterator.h | 2 + .../test_count_on_index_2.out | 103 +++++++++ .../test_count_on_index_2.groovy | 205 ++++++++++++++++++ 4 files changed, 351 insertions(+) create mode 100644 regression-test/data/inverted_index_p0/test_count_on_index_2.out create mode 100644 regression-test/suites/inverted_index_p0/test_count_on_index_2.groovy diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 658b30e67fc778..1cd088c0e3cbb5 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1782,6 +1782,9 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 for (auto cid : _first_read_column_ids) { auto& column = _current_return_columns[cid]; + if (_need_read_key_data(cid, column, nrows_read)) { + continue; + } if (_prune_column(cid, column, true, nrows_read)) { continue; } @@ -2449,5 +2452,43 @@ void SegmentIterator::_calculate_pred_in_remaining_conjunct_root( } } +bool SegmentIterator::_need_read_key_data(ColumnId cid, vectorized::MutableColumnPtr& column, + size_t nrows_read) { + if (_opts.tablet_schema->keys_type() != KeysType::DUP_KEYS) { + return false; + } + + if (_opts.push_down_agg_type_opt != TPushAggOp::COUNT_ON_INDEX) { + return false; + } + + if (!_opts.tablet_schema->column(cid).is_key()) { + return false; + } + + std::set cids; + for (auto* pred : _col_predicates) { + cids.insert(pred->column_id()); + } + for (auto* pred : _col_preds_except_leafnode_of_andnode) { + cids.insert(pred->column_id()); + } + + // If the key is present in expr, data needs to be read. + if (cids.contains(cid)) { + return false; + } + + if (column->is_nullable()) { + auto* nullable_col_ptr = reinterpret_cast(column.get()); + nullable_col_ptr->get_null_map_column().insert_many_defaults(nrows_read); + nullable_col_ptr->get_nested_column_ptr()->insert_many_defaults(nrows_read); + } else { + column->insert_many_defaults(nrows_read); + } + + return true; +} + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 352929678b3588..b84189c106d3d2 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -327,6 +327,8 @@ class SegmentIterator : public RowwiseIterator { return 0; } + bool _need_read_key_data(ColumnId cid, vectorized::MutableColumnPtr& column, size_t nrows_read); + class BitmapRangeIterator; class BackwardBitmapRangeIterator; diff --git a/regression-test/data/inverted_index_p0/test_count_on_index_2.out b/regression-test/data/inverted_index_p0/test_count_on_index_2.out new file mode 100644 index 00000000000000..94d2a83388b38f --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_count_on_index_2.out @@ -0,0 +1,103 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +974 + +-- !sql -- +974 + +-- !sql -- +839 + +-- !sql -- +839 + +-- !sql -- +271 + +-- !sql -- +271 + +-- !sql -- +913 + +-- !sql -- +913 + +-- !sql -- +14 + +-- !sql -- +14 + +-- !sql -- +15 + +-- !sql -- +15 + +-- !sql -- +4 + +-- !sql -- +4 + +-- !sql -- +15 + +-- !sql -- +15 + +-- !sql -- +827 + +-- !sql -- +827 + +-- !sql -- +970 + +-- !sql -- +970 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +970 + +-- !sql -- +970 + +-- !sql -- +11 + +-- !sql -- +9 + +-- !sql -- +21 + +-- !sql -- +19 + +-- !sql -- +11 + +-- !sql -- +10 + +-- !sql -- +6 + +-- !sql -- +7 + +-- !sql -- +0 + +-- !sql -- +3 + diff --git a/regression-test/suites/inverted_index_p0/test_count_on_index_2.groovy b/regression-test/suites/inverted_index_p0/test_count_on_index_2.groovy new file mode 100644 index 00000000000000..6866c81f964822 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_count_on_index_2.groovy @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_count_on_index_2", "p0"){ + def indexTbName1 = "test_count_on_index_2_index" + def indexTbName2 = "test_count_on_index_2_no_index" + def indexTbName3 = "test_count_on_index_2_pk" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX clientip_idx (`clientip`) USING INVERTED COMMENT '', + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '', + INDEX status_idx (`status`) USING INVERTED COMMENT '', + INDEX size_idx (`size`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "DROP TABLE IF EXISTS ${indexTbName2}" + + sql """ + CREATE TABLE ${indexTbName2} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "DROP TABLE IF EXISTS ${indexTbName3}" + + sql """ + CREATE TABLE ${indexTbName3} ( + `a` int NULL COMMENT "", + `b` int NULL COMMENT "", + `c` int NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`a`, `b`, `c`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + INSERT INTO ${indexTbName3} VALUES + (1, 1, 1), + (2, 2, 2), + (3, 3, 3), + (4, 4, 4), + (5, 5, 5), + (6, 6, 6), + (7, 7, 7), + (8, 8, 8), + (9, 9, 9), + (10, 10, 10), + (11, 11, 11), + (12, 12, 12), + (13, 13, 13), + (14, 14, 14), + (15, 15, 15), + (16, 16, 16), + (17, 17, 17), + (18, 18, 18), + (19, 19, 19), + (20, 20, 20), + (21, 21, 21), + (22, 22, 22), + (23, 23, 23), + (24, 24, 24), + (25, 25, 25), + (26, 26, 26), + (27, 27, 27), + (28, 28, 28), + (29, 29, 29), + (30, 30, 30); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, indexTbName1, 'true', 'json', 'documents-1000.json') + load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 'documents-1000.json') + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453; """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453; """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (request match 'images'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (request match 'images'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (request match 'images' and request match 'english'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (request match 'images' and request match 'english'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (request match 'images' or request match 'english'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (request match 'images' or request match 'english'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0' or clientip = '252.0.0.0'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0' or clientip = '252.0.0.0'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0' and request match 'hm'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0' and request match 'hm'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and clientip in ('247.37.0.0', '252.0.0.0'); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and clientip in ('247.37.0.0', '252.0.0.0'); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (status = 200); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (status = 200); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (status = 200 or status = 304); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (status = 200 or status = 304); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0' and status = 200); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and (clientip = '247.37.0.0' and status = 200); """ + + qt_sql """ select count() from ${indexTbName1} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and status in (200, 304); """ + qt_sql """ select count() from ${indexTbName2} where `@timestamp` >= 893964736 and `@timestamp` <= 893966453 and status in (200, 304); """ + + qt_sql """ select count() from ${indexTbName3} where (a >= 5 and a <= 15); """ + qt_sql """ select count() from ${indexTbName3} where (a > 5 and a < 15); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 7 and a <= 27); """ + qt_sql """ select count() from ${indexTbName3} where (a > 7 and a < 27); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 7 and a <= 27) and (b >= 10 and b <= 20); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 7 and a < 27) and (b >= 10 and b < 20); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 7 and a <= 27) and (b >= 10 and b < 20) and (c >= 12 and c < 18); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 2 and a < 28) and (b >= 5 and b < 20) and (c >= 8 and c < 15); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 10 and a < 20) and (b >= 5 and b < 14) and (c >= 16 and c < 25); """ + qt_sql """ select count() from ${indexTbName3} where (a >= 10 and a < 20) and (b >= 5 and b < 16) and (c >= 13 and c < 25); """ + + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file