From 1f844b1f501596ce5ee6890c7919813c46d82fbd Mon Sep 17 00:00:00 2001 From: Mryange <2319153948@qq.com> Date: Mon, 22 Jul 2024 18:50:31 +0800 Subject: [PATCH 1/3] upd --- be/src/vec/functions/function_string.cpp | 1 + be/src/vec/functions/function_string.h | 98 ++++++++++++++++++ .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../functions/scalar/NgramSearch.java | 78 ++++++++++++++ .../visitor/ScalarFunctionVisitor.java | 5 + .../string_functions/test_string_function.out | Bin 4217 -> 4484 bytes .../test_string_function.groovy | 29 ++++++ 7 files changed, 213 insertions(+) create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index fc4fd83214dd7b..b7234ba2311e3f 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -1037,6 +1037,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function>(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_alias(FunctionLeft::name, "strleft"); factory.register_alias(FunctionRight::name, "strright"); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 5e119e2146c91c..952c47c8aa3199 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -56,6 +56,7 @@ #include "vec/columns/column.h" #include "vec/columns/column_const.h" #include "vec/columns/column_vector.h" +#include "vec/common/hash_table/phmap_fwd_decl.h" #include "vec/common/int_exp.h" #include "vec/common/memcmp_small.h" #include "vec/common/memcpy_small.h" @@ -3674,4 +3675,101 @@ class FunctionOverlay : public IFunction { } } }; + +class FunctionNgramSearch : public IFunction { +public: + static constexpr auto name = "ngram_search"; + static FunctionPtr create() { return std::make_shared(); } + String get_name() const override { return name; } + size_t get_number_of_arguments() const override { return 3; } + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared(); + } + + // ngram_search(text,pattern,gram_num) + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) const override { + CHECK_EQ(arguments.size(), 3); + auto col_res = ColumnFloat64::create(); + bool col_const[3]; + ColumnPtr argument_columns[3]; + for (int i = 0; i < 3; ++i) { + std::tie(argument_columns[i], col_const[i]) = + unpack_if_const(block.get_by_position(arguments[i]).column); + } + auto pattern = assert_cast(argument_columns[1].get())->get_data_at(0); + auto gram_num = assert_cast(argument_columns[2].get())->get_element(0); + const auto* text_col = assert_cast(argument_columns[0].get()); + + if (col_const[0]) { + _execute_impl(text_col, pattern, gram_num, *col_res, input_rows_count); + } else { + _execute_impl(text_col, pattern, gram_num, *col_res, input_rows_count); + } + + block.replace_by_position(result, std::move(col_res)); + return Status::OK(); + } + +private: + template + void _execute_impl(const ColumnString* text_col, StringRef& pattern, int gram_num, + ColumnFloat64& res, size_t size) const { + auto& res_data = res.get_data(); + res_data.resize_fill(size, 0); + if (pattern.size < gram_num) { + return; + } + phmap::flat_hash_map pattern_map; + int pattern_count = get_pattern_set(pattern_map, pattern, gram_num); + std::vector restore_map; + for (int i = 0; i < size; i++) { + auto text = text_col->get_data_at(index_check_const(i)); + if (text.size < gram_num) { + continue; + } + restore_map.resize(text.size, 0); + auto not_overlap_pattern_count = get_not_overlap_with_text( + text, pattern_count, gram_num, pattern_map, restore_map); + res_data[i] = 1.0F - (not_overlap_pattern_count) * 1.0F / std::max(pattern_count, 1); + } + } + + int get_pattern_set(phmap::flat_hash_map& pattern_map, StringRef& pattern, + int gram_num) const { + int i = 0; + for (i = 0; i + gram_num <= pattern.size; i++) { + uint32_t cur_hash = HashUtil::crc_hash(pattern.data + i, gram_num, 0); + pattern_map[cur_hash]++; + } + return i; + } + + int get_not_overlap_with_text(StringRef& text, int not_overlap_pattern_count, int gram_num, + phmap::flat_hash_map& pattern_map, + std::vector& restore_map) const { + int i; + for (i = 0; i + gram_num <= text.size; i++) { + uint32_t cur_hash = HashUtil::crc_hash(text.data + i, gram_num, 0); + // if this gram is in pattern + if (pattern_map[cur_hash] > 0) { + not_overlap_pattern_count--; + pattern_map[cur_hash]--; + restore_map[i] = cur_hash; + } + } + + // restore pattern_map + for (int j = 0; j < i; j++) { + if (restore_map[j]) { + pattern_map[restore_map[j]]++; + // reset restore_map + restore_map[j] = 0; + } + } + + return not_overlap_pattern_count; + } +}; + } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 0f0dc8b5d2ae63..5d26844e79d1d7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -311,6 +311,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332; import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364; import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative; +import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch; import org.apache.doris.nereids.trees.expressions.functions.scalar.NonNullable; import org.apache.doris.nereids.trees.expressions.functions.scalar.NotNullOrEmpty; import org.apache.doris.nereids.trees.expressions.functions.scalar.Now; @@ -779,6 +780,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Negative.class, "negative"), scalar(NonNullable.class, "non_nullable"), scalar(NotNullOrEmpty.class, "not_null_or_empty"), + scalar(NgramSearch.class, "ngram_search"), scalar(Now.class, "now", "current_timestamp", "localtime", "localtimestamp"), scalar(Nullable.class, "nullable"), scalar(NullIf.class, "nullif"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java new file mode 100644 index 00000000000000..8ac713c6a093ee --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DoubleType; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'NgramSearch'. + */ +public class NgramSearch extends ScalarFunction + implements ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(DoubleType.INSTANCE).args(StringType.INSTANCE, StringType.INSTANCE, + IntegerType.INSTANCE)); + + /** + * constructor with 3 argument. + */ + public NgramSearch(Expression arg0, Expression arg1, Expression arg2) { + super("ngram_search", arg0, arg1, arg2); + if (!(arg1.isConstant())) { + throw new AnalysisException( + "ngram_search(text,pattern,gram_num): pattern support const value only."); + } + if (!(arg2.isConstant())) { + throw new AnalysisException( + "ngram_search(text,pattern,gram_num): gram_num support const value only."); + } + } + + /** + * withChildren. + */ + @Override + public NgramSearch withChildren(List children) { + Preconditions.checkArgument(children.size() == 3); + return new NgramSearch(children.get(0), children.get(1), children.get(2)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitNgramSearch(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 2f5ef3faa851f9..acb761658b4571 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -312,6 +312,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332; import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364; import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative; +import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch; import org.apache.doris.nereids.trees.expressions.functions.scalar.NotNullOrEmpty; import org.apache.doris.nereids.trees.expressions.functions.scalar.Now; import org.apache.doris.nereids.trees.expressions.functions.scalar.NullIf; @@ -1608,6 +1609,10 @@ default R visitNegative(Negative negative, C context) { return visitScalarFunction(negative, context); } + default R visitNgramSearch(NgramSearch ngramSearch, C context) { + return visitScalarFunction(ngramSearch, context); + } + default R visitNotNullOrEmpty(NotNullOrEmpty notNullOrEmpty, C context) { return visitScalarFunction(notNullOrEmpty, context); } diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out index 69ce99277fa878ba2b1107f9bd1aa430c6b45ae6..f52b37f349e5149620c41ee06d89ad52eb7ae47d 100644 GIT binary patch delta 277 zcmeyV(4xGdQlMT}S3xl^y(lp^zBo0pC^^GWL06Z{lrzQ;h`0sFaUs(Ml%2a delta 7 OcmZos{;9B`QUCx9 Date: Mon, 22 Jul 2024 22:29:19 +0800 Subject: [PATCH 2/3] upd --- be/src/vec/functions/function_string.h | 56 ++++++++++-------- .../string_functions/test_string_function.out | Bin 4484 -> 4562 bytes 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 952c47c8aa3199..c5712579db273d 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -3728,47 +3728,51 @@ class FunctionNgramSearch : public IFunction { if (text.size < gram_num) { continue; } - restore_map.resize(text.size, 0); - auto not_overlap_pattern_count = get_not_overlap_with_text( - text, pattern_count, gram_num, pattern_map, restore_map); - res_data[i] = 1.0F - (not_overlap_pattern_count) * 1.0F / std::max(pattern_count, 1); + restore_map.reserve(text.size); + auto [text_count, union_count] = get_text_set(text, gram_num, pattern_map, restore_map); + + res_data[i] = 2.0F * union_count / (text_count + pattern_count); } } int get_pattern_set(phmap::flat_hash_map& pattern_map, StringRef& pattern, int gram_num) const { - int i = 0; - for (i = 0; i + gram_num <= pattern.size; i++) { + int pattern_count = 0; + for (int i = 0; i + gram_num <= pattern.size; i++) { uint32_t cur_hash = HashUtil::crc_hash(pattern.data + i, gram_num, 0); - pattern_map[cur_hash]++; + if (!pattern_map.contains(cur_hash)) { + pattern_map[cur_hash] = 0b01; + pattern_count++; + } } - return i; + return pattern_count; } - int get_not_overlap_with_text(StringRef& text, int not_overlap_pattern_count, int gram_num, - phmap::flat_hash_map& pattern_map, - std::vector& restore_map) const { - int i; - for (i = 0; i + gram_num <= text.size; i++) { + pair get_text_set(StringRef& text, int gram_num, + phmap::flat_hash_map& pattern_map, + std::vector& restore_map) const { + restore_map.clear(); + int text_count = 0, union_count = 0; + for (int i = 0; i + gram_num <= text.size; i++) { uint32_t cur_hash = HashUtil::crc_hash(text.data + i, gram_num, 0); - // if this gram is in pattern - if (pattern_map[cur_hash] > 0) { - not_overlap_pattern_count--; - pattern_map[cur_hash]--; - restore_map[i] = cur_hash; + auto& val = pattern_map[cur_hash]; + if (val == 0b00) { + val ^= 0b10; + text_count++; + restore_map.push_back(cur_hash); + } else if (val == 0b01) { + val ^= 0b10; + text_count++; + union_count++; + restore_map.push_back(cur_hash); } } - // restore pattern_map - for (int j = 0; j < i; j++) { - if (restore_map[j]) { - pattern_map[restore_map[j]]++; - // reset restore_map - restore_map[j] = 0; - } + for (auto& reset_hash : restore_map) { + pattern_map[reset_hash] ^= 0b10; } - return not_overlap_pattern_count; + return {text_count, union_count}; } }; diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out index f52b37f349e5149620c41ee06d89ad52eb7ae47d..a61c3df00c554357deb41c855068a8204e7cc157 100644 GIT binary patch delta 141 zcmZoszNEZiwScBE1eh2bnVFlI7+7#7CM9zja~kNGnj4xJS(sXyS(+Q07+M-l-X&lp VfUbJ-djVqs43(;arX-mw4FFE|Advt7 delta 90 zcmcbl+@ic;wE(}Nktt_lQZkn@r-7cCk?G_s0!GY0vB?I4;*)g+Wtc%?k%FdLXzDl( Sjlq!1h|^Hd0AvD8fC~Wj@Dv#U From 19ea8afc149d16633cc4a1846d3341a9aa882df6 Mon Sep 17 00:00:00 2001 From: Mryange <2319153948@qq.com> Date: Tue, 23 Jul 2024 11:41:50 +0800 Subject: [PATCH 3/3] uod --- be/src/vec/functions/function_string.h | 69 ++++++++++++------ .../string_functions/test_string_function.out | Bin 4562 -> 4562 bytes 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index c5712579db273d..22eeb93591f931 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -3697,6 +3697,7 @@ class FunctionNgramSearch : public IFunction { std::tie(argument_columns[i], col_const[i]) = unpack_if_const(block.get_by_position(arguments[i]).column); } + // There is no need to check if the 2-th,3-th parameters are const here because fe has already checked them. auto pattern = assert_cast(argument_columns[1].get())->get_data_at(0); auto gram_num = assert_cast(argument_columns[2].get())->get_element(0); const auto* text_col = assert_cast(argument_columns[0].get()); @@ -3712,67 +3713,91 @@ class FunctionNgramSearch : public IFunction { } private: + using NgramMap = phmap::flat_hash_map; + // In the map, the key is the CRC32 hash result of a substring in the string, + // and the value indicates whether this hash is found in the text or pattern. + constexpr static auto not_found = 0b00; + constexpr static auto found_in_pattern = 0b01; + constexpr static auto found_in_text = 0b10; + constexpr static auto found_in_pattern_and_text = 0b11; + + uint32_t sub_str_hash(const char* data, int32_t length) const { + constexpr static uint32_t seed = 0; + return HashUtil::crc_hash(data, length, seed); + } + template void _execute_impl(const ColumnString* text_col, StringRef& pattern, int gram_num, ColumnFloat64& res, size_t size) const { auto& res_data = res.get_data(); res_data.resize_fill(size, 0); + // If the length of the pattern is less than gram_num, return 0. if (pattern.size < gram_num) { return; } - phmap::flat_hash_map pattern_map; + + // Build a map by pattern string, which will be used repeatedly in the following loop. + NgramMap pattern_map; int pattern_count = get_pattern_set(pattern_map, pattern, gram_num); + // Each time a loop is executed, the map will be modified, so it needs to be restored afterward. std::vector restore_map; + for (int i = 0; i < size; i++) { auto text = text_col->get_data_at(index_check_const(i)); if (text.size < gram_num) { + // If the length of the text is less than gram_num, return 0. continue; } restore_map.reserve(text.size); - auto [text_count, union_count] = get_text_set(text, gram_num, pattern_map, restore_map); + auto [text_count, intersection_count] = + get_text_set(text, gram_num, pattern_map, restore_map); - res_data[i] = 2.0F * union_count / (text_count + pattern_count); + // 2 * |Intersection| / (|text substr set| + |pattern substr set|) + res_data[i] = 2.0 * intersection_count / (text_count + pattern_count); } } - int get_pattern_set(phmap::flat_hash_map& pattern_map, StringRef& pattern, - int gram_num) const { - int pattern_count = 0; + size_t get_pattern_set(NgramMap& pattern_map, StringRef& pattern, int gram_num) const { + size_t pattern_count = 0; for (int i = 0; i + gram_num <= pattern.size; i++) { - uint32_t cur_hash = HashUtil::crc_hash(pattern.data + i, gram_num, 0); + uint32_t cur_hash = sub_str_hash(pattern.data + i, gram_num); if (!pattern_map.contains(cur_hash)) { - pattern_map[cur_hash] = 0b01; + pattern_map[cur_hash] = found_in_pattern; pattern_count++; } } return pattern_count; } - pair get_text_set(StringRef& text, int gram_num, - phmap::flat_hash_map& pattern_map, - std::vector& restore_map) const { + pair get_text_set(StringRef& text, int gram_num, NgramMap& pattern_map, + std::vector& restore_map) const { restore_map.clear(); - int text_count = 0, union_count = 0; + //intersection_count indicates a substring both in pattern and text. + size_t text_count = 0, intersection_count = 0; for (int i = 0; i + gram_num <= text.size; i++) { - uint32_t cur_hash = HashUtil::crc_hash(text.data + i, gram_num, 0); + uint32_t cur_hash = sub_str_hash(text.data + i, gram_num); auto& val = pattern_map[cur_hash]; - if (val == 0b00) { - val ^= 0b10; + if (val == not_found) { + val ^= found_in_text; + DCHECK(val == found_in_text); + // only found in text text_count++; restore_map.push_back(cur_hash); - } else if (val == 0b01) { - val ^= 0b10; + } else if (val == found_in_pattern) { + val ^= found_in_text; + DCHECK(val == found_in_pattern_and_text); + // found in text and pattern text_count++; - union_count++; + intersection_count++; restore_map.push_back(cur_hash); } } - - for (auto& reset_hash : restore_map) { - pattern_map[reset_hash] ^= 0b10; + // Restore the pattern_map. + for (auto& restore_hash : restore_map) { + pattern_map[restore_hash] ^= found_in_text; } - return {text_count, union_count}; + return {text_count, intersection_count}; } }; diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out index a61c3df00c554357deb41c855068a8204e7cc157..4cc5f410bbf23d622635a3f20571bd9959ba4edf 100644 GIT binary patch delta 140 zcmcbld`Wr3CIKa57~o7yO6D@=G|)3OH#9M_K#-HK2v~B$R2fe;5EP%RE2zu~S2a0O b(8d-nkJA`VLt`-HGU7DUGXN=r32*@bH3A;o delta 140 zcmcbld`Wr3CIKZAV#OEXJzV-rJ5qsdnUEIHxI nCL0KfPu3Mw=0wOx3fkDh<#8IrX=n_FTt=LRdIlh6Faa(AN$?*Y