From fb750120c658b6cecb66f32ee8ccb5f33da77069 Mon Sep 17 00:00:00 2001 From: zhangstar333 <87313068+zhangstar333@users.noreply.github.com> Date: Thu, 1 Aug 2024 18:14:19 +0800 Subject: [PATCH 1/4] [Feature](function) support split_by_regexp function (#38259) docs link: https://github.com/apache/doris-website/pull/904 --- .../functions/function_split_by_regexp.cpp | 378 ++++++++++++++++++ .../vec/functions/simple_function_factory.h | 2 + .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../functions/scalar/SplitByRegexp.java | 97 +++++ .../visitor/ScalarFunctionVisitor.java | 5 + gensrc/script/doris_builtins_functions.py | 2 + .../string_functions/test_split_by_regexp.out | 52 +++ .../test_split_by_regexp.groovy | 68 ++++ 8 files changed, 606 insertions(+) create mode 100644 be/src/vec/functions/function_split_by_regexp.cpp create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java create mode 100644 regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out create mode 100644 regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy diff --git a/be/src/vec/functions/function_split_by_regexp.cpp b/be/src/vec/functions/function_split_by_regexp.cpp new file mode 100644 index 00000000000000..40628ee2017fc7 --- /dev/null +++ b/be/src/vec/functions/function_split_by_regexp.cpp @@ -0,0 +1,378 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "common/status.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_const.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function.h" +#include "vec/functions/function_string.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { + +struct Match { + std::string::size_type offset; + std::string::size_type length; +}; + +class RegexpSplit { +public: + void init(re2::RE2* re2, int32_t max_splits); + void set(const char* pos, const char* end); + bool get(const char*& token_begin, const char*& token_end); + +private: + const char* _pos; + const char* _end; + + std::int32_t _max_splits = 0; + std::vector _matches; + int32_t _splits; + re2::RE2* _re2 = nullptr; + unsigned _number_of_subpatterns = 0; + + unsigned match(const char* subject, size_t subject_size, std::vector& matches, + unsigned limit) const; +}; + +unsigned RegexpSplit::match(const char* subject, size_t subject_size, std::vector& matches, + unsigned limit) const { + matches.clear(); + + if (limit == 0) { + return 0; + } + + limit = std::min(limit, _number_of_subpatterns + 1); + std::vector pieces(limit); + + if (!_re2->Match({subject, subject_size}, 0, subject_size, re2::RE2::UNANCHORED, pieces.data(), + limit)) { + return 0; + } else { + matches.resize(limit); + for (size_t i = 0; i < limit; ++i) { + if (pieces[i].empty()) { + matches[i].offset = std::string::npos; + matches[i].length = 0; + } else { + matches[i].offset = pieces[i].data() - subject; + matches[i].length = pieces[i].length(); + } + } + return limit; + } +} + +void RegexpSplit::init(re2::RE2* re2, int32_t max_splits) { + _max_splits = max_splits; + _re2 = re2; + if (_re2) { + _number_of_subpatterns = _re2->NumberOfCapturingGroups(); + } +} + +// Called for each next string. +void RegexpSplit::set(const char* pos, const char* end) { + _pos = pos; + _end = end; + _splits = 0; +} + +// Get the next token, if any, or return false. +bool RegexpSplit::get(const char*& token_begin, const char*& token_end) { + if (!_re2) { + if (_pos == _end) { + return false; + } + + token_begin = _pos; + if (_max_splits != -1) { + if (_splits == _max_splits - 1) { + token_end = _end; + _pos = _end; + return true; + } + } + + _pos += 1; + token_end = _pos; + ++_splits; + } else { + if (!_pos || _pos > _end) { + return false; + } + + token_begin = _pos; + if (_max_splits != -1) { + if (_splits == _max_splits - 1) { + token_end = _end; + _pos = nullptr; + return true; + } + } + + if (!match(_pos, _end - _pos, _matches, _number_of_subpatterns + 1) || + !_matches[0].length) { + token_end = _end; + _pos = _end + 1; + } else { + token_end = _pos + _matches[0].offset; + _pos = token_end + _matches[0].length; + ++_splits; + } + } + + return true; +} + +template +class SplitByRegexp : public IFunction { +public: + static constexpr auto name = "split_by_regexp"; + + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { + return get_variadic_argument_types_impl().size(); + } + + bool is_variadic() const override { return true; } + + DataTypes get_variadic_argument_types_impl() const override { + return Impl::get_variadic_argument_types(); + } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + DCHECK(is_string(arguments[0])) + << "first argument for function: " << name << " should be string" + << " and arguments[0] is " << arguments[0]->get_name(); + DCHECK(is_string(arguments[1])) + << "second argument for function: " << name << " should be string" + << " and arguments[1] is " << arguments[1]->get_name(); + auto nullable_string_type = make_nullable(std::make_shared()); + return std::make_shared(nullable_string_type); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) const override { + return Impl::execute_impl(context, block, arguments, result, input_rows_count); + } +}; + +struct ExecuteImpl { + using NullMapType = PaddedPODArray; + static Status execute_impl(FunctionContext* context, Block& block, + const ColumnNumbers& arguments, size_t result, + size_t input_rows_count) { + const auto& [first_column, left_const] = + unpack_if_const(block.get_by_position(arguments[0]).column); + const auto& [second_column, right_const] = + unpack_if_const(block.get_by_position(arguments[1]).column); + const auto& [three_column, three_is_const] = + unpack_if_const(block.get_by_position(arguments[2]).column); + auto limit_value = assert_cast(*three_column).get_int(0); + const auto& src_column = assert_cast(*first_column); + const auto& pattern_column = assert_cast(*second_column); + + auto nullable_string_type = make_nullable(std::make_shared()); + auto dest_column_ptr = ColumnArray::create(nullable_string_type->create_column(), + ColumnArray::ColumnOffsets::create()); + IColumn* dest_nested_column = &dest_column_ptr->get_data(); + auto& dest_offsets = dest_column_ptr->get_offsets(); + DCHECK(dest_nested_column != nullptr); + + NullMapType* dest_nested_null_map = nullptr; + auto* dest_nullable_col = assert_cast(dest_nested_column); + auto& dest_column_string = + assert_cast(*(dest_nullable_col->get_nested_column_ptr())); + dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + // split_by_regexp(ColumnString, "xxx") + if (right_const) { + RETURN_IF_ERROR(_execute_constant_pattern( + src_column, pattern_column.get_data_at(0), dest_column_string, dest_offsets, + dest_nested_null_map, limit_value, input_rows_count, &opts)); + } else if (left_const) { + // split_by_regexp("xxx", ColumnString) + _execute_constant_src_string(src_column.get_data_at(0), pattern_column, + dest_column_string, dest_offsets, dest_nested_null_map, + limit_value, input_rows_count, &opts); + } else { + // split_by_regexp(ColumnString, ColumnString) + _execute_vector_vector(src_column, pattern_column, dest_column_string, dest_offsets, + dest_nested_null_map, limit_value, input_rows_count, &opts); + } + + block.replace_by_position(result, std::move(dest_column_ptr)); + return Status::OK(); + } + +private: + static Status _execute_constant_pattern(const ColumnString& src_column_string, + const StringRef& pattern_ref, + ColumnString& dest_column_string, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map, Int64 limit_value, + size_t input_rows_count, RE2::Options* opts) { + const char* token_begin = nullptr; + const char* token_end = nullptr; + UInt64 index = 0; + std::unique_ptr re2_ptr = nullptr; + if (pattern_ref.size) { + re2_ptr = std::make_unique(pattern_ref.to_string_view(), *opts); + } + if (!re2_ptr->ok()) { + return Status::RuntimeError("Invalid pattern: {}", pattern_ref.debug_string()); + } + RegexpSplit RegexpSplit; + RegexpSplit.init(re2_ptr.get(), limit_value); + for (int row = 0; row < input_rows_count; ++row) { + auto str_data = src_column_string.get_data_at(row); + RegexpSplit.set(str_data.begin(), str_data.end()); + while (RegexpSplit.get(token_begin, token_end)) { + size_t token_size = token_end - token_begin; + dest_column_string.insert_data(token_begin, token_size); + dest_nested_null_map->push_back(false); + index += 1; + } + dest_offsets.push_back(index); + } + return Status::OK(); + } + + static void _execute_constant_src_string(const StringRef& str_ref, + const ColumnString& pattern_column, + ColumnString& dest_column_string, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map, Int64 limit_value, + size_t input_rows_count, RE2::Options* opts) { + const char* token_begin = nullptr; + const char* token_end = nullptr; + UInt64 index = 0; + RegexpSplit RegexpSplit; + + for (int row = 0; row < input_rows_count; ++row) { + std::unique_ptr re2_ptr = nullptr; + auto pattern = pattern_column.get_data_at(row); + if (pattern.size) { + re2_ptr = std::make_unique(pattern.to_string_view(), *opts); + if (!re2_ptr->ok()) { + dest_column_string.insert_default(); + dest_nested_null_map->push_back(true); + index += 1; + dest_offsets.push_back(index); + continue; + } + } + + RegexpSplit.init(re2_ptr.get(), limit_value); + RegexpSplit.set(str_ref.begin(), str_ref.end()); + while (RegexpSplit.get(token_begin, token_end)) { + size_t token_size = token_end - token_begin; + dest_column_string.insert_data(token_begin, token_size); + dest_nested_null_map->push_back(false); + index += 1; + } + dest_offsets.push_back(index); + } + } + + static void _execute_vector_vector(const ColumnString& src_column_string, + const ColumnString& pattern_column, + ColumnString& dest_column_string, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map, Int64 limit_value, + size_t input_rows_count, RE2::Options* opts) { + const char* token_begin = nullptr; + const char* token_end = nullptr; + UInt64 index = 0; + RegexpSplit RegexpSplit; + + for (int row = 0; row < input_rows_count; ++row) { + std::unique_ptr re2_ptr = nullptr; + auto str_data = src_column_string.get_data_at(row); + auto pattern = pattern_column.get_data_at(row); + if (pattern.size) { + re2_ptr = std::make_unique(pattern.to_string_view(), *opts); + if (!re2_ptr->ok()) { + dest_column_string.insert_default(); + dest_nested_null_map->push_back(true); + index += 1; + dest_offsets.push_back(index); + continue; + } + } + RegexpSplit.init(re2_ptr.get(), limit_value); + RegexpSplit.set(str_data.begin(), str_data.end()); + while (RegexpSplit.get(token_begin, token_end)) { + size_t token_size = token_end - token_begin; + dest_column_string.insert_data(token_begin, token_size); + dest_nested_null_map->push_back(false); + index += 1; + } + dest_offsets.push_back(index); + } + } +}; + +struct TwoArgumentImpl { + static DataTypes get_variadic_argument_types() { + return {std::make_shared(), std::make_shared()}; + } + + static Status execute_impl(FunctionContext* context, Block& block, + const ColumnNumbers& arguments, size_t result, + size_t input_rows_count) { + DCHECK_EQ(arguments.size(), 2); + auto max_limit = ColumnConst::create(ColumnInt32::create(1, -1), input_rows_count); + block.insert({std::move(max_limit), std::make_shared(), "max_limit"}); + ColumnNumbers temp_arguments = {arguments[0], arguments[1], block.columns() - 1}; + return ExecuteImpl::execute_impl(context, block, temp_arguments, result, input_rows_count); + } +}; + +struct ThreeArgumentImpl { + static DataTypes get_variadic_argument_types() { + return {std::make_shared(), std::make_shared(), + std::make_shared()}; + } + static Status execute_impl(FunctionContext* context, Block& block, + const ColumnNumbers& arguments, size_t result, + size_t input_rows_count) { + DCHECK_EQ(arguments.size(), 3); + return ExecuteImpl::execute_impl(context, block, arguments, result, input_rows_count); + } +}; + +void register_function_split_by_regexp(SimpleFunctionFactory& factory) { + factory.register_function>(); + factory.register_function>(); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h index 7619858153c203..727cf98cda38fd 100644 --- a/be/src/vec/functions/simple_function_factory.h +++ b/be/src/vec/functions/simple_function_factory.h @@ -109,6 +109,7 @@ void register_function_url(SimpleFunctionFactory& factory); void register_function_ip(SimpleFunctionFactory& factory); void register_function_multi_match(SimpleFunctionFactory& factory); void register_function_assert_true(SimpleFunctionFactory& factory); +void register_function_split_by_regexp(SimpleFunctionFactory& factory); class SimpleFunctionFactory { using Creator = std::function; @@ -313,6 +314,7 @@ class SimpleFunctionFactory { register_function_variant_element(instance); register_function_multi_match(instance); register_function_assert_true(instance); + register_function_split_by_regexp(instance); }); return instance; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 1c1a820952f263..7d9314cb74fd75 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -385,6 +385,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Encrypt; import org.apache.doris.nereids.trees.expressions.functions.scalar.Space; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByChar; +import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByRegexp; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByString; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitPart; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sqrt; @@ -881,6 +882,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Sm4Encrypt.class, "sm4_encrypt"), scalar(Space.class, "space"), scalar(SplitByChar.class, "split_by_char"), + scalar(SplitByRegexp.class, "split_by_regexp"), scalar(SplitByString.class, "split_by_string"), scalar(SplitPart.class, "split_part"), scalar(Sqrt.class, "sqrt"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java new file mode 100644 index 00000000000000..8d1d0145d71047 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.ArrayType; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'split_by_regexp'. This class is generated by + * GenerateFunction. + */ +public class SplitByRegexp extends ScalarFunction + implements ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(ArrayType.of(VarcharType.SYSTEM_DEFAULT)) + .args(StringType.INSTANCE, StringType.INSTANCE), + FunctionSignature.ret(ArrayType.of(VarcharType.SYSTEM_DEFAULT)) + .args(StringType.INSTANCE, StringType.INSTANCE, IntegerType.INSTANCE)); + + /** + * constructor with 2 arguments. + */ + public SplitByRegexp(Expression arg0, Expression arg1) { + super("split_by_regexp", arg0, arg1); + } + + /** + * constructor with 3 arguments. + */ + public SplitByRegexp(Expression arg0, Expression arg1, Expression arg2) { + super("split_by_regexp", arg0, arg1, arg2); + } + + /** + * withChildren. + */ + @Override + public SplitByRegexp withChildren(List children) { + Preconditions.checkArgument(children.size() == 2 || children.size() == 3); + if (children.size() == 2) { + return new SplitByRegexp(children.get(0), children.get(1)); + } else { + return new SplitByRegexp(children.get(0), children.get(1), children.get(2)); + } + } + + @Override + public void checkLegalityBeforeTypeCoercion() { + if (children().size() == 3) { + if (!child(2).isConstant() || !(child(2) instanceof IntegerLiteral) + || (((IntegerLiteral) child(2)).getValue() < 0)) { + throw new AnalysisException("the third parameter of " + + getName() + " function must be a positive constant: " + toSql()); + } + } + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitSplitByRegexp(this, context); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index b061e2f8d6a6f3..f85ce2fbed3b80 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -383,6 +383,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Encrypt; import org.apache.doris.nereids.trees.expressions.functions.scalar.Space; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByChar; +import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByRegexp; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByString; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitPart; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sqrt; @@ -1912,6 +1913,10 @@ default R visitSplitByChar(SplitByChar splitByChar, C context) { return visitScalarFunction(splitByChar, context); } + default R visitSplitByRegexp(SplitByRegexp splitByRegexp, C context) { + return visitScalarFunction(splitByRegexp, context); + } + default R visitSplitByString(SplitByString splitByString, C context) { return visitScalarFunction(splitByString, context); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 99c17f7dfc835a..3f4614dc337e51 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1659,6 +1659,8 @@ [['money_format'], 'VARCHAR', ['DECIMAL64'], ''], [['money_format'], 'VARCHAR', ['DECIMAL128'], ''], [['split_by_string'],'ARRAY_VARCHAR',['STRING','STRING'], ''], + [['split_by_regexp'],'ARRAY_VARCHAR',['STRING','STRING'], ''], + [['split_by_regexp'],'ARRAY_VARCHAR',['STRING','STRING', 'INT'], ''], [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], 'ALWAYS_NULLABLE'], [['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], 'DEPEND_ON_ARGUMENT'], [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'], ''], diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out new file mode 100644 index 00000000000000..588ad7fa5cb9d2 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out @@ -0,0 +1,52 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- +["a", "b", "c", "d", "e"] + +-- !select2 -- +["a", "bc", "de", "f"] + +-- !select3 -- +\N + +-- !select4 -- +\N + +-- !select5 -- +["abcde"] +["12553"] +[""] +[""] +[""] +["a1b1c1d"] +["", "", "", ""] +["a", "b", "c"] +["a", "b", "c", ""] +\N +["a", "b", "c", "12345", ""] + +-- !select6 -- +["d", "o", ",", "r", "i", "s"] +["d", "o", ",", "r", "i", "s"] +["d", "o", ",", "r", "i", "s"] +["do", "ris"] +["do,ris"] +["do,ris"] +["do", "ris"] +["do", "ris"] +["do", "ris"] +["do", "ris"] +["do", "ris"] + +-- !select7 -- +["a", "b", "c", "d", "e"] +["1", "2", "5", "5", "3"] +[] +[""] +[""] +["a", "b", "c", "d"] +["", "", "", ""] +["a", "b", "c"] +["a", "b", "c", ""] +\N +["a", "b", "c", "12345", ""] + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy new file mode 100644 index 00000000000000..c9ace391b5bebb --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_split_by_regexp") { + qt_select1 "select split_by_regexp('abcde','');" + qt_select2 "select split_by_regexp('a12bc23de345f','\\\\d+');" + qt_select3 "select split_by_regexp('a12bc23de345f',NULL);" + qt_select4 "select split_by_regexp(NULL, 'a12bc23de345f');" + + def tableName1 = "test_split_by_regexp" + + sql """DROP TABLE IF EXISTS ${tableName1}""" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName1} ( + `k1` int(11) NULL COMMENT "", + `v1` varchar(20) NULL COMMENT "", + `v2` varchar(1) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + sql """ INSERT INTO ${tableName1} VALUES(1, 'abcde', '') """ + sql """ INSERT INTO ${tableName1} VALUES(2, '12553', '') """ + sql """ INSERT INTO ${tableName1} VALUES(3, '', '') """ + sql """ INSERT INTO ${tableName1} VALUES(4, '', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(5, '', 'a') """ + sql """ INSERT INTO ${tableName1} VALUES(6, 'a1b1c1d', '1') """ + sql """ INSERT INTO ${tableName1} VALUES(7, ',,,', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(8, 'a,b,c', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(9, 'a,b,c,', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(10, null, ',') """ + sql """ INSERT INTO ${tableName1} VALUES(11, 'a,b,c,12345,', ',') """ + + test { + sql " select split_by_regexp(NULL, 'a12bc23de345f', k1) from test_split_by_regexp" + exception "function must be a positive constant" + } + test { + sql " select split_by_regexp(NULL, 'a12bc23de345f', -10) from test_split_by_regexp" + exception "function must be a positive constant" + } + test { + sql " select split_by_regexp(NULL, 'a12bc23de345f', 1 + 2) from test_split_by_regexp" + exception "function must be a positive constant" + } + qt_select5 "select split_by_regexp(v1, ',') from test_split_by_regexp order by k1;" + qt_select6 "select split_by_regexp('do,ris', v2) from test_split_by_regexp order by k1;" + qt_select7 "select split_by_regexp(v1, v2) from test_split_by_regexp order by k1;" +} + From 966a91ee50a96bb6d42ec1bfdd798892f8cfa6ff Mon Sep 17 00:00:00 2001 From: James Date: Mon, 10 Feb 2025 15:34:43 +0800 Subject: [PATCH 2/4] [fix](function)Fix split_by_regexp function integer parameter couldn't set bug. (#47676) ### What problem does this PR solve? split_by_regexp function's third parameter doesn't support tinyint. This pr is to fix it. ``` mysql> select split_by_regexp('aa,bb,cc', ',', 1); ERROR 1105 (HY000): errCode = 2, detailMessage = the third parameter of split_by_regexp function must be a positive constant: split_by_regexp('aa,bb,cc', ',', 1) ``` --- .../functions/scalar/SplitByRegexp.java | 6 +++--- .../string_functions/test_split_by_regexp.out | 18 ++++++++++++++++++ .../test_split_by_regexp.groovy | 6 ++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java index 8d1d0145d71047..a72ed434cc35f7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SplitByRegexp.java @@ -22,7 +22,7 @@ import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; -import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; +import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.ArrayType; import org.apache.doris.nereids.types.IntegerType; @@ -77,8 +77,8 @@ public SplitByRegexp withChildren(List children) { @Override public void checkLegalityBeforeTypeCoercion() { if (children().size() == 3) { - if (!child(2).isConstant() || !(child(2) instanceof IntegerLiteral) - || (((IntegerLiteral) child(2)).getValue() < 0)) { + if (!child(2).isConstant() || !(child(2) instanceof IntegerLikeLiteral) + || (((IntegerLikeLiteral) child(2)).getIntValue() < 0)) { throw new AnalysisException("the third parameter of " + getName() + " function must be a positive constant: " + toSql()); } diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out index 588ad7fa5cb9d2..1fb99f58ab1c7c 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out @@ -50,3 +50,21 @@ \N ["a", "b", "c", "12345", ""] +-- !select8 -- +["aa,bbb,cccc"] + +-- !select9 -- +["aa", "bbb,cccc"] + +-- !select10 -- +["aa", "bbb", "cccc"] + +-- !select11 -- +["aa", "bbb", "cccc"] + +-- !select12 -- +["aa", "bbb", "cccc"] + +-- !select13 -- +["aa", "bbb", "cccc"] + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy index c9ace391b5bebb..4b9719068e6606 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy @@ -64,5 +64,11 @@ suite("test_split_by_regexp") { qt_select5 "select split_by_regexp(v1, ',') from test_split_by_regexp order by k1;" qt_select6 "select split_by_regexp('do,ris', v2) from test_split_by_regexp order by k1;" qt_select7 "select split_by_regexp(v1, v2) from test_split_by_regexp order by k1;" + qt_select8 "select split_by_regexp('aa,bbb,cccc', ',', 1);" + qt_select9 "select split_by_regexp('aa,bbb,cccc', ',', 2);" + qt_select10 "select split_by_regexp('aa,bbb,cccc', ',', 3);" + qt_select11 "select split_by_regexp('aa,bbb,cccc', ',', 4);" + qt_select12 "select split_by_regexp('aa,bbb,cccc', ',', 100000000);" + qt_select13 "select split_by_regexp('aa,bbb,cccc', ',', 10000000000000);" } From b367c9859669ef735ba7ff695a8dd2c6a9aff930 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 2 Jun 2025 15:05:33 +0800 Subject: [PATCH 3/4] [bug](function) fix split_by_regexp meet empty string return error (#51293) when pattern is empty, it's should split all alone , not return error msg `mysql> select id,name,score, k,v from table_test lateral view posexplode(split_by_regexp(name,'')) tmp as k,v order by id; ERROR 1105 (HY000): errCode = 2, detailMessage = (10.16.10.2)[RUNTIME_ERROR]Invalid pattern:` --- .../functions/function_split_by_regexp.cpp | 3 --- .../string_functions/test_split_by_regexp.out | 26 +++++++++++++++++++ .../test_split_by_regexp.groovy | 2 ++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/be/src/vec/functions/function_split_by_regexp.cpp b/be/src/vec/functions/function_split_by_regexp.cpp index 40628ee2017fc7..f2c419ec220302 100644 --- a/be/src/vec/functions/function_split_by_regexp.cpp +++ b/be/src/vec/functions/function_split_by_regexp.cpp @@ -247,9 +247,6 @@ struct ExecuteImpl { if (pattern_ref.size) { re2_ptr = std::make_unique(pattern_ref.to_string_view(), *opts); } - if (!re2_ptr->ok()) { - return Status::RuntimeError("Invalid pattern: {}", pattern_ref.debug_string()); - } RegexpSplit RegexpSplit; RegexpSplit.init(re2_ptr.get(), limit_value); for (int row = 0; row < input_rows_count; ++row) { diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out index 1fb99f58ab1c7c..483d9d89f872e5 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_regexp.out @@ -68,3 +68,29 @@ -- !select13 -- ["aa", "bbb", "cccc"] +-- !select14 -- +abcde ["a", "b", "c", "d", "e"] +12553 ["1", "2", "5", "5", "3"] + [] + [] + [] +a1b1c1d ["a", "1", "b", "1", "c", "1", "d"] +,,, [",", ",", ","] +a,b,c ["a", ",", "b", ",", "c"] +a,b,c, ["a", ",", "b", ",", "c", ","] +\N \N +a,b,c,12345, ["a", ",", "b", ",", "c", ",", "1", "2", "3", "4", "5", ","] + +-- !select15 -- + [] + [] + [] +, [","] +a ["a"] +1 ["1"] +, [","] +, [","] +, [","] +, [","] +, [","] + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy index 4b9719068e6606..394a8e721aafab 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_regexp.groovy @@ -70,5 +70,7 @@ suite("test_split_by_regexp") { qt_select11 "select split_by_regexp('aa,bbb,cccc', ',', 4);" qt_select12 "select split_by_regexp('aa,bbb,cccc', ',', 100000000);" qt_select13 "select split_by_regexp('aa,bbb,cccc', ',', 10000000000000);" + qt_select14 "select v1,split_by_regexp(v1, '') from test_split_by_regexp order by k1;" + qt_select15 "select v2,split_by_regexp(v2, '') from test_split_by_regexp order by k1;" } From 39d11d1e3b0dbee7e48e669027d4c861ee83ff2b Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Tue, 25 Mar 2025 17:15:22 +0800 Subject: [PATCH 4/4] [improve](function) support lead/lag function input column as third params (#49381) support lead/lag function could input column eg: lead(col,1,col)/lag(col,1,col) --- .../aggregate_function_window.h | 66 +++++++++---------- .../expressions/functions/window/Lag.java | 5 +- .../expressions/functions/window/Lead.java | 5 +- .../functions/window/WindowFunction.java | 6 +- .../correctness_p0/test_lag_lead_window.out | 11 ++++ .../test_lag_lead_window.groovy | 39 +++++++++++ 6 files changed, 86 insertions(+), 46 deletions(-) diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.h b/be/src/vec/aggregate_functions/aggregate_function_window.h index 10eb3866ee31a0..bb0f76c58e97eb 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_window.h +++ b/be/src/vec/aggregate_functions/aggregate_function_window.h @@ -21,13 +21,11 @@ #pragma once #include -#include -#include #include #include +#include #include -#include #include "gutil/integral_types.h" #include "vec/aggregate_functions/aggregate_function.h" @@ -353,15 +351,10 @@ struct LeadLagData { static constexpr bool result_nullable = result_is_nullable; void reset() { _data_value.reset(); - _default_value.reset(); _is_inited = false; + _offset_value = 0; } - bool default_is_null() { return _default_value.is_null(); } - - // here _ptr pointer default column from third - void set_value_from_default() { this->_data_value = _default_value; } - void insert_result_into(IColumn& to) const { if constexpr (result_is_nullable) { if (_data_value.is_null()) { @@ -378,8 +371,6 @@ struct LeadLagData { } } - void set_is_null() { this->_data_value.reset(); } - void set_value(const IColumn** columns, size_t pos) { if constexpr (arg_is_nullable) { if (assert_cast(columns[0]) @@ -393,40 +384,47 @@ struct LeadLagData { _data_value.set_value(columns[0], pos); } - void check_default(const IColumn* column) { - if (!_is_inited) { - if (is_column_nullable(*column)) { - const auto* nullable_column = - assert_cast(column); - if (nullable_column->is_null_at(0)) { - _default_value.reset(); - } else { - _default_value.set_value(nullable_column->get_nested_column_ptr(), 0); - } + void set_value_from_default(const IColumn* column, size_t pos) { + DCHECK_GE(pos, 0); + if (is_column_nullable(*column)) { + const auto* nullable_column = + assert_cast(column); + if (nullable_column->is_null_at(pos)) { + this->_data_value.reset(); } else { - _default_value.set_value(column, 0); + this->_data_value.set_value(nullable_column->get_nested_column_ptr().get(), pos); } + } else { + this->_data_value.set_value(column, pos); + } + } + + void set_offset_value(const IColumn* column) { + if (!_is_inited) { + const auto* column_number = assert_cast(column); + _offset_value = column_number->get_data()[0]; _is_inited = true; } } + int64_t get_offset_value() const { return _offset_value; } + private: BaseValue _data_value; - BaseValue _default_value; bool _is_inited = false; + int64_t _offset_value = 0; }; template struct WindowFunctionLeadImpl : Data { void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, const IColumn** columns) { - this->check_default(columns[2]); if (frame_end > partition_end) { //output default value, win end is under partition - if (this->default_is_null()) { - this->set_is_null(); - } else { - this->set_value_from_default(); - } + this->set_offset_value(columns[1]); + // eg: lead(column, 10, default_value), column size maybe 3 rows + // offset value 10 is from second argument, pos: 11 is calculated as frame_end + auto pos = frame_end - 1 - this->get_offset_value(); + this->set_value_from_default(columns[2], pos); return; } this->set_value(columns, frame_end - 1); @@ -439,13 +437,11 @@ template struct WindowFunctionLagImpl : Data { void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, const IColumn** columns) { - this->check_default(columns[2]); + // window start is beyond partition if (partition_start >= frame_end) { //[unbound preceding(0), offset preceding(-123)] - if (this->default_is_null()) { // win start is beyond partition - this->set_is_null(); - } else { - this->set_value_from_default(); - } + this->set_offset_value(columns[1]); + auto pos = frame_end - 1 + this->get_offset_value(); + this->set_value_from_default(columns[2], pos); return; } this->set_value(columns, frame_end - 1); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lag.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lag.java index fc74cadebfce04..58ff2b820f03f8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lag.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lag.java @@ -99,7 +99,7 @@ public void checkLegalityBeforeTypeCoercion() { return; } if (children().size() >= 2) { - checkValidParams(getOffset(), true); + checkValidParams(getOffset()); if (getOffset() instanceof Literal) { if (((Literal) getOffset()).getDouble() < 0) { throw new AnalysisException( @@ -109,9 +109,6 @@ public void checkLegalityBeforeTypeCoercion() { throw new AnalysisException( "The offset parameter of LAG must be a constant positive integer: " + this.toSql()); } - if (children().size() >= 3) { - checkValidParams(getDefaultValue(), false); - } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lead.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lead.java index 251141a68cb222..b0de4ad571b8e8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lead.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/Lead.java @@ -94,7 +94,7 @@ public void checkLegalityBeforeTypeCoercion() { return; } if (children().size() >= 2) { - checkValidParams(getOffset(), true); + checkValidParams(getOffset()); if (getOffset() instanceof Literal) { if (((Literal) getOffset()).getDouble() < 0) { throw new AnalysisException( @@ -104,9 +104,6 @@ public void checkLegalityBeforeTypeCoercion() { throw new AnalysisException( "The offset parameter of LAG must be a constant positive integer: " + this.toSql()); } - if (children().size() >= 3) { - checkValidParams(getDefaultValue(), false); - } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/WindowFunction.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/WindowFunction.java index 1265f685b26e69..13b4ec58476849 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/WindowFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/window/WindowFunction.java @@ -59,14 +59,14 @@ public int computeHashCode() { /** * LAG/LEAD param must be const, and offset must be number */ - protected void checkValidParams(Expression param, boolean isOffset) { + protected void checkValidParams(Expression param) { DataType type = param.getDataType(); - if (isOffset == true && !type.isNumericType()) { + if (!type.isNumericType()) { throw new AnalysisException("The offset of LAG/LEAD must be a number: " + this.toSql()); } if (!param.isConstant()) { throw new AnalysisException( - "The parameter 2 or parameter 3 of LAG/LEAD must be a constant value: " + this.toSql()); + "The parameter 2 of LAG/LEAD must be a constant value: " + this.toSql()); } } } diff --git a/regression-test/data/correctness_p0/test_lag_lead_window.out b/regression-test/data/correctness_p0/test_lag_lead_window.out index 041314a1c6535d..f9927b708c9130 100644 --- a/regression-test/data/correctness_p0/test_lag_lead_window.out +++ b/regression-test/data/correctness_p0/test_lag_lead_window.out @@ -9,6 +9,11 @@ a aa /wyyt-image/2021/11/13/595345040188712460.jpg b aa /wyyt-image/2022/04/13/1434607674511761493.jpg /wyyt-image/2022/04/13/1434607674511761493.jpg c cc /wyyt-image/2022/04/13/1434607674511761493.jpg +-- !select_default3 -- +a /wyyt-image/2021/11/13/595345040188712460.jpg aa aa aa +b /wyyt-image/2022/04/13/1434607674511761493.jpg aa /wyyt-image/2022/04/13/1434607674511761493.jpg aa +c /wyyt-image/2022/04/13/1434607674511761493.jpg cc cc /wyyt-image/2022/04/13/1434607674511761493.jpg + -- !select_default -- c 2022-09-06T00:00:02 2022-09-06T00:00:01 b 2022-09-06T00:00:01 2022-09-06T00:00 @@ -49,3 +54,9 @@ a 2022-09-06T00:00 \N b 2022-09-06T00:00:01 \N c 2022-09-06T00:00:02 \N +-- !select_lead_7 -- +2023-01-01 1 10 20 10 +2023-01-02 1 20 30 10 +2023-01-03 1 30 \N 20 +2023-01-04 1 \N \N 30 + diff --git a/regression-test/suites/correctness_p0/test_lag_lead_window.groovy b/regression-test/suites/correctness_p0/test_lag_lead_window.groovy index 1dfccca58ee6f9..0cf731e7d9ea5a 100644 --- a/regression-test/suites/correctness_p0/test_lag_lead_window.groovy +++ b/regression-test/suites/correctness_p0/test_lag_lead_window.groovy @@ -41,6 +41,12 @@ suite("test_lag_lead_window") { lead(cc,1,'') over (PARTITION by cc order by aa) as lead_cc from ${tableName} order by aa; """ + + qt_select_default3 """ select aa,cc,bb,lead(cc,1,bb) over (PARTITION by cc order by aa) as lead_res, + lag(cc,1,bb) over (PARTITION by cc order by aa) as lag_res + from ${tableName} + order by aa; """ + sql """ DROP TABLE IF EXISTS test1 """ sql """ CREATE TABLE IF NOT EXISTS test1 (id varchar(255), create_time datetime) DISTRIBUTED BY HASH(id) PROPERTIES("replication_num" = "1"); """ @@ -61,4 +67,37 @@ suite("test_lag_lead_window") { sql """ DROP TABLE IF EXISTS test1 """ + + qt_select_lead_7 """ SELECT + sale_date, + product_id, + quantity, + LEAD (quantity, 1, quantity) OVER ( + PARTITION BY + product_id + ORDER BY + sale_date + ) AS next_day_quantity, + LAG (quantity, 1, quantity) OVER ( + PARTITION BY + product_id + ORDER BY + sale_date + ) AS pre_day_quantity + FROM + ( + select 1 AS product_id, '2023-01-01' AS sale_date, 10 AS quantity + UNION ALL + select 1, '2023-01-02', 20 + UNION ALL + select 1, '2023-01-03', 30 + UNION ALL + select 1, '2023-01-04', NULL + ) AS t + ORDER BY + product_id, + sale_date; + """ + + }