diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index c9b00b653fc..661b1233cfb 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -15,10 +15,12 @@ #pragma once #include +#include #include #include #include +#include #include #include #if USE_RE2_ST @@ -114,11 +116,14 @@ class OptimizedRegularExpressionImpl } Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op); + std::optional substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur); private: - Int64 processEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); - Int64 getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); + Int64 processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + Int64 instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); + std::optional processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + std::optional substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur); bool is_trivial; bool required_substring_is_prefix; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 4d717f9c602..a73e78562b5 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -17,10 +17,12 @@ #include #include #include +#include #include #include #include +#include #define MIN_LENGTH_FOR_STRSTR 3 #define MAX_SUBPATTERNS 5 @@ -474,7 +476,7 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject } template -Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) +Int64 OptimizedRegularExpressionImpl::processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) { if (occur != 1) return 0; @@ -483,19 +485,38 @@ Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const return RegexType::FindAndConsume(&expr_sp, *re2) ? pos : 0; } -static inline void checkArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) +template +std::optional OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur) +{ + if (occur != 1 || byte_pos != 1) + return std::nullopt; + + StringPieceType expr_sp(expr, expr_size); + StringPieceType matched_str; + if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) + return std::nullopt; + + return std::optional(StringRef(matched_str.data(), matched_str.size())); +} + +static inline void checkInstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) { RUNTIME_CHECK_MSG(!(ret_op != 0 && ret_op != 1), "Incorrect argument to regexp function: return_option must be 1 or 0"); RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function."); } +static inline void checkSubstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos) +{ + RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function."); +} + static inline void makeOccurValid(Int64 & occur) { - occur = occur < 0 ? 1 : occur; + occur = occur < 1 ? 1 : occur; } template -Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op) +Int64 OptimizedRegularExpressionImpl::instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op) { size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched @@ -516,20 +537,53 @@ Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const c return ret_op == 0 ? DB::UTF8::bytePos2Utf8Pos(reinterpret_cast(subject), byte_offset + 1) : DB::UTF8::bytePos2Utf8Pos(reinterpret_cast(subject), byte_offset + matched_str.size() + 1); } +template +std::optional OptimizedRegularExpressionImpl::substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur) +{ + size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 + const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched + size_t expr_size = subject_size - byte_offset; + + StringPieceType expr_sp(expr, expr_size); + StringPieceType matched_str; + while (occur > 0) + { + if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) + return std::nullopt; + + --occur; + } + + return std::optional(StringRef(matched_str.data(), matched_str.size())); +} + template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast(subject), subject_size); + ; + checkInstrArgs(utf8_total_len, subject_size, pos, ret_op); + makeOccurValid(occur); - checkArgs(utf8_total_len, subject_size, pos, ret_op); + if (unlikely(subject_size == 0)) + return processInstrEmptyStringExpr(subject, subject_size, pos, occur); + size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast(subject), pos); + return instrImpl(subject, subject_size, byte_pos, occur, ret_op); +} + +template +std::optional OptimizedRegularExpressionImpl::substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur) +{ + Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast(subject), subject_size); + checkSubstrArgs(utf8_total_len, subject_size, pos); makeOccurValid(occur); if (unlikely(subject_size == 0)) - return processEmptyStringExpr(subject, subject_size, pos, occur); + return processSubstrEmptyStringExpr(subject, subject_size, pos, occur); size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast(subject), pos); - return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op); + return substrImpl(subject, subject_size, byte_pos, occur); } #undef MIN_LENGTH_FOR_STRSTR diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index d7a72b0ea32..9f36318c7c5 100755 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -436,7 +436,7 @@ const std::unordered_map scalar_func_map({ {tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"}, {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"}, // {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"}, - // {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"}, + {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"}, //{tipb::ScalarFuncSig::JsonExtractSig, "cast"}, //{tipb::ScalarFuncSig::JsonUnquoteSig, "cast"}, diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 18bd60c17fa..bc25b91569b 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -287,6 +287,7 @@ struct ReplaceRegexpImpl using FunctionTiDBRegexp = FunctionStringRegexp; using FunctionRegexpLike = FunctionStringRegexp; using FunctionRegexpInstr = FunctionStringRegexpInstr; +using FunctionRegexpSubstr = FunctionStringRegexpSubstr; using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; @@ -297,6 +298,7 @@ void registerFunctionsRegexp(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); } } // namespace DB diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index b074ddd7b6b..b1b167a4f15 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -76,6 +76,10 @@ struct NameRegexpInstr { static constexpr auto name = "regexp_instr"; }; +struct NameRegexpSubstr +{ + static constexpr auto name = "regexp_substr"; +}; struct NameReplaceRegexpOne { static constexpr auto name = "replaceRegexpOne"; @@ -175,6 +179,22 @@ GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) } } +// We need to fill something into StringColumn when all elements are null +inline void fillColumnStringWhenAllNull(decltype(ColumnString::create()) & col_res, size_t size) +{ + auto & col_res_data = col_res->getChars(); + auto & col_res_offsets = col_res->getOffsets(); + col_res_data.resize(size); + col_res_offsets.resize(size); + + size_t offset = 0; + for (size_t i = 0; i < size; ++i) + { + col_res_data[offset++] = 0; + col_res_offsets[i] = offset; + } +} + template class ParamString { @@ -1512,6 +1532,348 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase #undef GET_MATCH_TYPE_ACTUAL_PARAM #undef EXECUTE_REGEXP_INSTR +#define EXECUTE_REGEXP_SUBSTR() \ + do \ + { \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, *(EXPR_PARAM_PTR_VAR_NAME), *(PAT_PARAM_PTR_VAR_NAME), *(POS_PARAM_PTR_VAR_NAME), *(OCCUR_PARAM_PTR_VAR_NAME), *(MATCH_TYPE_PARAM_PTR_VAR_NAME)); \ + } while (0); + +// Method to get actual match type param +#define GET_MATCH_TYPE_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(MATCH_TYPE_PV_VAR_NAME, MATCH_TYPE_PARAM_PTR_VAR_NAME, ({EXECUTE_REGEXP_SUBSTR()})) \ + } while (0); + +// Method to get actual occur param +#define GET_OCCUR_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_INT_PARAM(OCCUR_PV_VAR_NAME, OCCUR_PARAM_PTR_VAR_NAME, ({GET_MATCH_TYPE_ACTUAL_PARAM()})) \ + } while (0); + +// Method to get actual position param +#define GET_POS_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_INT_PARAM(POS_PV_VAR_NAME, POS_PARAM_PTR_VAR_NAME, ({GET_OCCUR_ACTUAL_PARAM()})) \ + } while (0); + +// Method to get actual pattern param +#define GET_PAT_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(PAT_PV_VAR_NAME, PAT_PARAM_PTR_VAR_NAME, ({GET_POS_ACTUAL_PARAM()})) \ + } while (0); + +// Method to get actual expression param +#define GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(EXPR_PV_VAR_NAME, EXPR_PARAM_PTR_VAR_NAME, ({GET_PAT_ACTUAL_PARAM()})) \ + } while (0); + +// The entry to get actual params and execute regexp functions +#define GET_ACTUAL_PARAMS_AND_EXECUTE() \ + do \ + { \ + GET_EXPR_ACTUAL_PARAM() \ + } while (0); + +// Implementation of regexp_substr function +template +class FunctionStringRegexpSubstr : public FunctionStringRegexpBase + , public IFunction +{ +public: + using ResultType = String; + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + bool useDefaultImplementationForNulls() const override { return false; } + size_t getNumberOfArguments() const override { return 0; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + size_t arg_num = arguments.size(); + if (arg_num < REGEXP_MIN_PARAM_NUM) + throw Exception("Too few arguments", ErrorCodes::TOO_LESS_ARGUMENTS_FOR_FUNCTION); + else if (arg_num > REGEXP_SUBSTR_MAX_PARAM_NUM) + throw Exception("Too many arguments", ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION); + + bool has_nullable_col = false; + bool has_data_type_nothing = false; + bool is_str_arg; + + // Check type of arguments + for (size_t i = 0; i < arg_num; ++i) + { + // Index at 0, 1 and 4 arguments should be string type, otherwise int type. + is_str_arg = (i <= 1 || i == 4); + checkInputArg(arguments[i], is_str_arg, &has_nullable_col, &has_data_type_nothing); + } + + if (has_data_type_nothing) + return std::make_shared(std::make_shared()); + + return std::make_shared(std::make_shared()); + } + + template + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const PosT & pos_param, const OccurT & occur_param, const MatchTypeT & match_type_param) const + { + size_t col_size = expr_param.getDataNum(); + + // Get function pointers to process the specific int type + GetIntFuncPointerType get_pos_func = getGetIntFuncPointer(pos_param.getIntType()); + GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); + + // Container will not be used when parm is const + const void * pos_container = pos_param.getContainer(); + const void * occur_container = occur_param.getContainer(); + + // Const value will not be used when param is not const + Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; + Int64 occur_const_val = OccurT::isConst() ? occur_param.template getInt(0) : -1; + + // Check if args are all const columns + if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && MatchTypeT::isConst()) + { + if (expr_param.isNullAt(0) || pat_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || match_type_param.isNullAt(0)) + { + res_arg.column = res_arg.type->createColumnConst(col_size, Null()); + return; + } + + String pat = pat_param.getString(0); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + + int flags = getDefaultFlags(); + String expr = expr_param.getString(0); + String match_type = match_type_param.getString(0); + pat = fmt::format("({})", pat); + + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + auto res = regexp.substr(expr.c_str(), expr.size(), pos_const_val, occur_const_val); + if (res) + res_arg.column = res_arg.type->createColumnConst(col_size, toField(res.value().toString())); + else + res_arg.column = res_arg.type->createColumnConst(col_size, Null()); + return; + } + + // Initialize result column + auto col_res = ColumnString::create(); + col_res->reserve(col_size); + + constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || MatchTypeT::isNullableCol(); + +#define GET_POS_VALUE(idx) \ + do \ + { \ + if constexpr (PosT::isConst()) \ + pos = pos_const_val; \ + else \ + pos = get_pos_func(pos_container, idx); \ + } while (0); + +#define GET_OCCUR_VALUE(idx) \ + do \ + { \ + if constexpr (OccurT::isConst()) \ + occur = occur_const_val; \ + else \ + occur = get_occur_func(occur_container, idx); \ + } while (0); + + StringRef expr_ref; + String pat; + Int64 pos; + Int64 occur; + String match_type; + + auto null_map_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = null_map_col->getData(); + null_map.resize(col_size, 1); + + // Start to execute substr + if (canMemorize()) + { + std::unique_ptr regexp; + if (col_size > 0) + { + regexp = memorize(pat_param, match_type_param, collator); + if (regexp == nullptr) + { + fillColumnStringWhenAllNull(col_res, col_size); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(null_map_col)); + return; + } + } + + if constexpr (has_nullable_col) + { + for (size_t i = 0; i < col_size; ++i) + { + if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i)) + { + // null_map has been set to 1 in the previous + col_res->insertData("", 0); + continue; + } + + expr_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + + executeAndSetResult(*regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); + } + } + else + { + for (size_t i = 0; i < col_size; ++i) + { + expr_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + + executeAndSetResult(*regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); + } + } + } + else + { + if constexpr (has_nullable_col) + { + for (size_t i = 0; i < col_size; ++i) + { + if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || match_type_param.isNullAt(i)) + { + // null_map has been set to 1 in the previous + col_res->insertData("", 0); + continue; + } + + pat = pat_param.getString(i); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + + expr_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + match_type = match_type_param.getString(i); + pat = fmt::format("({})", pat); + + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); + } + } + else + { + for (size_t i = 0; i < col_size; ++i) + { + pat = pat_param.getString(i); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + + expr_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + match_type = match_type_param.getString(i); + pat = fmt::format("({})", pat); + + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); + } + } + } +#undef GET_OCCUR_VALUE +#undef GET_POS_VALUE + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(null_map_col)); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + // Do something related with nullable columns + NullPresence null_presence = getNullPresense(block, arguments); + + if (null_presence.has_null_constant) + { + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); + return; + } + + const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; + const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; + + size_t arg_num = arguments.size(); + auto & RES_ARG_VAR_NAME = block.getByPosition(result); + + ColumnPtr col_pos; + ColumnPtr col_occur; + ColumnPtr col_match_type; + + // Go through cases to get arguments + switch (arg_num) + { + case REGEXP_SUBSTR_MAX_PARAM_NUM: + col_match_type = block.getByPosition(arguments[4]).column; + case REGEXP_SUBSTR_MAX_PARAM_NUM - 1: + col_occur = block.getByPosition(arguments[3]).column; + case REGEXP_SUBSTR_MAX_PARAM_NUM - 2: + col_pos = block.getByPosition(arguments[2]).column; + }; + + size_t col_size = col_expr->size(); + + ParamVariant EXPR_PV_VAR_NAME(col_expr, col_size, StringRef("", 0)); + ParamVariant PAT_PV_VAR_NAME(col_pat, col_size, StringRef("", 0)); + ParamVariant POS_PV_VAR_NAME(col_pos, col_size, 1); + ParamVariant OCCUR_PV_VAR_NAME(col_occur, col_size, 1); + ParamVariant MATCH_TYPE_PV_VAR_NAME(col_match_type, col_size, StringRef("", 0)); + + GET_ACTUAL_PARAMS_AND_EXECUTE() + } + +private: + void executeAndSetResult( + Regexps::Regexp & regexp, + ColumnString::MutablePtr & col_res, + typename ColumnUInt8::Container & null_map, + size_t idx, + const char * subject, + size_t subject_size, + Int64 pos, + Int64 occur) const + { + auto res = regexp.substr(subject, subject_size, pos, occur); + if (res) + { + col_res->insertData(res.value().data, res.value().size); + null_map[idx] = 0; + } + else + { + col_res->insertData("", 0); + null_map[idx] = 1; + } + } + + TiDB::TiDBCollatorPtr collator = nullptr; +}; + +#undef GET_ACTUAL_PARAMS_AND_EXECUTE +#undef GET_EXPR_ACTUAL_PARAM +#undef GET_PAT_ACTUAL_PARAM +#undef GET_POS_ACTUAL_PARAM +#undef GET_OCCUR_ACTUAL_PARAM +#undef GET_MATCH_TYPE_ACTUAL_PARAM +#undef EXECUTE_REGEXP_SUBSTR + #undef GET_ACTUAL_INT_PARAM #undef GET_ACTUAL_STRING_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 2ed1a1391f2..bdbd61f92a5 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2210,10 +2210,10 @@ TEST_F(Regexp, testRegexpCustomerCases) namespace { -template -std::vector getResultVec(const std::vector & test_cases) +template +std::vector getResultVec(const std::vector & test_cases) { - std::vector vecs; + std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.result); @@ -2313,7 +2313,7 @@ struct RegexpInstrCase static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) { - results = getResultVec(test_cases); + results = getResultVec(test_cases); switch (param_num) { case 6: @@ -2327,6 +2327,9 @@ struct RegexpInstrCase case 2: pats = getPatVec(test_cases); exprs = getExprVec(test_cases); + break; + default: + throw DB::Exception("Invalid param_num"); } } @@ -2547,7 +2550,7 @@ TEST_F(Regexp, RegexpInstr) {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2568,7 +2571,7 @@ TEST_F(Regexp, RegexpInstr) {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2725,6 +2728,374 @@ TEST_F(Regexp, RegexpInstr) } } +struct RegexpSubstrCase +{ + RegexpSubstrCase(const String & res, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") + : result(res) + , expression(expr) + , pattern(pat) + , position(pos) + , occurrence(occur) + , match_type(mt) + {} + + RegexpSubstrCase(const String & res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") + : result(res) + , null_map(null_map_) + , expression(expr) + , pattern(pat) + , position(pos) + , occurrence(occur) + , match_type(mt) + {} + + static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) + { + results = getResultVec(test_cases); + switch (param_num) + { + case 5: + match_types = getMatchTypeVec(test_cases); + case 4: + occurs = getOccurVec(test_cases); + case 3: + positions = getPosVec(test_cases); + case 2: + pats = getPatVec(test_cases); + exprs = getExprVec(test_cases); + break; + default: + throw DB::Exception("Invalid param_num"); + } + } + + static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) + { + null_map.clear(); + null_map.resize(REGEXP_SUBSTR_MAX_PARAM_NUM); + for (const auto & elem : test_cases) + { + null_map[EXPR_NULL_MAP_IDX].push_back(elem.null_map[EXPR_NULL_MAP_IDX]); + null_map[PAT_NULL_MAP_IDX].push_back(elem.null_map[PAT_NULL_MAP_IDX]); + null_map[POS_NULL_MAP_IDX].push_back(elem.null_map[POS_NULL_MAP_IDX]); + null_map[OCCUR_NULL_MAP_IDX].push_back(elem.null_map[OCCUR_NULL_MAP_IDX]); + null_map[MATCH_TYPE_NULL_MAP_IDX].push_back(elem.null_map[MATCH_TYPE_NULL_MAP_IDX]); + } + + setVecsWithoutNullMap(param_num, test_cases, results, exprs, pats, positions, occurs, match_types); + } + + const static UInt8 REGEXP_SUBSTR_MAX_PARAM_NUM = 5; + const static UInt8 EXPR_NULL_MAP_IDX = 0; + const static UInt8 PAT_NULL_MAP_IDX = 1; + const static UInt8 POS_NULL_MAP_IDX = 2; + const static UInt8 OCCUR_NULL_MAP_IDX = 3; + const static UInt8 MATCH_TYPE_NULL_MAP_IDX = 4; + + String result; + std::vector null_map; + String expression; + String pattern; + Int64 position; + Int64 occurrence; + String match_type; +}; + +TEST_F(Regexp, RegexpSubstr) +{ + // Test: All columns are const + { + for (size_t row_size = 1; row_size < 3; ++row_size) + { + ASSERT_COLUMN_EQ(createConstColumn>(row_size, "123"), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, "12"), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, "ab"), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "aabab"), + createConstColumn(row_size, "aB"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, "i"))); + } + } + + // Test: null const + { + for (size_t row_size = 1; row_size < 3; ++row_size) + { + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn>(row_size, {}), + createConstColumn(row_size, "123"))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + } + } + + std::vector test_cases; + std::vector results; + std::vector> null_maps; + std::vector exprs; + std::vector patterns; + std::vector positions; + std::vector occurs; + std::vector match_types; + + // Test: All columns are pure vector + { + // test regexp_substr(vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl"}, + {"tidb", "tidb_tikv", "ti(db|kv)"}, + {"aa", "aaaaaa", "a."}, + {"", "\n", "."}, + {"", "", "^$"}, + {"", "ab\naB", "^ab$"}, + {"跑", "pp跑ppのaaa", "(跑|の|P)"}}; + RegexpSubstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 0, 1, 0}), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns))); + + // test regexp_substr(vector, vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl", 3}, + {"tikv", "tidb_tikv", "ti(db|kv)", 2}, + {"aa", "aaaaaa", "aa", 3}, + {"", "\n", ".", 1}, + {"", "ab\naB", "^ab$", 1}, + {"跑", "pp跑ppのaaa", "(跑|の|P)", 2}}; + RegexpSubstrCase::setVecsWithoutNullMap(3, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 1, 0}), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions))); + + // test regexp_substr(vector, vector, vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl", 3, 1}, + {"tikv", "tidb_tikv", "ti(db|kv)", 2, 1}, + {"aa", "aaaaaa", "aa", 3, 2}, + {"", "\n", ".", 1, 1}, + {"", "ab\naB", "^ab$", 1, 1}, + {"の", "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; + RegexpSubstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 1, 0}), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs))); + + // test regexp_substr(vector, vector, vector, vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl", 3, 1, ""}, + {"tikv", "tidb_tikv", "ti(db|kv)", 2, 1, ""}, + {"aa", "aaaaaa", "aa", 3, 2, ""}, + {"\n", "\n", ".", 1, 1, "s"}, + {"aB", "ab\naB", "^ab$", 3, 1, "mi"}, + {"跑", "pp跑ppのaaa", "(跑|の|P)", 2, 2, "i"}}; + RegexpSubstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, match_types); + results = getResultVec(test_cases); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 0, 0, 0}), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(match_types))); + + // test collation + const auto * utf8mb4_general_ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + test_cases = {{"tiFl", "ttiFl", "tifl", 1, 1, ""}, + {"", "ttiFl", "tifl", 1, 1, "c"}, + {"tiFl", "ttiFl", "tifl", 1, 1, "i"}, + {"tiFl", "ttiFl", "tifl", 1, 1, "ci"}, + {"", "ttiFl", "tifl", 1, 1, "ic"}, + {"", "ttiFl", "tifl", 1, 1, "iccc"}, + {"", "ttiFl", "tifl", 1, 1, "icic"}}; + RegexpSubstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, match_types); + results = getResultVec(test_cases); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 0, 0, 1, 1, 1}), + executeFunction( + "regexp_substr", + {createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(match_types)}, + utf8mb4_general_ci_collator)); + } + + // Test: Args include nullable columns + { + // test regexp_substr(nullable vector, vector) + test_cases = {{"", {{1, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {"tidb", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpSubstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), + executeFunction( + "regexp_substr", + createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns))); + + // test regexp_substr(vector, nullable vector) + test_cases = {{"tifl", {{0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {"", {{0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpSubstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]), + executeFunction( + "regexp_substr", + createColumn(exprs), + createNullableVectorColumn(patterns, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]))); + + // test regexp_substr(vector, vector, nullable vector) + test_cases = {{"tifl", {{0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, + {"", {{0, 0, 1, 0, 0}}, "ttttifl", "tifl", 3}}; + RegexpSubstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]))); + + // test regexp_substr(vector, vector, vector, nullable vector) + test_cases = {{"tikv", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, + {"", {{0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; + RegexpSubstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createNullableVectorColumn(occurs, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]))); + + // test regexp_substr(vector, vector, vector, vector, nullable vector) + test_cases = {{"b", {{0, 0, 0, 0, 0}}, "b", "B", 1, 1, "i"}, + {"", {{0, 0, 0, 0, 1}}, "b", "B", 1, 1, "i"}}; + RegexpSubstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]), + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createNullableVectorColumn(match_types, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]))); + } + + // Test: const, nullable and pure vector columns appear together + { + // test regexp_substr(nullable vector, vector, nullable vector, vector, const vector, vector) + test_cases = {{"tidb", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}, + {"", {{1, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}, + {"", {{0, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}, + {"", {{1, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}}; + RegexpSubstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), + executeFunction( + "regexp_substr", + createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), + createColumn(occurs), + createColumn(match_types))); + } + + // Test: empty column tests + { + ASSERT_COLUMN_EQ(createConstColumn>(0, ""), + executeFunction( + "regexp_substr", + createConstColumn(0, "m"), + createConstColumn(0, "m"), + createConstColumn(0, 1), + createConstColumn(0, 1), + createConstColumn(0, "m"))); + + ASSERT_COLUMN_EQ(createColumn>({}), + executeFunction( + "regexp_substr", + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}))); + + ASSERT_COLUMN_EQ(createColumn>({}), + executeFunction( + "regexp_substr", + createColumn({}), + createColumn({}), + createConstColumn(0, 1), + createColumn({}), + createConstColumn(0, ""))); + } + + // Test: Invalid parameter handling + { + // test empty pattern + test_cases = {{"", "ttt", ""}}; + RegexpSubstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_THROW(executeFunction("regexp_substr", createNullableVectorColumn(exprs, {0}), createColumn(patterns)), Exception); + + // test invalid match type + test_cases = {{"", "ttt", "t", 1, 1, "p"}}; + RegexpSubstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_THROW(executeFunction("regexp_substr", createNullableVectorColumn(exprs, {0}), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(match_types)), Exception); + } +} + TEST_F(Regexp, testRegexpReplaceMatchType) { String res; diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index 89df50a8c56..e4f8e002e15 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# test regexp and regexp_like mysql> drop table if exists test.t mysql> create table test.t (data varchar(30), data_not_null varchar(30) not null, pattern varchar(30), pattern_not_null varchar(30) not null); mysql> insert into test.t values ('aaaa', 'AAAA', '^a.*', '^A.*'), ('abcd', 'abcd', null, '^a..d$'), (null, 'bbb', 'bb$', 'bb$'),('中文测试','中文测试','中文','^....$'),('中English混合','中English混合','^中English','^..nglish..$'); @@ -95,6 +96,7 @@ mysql> set tidb_enforce_mpp=1; select regexp_like(data, pattern, match_type) as | 1 | +------+ +# test regexp_instr mysql> drop table if exists test.t; mysql> create table test.t (expr varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); mysql> alter table test.t set tiflash replica 1; @@ -113,3 +115,23 @@ mysql> set tidb_enforce_mpp=1; select regexp_instr(expr, pattern, 1, 1, 0, match | 2 | | 4 | +------+ + +# test regexp_substr +mysql> drop table if exists test.t; +mysql> create table test.t (expr varchar(30), pattern varchar(30), pos int, occur int, match_type varchar(30)); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_substr(_utf8mb4'1', _utf8mb4'1', pos, occur, match_type) as res from test.t; +mysql> set tidb_enforce_mpp=1; select regexp_substr(_utf8mb4'1', _utf8mb4'', pos, occur, match_type) as res from test.t; + +mysql> insert into test.t values (_utf8mb4'123', _utf8mb4'12.', 1, 1, _utf8mb4''), (_utf8mb4'aBb', _utf8mb4'bb', 1, 1, _utf8mb4'i'), (_utf8mb4'ab\nabc', _utf8mb4'^abc$', 1, 1, _utf8mb4'm'); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_substr(expr, pattern, 1, 1, match_type) as res from test.t; ++------+ +| res | ++------+ +| 123 | +| Bb | +| abc | ++------+