From 8c5768b5c9e73998c297a7a6c543e7ab30cf039e Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 23 Sep 2022 17:57:43 +0800 Subject: [PATCH 01/87] save --- dbms/src/Flash/Coprocessor/DAGUtils.cpp | 2 +- dbms/src/Functions/FunctionsStringSearch.cpp | 240 ++++++++++++++++++- dbms/src/Functions/IFunction.cpp | 37 ++- dbms/src/Functions/IFunction.h | 8 + dbms/src/Functions/tests/gtest_regexp.cpp | 21 +- 5 files changed, 280 insertions(+), 28 deletions(-) diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index 1c4a8e521e9..e43cb7f9a12 100755 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -433,7 +433,7 @@ const std::unordered_map scalar_func_map({ {tipb::ScalarFuncSig::LikeSig, "like3Args"}, {tipb::ScalarFuncSig::RegexpSig, "regexp"}, {tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"}, - {tipb::ScalarFuncSig::RegexpLikeSig, "regexp"}, + {tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"}, // {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"}, // {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"}, // {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"}, diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 7278e61dfce..7e17322791b 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1809,6 +1810,237 @@ struct ReplaceStringImpl } }; +// Columns may be const, nullable or plain vector, we can conveniently handle +// these different type columns with Param. +template +class Param +{ +public: + DISALLOW_COPY_AND_MOVE(Param); + + Param(const ColumnPtr * ptr, T default_value) : data(default_value), is_const(false) + { + // arg is not provided and we should use default_value + if (ptr == nullptr) return; + + auto type_name = typeid(T).name(); + const ColumnConst * col_const = typeid_cast(&(*(*ptr))) + if (type_name == typeid(Int64).name()) + { + // Handle const + if (col_const != nullptr) + { + // This is a const column + data = col_const->getValue(); + is_const = true; + } + else + { + // This is a vector column + col_str = checkAndGetColumn(&(*(*ptr))); + } + } + else if (type_name == typeid(StringRef).name()) + { + // Handle const + if (col_const != nullptr) + { + // This is a const column + auto const_data = col_const->getValue(); + data.data = const_data.c_str(); + data.size = const_data.size(); + is_const = true; + } + else { + // This is a vector column + col_str = checkAndGetColumn(&(*(*ptr))); + } + } + else + throw Exception(fmt::format("Invalid type: {}", type_name)); + + // Handle nullable + if ((*ptr)->isColumnNullable()) + { + const ColumnPtr & null_map_column = static_cast(*(*ptr)).getNullMapColumnPtr(); + null_map = &(static_cast(null_map_column).getData()); + } + } + + Int64 getInt64(size_t idx) const + { + // Use default value when arg is const or not provided. + // For safety, nullptr should be checked + return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data; + } + + const StringRef & getString(size_t idx) const + { + // Use default value when arg is const or not provided. + // For safety, nullptr should be checked + return !is_const && col_str != nullptr ? col_str->getDataAt(idx) : data; + } + + bool isNullAt(size_t idx) const + { + if (null_map == nullptr) return false; + + return (*null_map)[idx]; + } + + bool isConstCol() const { return is_const; } + bool isNullableCol() const { return null_map == nullptr; } + size_t getDataNum() const { return (*col_ptr)->size(); } +private: + const ColumnPtr * col_ptr; + const ColumnString * col_str; + const ColumnInt64 * col_int64; + const NullMap * null_map; + bool is_const; // mark as the const column when it's true + T data; +}; + +class FunctionStringRegexpBase +{ +public: + bool memorizeRE2(const char * pattern) + { + memorized_re = std::make_unique(pattern); + return memorized_re->ok(); + } + + bool isMemorized() const { return memorized_re != nullptr; } +private: + // We should pre compile the regular expression when: + // - only pattern column is provided and it's a constant column + // - pattern and match type columns are provided and they are constant columns + std::unique_ptr memorized_re; +}; + +template +class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction +{ +public: + using ResuleType = UInt8; + static constexpr auto name = Name::name; + static FunctionPtr create(const Context &) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { std::make_shared>(); } + bool useDefaultImplementationForNulls() const override { return false; } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + // Do something related with nullable columns + NullPresence null_presence = getNullPresense(block, arguments); + if (null_presence.has_null_constant) + { + // This is a null constant column + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); + return; + } + + const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; + const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; + + const Param expr_param(&col_expr, ""); + const Param pat_param(&col_pat, ""); + auto arg_num = arguments.size(); + + std::unique_ptr match_type_param; + if constexpr (name == NameRegexpLike::name) + { + const ColumnPtr * col_match_type = nullptr; + // Try to get match type column only when it's a regexp_like function + if (arg_num > 2) + { + col_match_type = &(block.getByPosition(arguments[2]).column); + match_type_param = std::make_unique(col_match_type, ""); + } + else + { + match_type_param = std::make_unique(col_match_type, ""); + } + } + + if (pat_param.getDataNum() == 0) + { + // TODO return empty result + } + + // Check if all args are all const columns + if (expr_param.isConstCol() && pat_param.isConstCol()) + { + // TODO implement 2 param with macro + // TODO check empty pattern + if constexpr (name == NameRegexpLike::name) + { + if (arg_num > 2 && match_type_param.isConstCol()) + { + // TODO calculate return result 3 param + } + else if (arg_num == 2) + { + // TODO calculate return result 2 param + } + // Do nothing + } + else + { + // TODO calculate return result 2 param + } + } + + // TODO check memorization + + // if (col_expr_const && col_pat_const) + // { + // ResultType res{}; + // String expr = col_expr_const->getValue(); + // String pattern = col_pat_const->getValue(); + // if constexpr (name == NameTiDBRegexp::name) + // { + // // TODO calculate + // // judge the empty pattern + // block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_expr_const->size(), toField(res)); + // return + // } else + // { + // if (col_match_type == nullptr || col_match_type_const != nullptr) + // { + // // TODO calculate + // // judge the empty pattern + // block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_expr_const->size(), toField(res)); + // return + // } + // } + // } + + // Initialize result column + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(col_expr->size()); + + // Start to calculate + for (size_t i = 0; i < arg_num; ++i) + { + if constexpr (name == NameRegexpLike::name) + { + const StringRef & expr = expr_param.getString(i); + const StringRef & pat = pat_param.getString(i); + const StringRef & match_type = match_type_param->getString(i); + // TODO process + } + else + { + const StringRef & expr = expr_param.getString(i); + const StringRef & pat = pat_param.getString(i); + // TODO process + } + } + } +}; template class FunctionStringReplace : public IFunction @@ -2097,6 +2329,11 @@ struct NameTiDBRegexp static constexpr auto name = "regexp"; }; +struct NameRegexpLike +{ + static constexpr auto name = "regexp_like"; +}; + struct NameLike { static constexpr auto name = "like"; @@ -2137,7 +2374,8 @@ using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; using FunctionMatch = FunctionsStringSearch, NameMatch>; -using FunctionTiDBRegexp = FunctionsStringSearch, NameTiDBRegexp>; +using FunctionTiDBRegexp = FunctionStringRegexp; +// using FunctionTiDBRegexp = FunctionsStringSearch, NameTiDBRegexp>; using FunctionLike = FunctionsStringSearch, NameLike>; using FunctionLike3Args = FunctionsStringSearch, NameLike3Args>; using FunctionNotLike = FunctionsStringSearch, NameNotLike>; diff --git a/dbms/src/Functions/IFunction.cpp b/dbms/src/Functions/IFunction.cpp index 8a9d5a17469..51ab7cd2a6c 100644 --- a/dbms/src/Functions/IFunction.cpp +++ b/dbms/src/Functions/IFunction.cpp @@ -98,21 +98,12 @@ ColumnPtr wrapInNullable(const ColumnPtr & src, Block & block, const ColumnNumbe return ColumnNullable::create(src_not_nullable, result_null_map_column); } - -struct NullPresence -{ - bool has_nullable = false; - bool has_null_constant = false; -}; - -NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) +NullPresence getNullPresense(const ColumnsWithTypeAndName & args) { NullPresence res; - for (const auto & arg : args) + for (const auto & elem : args) { - const auto & elem = block.getByPosition(arg); - if (!res.has_nullable) res.has_nullable = elem.type->isNullable(); if (!res.has_null_constant) @@ -122,12 +113,23 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) return res; } -NullPresence getNullPresense(const ColumnsWithTypeAndName & args) +bool allArgumentsAreConstants(const Block & block, const ColumnNumbers & args) +{ + for (auto arg : args) + if (!block.getByPosition(arg).column->isColumnConst()) + return false; + return true; +} +} // namespace + +NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) { NullPresence res; - for (const auto & elem : args) + for (const auto & arg : args) { + const auto & elem = block.getByPosition(arg); + if (!res.has_nullable) res.has_nullable = elem.type->isNullable(); if (!res.has_null_constant) @@ -137,15 +139,6 @@ NullPresence getNullPresense(const ColumnsWithTypeAndName & args) return res; } -bool allArgumentsAreConstants(const Block & block, const ColumnNumbers & args) -{ - for (auto arg : args) - if (!block.getByPosition(arg).column->isColumnConst()) - return false; - return true; -} -} // namespace - bool IExecutableFunction::defaultImplementationForConstantArguments(Block & block, const ColumnNumbers & args, size_t result) const { ColumnNumbers arguments_to_remain_constants = getArgumentsThatAreAlwaysConstant(); diff --git a/dbms/src/Functions/IFunction.h b/dbms/src/Functions/IFunction.h index c1bcdc8b151..aca795ddf29 100644 --- a/dbms/src/Functions/IFunction.h +++ b/dbms/src/Functions/IFunction.h @@ -390,4 +390,12 @@ class DefaultFunctionBuilder : public IFunctionBuilder using FunctionPtr = std::shared_ptr; +struct NullPresence +{ + bool has_nullable = false; + bool has_null_constant = false; +}; + +NullPresence getNullPresense(const Block &, const ColumnNumbers &); + } // namespace DB diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index d3eb93a0790..f4ffb916b74 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -1809,59 +1809,72 @@ TEST_F(Regexp, testRegexp) auto const_uint8_null_column = createConstColumn>(row_size, {}); auto const_string_null_column = createConstColumn>(row_size, {}); + std::cout << "here 1" << std::endl; /// case 1. regexp(const, const [, const]) for (size_t i = 0; i < row_size; i++) { /// test regexp(const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results[i]), executeFunction("regexp", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]))); - +std::cout << "here 1.1" << std::endl; /// test regexp(const, const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type[i]), executeFunction("regexp", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i]))); - +std::cout << "here 1.2" << std::endl; /// test regexp(const, const, const) with binary collator ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type_collator[i]), executeFunction("regexp", {createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i])}, binary_collator)); } /// case 2. regexp(const, const [, const]) with null value + std::cout << "here 2" << std::endl; for (size_t i = 0; i < row_size; i++) { /// test regexp(const, const) ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results[i]), executeFunction("regexp", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); + std::cout << "here 2.1" << std::endl; /// test regexp(const, const, const) ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type[i]), executeFunction("regexp", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); + std::cout << "here 2.2" << std::endl; /// test regexp(const, const, const) with binary collator ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type_collator[i]), executeFunction("regexp", {input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, binary_collator)); } /// case 3 regexp(vector, const[, const]) + std::cout << "here 3" << std::endl; { /// test regexp(vector, const) ASSERT_COLUMN_EQ(createColumn(vec_results), executeFunction("regexp", createColumn(input_strings), createConstColumn(row_size, patterns[0]))); - +std::cout << "here 3.1" << std::endl; /// test regexp(vector, const, const) ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type), executeFunction("regexp", createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); - +std::cout << "here 3.2" << std::endl; /// test regexp(vector, const, const) with binary collator ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type_collator), executeFunction("regexp", {createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); } /// case 4 regexp(vector, const[, const]) nullable + std::cout << "here 4" << std::endl; { ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results, input_string_nulls), executeFunction("regexp", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]))); + std::cout << "here 4.2" << std::endl; ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type, input_string_nulls), executeFunction("regexp", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + std::cout << "here 4.2" << std::endl; ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type_collator, input_string_nulls), executeFunction("regexp", {createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); } + + /// issue 5984 + ASSERT_THROW(executeFunction("regexp", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); + ASSERT_THROW(executeFunction("regexp", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); + ASSERT_THROW(executeFunction("regexp", createColumn(std::vector{"1"}), createColumn(std::vector{""})), Exception); } TEST_F(Regexp, testRegexpCustomerCases) From 8b9c1499c68e039b184f4f9328487be5d3c01471 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 26 Sep 2022 18:14:26 +0800 Subject: [PATCH 02/87] save --- dbms/src/Functions/FunctionsStringSearch.cpp | 236 ++++++++++++++----- 1 file changed, 180 insertions(+), 56 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 7e17322791b..f917078007d 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1903,26 +1903,72 @@ class Param class FunctionStringRegexpBase { public: - bool memorizeRE2(const char * pattern) + static constexpr size_t REGEXP_PARAM_NUM = 2; + static constexpr size_t REGEXP_LIKE_PARAM_NUM = 3; + static constexpr size_t REGEXP_INSTR_PARAM_NUM = 6; + static constexpr size_t REGEXP_REPLACE_PARAM_NUM = 6; + static constexpr size_t REGEXP_SUBSTR_PARAM_NUM = 5; + + void memorize(const Param & pat_param, std::unique_ptr> match_type_param) + { + String pat(pat_param.getString(0)); + if (match_type_param != nullptr) + { + // TODO handle match_type_param + } + + int flags = 0; + flags |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE; + memorized_re = std::make_unique(pat, flags); + } + + // Check if we can memorize the regexp + template + static bool canMemorize(size_t arg_num, const Param & pat_param, const std::unique_ptr> & match_type_param) { - memorized_re = std::make_unique(pattern); - return memorized_re->ok(); + size_t total_param_num = 0; + if constexpr (Name::name == NameTiDBRegexp::name) + total_param_num = REGEXP_PARAM_NUM; + else if constexpr (Name::name == NameRegexpLike::name) + total_param_num = REGEXP_LIKE_PARAM_NUM; + else + throw Exception("Unknown regular function."); + + if constexpr (Name::name == NameTiDBRegexp::name) + { + return pat_param.isConstCol(); + } else + { + const bool is_pat_const = pat_param.isConstCol(); + if ((arg_num < total_param_num && is_pat_const) + || (arg_num == total_param_num && is_pat_const && match_type_param->isConstCol())) + { + return true; + } + } + + return false; } bool isMemorized() const { return memorized_re != nullptr; } + + const std::unique_ptr & getRegexp() const { return memorized_re; } private: // We should pre compile the regular expression when: // - only pattern column is provided and it's a constant column - // - pattern and match type columns are provided and they are constant columns - std::unique_ptr memorized_re; + // - pattern and match type columns are provided and they are both constant columns + std::unique_ptr memorized_re; }; +#define SET_FLAGS(flags) flags |= RE_NO_CAPTURE | RE_NO_OPTIMIZE; + template class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { public: using ResuleType = UInt8; static constexpr auto name = Name::name; + static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return true; } @@ -1944,11 +1990,12 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; - const Param expr_param(&col_expr, ""); - const Param pat_param(&col_pat, ""); + const Param expr_param(&col_expr, ""); + const Param pat_param(&col_pat, ""); auto arg_num = arguments.size(); - std::unique_ptr match_type_param; + // Only when this is a regexp_like function, match_type_param will be initialized + std::unique_ptr> match_type_param; if constexpr (name == NameRegexpLike::name) { const ColumnPtr * col_match_type = nullptr; @@ -1956,87 +2003,164 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction if (arg_num > 2) { col_match_type = &(block.getByPosition(arguments[2]).column); - match_type_param = std::make_unique(col_match_type, ""); + match_type_param = std::make_unique>(col_match_type, ""); } else { - match_type_param = std::make_unique(col_match_type, ""); + match_type_param = std::make_unique>(col_match_type, ""); } } if (pat_param.getDataNum() == 0) { - // TODO return empty result + auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); + block.getByPosition(result).column = ColumnConst::create(null_col_res, 0); + return; } - // Check if all args are all const columns + // Check if args are all const columns if (expr_param.isConstCol() && pat_param.isConstCol()) { - // TODO implement 2 param with macro - // TODO check empty pattern +#define PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type) \ + do { + int flags = 0; \ + SET_FLAGS(flags) \ + if constexpr (has_match_type) \ + { \ + const StringRef & match_type = match_type_param->getString(0); \ + // TODO should put match_type into pattern + } \ + Regexps::Regexp regexp(pat, flags); \ + ResultType res{regexp.match(expr)}; \ + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(pat_param.getDataNum(), toField(res)); \ + } while(0) + + const StringReg & pat = pat_param.getString(0); + if (pat.size() == 0) + throw Exception("Empty pattern is invalid"); + + const StringRef & expr = expr_param.getString(0); if constexpr (name == NameRegexpLike::name) { if (arg_num > 2 && match_type_param.isConstCol()) { - // TODO calculate return result 3 param + const bool has_match_type = true; + PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + return; } else if (arg_num == 2) { - // TODO calculate return result 2 param + const bool has_match_type = false; + PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + return; } - // Do nothing + // reach here when arg_num == 3 and match_type is not const } else { - // TODO calculate return result 2 param - } - } - - // TODO check memorization - - // if (col_expr_const && col_pat_const) - // { - // ResultType res{}; - // String expr = col_expr_const->getValue(); - // String pattern = col_pat_const->getValue(); - // if constexpr (name == NameTiDBRegexp::name) - // { - // // TODO calculate - // // judge the empty pattern - // block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_expr_const->size(), toField(res)); - // return - // } else - // { - // if (col_match_type == nullptr || col_match_type_const != nullptr) - // { - // // TODO calculate - // // judge the empty pattern - // block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_expr_const->size(), toField(res)); - // return - // } - // } - // } + const bool has_match_type = false; + PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + return; + } +#undef PROCESS + } + + // Check memorization + if (canMemorize(arg_num, pat_param, match_type_param)) + memorize(pat_param, match_type_param); + // Initialize result column auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(col_expr->size()); + vec_res.resize(expr_param->getDataNum()); - // Start to calculate - for (size_t i = 0; i < arg_num; ++i) + // Start to match + if (isMemorized()) { - if constexpr (name == NameRegexpLike::name) + const auto & regexp = getRegexp(); + if (null_presence.has_nullable) { - const StringRef & expr = expr_param.getString(i); - const StringRef & pat = pat_param.getString(i); - const StringRef & match_type = match_type_param->getString(i); - // TODO process + // expr column must be a nullable column here, so we need to check null for each elems + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(expr_param->getDataNum()); + + for (size_t i = 0; i < arg_num; ++i) + { + if (expr_param.isNullAt(i)) + { + nullmap[i] = 1; + continue; + } + + nullmap[i] = 0; + vec_res[i] = regexp->match(expr_param.getString(i)); // match + } + // TODO set result + } + else + { + // expr column is impossible to be a nullable column here + for (size_t i = 0; i < arg_num; ++i) + vec_res[i] = regexp->match(expr_param.getString(i)); // match + // TODO set result + } + } + else + { + if (null_presence.has_nullable) + { + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(expr_param->getDataNum()); + + for (size_t i = 0; i < arg_num; ++i) + { + if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) + { + nullmap[i] = 1; + continue; + } + + const StringRef & expr = expr_param.getString(i); + const StringRef & pat = pat_param.getString(i); + + if constexpr (name == NameTiDBRegexp::name) + { + int flags = 0; + SET_FLAGS(flags); + const auto & regexp = Regexps::get(pat, flags); + vec_res[i] = regexp->match(expr); // match + } + else + { + // TODO handle match_type first and do match action + } + + } + // TODO set result } else { - const StringRef & expr = expr_param.getString(i); - const StringRef & pat = pat_param.getString(i); - // TODO process + for (size_t i = 0; i < arg_num; ++i) + { + const StringRef & expr = expr_param.getString(i); + const StringRef & pat = pat_param.getString(i); + + if constexpr (name == NameTiDBRegexp::name) + { + int flags = 0; + SET_FLAGS(flags); + const auto & regexp = Regexps::get(pat, flags); + vec_res[i] = regexp->match(expr); // match + } + else + { + // TODO handle match_type first and do match action + } + } + // TODO set result } } } From c98b151ad401ca987d7859534ec172630acf72fc Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 27 Sep 2022 10:15:28 +0800 Subject: [PATCH 03/87] ready to compile --- dbms/src/Functions/FunctionsStringSearch.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index f917078007d..534e731350f 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1989,11 +1989,11 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; - + const Param expr_param(&col_expr, ""); const Param pat_param(&col_pat, ""); auto arg_num = arguments.size(); - + // Only when this is a regexp_like function, match_type_param will be initialized std::unique_ptr> match_type_param; if constexpr (name == NameRegexpLike::name) @@ -2065,11 +2065,11 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction #undef PROCESS } - + // Check memorization if (canMemorize(arg_num, pat_param, match_type_param)) memorize(pat_param, match_type_param); - + // Initialize result column auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); @@ -2097,14 +2097,16 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction nullmap[i] = 0; vec_res[i] = regexp->match(expr_param.getString(i)); // match } - // TODO set result + + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { // expr column is impossible to be a nullable column here for (size_t i = 0; i < arg_num; ++i) vec_res[i] = regexp->match(expr_param.getString(i)); // match - // TODO set result + + block.getByPosition(result).column = std::move(col_res); } } else @@ -2139,7 +2141,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } } - // TODO set result + + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { @@ -2160,7 +2163,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // TODO handle match_type first and do match action } } - // TODO set result + + block.getByPosition(result).column = std::move(col_res); } } } From c126dc57a108677f3f1fd3d39e717461ac584ef2 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 27 Sep 2022 14:14:33 +0800 Subject: [PATCH 04/87] successfully compile --- dbms/src/Functions/FunctionsStringSearch.cpp | 252 +++++++++++-------- 1 file changed, 142 insertions(+), 110 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 534e731350f..6abc65d575a 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -31,6 +31,7 @@ #include #include +#include "Columns/IColumn.h" #if USE_RE2_ST #include @@ -1812,73 +1813,79 @@ struct ReplaceStringImpl // Columns may be const, nullable or plain vector, we can conveniently handle // these different type columns with Param. -template class Param { public: DISALLOW_COPY_AND_MOVE(Param); - Param(const ColumnPtr * ptr, T default_value) : data(default_value), is_const(false) + Param(const ColumnPtr ptr, const StringRef & default_value) + : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), + is_const(false), data_stringrf(default_value.data, default_value.size), data_int64(0) { // arg is not provided and we should use default_value - if (ptr == nullptr) return; + if (col_ptr == nullptr) return; - auto type_name = typeid(T).name(); - const ColumnConst * col_const = typeid_cast(&(*(*ptr))) - if (type_name == typeid(Int64).name()) + const auto * col_const = typeid_cast(&(*col_ptr)); + + // Handle const + if (col_const != nullptr) { - // Handle const - if (col_const != nullptr) - { - // This is a const column - data = col_const->getValue(); - is_const = true; - } - else - { - // This is a vector column - col_str = checkAndGetColumn(&(*(*ptr))); - } + // This is a const column + auto const_data = col_const->getValue(); + data_stringrf.data = const_data.c_str(); + data_stringrf.size = const_data.size(); + is_const = true; } - else if (type_name == typeid(StringRef).name()) - { - // Handle const - if (col_const != nullptr) - { - // This is a const column - auto const_data = col_const->getValue(); - data.data = const_data.c_str(); - data.size = const_data.size(); - is_const = true; - } - else { - // This is a vector column - col_str = checkAndGetColumn(&(*(*ptr))); - } + else { + // This is a vector column + col_str = checkAndGetColumn(&(*col_ptr)); } - else - throw Exception(fmt::format("Invalid type: {}", type_name)); // Handle nullable - if ((*ptr)->isColumnNullable()) + if (col_ptr->isColumnNullable()) + null_map = &(static_cast(*col_ptr).getNullMapData()); + } + + Param(const ColumnPtr ptr, Int64 default_value) + : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), + is_const(false), data_int64(default_value) + { + // arg is not provided and we should use default_value + if (col_ptr == nullptr) return; + + const auto * col_const = typeid_cast(&(*col_ptr)); + + // Handle const + if (col_const != nullptr) + { + // This is a const column + data_int64 = col_const->getValue(); + is_const = true; + } + else { - const ColumnPtr & null_map_column = static_cast(*(*ptr)).getNullMapColumnPtr(); - null_map = &(static_cast(null_map_column).getData()); + // This is a vector column + col_int64 = checkAndGetColumn(&(*col_ptr)); } + + // Handle nullable + if (col_ptr->isColumnNullable()) + null_map = &(static_cast(*col_ptr).getNullMapData()); } Int64 getInt64(size_t idx) const { // Use default value when arg is const or not provided. // For safety, nullptr should be checked - return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data; + return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data_int64; } - const StringRef & getString(size_t idx) const + // @param to: destination that this function should copy data_stringrf to + void getString(size_t idx, StringRef & to) const { // Use default value when arg is const or not provided. // For safety, nullptr should be checked - return !is_const && col_str != nullptr ? col_str->getDataAt(idx) : data; + !is_const && col_str != nullptr ? (to = col_str->getDataAt(idx)) : (to = data_stringrf); } bool isNullAt(size_t idx) const @@ -1890,16 +1897,34 @@ class Param bool isConstCol() const { return is_const; } bool isNullableCol() const { return null_map == nullptr; } - size_t getDataNum() const { return (*col_ptr)->size(); } + size_t getDataNum() const { return col_ptr->size(); } private: - const ColumnPtr * col_ptr; + const ColumnPtr col_ptr; const ColumnString * col_str; const ColumnInt64 * col_int64; - const NullMap * null_map; + ConstNullMapPtr null_map; bool is_const; // mark as the const column when it's true - T data; + StringRef data_stringrf; + Int64 data_int64; +}; + +struct NameTiDBRegexp +{ + static constexpr auto name = "regexp"; +}; + +struct NameRegexpLike +{ + static constexpr auto name = "regexp_like"; +}; + +struct NameLike +{ + static constexpr auto name = "like"; }; +#define SET_FLAGS(flags) ((flags) |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE) + class FunctionStringRegexpBase { public: @@ -1909,27 +1934,32 @@ class FunctionStringRegexpBase static constexpr size_t REGEXP_REPLACE_PARAM_NUM = 6; static constexpr size_t REGEXP_SUBSTR_PARAM_NUM = 5; - void memorize(const Param & pat_param, std::unique_ptr> match_type_param) + void memorize(const Param & pat_param, const std::unique_ptr & match_type_param) const { - String pat(pat_param.getString(0)); + StringRef pat; + pat_param.getString(0, pat); if (match_type_param != nullptr) { // TODO handle match_type_param } int flags = 0; - flags |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE; - memorized_re = std::make_unique(pat, flags); + SET_FLAGS(flags); + memorized_re = std::make_unique(String(pat.data, pat.size), flags); } // Check if we can memorize the regexp template - static bool canMemorize(size_t arg_num, const Param & pat_param, const std::unique_ptr> & match_type_param) + static bool canMemorize(size_t arg_num, const Param & pat_param, const std::unique_ptr & match_type_param) { size_t total_param_num = 0; - if constexpr (Name::name == NameTiDBRegexp::name) + constexpr std::string_view class_name_sv(Name::name); + constexpr std::string_view tidb_regexp_name_sv(NameTiDBRegexp::name); + constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); + + if constexpr (class_name_sv == tidb_regexp_name_sv) total_param_num = REGEXP_PARAM_NUM; - else if constexpr (Name::name == NameRegexpLike::name) + else if constexpr (class_name_sv == regexp_like_name_sv) total_param_num = REGEXP_LIKE_PARAM_NUM; else throw Exception("Unknown regular function."); @@ -1957,24 +1987,23 @@ class FunctionStringRegexpBase // We should pre compile the regular expression when: // - only pattern column is provided and it's a constant column // - pattern and match type columns are provided and they are both constant columns - std::unique_ptr memorized_re; + mutable std::unique_ptr memorized_re; }; -#define SET_FLAGS(flags) flags |= RE_NO_CAPTURE | RE_NO_OPTIMIZE; - template class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { public: - using ResuleType = UInt8; + using ResultType = UInt8; static constexpr auto name = Name::name; static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return true; } void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { std::make_shared>(); } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments [[maybe_unused]]) const override { return std::make_shared>(); } bool useDefaultImplementationForNulls() const override { return false; } + size_t getNumberOfArguments() const override { return 0; } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { @@ -1990,31 +2019,34 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; - const Param expr_param(&col_expr, ""); - const Param pat_param(&col_pat, ""); + const Param expr_param(col_expr, ""); + const Param pat_param(col_pat, ""); auto arg_num = arguments.size(); // Only when this is a regexp_like function, match_type_param will be initialized - std::unique_ptr> match_type_param; - if constexpr (name == NameRegexpLike::name) + std::unique_ptr match_type_param; + + constexpr std::string_view class_name(name); + constexpr std::string_view regexp_like_name(NameRegexpLike::name); + if constexpr (class_name == regexp_like_name) { - const ColumnPtr * col_match_type = nullptr; + ColumnPtr col_match_type; // Try to get match type column only when it's a regexp_like function if (arg_num > 2) { - col_match_type = &(block.getByPosition(arguments[2]).column); - match_type_param = std::make_unique>(col_match_type, ""); + col_match_type = block.getByPosition(arguments[2]).column; + match_type_param = std::make_unique(*col_match_type, ""); } else { - match_type_param = std::make_unique>(col_match_type, ""); + match_type_param = std::make_unique(*col_match_type, ""); } } if (pat_param.getDataNum() == 0) { auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); - block.getByPosition(result).column = ColumnConst::create(null_col_res, 0); + block.getByPosition(result).column = ColumnConst::create(std::move(null_col_res), 0); return; } @@ -2022,35 +2054,36 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction if (expr_param.isConstCol() && pat_param.isConstCol()) { #define PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type) \ - do { + do { \ int flags = 0; \ - SET_FLAGS(flags) \ + SET_FLAGS(flags); \ if constexpr (has_match_type) \ { \ - const StringRef & match_type = match_type_param->getString(0); \ - // TODO should put match_type into pattern + /* TODO put match_type into pattern */ \ } \ - Regexps::Regexp regexp(pat, flags); \ - ResultType res{regexp.match(expr)}; \ - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(pat_param.getDataNum(), toField(res)); \ + Regexps::Regexp regexp(String((pat).data, (pat).size), flags); \ + ResultType res{regexp.match((expr).data, (expr).size)}; \ + (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ } while(0) - const StringReg & pat = pat_param.getString(0); - if (pat.size() == 0) + StringRef pat; + pat_param.getString(0, pat); + if (pat.size == 0) throw Exception("Empty pattern is invalid"); - const StringRef & expr = expr_param.getString(0); - if constexpr (name == NameRegexpLike::name) + StringRef expr; + expr_param.getString(0, expr); + if constexpr (class_name == regexp_like_name) { - if (arg_num > 2 && match_type_param.isConstCol()) + if (arg_num > 2 && match_type_param->isConstCol()) { - const bool has_match_type = true; + constexpr bool has_match_type = true; PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); return; } else if (arg_num == 2) { - const bool has_match_type = false; + constexpr bool has_match_type = false; PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); return; } @@ -2058,7 +2091,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } else { - const bool has_match_type = false; + constexpr bool has_match_type = false; PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); return; } @@ -2073,7 +2106,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // Initialize result column auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(expr_param->getDataNum()); + vec_res.resize(expr_param.getDataNum()); // Start to match if (isMemorized()) @@ -2084,8 +2117,9 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // expr column must be a nullable column here, so we need to check null for each elems auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(expr_param->getDataNum()); + nullmap.resize(expr_param.getDataNum()); + StringRef expr_ref; for (size_t i = 0; i < arg_num; ++i) { if (expr_param.isNullAt(i)) @@ -2095,7 +2129,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } nullmap[i] = 0; - vec_res[i] = regexp->match(expr_param.getString(i)); // match + expr_param.getString(i, expr_ref); + vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match } block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); @@ -2103,19 +2138,28 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction else { // expr column is impossible to be a nullable column here + StringRef expr_ref; for (size_t i = 0; i < arg_num; ++i) - vec_res[i] = regexp->match(expr_param.getString(i)); // match + { + expr_param.getString(i, expr_ref); + vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match + } block.getByPosition(result).column = std::move(col_res); } } else { + // container used for receiving data + StringRef expr; + StringRef pat; + if (null_presence.has_nullable) { auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(expr_param->getDataNum()); + nullmap.resize(expr_param.getDataNum()); + for (size_t i = 0; i < arg_num; ++i) { @@ -2125,15 +2169,15 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction continue; } - const StringRef & expr = expr_param.getString(i); - const StringRef & pat = pat_param.getString(i); + expr_param.getString(i, expr); + pat_param.getString(i, pat); - if constexpr (name == NameTiDBRegexp::name) + if constexpr (class_name == regexp_like_name) { int flags = 0; SET_FLAGS(flags); - const auto & regexp = Regexps::get(pat, flags); - vec_res[i] = regexp->match(expr); // match + const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); + vec_res[i] = regexp->match(expr.data, expr.size); // match } else { @@ -2148,15 +2192,15 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { for (size_t i = 0; i < arg_num; ++i) { - const StringRef & expr = expr_param.getString(i); - const StringRef & pat = pat_param.getString(i); + expr_param.getString(i, expr); + pat_param.getString(i, pat); - if constexpr (name == NameTiDBRegexp::name) + if constexpr (class_name == regexp_like_name) { int flags = 0; SET_FLAGS(flags); - const auto & regexp = Regexps::get(pat, flags); - vec_res[i] = regexp->match(expr); // match + const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); + vec_res[i] = regexp->match(expr.data, expr.size); // match } else { @@ -2168,6 +2212,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } } } +private: + TiDB::TiDBCollatorPtr collator = nullptr; }; template @@ -2452,20 +2498,6 @@ struct NameMatch static constexpr auto name = "match"; }; -struct NameTiDBRegexp -{ - static constexpr auto name = "regexp"; -}; - -struct NameRegexpLike -{ - static constexpr auto name = "regexp_like"; -}; - -struct NameLike -{ - static constexpr auto name = "like"; -}; struct NameLike3Args { static constexpr auto name = "like3Args"; From 51ff17345c132657cc4da1381edc7d63ca22c0f8 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 28 Sep 2022 10:33:32 +0800 Subject: [PATCH 05/87] clean up --- dbms/src/Functions/FunctionsRegexp.cpp | 1067 +++++++++++ dbms/src/Functions/FunctionsRegexp.h | 727 +++++++ dbms/src/Functions/FunctionsStringSearch.cpp | 1786 +----------------- dbms/src/Functions/FunctionsStringSearch.h | 20 +- dbms/src/Functions/re2Util.cpp | 81 + dbms/src/Functions/re2Util.h | 37 + dbms/src/Functions/registerFunctions.cpp | 2 + 7 files changed, 1950 insertions(+), 1770 deletions(-) create mode 100644 dbms/src/Functions/FunctionsRegexp.cpp create mode 100644 dbms/src/Functions/FunctionsRegexp.h create mode 100644 dbms/src/Functions/re2Util.cpp create mode 100644 dbms/src/Functions/re2Util.h diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp new file mode 100644 index 00000000000..cb515f529de --- /dev/null +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -0,0 +1,1067 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +namespace DB +{ + +namespace +{ +const char flag_i = 'i'; +const char flag_c = 'c'; +const char flag_m = 'm'; +const char flag_s = 's'; + +std::set valid_flags{flag_i, flag_c, flag_m, flag_s}; +} + +// If characters specifying contradictory options are specified +// within match_type, the rightmost one takes precedence. +String getMatchType(const String & match_type) +{ + // TODO handle collation + std::set applied_flags; + + for (auto flag : match_type) + { + auto iter = valid_flags.find(flag); + if (iter == valid_flags.end()) + throw Exception(fmt::format("Invalid match type '{}' in regexp function", flag)); + + // re2 is case-sensitive by default, so we only need to delete 'i' flag + // to enable the case-sensitive for the regexp + if (flag == flag_c) + { + auto iter_i = applied_flags.find('i'); + if (iter_i != applied_flags.end()) + applied_flags.erase(iter_i); + + continue; + } + + applied_flags.insert(flag); + } + + // generate match type flag + String flags; + for (auto flag : applied_flags) + flags += flag; + + return flags; +} + +/** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. + * 'replacement' could contain substitutions, for example: '\2-\3-\1' + */ +template +struct ReplaceRegexpImpl +{ + static constexpr bool support_non_const_needle = false; + static constexpr bool support_non_const_replacement = false; + /// need customized escape char when do the string search + static const bool need_customized_escape_char = false; + /// support match type when do the string search, used in regexp + static const bool support_match_type = true; + + /// Sequence of instructions, describing how to get resulting string. + /// Each element is either: + /// - substitution (in that case first element of pair is their number and second element is empty) + /// - string that need to be inserted (in that case, first element of pair is that string and second element is -1) + using Instructions = std::vector>; + + static const size_t max_captures = 10; + + static Instructions createInstructions(const std::string & s, int num_captures) + { + Instructions instructions; + + String now; + for (size_t i = 0; i < s.size(); ++i) + { + if (s[i] == '\\' && i + 1 < s.size()) + { + if (isNumericASCII(s[i + 1])) /// Substitution + { + if (!now.empty()) + { + instructions.emplace_back(-1, now); + now = ""; + } + instructions.emplace_back(s[i + 1] - '0', String()); + } + else + now += s[i + 1]; /// Escaping + ++i; + } + else + now += s[i]; /// Plain character + } + + if (!now.empty()) + { + instructions.emplace_back(-1, now); + now = ""; + } + + for (const auto & it : instructions) + if (it.first >= num_captures) + throw Exception("Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only " + + toString(num_captures - 1) + + " subpatterns", + ErrorCodes::BAD_ARGUMENTS); + + return instructions; + } + + + static void processString(const re2_st::StringPiece & input, + ColumnString::Chars_t & res_data, + ColumnString::Offset & res_offset, + const Int64 & pos, + const Int64 & occ, + re2_st::RE2 & searcher, + int num_captures, + const Instructions & instructions) + { + re2_st::StringPiece matches[max_captures]; + + size_t start_pos = pos <= 0 ? 0 : pos - 1; + Int64 match_occ = 0; + size_t prefix_length = std::min(start_pos, static_cast(input.length())); + if (prefix_length > 0) + { + /// Copy prefix + res_data.resize(res_data.size() + prefix_length); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data(), prefix_length); + res_offset += prefix_length; + } + while (start_pos < static_cast(input.length())) + { + /// If no more replacements possible for current string + bool can_finish_current_string = false; + + if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) + { + match_occ++; + /// if occ > 0, it will replace all the match expr, otherwise it only replace the occ-th match + if (occ == 0 || match_occ == occ) + { + const auto & match = matches[0]; + size_t bytes_to_copy = (match.data() - input.data()) - start_pos; + + /// Copy prefix before matched regexp without modification + res_data.resize(res_data.size() + bytes_to_copy); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy); + res_offset += bytes_to_copy; + start_pos += bytes_to_copy + match.length(); + + /// Do substitution instructions + for (const auto & it : instructions) + { + if (it.first >= 0) + { + res_data.resize(res_data.size() + matches[it.first].length()); + memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length()); + res_offset += matches[it.first].length(); + } + else + { + res_data.resize(res_data.size() + it.second.size()); + memcpy(&res_data[res_offset], it.second.data(), it.second.size()); + res_offset += it.second.size(); + } + } + + /// when occ > 0, just replace the occ-th match even if replace_one is false + if (replace_one || match.length() == 0) /// Stop after match of zero length, to avoid infinite loop. + can_finish_current_string = true; + } + else + { + const auto & match = matches[0]; + size_t bytes_to_copy = (match.data() - input.data()) - start_pos + match.length(); + + /// Copy the matched string without modification + res_data.resize(res_data.size() + bytes_to_copy); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy); + res_offset += bytes_to_copy; + start_pos += bytes_to_copy; + if (match.length() == 0) + can_finish_current_string = true; + } + } + else + can_finish_current_string = true; + + /// If ready, append suffix after match to end of string. + if (can_finish_current_string) + { + res_data.resize(res_data.size() + input.length() - start_pos); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos); + res_offset += input.length() - start_pos; + start_pos = input.length(); + } + } + + res_data.resize(res_data.size() + 1); + res_data[res_offset] = 0; + ++res_offset; + } + + + static void vector(const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const std::string & needle, + const std::string & replacement, + const Int64 & pos, + const Int64 & occ, + const std::string & match_type, + TiDB::TiDBCollatorPtr collator, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + ColumnString::Offset res_offset = 0; + res_data.reserve(data.size()); + size_t size = offsets.size(); + res_offsets.resize(size); + + if (needle.empty()) + { + /// Copy all the data without changing. + res_data.resize(data.size()); + const UInt8 * begin = &data[0]; + memcpy(&res_data[0], begin, data.size()); + memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); + return; + } + + String updated_needle = needle; + if (!match_type.empty() || collator != nullptr) + { + String mode_modifiers = re2Util::getRE2ModeModifiers(match_type, collator); + if (!mode_modifiers.empty()) + updated_needle = mode_modifiers + updated_needle; + } + re2_st::RE2 searcher(updated_needle); + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast(max_captures)); + + Instructions instructions = createInstructions(replacement, num_captures); + + /// Cannot perform search for whole block. Will process each string separately. + for (size_t i = 0; i < size; ++i) + { + int from = i > 0 ? offsets[i - 1] : 0; + re2_st::StringPiece input(reinterpret_cast(&data[0] + from), offsets[i] - from - 1); + + processString(input, res_data, res_offset, pos, occ, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorFixed(const ColumnString::Chars_t & data, + size_t n, + const std::string & needle, + const std::string & replacement, + const Int64 & pos, + const Int64 & occ, + const std::string & match_type, + TiDB::TiDBCollatorPtr collator, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + ColumnString::Offset res_offset = 0; + size_t size = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(size); + + if (needle.empty()) + { + /// TODO: copy all the data without changing + throw Exception("Length of the second argument of function replace must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + } + + String updated_needle = needle; + if (!match_type.empty() || collator != nullptr) + { + String mode_modifiers = re2Util::getRE2ModeModifiers(match_type, collator); + if (!mode_modifiers.empty()) + updated_needle = mode_modifiers + updated_needle; + } + re2_st::RE2 searcher(updated_needle); + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast(max_captures)); + + Instructions instructions = createInstructions(replacement, num_captures); + + for (size_t i = 0; i < size; ++i) + { + int from = i * n; + re2_st::StringPiece input(reinterpret_cast(&data[0] + from), n); + + processString(input, res_data, res_offset, pos, occ, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + static void constant(const String & input, const String & needle, const String & replacement, const Int64 & pos, const Int64 & occ, const String & match_type, TiDB::TiDBCollatorPtr collator, String & output) + { + ColumnString::Chars_t input_data; + input_data.insert(input_data.end(), input.begin(), input.end()); + ColumnString::Offsets input_offsets; + input_offsets.push_back(input_data.size() + 1); + ColumnString::Chars_t output_data; + ColumnString::Offsets output_offsets; + vector(input_data, input_offsets, needle, replacement, pos, occ, match_type, collator, output_data, output_offsets); + output = String(reinterpret_cast(&output_data[0]), output_offsets[0] - 1); + } +}; + +/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. + */ +template +struct ReplaceStringImpl +{ + static constexpr bool support_non_const_needle = true; + static constexpr bool support_non_const_replacement = true; + /// need customized escape char during the string search + static const bool need_customized_escape_char = false; + /// support match type during the string search, used in regexp + static const bool support_match_type = false; + + static void vector(const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const std::string & needle, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + res_data.reserve(data.size()); + size_t size = offsets.size(); + res_offsets.resize(size); + + if (needle.empty()) + { + /// Copy all the data without changing. + res_data.resize(data.size()); + memcpy(&res_data[0], begin, data.size()); + memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); + return; + } + + /// The current index in the array of strings. + size_t i = 0; + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + + /// Determine which index it belongs to. + while (i < offsets.size() && begin + offsets[i] <= match) + { + res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); + ++i; + } + res_offset += (match - pos); + + /// If you have reached the end, it's time to stop + if (i == offsets.size()) + break; + + /// Is it true that this line no longer needs to perform transformations. + bool can_finish_current_string = false; + + /// We check that the entry does not go through the boundaries of strings. + if (match + needle.size() < begin + offsets[i]) + { + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle.size(); + if (replace_one) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + res_data.resize(res_data.size() + (begin + offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); + res_offset += (begin + offsets[i] - pos); + res_offsets[i] = res_offset; + pos = begin + offsets[i]; + ++i; + } + } + } + + static void vectorNonConstNeedle( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(offsets.size()); + + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(offsets, i); + auto data_size = StringUtil::sizeAt(offsets, i); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + /// Copy the whole data to res without changing + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + /// It's time to stop. + break; + } + + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle_size; + + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + res_offsets[i] = res_offset; + } + } + + static void vectorNonConstReplacement( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const std::string & needle, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + res_data.reserve(data.size()); + size_t size = offsets.size(); + res_offsets.resize(size); + + if (needle.empty()) + { + /// Copy all the data without changing. + res_data.resize(data.size()); + memcpy(&res_data[0], begin, data.size()); + memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); + return; + } + + /// The current index in the array of strings. + size_t i = 0; + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + + /// Determine which index it belongs to. + while (i < offsets.size() && begin + offsets[i] <= match) + { + res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); + ++i; + } + res_offset += (match - pos); + + /// If you have reached the end, it's time to stop + if (i == offsets.size()) + break; + + /// Is it true that this line no longer needs to perform transformations. + bool can_finish_current_string = false; + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + /// We check that the entry does not go through the boundaries of strings. + if (match + needle.size() < begin + offsets[i]) + { + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle.size(); + if (replace_one) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + res_data.resize(res_data.size() + (begin + offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); + res_offset += (begin + offsets[i] - pos); + res_offsets[i] = res_offset; + pos = begin + offsets[i]; + ++i; + } + } + } + + static void vectorNonConstNeedleReplacement( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(offsets.size()); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(offsets, i); + auto data_size = StringUtil::sizeAt(offsets, i); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + /// It's time to stop. + break; + } + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle_size; + + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + res_offsets[i] = res_offset; + } + } + + /// Note: this function converts fixed-length strings to variable-length strings + /// and each variable-length string should ends with zero byte. + static void vectorFixed(const ColumnString::Chars_t & data, + size_t n, + const std::string & needle, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + + /// The current index in the string array. + size_t i = 0; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = begin + n * (i + 1) - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = begin + n * (i + 1); \ + ++i; \ + } while (false) + + if (needle.empty()) + { + /// Copy all the data without changing. + while (i < count) + { + COPY_REST_OF_CURRENT_STRING(); + } + return; + } + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy skipped strings without any changes but + /// add zero byte to the end of each string. + while (i < count && begin + n * (i + 1) <= match) + { + COPY_REST_OF_CURRENT_STRING(); + } + + /// If you have reached the end, it's time to stop + if (i == count) + break; + + /// Copy unchanged part of current string. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += (match - pos); + + /// Is it true that this line no longer needs to perform conversions. + bool can_finish_current_string = false; + + /// We check that the entry does not pass through the boundaries of strings. + if (match + needle.size() <= begin + n * (i + 1)) + { + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle.size(); + if (replace_one || pos == begin + n * (i + 1)) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + COPY_REST_OF_CURRENT_STRING(); + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void vectorFixedNonConstNeedle( + const ColumnString::Chars_t & data, + size_t n, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < count; ++i) + { + const UInt8 * begin = &data[i * n]; + const UInt8 * pos = begin; + const UInt8 * end = pos + n; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = end - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = end; \ + } while (false) + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + if (needle_size == 0) + { + COPY_REST_OF_CURRENT_STRING(); + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + if (match == end) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle_size; + + if (replace_one) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void vectorFixedNonConstReplacement( + const ColumnString::Chars_t & data, + size_t n, + const std::string & needle, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + + /// The current index in the string array. + size_t i = 0; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = begin + n * (i + 1) - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = begin + n * (i + 1); \ + ++i; \ + } while (false) + + if (needle.empty()) + { + /// Copy all the data without changing. + while (i < count) + { + COPY_REST_OF_CURRENT_STRING(); + } + return; + } + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy skipped strings without any changes but + /// add zero byte to the end of each string. + while (i < count && begin + n * (i + 1) <= match) + { + COPY_REST_OF_CURRENT_STRING(); + } + + /// If you have reached the end, it's time to stop + if (i == count) + break; + + /// Copy unchanged part of current string. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += (match - pos); + + /// Is it true that this line no longer needs to perform conversions. + bool can_finish_current_string = false; + + /// We check that the entry does not pass through the boundaries of strings. + if (match + needle.size() <= begin + n * (i + 1)) + { + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle.size(); + if (replace_one || pos == begin + n * (i + 1)) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + COPY_REST_OF_CURRENT_STRING(); + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void vectorFixedNonConstNeedleReplacement( + const ColumnString::Chars_t & data, + size_t n, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < count; ++i) + { + const UInt8 * begin = &data[i * n]; + const UInt8 * pos = begin; + const UInt8 * end = pos + n; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = end - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = end; \ + } while (false) + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + if (needle_size == 0) + { + COPY_REST_OF_CURRENT_STRING(); + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + if (match == end) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle_size; + + if (replace_one) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void constant(const std::string & data, const std::string & needle, const std::string & replacement, const Int64 & /* pos */, const Int64 & /* occ */, const std::string & /* match_type */, TiDB::TiDBCollatorPtr /* collator */, std::string & res_data) + { + if (needle.empty()) + { + res_data = data; + return; + } + res_data = ""; + int replace_cnt = 0; + for (size_t i = 0; i < data.size(); ++i) + { + bool match = true; + if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0)) + match = false; + for (size_t j = 0; match && j < needle.size(); ++j) + if (data[i + j] != needle[j]) + match = false; + if (match) + { + ++replace_cnt; + res_data += replacement; + i = i + needle.size() - 1; + } + else + res_data += data[i]; + } + } +}; + +using FunctionTiDBRegexp = FunctionStringRegexp; +using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; +using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; +using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; +using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; + +void registerFunctionsRegexp(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); +} + +} // namespace DB diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h new file mode 100644 index 00000000000..950b48b7583 --- /dev/null +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -0,0 +1,727 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if USE_RE2_ST +#include +#else +#define re2_st re2 +#endif + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +extern const int ILLEGAL_COLUMN; +} // namespace ErrorCodes + +struct NameTiDBRegexp +{ + static constexpr auto name = "regexp"; +}; +struct NameRegexpLike +{ + static constexpr auto name = "regexp_like"; +}; +struct NameReplaceOne +{ + static constexpr auto name = "replaceOne"; +}; +struct NameReplaceAll +{ + static constexpr auto name = "replaceAll"; +}; +struct NameReplaceRegexpOne +{ + static constexpr auto name = "replaceRegexpOne"; +}; +struct NameReplaceRegexpAll +{ + static constexpr auto name = "replaceRegexpAll"; +}; + +#define SET_FLAGS(flags) ((flags) |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE) + +String getMatchType(const String & match_type); + +// Columns may be const, nullable or plain vector, we can conveniently handle +// these different type columns with Param. +class Param +{ +public: + DISALLOW_COPY_AND_MOVE(Param); + + Param(const ColumnPtr ptr, const StringRef & default_value) + : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), + is_const(false), data_stringrf(default_value.data, default_value.size), data_int64(0) + { + // arg is not provided and we should use default_value + if (col_ptr == nullptr) return; + + const auto * col_const = typeid_cast(&(*col_ptr)); + + // Handle const + if (col_const != nullptr) + { + // This is a const column + data_stringrf = col_const->getDataAt(0); + is_const = true; + } + else { + // This is a vector column + col_str = checkAndGetColumn(&(*col_ptr)); + } + + // Handle nullable + if (col_ptr->isColumnNullable()) + null_map = &(static_cast(*col_ptr).getNullMapData()); + } + + Param(const ColumnPtr ptr, Int64 default_value) + : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), + is_const(false), data_int64(default_value) + { + // arg is not provided and we should use default_value + if (col_ptr == nullptr) return; + + const auto * col_const = typeid_cast(&(*col_ptr)); + + // Handle const + if (col_const != nullptr) + { + // This is a const column + data_int64 = col_const->getValue(); + is_const = true; + } + else + { + // This is a vector column + col_int64 = checkAndGetColumn(&(*col_ptr)); + } + + // Handle nullable + if (col_ptr->isColumnNullable()) + null_map = &(static_cast(*col_ptr).getNullMapData()); + } + + Int64 getInt64(size_t idx) const + { + // Use default value when arg is const or not provided. + // For safety, nullptr should be checked + return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data_int64; + } + + // @param to: destination that this function should copy data_stringrf to + void getString(size_t idx, StringRef & to) const + { + // Use default value when arg is const or not provided. + // For safety, nullptr should be checked + !is_const && col_str != nullptr ? (to = col_str->getDataAt(idx)) : (to = data_stringrf); + } + + bool isNullAt(size_t idx) const + { + if (null_map == nullptr) return false; + + return (*null_map)[idx]; + } + + bool isConstCol() const { return is_const; } + bool isNullableCol() const { return null_map == nullptr; } + size_t getDataNum() const { return col_ptr->size(); } +private: + const ColumnPtr col_ptr; + const ColumnString * col_str; + const ColumnInt64 * col_int64; + ConstNullMapPtr null_map; + bool is_const; // mark as the const column when it's true + StringRef data_stringrf; + Int64 data_int64; +}; + +class FunctionStringRegexpBase +{ +public: + // Max parameter number the regexp_xxx function could receive + static constexpr size_t REGEXP_PARAM_NUM = 2; + static constexpr size_t REGEXP_LIKE_PARAM_NUM = 3; + static constexpr size_t REGEXP_INSTR_PARAM_NUM = 6; + static constexpr size_t REGEXP_REPLACE_PARAM_NUM = 6; + static constexpr size_t REGEXP_SUBSTR_PARAM_NUM = 5; + + void memorize(const Param & pat_param, const std::unique_ptr & match_type_param) const + { + StringRef pat; + pat_param.getString(0, pat); + if (match_type_param != nullptr) + { + // TODO handle match_type_param + } + + int flags = 0; + SET_FLAGS(flags); + memorized_re = std::make_unique(String(pat.data, pat.size), flags); + } + + // Check if we can memorize the regexp + template + static bool canMemorize(size_t arg_num, const Param & pat_param, const std::unique_ptr & match_type_param) + { + size_t total_param_num = 0; + constexpr std::string_view class_name_sv(Name::name); + constexpr std::string_view tidb_regexp_name_sv(NameTiDBRegexp::name); + constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); + + if constexpr (class_name_sv == tidb_regexp_name_sv) + total_param_num = REGEXP_PARAM_NUM; + else if constexpr (class_name_sv == regexp_like_name_sv) + total_param_num = REGEXP_LIKE_PARAM_NUM; + else + throw Exception("Unknown regular function."); + + if constexpr (Name::name == NameTiDBRegexp::name) + { + return pat_param.isConstCol(); + } else + { + const bool is_pat_const = pat_param.isConstCol(); + if ((arg_num < total_param_num && is_pat_const) + || (arg_num == total_param_num && is_pat_const && match_type_param->isConstCol())) + { + return true; + } + } + + return false; + } + + bool isMemorized() const { return memorized_re != nullptr; } + + const std::unique_ptr & getRegexp() const { return memorized_re; } +private: + // We should pre compile the regular expression when: + // - only pattern column is provided and it's a constant column + // - pattern and match type columns are provided and they are both constant columns + mutable std::unique_ptr memorized_re; +}; + +template +class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction +{ +public: + using ResultType = UInt8; + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments [[maybe_unused]]) const override { return std::make_shared>(); } + bool useDefaultImplementationForNulls() const override { return false; } + size_t getNumberOfArguments() const override { return 0; } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + // Do something related with nullable columns + NullPresence null_presence = getNullPresense(block, arguments); + if (null_presence.has_null_constant) + { + // This is a null constant column + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); + return; + } + + const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; + const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; + + if (col_expr->empty()) + { + auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); + block.getByPosition(result).column = ColumnConst::create(std::move(null_col_res), 0); + return; + } + + const Param expr_param(col_expr, StringRef("")); + const Param pat_param(col_pat, StringRef("")); + auto arg_num = arguments.size(); + + // Only when this is a regexp_like function, match_type_param will be initialized + std::unique_ptr match_type_param; + + constexpr std::string_view class_name(name); + constexpr std::string_view regexp_like_name(NameRegexpLike::name); + if constexpr (class_name == regexp_like_name) + { + ColumnPtr col_match_type; + // Try to get match type column only when it's a regexp_like function + if (arg_num > 2) + { + col_match_type = block.getByPosition(arguments[2]).column; + match_type_param = std::make_unique(*col_match_type, ""); + } + else + { + match_type_param = std::make_unique(*col_match_type, ""); + } + } + + // Check if args are all const columns + if (expr_param.isConstCol() && pat_param.isConstCol()) + { +#define PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type) \ + do { \ + int flags = 0; \ + SET_FLAGS(flags); \ + if constexpr (has_match_type) \ + { \ + /* TODO put match_type into pattern */ \ + } \ + Regexps::Regexp regexp(String((pat).data, (pat).size), flags); \ + ResultType res{regexp.match((expr).data, (expr).size)}; \ + (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ + } while(0) + + StringRef pat; + pat_param.getString(0, pat); + if (pat.size == 0) + throw Exception("Empty pattern is invalid"); + + StringRef expr; + expr_param.getString(0, expr); + if constexpr (class_name == regexp_like_name) + { + if (arg_num > 2 && match_type_param->isConstCol()) + { + constexpr bool has_match_type = true; + PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + return; + } + else if (arg_num == 2) + { + constexpr bool has_match_type = false; + PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + return; + } + // reach here when arg_num == 3 and match_type is not const + } + else + { + constexpr bool has_match_type = false; + PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + return; + } + +#undef PROCESS + } + + // Check memorization + if (canMemorize(arg_num, pat_param, match_type_param)) + memorize(pat_param, match_type_param); + + // Initialize result column + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(expr_param.getDataNum()); + + // Start to match + if (isMemorized()) + { + const auto & regexp = getRegexp(); + if (null_presence.has_nullable) + { + // expr column must be a nullable column here, so we need to check null for each elems + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(expr_param.getDataNum()); + + StringRef expr_ref; + for (size_t i = 0; i < arg_num; ++i) + { + if (expr_param.isNullAt(i)) + { + nullmap[i] = 1; + continue; + } + + nullmap[i] = 0; + expr_param.getString(i, expr_ref); + vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match + } + + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + } + else + { + // expr column is impossible to be a nullable column here + StringRef expr_ref; + for (size_t i = 0; i < arg_num; ++i) + { + expr_param.getString(i, expr_ref); + vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match + } + + block.getByPosition(result).column = std::move(col_res); + } + } + else + { + // container used for receiving data + StringRef expr; + StringRef pat; + + if (null_presence.has_nullable) + { + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(expr_param.getDataNum()); + + + for (size_t i = 0; i < arg_num; ++i) + { + if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) + { + nullmap[i] = 1; + continue; + } + + expr_param.getString(i, expr); + pat_param.getString(i, pat); + + if constexpr (class_name == regexp_like_name) + { + int flags = 0; + SET_FLAGS(flags); + const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); + vec_res[i] = regexp->match(expr.data, expr.size); // match + } + else + { + // TODO handle match_type first and do match action + } + + } + + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + } + else + { + for (size_t i = 0; i < arg_num; ++i) + { + expr_param.getString(i, expr); + pat_param.getString(i, pat); + + if constexpr (class_name == regexp_like_name) + { + int flags = 0; + SET_FLAGS(flags); + const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); + vec_res[i] = regexp->match(expr.data, expr.size); // match + } + else + { + // TODO handle match_type first and do match action + } + } + + block.getByPosition(result).column = std::move(col_res); + } + } + } +private: + TiDB::TiDBCollatorPtr collator = nullptr; +}; + +template +class FunctionStringReplace : public IFunction +{ +public: + static constexpr auto name = Name::name; + static FunctionPtr create(const Context &) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override + { + return 0; + } + + bool isVariadic() const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + return {3, 4, 5}; + } + else if constexpr (Impl::support_non_const_needle) + { + return {2, 3, 4, 5}; + } + else if constexpr (Impl::support_non_const_replacement) + { + return {1, 3, 4, 5}; + } + else + { + return {1, 2, 3, 4, 5}; + } + } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!arguments[0]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (!arguments[1]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (!arguments[2]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() > 3 && !arguments[3]->isInteger()) + throw Exception("Illegal type " + arguments[2]->getName() + " of forth argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() > 4 && !arguments[4]->isInteger()) + throw Exception("Illegal type " + arguments[2]->getName() + " of fifth argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() > 5 && !arguments[5]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[2]->getName() + " of sixth argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + const ColumnPtr & column_src = block.getByPosition(arguments[0]).column; + const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; + const ColumnPtr & column_replacement = block.getByPosition(arguments[2]).column; + const ColumnPtr column_pos = arguments.size() > 3 ? block.getByPosition(arguments[3]).column : nullptr; + const ColumnPtr column_occ = arguments.size() > 4 ? block.getByPosition(arguments[4]).column : nullptr; + const ColumnPtr column_match_type = arguments.size() > 5 ? block.getByPosition(arguments[5]).column : nullptr; + + if ((column_pos != nullptr && !column_pos->isColumnConst()) + || (column_occ != nullptr && !column_occ->isColumnConst()) + || (column_match_type != nullptr && !column_match_type->isColumnConst())) + throw Exception("4th, 5th, 6th arguments of function " + getName() + " must be constants."); + Int64 pos = column_pos == nullptr ? 1 : typeid_cast(column_pos.get())->getInt(0); + Int64 occ = column_occ == nullptr ? 0 : typeid_cast(column_occ.get())->getInt(0); + String match_type = column_match_type == nullptr ? "" : typeid_cast(column_match_type.get())->getValue(); + + ColumnWithTypeAndName & column_result = block.getByPosition(result); + + bool needle_const = column_needle->isColumnConst(); + bool replacement_const = column_replacement->isColumnConst(); + + if (needle_const && replacement_const) + { + executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + else if (needle_const) + { + executeImplNonConstReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + else if (replacement_const) + { + executeImplNonConstNeedle(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + else + { + executeImplNonConstNeedleReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + } + +private: + void executeImpl( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos, + Int64 occ, + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + const auto * c1_const = typeid_cast(column_needle.get()); + const auto * c2_const = typeid_cast(column_replacement.get()); + auto needle = c1_const->getValue(); + auto replacement = c2_const->getValue(); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + + void executeImplNonConstNeedle( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement_const = typeid_cast(column_replacement.get()); + auto replacement = col_replacement_const->getValue(); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 2 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + void executeImplNonConstReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_replacement) + { + const auto * col_needle_const = typeid_cast(column_needle.get()); + auto needle = col_needle_const->getValue(); + const auto * col_replacement = typeid_cast(column_replacement.get()); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + void executeImplNonConstNeedleReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 2 and 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + TiDB::TiDBCollatorPtr collator{}; +}; + +#undef SET_FLAGS +} // namespace DB + diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 6abc65d575a..e7525e832e0 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -28,10 +28,13 @@ #include #include #include +#include #include #include #include "Columns/IColumn.h" +#include "Common/Exception.h" +#include "common/defines.h" #if USE_RE2_ST #include @@ -335,66 +338,6 @@ struct PositionImpl } }; -static re2_st::RE2::Options getDefaultRe2Options() -{ - re2_st::RE2::Options options(re2_st::RE2::CannedOptions::DefaultOptions); - options.set_case_sensitive(true); - options.set_one_line(true); - options.set_dot_nl(false); - return options; -} - -static String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator) -{ - /// for regexp only ci/cs is supported - re2_st::RE2::Options options = getDefaultRe2Options(); - if (collator != nullptr && collator->isCI()) - options.set_case_sensitive(false); - - /// match_type can overwrite collator - if (!match_type.empty()) - { - for (const auto & c : match_type) - { - switch (c) - { - case 'i': - /// according to MySQL doc: if either argument is a binary string, the arguments are handled in - /// case-sensitive fashion as binary strings, even if match_type contains the i character. - /// However, test in MySQL 8.0.25 shows that i flag still take affect even if the collation is binary, - if (collator == nullptr || !collator->isBinary()) - options.set_case_sensitive(false); - break; - case 'c': - options.set_case_sensitive(true); - break; - case 's': - options.set_dot_nl(true); - break; - case 'm': - options.set_one_line(false); - break; - default: - throw Exception("Incorrect arguments to regexp related functions."); - } - } - } - if (!options.one_line() || options.dot_nl() || !options.case_sensitive()) - { - String mode_modifiers("(?"); - if (!options.one_line()) - mode_modifiers += "m"; - if (!options.case_sensitive()) - mode_modifiers += "i"; - if (options.dot_nl()) - mode_modifiers += "s"; - mode_modifiers += ")"; - return mode_modifiers; - } - else - return ""; -} - /// Is the LIKE expression reduced to finding a substring in a string? inline bool likePatternIsStrstr(const String & pattern, String & res) { @@ -572,7 +515,7 @@ struct MatchImpl /// match_type can overwrite collator if (!match_type.empty() || collator != nullptr) { - String mode_modifiers = getRE2ModeModifiers(match_type, collator); + String mode_modifiers = re2Util::getRE2ModeModifiers(match_type, collator); if (!mode_modifiers.empty()) pattern = mode_modifiers + pattern; } @@ -707,7 +650,7 @@ struct MatchImpl /// match_type can overwrite collator if (!match_type.empty() || collator != nullptr) { - String mode_modifiers = getRE2ModeModifiers(match_type, collator); + String mode_modifiers = re2Util::getRE2ModeModifiers(match_type, collator); if (!mode_modifiers.empty()) pattern = mode_modifiers + pattern; } @@ -823,1680 +766,30 @@ struct ExtractImpl } }; - -/** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. - * 'replacement' could contain substitutions, for example: '\2-\3-\1' - */ -template -struct ReplaceRegexpImpl +struct NameLike { - static constexpr bool support_non_const_needle = false; - static constexpr bool support_non_const_replacement = false; - /// need customized escape char when do the string search - static const bool need_customized_escape_char = false; - /// support match type when do the string search, used in regexp - static const bool support_match_type = true; - - /// Sequence of instructions, describing how to get resulting string. - /// Each element is either: - /// - substitution (in that case first element of pair is their number and second element is empty) - /// - string that need to be inserted (in that case, first element of pair is that string and second element is -1) - using Instructions = std::vector>; - - static const size_t max_captures = 10; - - static Instructions createInstructions(const std::string & s, int num_captures) - { - Instructions instructions; - - String now; - for (size_t i = 0; i < s.size(); ++i) - { - if (s[i] == '\\' && i + 1 < s.size()) - { - if (isNumericASCII(s[i + 1])) /// Substitution - { - if (!now.empty()) - { - instructions.emplace_back(-1, now); - now = ""; - } - instructions.emplace_back(s[i + 1] - '0', String()); - } - else - now += s[i + 1]; /// Escaping - ++i; - } - else - now += s[i]; /// Plain character - } - - if (!now.empty()) - { - instructions.emplace_back(-1, now); - now = ""; - } - - for (const auto & it : instructions) - if (it.first >= num_captures) - throw Exception("Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only " - + toString(num_captures - 1) - + " subpatterns", - ErrorCodes::BAD_ARGUMENTS); - - return instructions; - } - - - static void processString(const re2_st::StringPiece & input, - ColumnString::Chars_t & res_data, - ColumnString::Offset & res_offset, - const Int64 & pos, - const Int64 & occ, - re2_st::RE2 & searcher, - int num_captures, - const Instructions & instructions) - { - re2_st::StringPiece matches[max_captures]; - - size_t start_pos = pos <= 0 ? 0 : pos - 1; - Int64 match_occ = 0; - size_t prefix_length = std::min(start_pos, static_cast(input.length())); - if (prefix_length > 0) - { - /// Copy prefix - res_data.resize(res_data.size() + prefix_length); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data(), prefix_length); - res_offset += prefix_length; - } - while (start_pos < static_cast(input.length())) - { - /// If no more replacements possible for current string - bool can_finish_current_string = false; - - if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) - { - match_occ++; - /// if occ > 0, it will replace all the match expr, otherwise it only replace the occ-th match - if (occ == 0 || match_occ == occ) - { - const auto & match = matches[0]; - size_t bytes_to_copy = (match.data() - input.data()) - start_pos; - - /// Copy prefix before matched regexp without modification - res_data.resize(res_data.size() + bytes_to_copy); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy); - res_offset += bytes_to_copy; - start_pos += bytes_to_copy + match.length(); - - /// Do substitution instructions - for (const auto & it : instructions) - { - if (it.first >= 0) - { - res_data.resize(res_data.size() + matches[it.first].length()); - memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length()); - res_offset += matches[it.first].length(); - } - else - { - res_data.resize(res_data.size() + it.second.size()); - memcpy(&res_data[res_offset], it.second.data(), it.second.size()); - res_offset += it.second.size(); - } - } - - /// when occ > 0, just replace the occ-th match even if replace_one is false - if (replace_one || match.length() == 0) /// Stop after match of zero length, to avoid infinite loop. - can_finish_current_string = true; - } - else - { - const auto & match = matches[0]; - size_t bytes_to_copy = (match.data() - input.data()) - start_pos + match.length(); - - /// Copy the matched string without modification - res_data.resize(res_data.size() + bytes_to_copy); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy); - res_offset += bytes_to_copy; - start_pos += bytes_to_copy; - if (match.length() == 0) - can_finish_current_string = true; - } - } - else - can_finish_current_string = true; - - /// If ready, append suffix after match to end of string. - if (can_finish_current_string) - { - res_data.resize(res_data.size() + input.length() - start_pos); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos); - res_offset += input.length() - start_pos; - start_pos = input.length(); - } - } - - res_data.resize(res_data.size() + 1); - res_data[res_offset] = 0; - ++res_offset; - } - - - static void vector(const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const std::string & replacement, - const Int64 & pos, - const Int64 & occ, - const std::string & match_type, - TiDB::TiDBCollatorPtr collator, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); - - if (needle.empty()) - { - /// Copy all the data without changing. - res_data.resize(data.size()); - const UInt8 * begin = &data[0]; - memcpy(&res_data[0], begin, data.size()); - memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); - return; - } - - String updated_needle = needle; - if (!match_type.empty() || collator != nullptr) - { - String mode_modifiers = getRE2ModeModifiers(match_type, collator); - if (!mode_modifiers.empty()) - updated_needle = mode_modifiers + updated_needle; - } - re2_st::RE2 searcher(updated_needle); - int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast(max_captures)); - - Instructions instructions = createInstructions(replacement, num_captures); - - /// Cannot perform search for whole block. Will process each string separately. - for (size_t i = 0; i < size; ++i) - { - int from = i > 0 ? offsets[i - 1] : 0; - re2_st::StringPiece input(reinterpret_cast(&data[0] + from), offsets[i] - from - 1); - - processString(input, res_data, res_offset, pos, occ, searcher, num_captures, instructions); - res_offsets[i] = res_offset; - } - } - - static void vectorFixed(const ColumnString::Chars_t & data, - size_t n, - const std::string & needle, - const std::string & replacement, - const Int64 & pos, - const Int64 & occ, - const std::string & match_type, - TiDB::TiDBCollatorPtr collator, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - ColumnString::Offset res_offset = 0; - size_t size = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(size); - - if (needle.empty()) - { - /// TODO: copy all the data without changing - throw Exception("Length of the second argument of function replace must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); - } - - String updated_needle = needle; - if (!match_type.empty() || collator != nullptr) - { - String mode_modifiers = getRE2ModeModifiers(match_type, collator); - if (!mode_modifiers.empty()) - updated_needle = mode_modifiers + updated_needle; - } - re2_st::RE2 searcher(updated_needle); - int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast(max_captures)); - - Instructions instructions = createInstructions(replacement, num_captures); - - for (size_t i = 0; i < size; ++i) - { - int from = i * n; - re2_st::StringPiece input(reinterpret_cast(&data[0] + from), n); - - processString(input, res_data, res_offset, pos, occ, searcher, num_captures, instructions); - res_offsets[i] = res_offset; - } - } - static void constant(const String & input, const String & needle, const String & replacement, const Int64 & pos, const Int64 & occ, const String & match_type, TiDB::TiDBCollatorPtr collator, String & output) - { - ColumnString::Chars_t input_data; - input_data.insert(input_data.end(), input.begin(), input.end()); - ColumnString::Offsets input_offsets; - input_offsets.push_back(input_data.size() + 1); - ColumnString::Chars_t output_data; - ColumnString::Offsets output_offsets; - vector(input_data, input_offsets, needle, replacement, pos, occ, match_type, collator, output_data, output_offsets); - output = String(reinterpret_cast(&output_data[0]), output_offsets[0] - 1); - } + static constexpr auto name = "like"; }; - - -/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. - */ -template -struct ReplaceStringImpl +struct NamePosition { - static constexpr bool support_non_const_needle = true; - static constexpr bool support_non_const_replacement = true; - /// need customized escape char during the string search - static const bool need_customized_escape_char = false; - /// support match type during the string search, used in regexp - static const bool support_match_type = false; - - static void vector(const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); - - if (needle.empty()) - { - /// Copy all the data without changing. - res_data.resize(data.size()); - memcpy(&res_data[0], begin, data.size()); - memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); - return; - } - - /// The current index in the array of strings. - size_t i = 0; - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - - /// Determine which index it belongs to. - while (i < offsets.size() && begin + offsets[i] <= match) - { - res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); - ++i; - } - res_offset += (match - pos); - - /// If you have reached the end, it's time to stop - if (i == offsets.size()) - break; - - /// Is it true that this line no longer needs to perform transformations. - bool can_finish_current_string = false; - - /// We check that the entry does not go through the boundaries of strings. - if (match + needle.size() < begin + offsets[i]) - { - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle.size(); - if (replace_one) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - res_data.resize(res_data.size() + (begin + offsets[i] - pos)); - memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); - res_offset += (begin + offsets[i] - pos); - res_offsets[i] = res_offset; - pos = begin + offsets[i]; - ++i; - } - } - } - - static void vectorNonConstNeedle( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(offsets.size()); - - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < offsets.size(); ++i) - { - auto data_offset = StringUtil::offsetAt(offsets, i); - auto data_size = StringUtil::sizeAt(offsets, i); - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - const UInt8 * begin = &data[data_offset]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data_size; - - if (needle_size == 0) - { - /// Copy the whole data to res without changing - res_data.resize(res_data.size() + data_size); - memcpy(&res_data[res_offset], begin, data_size); - res_offset += data_size; - res_offsets[i] = res_offset; - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - if (match == end) - { - /// It's time to stop. - break; - } - - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle_size; - - if (replace_one) - { - /// Copy the rest of data and stop. - res_data.resize(res_data.size() + (end - pos)); - memcpy(&res_data[res_offset], pos, (end - pos)); - res_offset += (end - pos); - break; - } - } - res_offsets[i] = res_offset; - } - } - - static void vectorNonConstReplacement( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); - - if (needle.empty()) - { - /// Copy all the data without changing. - res_data.resize(data.size()); - memcpy(&res_data[0], begin, data.size()); - memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); - return; - } - - /// The current index in the array of strings. - size_t i = 0; - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - - /// Determine which index it belongs to. - while (i < offsets.size() && begin + offsets[i] <= match) - { - res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); - ++i; - } - res_offset += (match - pos); - - /// If you have reached the end, it's time to stop - if (i == offsets.size()) - break; - - /// Is it true that this line no longer needs to perform transformations. - bool can_finish_current_string = false; - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - /// We check that the entry does not go through the boundaries of strings. - if (match + needle.size() < begin + offsets[i]) - { - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle.size(); - if (replace_one) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - res_data.resize(res_data.size() + (begin + offsets[i] - pos)); - memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); - res_offset += (begin + offsets[i] - pos); - res_offsets[i] = res_offset; - pos = begin + offsets[i]; - ++i; - } - } - } - - static void vectorNonConstNeedleReplacement( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(offsets.size()); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < offsets.size(); ++i) - { - auto data_offset = StringUtil::offsetAt(offsets, i); - auto data_size = StringUtil::sizeAt(offsets, i); - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - const UInt8 * begin = &data[data_offset]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data_size; - - if (needle_size == 0) - { - res_data.resize(res_data.size() + data_size); - memcpy(&res_data[res_offset], begin, data_size); - res_offset += data_size; - res_offsets[i] = res_offset; - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - if (match == end) - { - /// It's time to stop. - break; - } - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle_size; - - if (replace_one) - { - /// Copy the rest of data and stop. - res_data.resize(res_data.size() + (end - pos)); - memcpy(&res_data[res_offset], pos, (end - pos)); - res_offset += (end - pos); - break; - } - } - res_offsets[i] = res_offset; - } - } - - /// Note: this function converts fixed-length strings to variable-length strings - /// and each variable-length string should ends with zero byte. - static void vectorFixed(const ColumnString::Chars_t & data, - size_t n, - const std::string & needle, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - - /// The current index in the string array. - size_t i = 0; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = begin + n * (i + 1) - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = begin + n * (i + 1); \ - ++i; \ - } while (false) - - if (needle.empty()) - { - /// Copy all the data without changing. - while (i < count) - { - COPY_REST_OF_CURRENT_STRING(); - } - return; - } - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy skipped strings without any changes but - /// add zero byte to the end of each string. - while (i < count && begin + n * (i + 1) <= match) - { - COPY_REST_OF_CURRENT_STRING(); - } - - /// If you have reached the end, it's time to stop - if (i == count) - break; - - /// Copy unchanged part of current string. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += (match - pos); - - /// Is it true that this line no longer needs to perform conversions. - bool can_finish_current_string = false; - - /// We check that the entry does not pass through the boundaries of strings. - if (match + needle.size() <= begin + n * (i + 1)) - { - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle.size(); - if (replace_one || pos == begin + n * (i + 1)) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - COPY_REST_OF_CURRENT_STRING(); - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void vectorFixedNonConstNeedle( - const ColumnString::Chars_t & data, - size_t n, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < count; ++i) - { - const UInt8 * begin = &data[i * n]; - const UInt8 * pos = begin; - const UInt8 * end = pos + n; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = end - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = end; \ - } while (false) - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - if (needle_size == 0) - { - COPY_REST_OF_CURRENT_STRING(); - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - if (match == end) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle_size; - - if (replace_one) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void vectorFixedNonConstReplacement( - const ColumnString::Chars_t & data, - size_t n, - const std::string & needle, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - - /// The current index in the string array. - size_t i = 0; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = begin + n * (i + 1) - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = begin + n * (i + 1); \ - ++i; \ - } while (false) - - if (needle.empty()) - { - /// Copy all the data without changing. - while (i < count) - { - COPY_REST_OF_CURRENT_STRING(); - } - return; - } - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy skipped strings without any changes but - /// add zero byte to the end of each string. - while (i < count && begin + n * (i + 1) <= match) - { - COPY_REST_OF_CURRENT_STRING(); - } - - /// If you have reached the end, it's time to stop - if (i == count) - break; - - /// Copy unchanged part of current string. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += (match - pos); - - /// Is it true that this line no longer needs to perform conversions. - bool can_finish_current_string = false; - - /// We check that the entry does not pass through the boundaries of strings. - if (match + needle.size() <= begin + n * (i + 1)) - { - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle.size(); - if (replace_one || pos == begin + n * (i + 1)) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - COPY_REST_OF_CURRENT_STRING(); - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void vectorFixedNonConstNeedleReplacement( - const ColumnString::Chars_t & data, - size_t n, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < count; ++i) - { - const UInt8 * begin = &data[i * n]; - const UInt8 * pos = begin; - const UInt8 * end = pos + n; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = end - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = end; \ - } while (false) - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - if (needle_size == 0) - { - COPY_REST_OF_CURRENT_STRING(); - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - if (match == end) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle_size; - - if (replace_one) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void constant(const std::string & data, const std::string & needle, const std::string & replacement, const Int64 & /* pos */, const Int64 & /* occ */, const std::string & /* match_type */, TiDB::TiDBCollatorPtr /* collator */, std::string & res_data) - { - if (needle.empty()) - { - res_data = data; - return; - } - res_data = ""; - int replace_cnt = 0; - for (size_t i = 0; i < data.size(); ++i) - { - bool match = true; - if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0)) - match = false; - for (size_t j = 0; match && j < needle.size(); ++j) - if (data[i + j] != needle[j]) - match = false; - if (match) - { - ++replace_cnt; - res_data += replacement; - i = i + needle.size() - 1; - } - else - res_data += data[i]; - } - } -}; - -// Columns may be const, nullable or plain vector, we can conveniently handle -// these different type columns with Param. -class Param -{ -public: - DISALLOW_COPY_AND_MOVE(Param); - - Param(const ColumnPtr ptr, const StringRef & default_value) - : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), - is_const(false), data_stringrf(default_value.data, default_value.size), data_int64(0) - { - // arg is not provided and we should use default_value - if (col_ptr == nullptr) return; - - const auto * col_const = typeid_cast(&(*col_ptr)); - - // Handle const - if (col_const != nullptr) - { - // This is a const column - auto const_data = col_const->getValue(); - data_stringrf.data = const_data.c_str(); - data_stringrf.size = const_data.size(); - is_const = true; - } - else { - // This is a vector column - col_str = checkAndGetColumn(&(*col_ptr)); - } - - // Handle nullable - if (col_ptr->isColumnNullable()) - null_map = &(static_cast(*col_ptr).getNullMapData()); - } - - Param(const ColumnPtr ptr, Int64 default_value) - : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), - is_const(false), data_int64(default_value) - { - // arg is not provided and we should use default_value - if (col_ptr == nullptr) return; - - const auto * col_const = typeid_cast(&(*col_ptr)); - - // Handle const - if (col_const != nullptr) - { - // This is a const column - data_int64 = col_const->getValue(); - is_const = true; - } - else - { - // This is a vector column - col_int64 = checkAndGetColumn(&(*col_ptr)); - } - - // Handle nullable - if (col_ptr->isColumnNullable()) - null_map = &(static_cast(*col_ptr).getNullMapData()); - } - - Int64 getInt64(size_t idx) const - { - // Use default value when arg is const or not provided. - // For safety, nullptr should be checked - return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data_int64; - } - - // @param to: destination that this function should copy data_stringrf to - void getString(size_t idx, StringRef & to) const - { - // Use default value when arg is const or not provided. - // For safety, nullptr should be checked - !is_const && col_str != nullptr ? (to = col_str->getDataAt(idx)) : (to = data_stringrf); - } - - bool isNullAt(size_t idx) const - { - if (null_map == nullptr) return false; - - return (*null_map)[idx]; - } - - bool isConstCol() const { return is_const; } - bool isNullableCol() const { return null_map == nullptr; } - size_t getDataNum() const { return col_ptr->size(); } -private: - const ColumnPtr col_ptr; - const ColumnString * col_str; - const ColumnInt64 * col_int64; - ConstNullMapPtr null_map; - bool is_const; // mark as the const column when it's true - StringRef data_stringrf; - Int64 data_int64; -}; - -struct NameTiDBRegexp -{ - static constexpr auto name = "regexp"; -}; - -struct NameRegexpLike -{ - static constexpr auto name = "regexp_like"; -}; - -struct NameLike -{ - static constexpr auto name = "like"; -}; - -#define SET_FLAGS(flags) ((flags) |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE) - -class FunctionStringRegexpBase -{ -public: - static constexpr size_t REGEXP_PARAM_NUM = 2; - static constexpr size_t REGEXP_LIKE_PARAM_NUM = 3; - static constexpr size_t REGEXP_INSTR_PARAM_NUM = 6; - static constexpr size_t REGEXP_REPLACE_PARAM_NUM = 6; - static constexpr size_t REGEXP_SUBSTR_PARAM_NUM = 5; - - void memorize(const Param & pat_param, const std::unique_ptr & match_type_param) const - { - StringRef pat; - pat_param.getString(0, pat); - if (match_type_param != nullptr) - { - // TODO handle match_type_param - } - - int flags = 0; - SET_FLAGS(flags); - memorized_re = std::make_unique(String(pat.data, pat.size), flags); - } - - // Check if we can memorize the regexp - template - static bool canMemorize(size_t arg_num, const Param & pat_param, const std::unique_ptr & match_type_param) - { - size_t total_param_num = 0; - constexpr std::string_view class_name_sv(Name::name); - constexpr std::string_view tidb_regexp_name_sv(NameTiDBRegexp::name); - constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); - - if constexpr (class_name_sv == tidb_regexp_name_sv) - total_param_num = REGEXP_PARAM_NUM; - else if constexpr (class_name_sv == regexp_like_name_sv) - total_param_num = REGEXP_LIKE_PARAM_NUM; - else - throw Exception("Unknown regular function."); - - if constexpr (Name::name == NameTiDBRegexp::name) - { - return pat_param.isConstCol(); - } else - { - const bool is_pat_const = pat_param.isConstCol(); - if ((arg_num < total_param_num && is_pat_const) - || (arg_num == total_param_num && is_pat_const && match_type_param->isConstCol())) - { - return true; - } - } - - return false; - } - - bool isMemorized() const { return memorized_re != nullptr; } - - const std::unique_ptr & getRegexp() const { return memorized_re; } -private: - // We should pre compile the regular expression when: - // - only pattern column is provided and it's a constant column - // - pattern and match type columns are provided and they are both constant columns - mutable std::unique_ptr memorized_re; -}; - -template -class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction -{ -public: - using ResultType = UInt8; - static constexpr auto name = Name::name; - - static FunctionPtr create(const Context &) { return std::make_shared(); } - String getName() const override { return name; } - bool isVariadic() const override { return true; } - void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments [[maybe_unused]]) const override { return std::make_shared>(); } - bool useDefaultImplementationForNulls() const override { return false; } - size_t getNumberOfArguments() const override { return 0; } - - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override - { - // Do something related with nullable columns - NullPresence null_presence = getNullPresense(block, arguments); - if (null_presence.has_null_constant) - { - // This is a null constant column - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); - return; - } - - const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; - const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; - - const Param expr_param(col_expr, ""); - const Param pat_param(col_pat, ""); - auto arg_num = arguments.size(); - - // Only when this is a regexp_like function, match_type_param will be initialized - std::unique_ptr match_type_param; - - constexpr std::string_view class_name(name); - constexpr std::string_view regexp_like_name(NameRegexpLike::name); - if constexpr (class_name == regexp_like_name) - { - ColumnPtr col_match_type; - // Try to get match type column only when it's a regexp_like function - if (arg_num > 2) - { - col_match_type = block.getByPosition(arguments[2]).column; - match_type_param = std::make_unique(*col_match_type, ""); - } - else - { - match_type_param = std::make_unique(*col_match_type, ""); - } - } - - if (pat_param.getDataNum() == 0) - { - auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); - block.getByPosition(result).column = ColumnConst::create(std::move(null_col_res), 0); - return; - } - - // Check if args are all const columns - if (expr_param.isConstCol() && pat_param.isConstCol()) - { -#define PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type) \ - do { \ - int flags = 0; \ - SET_FLAGS(flags); \ - if constexpr (has_match_type) \ - { \ - /* TODO put match_type into pattern */ \ - } \ - Regexps::Regexp regexp(String((pat).data, (pat).size), flags); \ - ResultType res{regexp.match((expr).data, (expr).size)}; \ - (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ - } while(0) - - StringRef pat; - pat_param.getString(0, pat); - if (pat.size == 0) - throw Exception("Empty pattern is invalid"); - - StringRef expr; - expr_param.getString(0, expr); - if constexpr (class_name == regexp_like_name) - { - if (arg_num > 2 && match_type_param->isConstCol()) - { - constexpr bool has_match_type = true; - PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); - return; - } - else if (arg_num == 2) - { - constexpr bool has_match_type = false; - PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); - return; - } - // reach here when arg_num == 3 and match_type is not const - } - else - { - constexpr bool has_match_type = false; - PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); - return; - } - -#undef PROCESS - } - - // Check memorization - if (canMemorize(arg_num, pat_param, match_type_param)) - memorize(pat_param, match_type_param); - - // Initialize result column - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(expr_param.getDataNum()); - - // Start to match - if (isMemorized()) - { - const auto & regexp = getRegexp(); - if (null_presence.has_nullable) - { - // expr column must be a nullable column here, so we need to check null for each elems - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(expr_param.getDataNum()); - - StringRef expr_ref; - for (size_t i = 0; i < arg_num; ++i) - { - if (expr_param.isNullAt(i)) - { - nullmap[i] = 1; - continue; - } - - nullmap[i] = 0; - expr_param.getString(i, expr_ref); - vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match - } - - block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); - } - else - { - // expr column is impossible to be a nullable column here - StringRef expr_ref; - for (size_t i = 0; i < arg_num; ++i) - { - expr_param.getString(i, expr_ref); - vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match - } - - block.getByPosition(result).column = std::move(col_res); - } - } - else - { - // container used for receiving data - StringRef expr; - StringRef pat; - - if (null_presence.has_nullable) - { - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(expr_param.getDataNum()); - - - for (size_t i = 0; i < arg_num; ++i) - { - if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) - { - nullmap[i] = 1; - continue; - } - - expr_param.getString(i, expr); - pat_param.getString(i, pat); - - if constexpr (class_name == regexp_like_name) - { - int flags = 0; - SET_FLAGS(flags); - const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); - vec_res[i] = regexp->match(expr.data, expr.size); // match - } - else - { - // TODO handle match_type first and do match action - } - - } - - block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); - } - else - { - for (size_t i = 0; i < arg_num; ++i) - { - expr_param.getString(i, expr); - pat_param.getString(i, pat); - - if constexpr (class_name == regexp_like_name) - { - int flags = 0; - SET_FLAGS(flags); - const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); - vec_res[i] = regexp->match(expr.data, expr.size); // match - } - else - { - // TODO handle match_type first and do match action - } - } - - block.getByPosition(result).column = std::move(col_res); - } - } - } -private: - TiDB::TiDBCollatorPtr collator = nullptr; -}; - -template -class FunctionStringReplace : public IFunction -{ -public: - static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) - { - return std::make_shared(); - } - - String getName() const override - { - return name; - } - - size_t getNumberOfArguments() const override - { - return 0; - } - - bool isVariadic() const override { return true; } - bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - return {3, 4, 5}; - } - else if constexpr (Impl::support_non_const_needle) - { - return {2, 3, 4, 5}; - } - else if constexpr (Impl::support_non_const_replacement) - { - return {1, 3, 4, 5}; - } - else - { - return {1, 2, 3, 4, 5}; - } - } - void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (!arguments[0]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (!arguments[1]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (!arguments[2]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (arguments.size() > 3 && !arguments[3]->isInteger()) - throw Exception("Illegal type " + arguments[2]->getName() + " of forth argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (arguments.size() > 4 && !arguments[4]->isInteger()) - throw Exception("Illegal type " + arguments[2]->getName() + " of fifth argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (arguments.size() > 5 && !arguments[5]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[2]->getName() + " of sixth argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - return std::make_shared(); - } - - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override - { - const ColumnPtr & column_src = block.getByPosition(arguments[0]).column; - const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; - const ColumnPtr & column_replacement = block.getByPosition(arguments[2]).column; - const ColumnPtr column_pos = arguments.size() > 3 ? block.getByPosition(arguments[3]).column : nullptr; - const ColumnPtr column_occ = arguments.size() > 4 ? block.getByPosition(arguments[4]).column : nullptr; - const ColumnPtr column_match_type = arguments.size() > 5 ? block.getByPosition(arguments[5]).column : nullptr; - - if ((column_pos != nullptr && !column_pos->isColumnConst()) - || (column_occ != nullptr && !column_occ->isColumnConst()) - || (column_match_type != nullptr && !column_match_type->isColumnConst())) - throw Exception("4th, 5th, 6th arguments of function " + getName() + " must be constants."); - Int64 pos = column_pos == nullptr ? 1 : typeid_cast(column_pos.get())->getInt(0); - Int64 occ = column_occ == nullptr ? 0 : typeid_cast(column_occ.get())->getInt(0); - String match_type = column_match_type == nullptr ? "" : typeid_cast(column_match_type.get())->getValue(); - - ColumnWithTypeAndName & column_result = block.getByPosition(result); - - bool needle_const = column_needle->isColumnConst(); - bool replacement_const = column_replacement->isColumnConst(); - - if (needle_const && replacement_const) - { - executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - else if (needle_const) - { - executeImplNonConstReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - else if (replacement_const) - { - executeImplNonConstNeedle(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - else - { - executeImplNonConstNeedleReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - } - -private: - void executeImpl( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos, - Int64 occ, - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - const auto * c1_const = typeid_cast(column_needle.get()); - const auto * c2_const = typeid_cast(column_replacement.get()); - auto needle = c1_const->getValue(); - auto replacement = c2_const->getValue(); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - - void executeImplNonConstNeedle( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement_const = typeid_cast(column_replacement.get()); - auto replacement = col_replacement_const->getValue(); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 2 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplNonConstReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_replacement) - { - const auto * col_needle_const = typeid_cast(column_needle.get()); - auto needle = col_needle_const->getValue(); - const auto * col_replacement = typeid_cast(column_replacement.get()); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplNonConstNeedleReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 2 and 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - TiDB::TiDBCollatorPtr collator{}; -}; - -struct NamePosition -{ - static constexpr auto name = "position"; -}; -struct NamePositionUTF8 -{ - static constexpr auto name = "positionUTF8"; -}; -struct NamePositionCaseInsensitive -{ - static constexpr auto name = "positionCaseInsensitive"; -}; -struct NamePositionCaseInsensitiveUTF8 -{ - static constexpr auto name = "positionCaseInsensitiveUTF8"; -}; -struct NameMatch -{ - static constexpr auto name = "match"; -}; + static constexpr auto name = "position"; +}; +struct NamePositionUTF8 +{ + static constexpr auto name = "positionUTF8"; +}; +struct NamePositionCaseInsensitive +{ + static constexpr auto name = "positionCaseInsensitive"; +}; +struct NamePositionCaseInsensitiveUTF8 +{ + static constexpr auto name = "positionCaseInsensitiveUTF8"; +}; +struct NameMatch +{ + static constexpr auto name = "match"; +}; struct NameLike3Args { @@ -2510,22 +803,6 @@ struct NameExtract { static constexpr auto name = "extract"; }; -struct NameReplaceOne -{ - static constexpr auto name = "replaceOne"; -}; -struct NameReplaceAll -{ - static constexpr auto name = "replaceAll"; -}; -struct NameReplaceRegexpOne -{ - static constexpr auto name = "replaceRegexpOne"; -}; -struct NameReplaceRegexpAll -{ - static constexpr auto name = "replaceRegexpAll"; -}; // using FunctionPosition = FunctionsStringSearch, NamePosition>; using FunctionPositionUTF8 = FunctionsStringSearch, NamePositionUTF8>; @@ -2534,30 +811,19 @@ using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; using FunctionMatch = FunctionsStringSearch, NameMatch>; -using FunctionTiDBRegexp = FunctionStringRegexp; -// using FunctionTiDBRegexp = FunctionsStringSearch, NameTiDBRegexp>; using FunctionLike = FunctionsStringSearch, NameLike>; using FunctionLike3Args = FunctionsStringSearch, NameLike3Args>; using FunctionNotLike = FunctionsStringSearch, NameNotLike>; using FunctionExtract = FunctionsStringSearchToString; -using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; -using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; -using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; -using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; void registerFunctionsStringSearch(FunctionFactory & factory) { - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); // factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); - factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/FunctionsStringSearch.h b/dbms/src/Functions/FunctionsStringSearch.h index 517de05a574..d8db1b7c356 100644 --- a/dbms/src/Functions/FunctionsStringSearch.h +++ b/dbms/src/Functions/FunctionsStringSearch.h @@ -123,11 +123,11 @@ class FunctionsStringSearch : public IFunction const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; - const ColumnConst * col_haystack_const = typeid_cast(&*column_haystack); - const ColumnConst * col_needle_const = typeid_cast(&*column_needle); + const auto * col_haystack_const = typeid_cast(&*column_haystack); + const auto * col_needle_const = typeid_cast(&*column_needle); UInt8 escape_char = CH_ESCAPE_CHAR; - String match_type = ""; + String match_type; if constexpr (Impl::need_customized_escape_char) { const auto * col_escape_const = typeid_cast(&*block.getByPosition(arguments[2]).column); @@ -158,7 +158,7 @@ class FunctionsStringSearch : public IFunction { if (arguments.size() > 2) { - auto * col_match_type_const = typeid_cast(&*block.getByPosition(arguments[2]).column); + const auto * col_match_type_const = typeid_cast(&*block.getByPosition(arguments[2]).column); if (col_match_type_const == nullptr) throw Exception("Match type argument of function " + getName() + " must be constant"); match_type = col_match_type_const->getValue(); @@ -168,7 +168,7 @@ class FunctionsStringSearch : public IFunction if (col_haystack_const && col_needle_const) { ResultType res{}; - String needle_string = col_needle_const->getValue(); + auto needle_string = col_needle_const->getValue(); Impl::constantConstant(col_haystack_const->getValue(), needle_string, escape_char, match_type, collator, res); block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); return; @@ -179,8 +179,8 @@ class FunctionsStringSearch : public IFunction typename ColumnVector::Container & vec_res = col_res->getData(); vec_res.resize(column_haystack->size()); - const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); - const ColumnString * col_needle_vector = checkAndGetColumn(&*column_needle); + const auto * col_haystack_vector = checkAndGetColumn(&*column_haystack); + const auto * col_needle_vector = checkAndGetColumn(&*column_needle); if (col_haystack_vector && col_needle_vector) Impl::vectorVector(col_haystack_vector->getChars(), @@ -193,7 +193,7 @@ class FunctionsStringSearch : public IFunction vec_res); else if (col_haystack_vector && col_needle_const) { - String needle_string = col_needle_const->getValue(); + auto needle_string = col_needle_const->getValue(); Impl::vectorConstant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle_string, escape_char, match_type, collator, vec_res); } else if (col_haystack_const && col_needle_vector) @@ -261,11 +261,11 @@ class FunctionsStringSearchToString : public IFunction const ColumnPtr column = block.getByPosition(arguments[0]).column; const ColumnPtr column_needle = block.getByPosition(arguments[1]).column; - const ColumnConst * col_needle = typeid_cast(&*column_needle); + const auto * col_needle = typeid_cast(&*column_needle); if (!col_needle) throw Exception("Second argument of function " + getName() + " must be constant string.", ErrorCodes::ILLEGAL_COLUMN); - if (const ColumnString * col = checkAndGetColumn(column.get())) + if (const auto * col = checkAndGetColumn(column.get())) { auto col_res = ColumnString::create(); diff --git a/dbms/src/Functions/re2Util.cpp b/dbms/src/Functions/re2Util.cpp new file mode 100644 index 00000000000..7687fd8e4f8 --- /dev/null +++ b/dbms/src/Functions/re2Util.cpp @@ -0,0 +1,81 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace DB +{ +namespace re2Util +{ +re2_st::RE2::Options getDefaultRe2Options() +{ + re2_st::RE2::Options options(re2_st::RE2::CannedOptions::DefaultOptions); + options.set_case_sensitive(true); + options.set_one_line(true); + options.set_dot_nl(false); + return options; +} + +String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator) +{ + /// for regexp only ci/cs is supported + re2_st::RE2::Options options = getDefaultRe2Options(); + if (collator != nullptr && collator->isCI()) + options.set_case_sensitive(false); + + /// match_type can overwrite collator + if (!match_type.empty()) + { + for (const auto & c : match_type) + { + switch (c) + { + case 'i': + /// according to MySQL doc: if either argument is a binary string, the arguments are handled in + /// case-sensitive fashion as binary strings, even if match_type contains the i character. + /// However, test in MySQL 8.0.25 shows that i flag still take affect even if the collation is binary, + if (collator == nullptr || !collator->isBinary()) + options.set_case_sensitive(false); + break; + case 'c': + options.set_case_sensitive(true); + break; + case 's': + options.set_dot_nl(true); + break; + case 'm': + options.set_one_line(false); + break; + default: + throw Exception("Incorrect arguments to regexp related functions."); + } + } + } + if (!options.one_line() || options.dot_nl() || !options.case_sensitive()) + { + String mode_modifiers("(?"); + if (!options.one_line()) + mode_modifiers += "m"; + if (!options.case_sensitive()) + mode_modifiers += "i"; + if (options.dot_nl()) + mode_modifiers += "s"; + mode_modifiers += ")"; + return mode_modifiers; + } + else + return ""; +} +} +} // namespace DB diff --git a/dbms/src/Functions/re2Util.h b/dbms/src/Functions/re2Util.h new file mode 100644 index 00000000000..cff69f6e3c5 --- /dev/null +++ b/dbms/src/Functions/re2Util.h @@ -0,0 +1,37 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "Common/Exception.h" +#include + + +#if USE_RE2_ST +#include +#else +#define re2_st re2 +#endif + +namespace DB +{ +namespace re2Util +{ +re2_st::RE2::Options getDefaultRe2Options(); +String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator); +} +} // namespace DB diff --git a/dbms/src/Functions/registerFunctions.cpp b/dbms/src/Functions/registerFunctions.cpp index 2b59231bb4d..d31a95ade4f 100644 --- a/dbms/src/Functions/registerFunctions.cpp +++ b/dbms/src/Functions/registerFunctions.cpp @@ -48,6 +48,7 @@ void registerFunctionsCharset(FunctionFactory &); void registerFunctionsNull(FunctionFactory &); void registerFunctionsStringMath(FunctionFactory &); void registerFunctionsDuration(FunctionFactory &); +void registerFunctionsRegexp(FunctionFactory &); void registerFunctions() @@ -81,6 +82,7 @@ void registerFunctions() registerFunctionsNull(factory); registerFunctionsStringMath(factory); registerFunctionsDuration(factory); + registerFunctionsRegexp(factory); } } // namespace DB From 573f877506696118f483bbf5b098028021e25d45 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 30 Sep 2022 17:31:32 +0800 Subject: [PATCH 06/87] pass tests, for the moment --- dbms/src/Functions/FunctionsRegexp.cpp | 44 ++- dbms/src/Functions/FunctionsRegexp.h | 334 ++++++++++++++++------ dbms/src/Functions/IFunction.cpp | 36 ++- dbms/src/Functions/IFunction.h | 8 - dbms/src/Functions/tests/gtest_regexp.cpp | 73 ++--- 5 files changed, 350 insertions(+), 145 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index cb515f529de..2b00e9e57f8 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -14,6 +14,8 @@ #include #include +#include +#include "Functions/Regexps.h" namespace DB { @@ -48,7 +50,7 @@ String getMatchType(const String & match_type) auto iter_i = applied_flags.find('i'); if (iter_i != applied_flags.end()) applied_flags.erase(iter_i); - + continue; } @@ -63,6 +65,44 @@ String getMatchType(const String & match_type) return flags; } +NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) +{ + NullPresence res; + + for (const auto & arg : args) + { + const auto & elem = block.getByPosition(arg); + const auto * col_const = typeid_cast(&(*(elem.column))); + + if (col_const != nullptr) + { + auto col_const_data = col_const->getDataColumnPtr(); + + // It's needless to check if it's a const nullable column when res.has_const_null has been set + if (!res.has_const_null_col) + { + // check const null + if (col_const_data->isColumnNullable()) + { + if (static_cast(*col_const_data).isNullAt(0)) + res.has_const_null_col = true; + } + } + } + else + { + // It's needless to check if it's a nullable column when res.has_nullable_col has been set + if (!res.has_nullable_col) + { + if ((elem.column)->isColumnNullable()) + res.has_nullable_col = true; + } + } + } + + return res; +} + /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. * 'replacement' could contain substitutions, for example: '\2-\3-\1' */ @@ -1050,6 +1090,7 @@ struct ReplaceStringImpl }; using FunctionTiDBRegexp = FunctionStringRegexp; +using FunctionRegexpLike = FunctionStringRegexp; using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; @@ -1062,6 +1103,7 @@ void registerFunctionsRegexp(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); } } // namespace DB diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 950b48b7583..49fae7edca8 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -28,6 +28,12 @@ #include #include +#include "Columns/ColumnNullable.h" +#include "Columns/ColumnsNumber.h" +#include "Common/Exception.h" +#include "Core/Field.h" +#include "DataTypes/DataTypeNullable.h" +#include "common/types.h" #include #if USE_RE2_ST @@ -70,10 +76,35 @@ struct NameReplaceRegexpAll static constexpr auto name = "replaceRegexpAll"; }; -#define SET_FLAGS(flags) ((flags) |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE) - String getMatchType(const String & match_type); +inline int getDefaultFlags() +{ + int flags = 0; + flags |= OptimizedRegularExpressionImpl::RE_NO_CAPTURE | OptimizedRegularExpressionImpl::RE_NO_OPTIMIZE; + return flags; +} + +struct NullPresence +{ + bool has_nullable_col = false; + bool has_const_null_col = false; +}; + +NullPresence getNullPresense(const Block & block, const ColumnNumbers & args); + +inline String addMatchTypeForPattern(const String & pattern, const String & match_type) +{ + String flags = getMatchType(match_type); + return fmt::format("(?{}){}", flags, pattern); +} + +inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type) +{ + String final_pattern = addMatchTypeForPattern(pattern, match_type); + return Regexps::get(final_pattern, getDefaultFlags()); +} + // Columns may be const, nullable or plain vector, we can conveniently handle // these different type columns with Param. class Param @@ -81,9 +112,9 @@ class Param public: DISALLOW_COPY_AND_MOVE(Param); - Param(const ColumnPtr ptr, const StringRef & default_value) + Param(const ColumnPtr & ptr, const String & default_value) : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), - is_const(false), data_stringrf(default_value.data, default_value.size), data_int64(0) + is_const(false), data_string(default_value), data_int64(0) { // arg is not provided and we should use default_value if (col_ptr == nullptr) return; @@ -94,20 +125,44 @@ class Param if (col_const != nullptr) { // This is a const column - data_stringrf = col_const->getDataAt(0); + auto col_const_data = col_const->getDataColumnPtr(); + if (col_const_data->isColumnNullable()) + { + // This is a const nullable column + // const null can't be here as we should have handle it in the previous + Field field; + auto p = static_cast(*col_const_data).getNestedColumnPtr(); + std::cout << fmt::format("family name: {}", p->getFamilyName()) << std::endl; + col_const->get(0, field); + std::cout << "type name: " << field.getTypeName() << std::endl; + data_string = field.safeGet(); + null_map = &(static_cast(*col_const_data).getNullMapData()); + std::cout << fmt::format("cons data string1: {}", data_string) << std::endl; + } + else + { + StringRef tmp_data = col_const->getDataAt(0); + data_string = String(tmp_data.data, tmp_data.size); + std::cout << fmt::format("cons data string2: {}", data_string) << std::endl; + } + is_const = true; } - else { - // This is a vector column - col_str = checkAndGetColumn(&(*col_ptr)); - } - // Handle nullable if (col_ptr->isColumnNullable()) + { + // Handle nullable column + auto nested_ptr = static_cast(*col_ptr).getNestedColumnPtr(); + col_str = checkAndGetColumn(&(*nested_ptr)); null_map = &(static_cast(*col_ptr).getNullMapData()); + } + else { + // This is a pure vector column + col_str = checkAndGetColumn(&(*col_ptr)); + } } - Param(const ColumnPtr ptr, Int64 default_value) + Param(const ColumnPtr & ptr, Int64 default_value) : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), is_const(false), data_int64(default_value) { @@ -120,18 +175,38 @@ class Param if (col_const != nullptr) { // This is a const column - data_int64 = col_const->getValue(); + auto col_const_data = col_const->getDataColumnPtr(); + if (col_const_data->isColumnNullable()) + { + // This is a const nullable column + Field field; + col_const->get(0, field); + data_int64 = field.get(); + null_map = &(static_cast(*col_ptr).getNullMapData()); + std::cout << fmt::format("cons data int 1: {}", data_int64) << std::endl; + } + else + { + data_int64 = col_const->getValue(); + std::cout << fmt::format("cons data int 2: {}", data_int64) << std::endl; + } + is_const = true; + return; + } + + if (col_ptr->isColumnNullable()) + { + // Handle nullable column + auto nested_ptr = static_cast(*col_ptr).getNestedColumnPtr(); + col_int64 = checkAndGetColumn(&(*nested_ptr)); + null_map = &(static_cast(*col_ptr).getNullMapData()); } else { - // This is a vector column + // This is a pure vector column col_int64 = checkAndGetColumn(&(*col_ptr)); } - - // Handle nullable - if (col_ptr->isColumnNullable()) - null_map = &(static_cast(*col_ptr).getNullMapData()); } Int64 getInt64(size_t idx) const @@ -141,12 +216,35 @@ class Param return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data_int64; } - // @param to: destination that this function should copy data_stringrf to - void getString(size_t idx, StringRef & to) const + void getStringRef(size_t idx, StringRef & dst) const { // Use default value when arg is const or not provided. // For safety, nullptr should be checked - !is_const && col_str != nullptr ? (to = col_str->getDataAt(idx)) : (to = data_stringrf); + if (!is_const && col_str != nullptr) + dst = col_str->getDataAt(idx); + else + { + dst.data = data_string.c_str(); + dst.size = data_string.size(); + } + } + + String getString(size_t idx) const + { + // Use default value when arg is const or not provided. + // For safety, nullptr should be checked + if (!is_const && col_str != nullptr) + { + StringRef sr = col_str->getDataAt(idx); + String ret_str(sr.data, sr.size); + std::cout << fmt::format("getString here1: {}", ret_str) << std::endl; + return ret_str; + } + else { + String ret_str(data_string); + std::cout << fmt::format("getString here2: {}", ret_str) << std::endl; + return ret_str; + } } bool isNullAt(size_t idx) const @@ -165,7 +263,7 @@ class Param const ColumnInt64 * col_int64; ConstNullMapPtr null_map; bool is_const; // mark as the const column when it's true - StringRef data_stringrf; + String data_string; Int64 data_int64; }; @@ -181,16 +279,18 @@ class FunctionStringRegexpBase void memorize(const Param & pat_param, const std::unique_ptr & match_type_param) const { - StringRef pat; - pat_param.getString(0, pat); + String && final_pattern = pat_param.getString(0); + if (final_pattern.empty()) + throw Exception("Empty pattern is invalid"); + if (match_type_param != nullptr) { - // TODO handle match_type_param + String && match_type = match_type_param->getString(0); + final_pattern = addMatchTypeForPattern(final_pattern, match_type); } - int flags = 0; - SET_FLAGS(flags); - memorized_re = std::make_unique(String(pat.data, pat.size), flags); + int flags = getDefaultFlags(); + memorized_re = std::make_unique(final_pattern, flags); } // Check if we can memorize the regexp @@ -209,17 +309,14 @@ class FunctionStringRegexpBase else throw Exception("Unknown regular function."); - if constexpr (Name::name == NameTiDBRegexp::name) + if constexpr (class_name_sv == tidb_regexp_name_sv) { return pat_param.isConstCol(); } else { const bool is_pat_const = pat_param.isConstCol(); - if ((arg_num < total_param_num && is_pat_const) - || (arg_num == total_param_num && is_pat_const && match_type_param->isConstCol())) - { + if (is_pat_const && (arg_num < total_param_num || (match_type_param->isConstCol()))) return true; - } } return false; @@ -235,6 +332,7 @@ class FunctionStringRegexpBase mutable std::unique_ptr memorized_re; }; +// Implementation of regexp and regexp_like functions template class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { @@ -246,22 +344,49 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction String getName() const override { return name; } bool isVariadic() const override { return true; } void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments [[maybe_unused]]) const override { return std::make_shared>(); } bool useDefaultImplementationForNulls() const override { return false; } size_t getNumberOfArguments() const override { return 0; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments [[maybe_unused]]) const override + { + size_t args_min_num = 2; + size_t args_max_num = 3; + if (arguments.size() < args_min_num) + throw Exception("Illegal argument number"); + + bool has_nullable_col = false; + + for (size_t i = 0; i < args_min_num; ++i) + checkInputArg(arguments[i], &has_nullable_col); + + constexpr std::string_view class_name_sv(Name::name); + constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); + + // check match_type arg for regexp_like + if constexpr (class_name_sv == regexp_like_name_sv) + if (arguments.size() == args_max_num && !arguments[args_max_num - 1]->isString()) + checkInputArg(arguments[args_max_num - 1], &has_nullable_col); + + if (has_nullable_col) + return std::make_shared(std::make_shared>()); + else + return std::make_shared>(); + } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { // Do something related with nullable columns NullPresence null_presence = getNullPresense(block, arguments); - if (null_presence.has_null_constant) + + const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; + + if (null_presence.has_const_null_col) { - // This is a null constant column + // There is a const null column in the input block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); return; } - const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; if (col_expr->empty()) @@ -271,77 +396,84 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction return; } - const Param expr_param(col_expr, StringRef("")); - const Param pat_param(col_pat, StringRef("")); + const Param expr_param(col_expr, String("")); + const Param pat_param(col_pat, String("")); auto arg_num = arguments.size(); + size_t col_size = expr_param.getDataNum(); - // Only when this is a regexp_like function, match_type_param will be initialized + std::cout << fmt::format("pat_param get string: {}", pat_param.getString(0)) << std::endl; + + // match_type_param will be initialized, only when this is a regexp_like function std::unique_ptr match_type_param; constexpr std::string_view class_name(name); constexpr std::string_view regexp_like_name(NameRegexpLike::name); if constexpr (class_name == regexp_like_name) { - ColumnPtr col_match_type; // Try to get match type column only when it's a regexp_like function + ColumnPtr col_match_type; if (arg_num > 2) { col_match_type = block.getByPosition(arguments[2]).column; - match_type_param = std::make_unique(*col_match_type, ""); + match_type_param = std::make_unique(col_match_type, String("")); } else { - match_type_param = std::make_unique(*col_match_type, ""); + match_type_param = std::make_unique(col_match_type, String("")); } } // Check if args are all const columns if (expr_param.isConstCol() && pat_param.isConstCol()) { -#define PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type) \ +#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type) \ do { \ - int flags = 0; \ - SET_FLAGS(flags); \ + int flags = getDefaultFlags(); \ + String final_pattern = pat; \ + std::cout << fmt::format("pat: {}", pat) << std::endl; \ if constexpr (has_match_type) \ { \ - /* TODO put match_type into pattern */ \ + /* put match_type into pattern */ \ + String match_type = (match_type_param)->getString(0); \ + std::cout << fmt::format("match_type: {}", match_type) << std::endl; \ + final_pattern = addMatchTypeForPattern(final_pattern, match_type); \ + std::cout << fmt::format("final_pattern: {}", final_pattern) << std::endl; \ } \ - Regexps::Regexp regexp(String((pat).data, (pat).size), flags); \ - ResultType res{regexp.match((expr).data, (expr).size)}; \ + Regexps::Regexp regexp(final_pattern, flags); \ + ResultType res{regexp.match(expr)}; \ (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ } while(0) - StringRef pat; - pat_param.getString(0, pat); - if (pat.size == 0) + String pat = pat_param.getString(0); + if (pat.empty()) throw Exception("Empty pattern is invalid"); - StringRef expr; - expr_param.getString(0, expr); + String expr = expr_param.getString(0); if constexpr (class_name == regexp_like_name) { + // regexp_like function if (arg_num > 2 && match_type_param->isConstCol()) { constexpr bool has_match_type = true; - PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type); return; } else if (arg_num == 2) { constexpr bool has_match_type = false; - PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type); return; } // reach here when arg_num == 3 and match_type is not const } else { + // regexp function constexpr bool has_match_type = false; - PROCESS(block, expr, pat, pat_param, match_type_param, has_match_type); + GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type); return; } - -#undef PROCESS +#undef GET_CONST_RESULT } // Check memorization @@ -357,7 +489,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction if (isMemorized()) { const auto & regexp = getRegexp(); - if (null_presence.has_nullable) + if (null_presence.has_nullable_col) { // expr column must be a nullable column here, so we need to check null for each elems auto nullmap_col = ColumnUInt8::create(); @@ -365,8 +497,9 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction nullmap.resize(expr_param.getDataNum()); StringRef expr_ref; - for (size_t i = 0; i < arg_num; ++i) + for (size_t i = 0; i < col_size; ++i) { + if (expr_param.isNullAt(i)) { nullmap[i] = 1; @@ -374,7 +507,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } nullmap[i] = 0; - expr_param.getString(i, expr_ref); + expr_param.getStringRef(i, expr_ref); vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match } @@ -384,10 +517,12 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { // expr column is impossible to be a nullable column here StringRef expr_ref; - for (size_t i = 0; i < arg_num; ++i) + for (size_t i = 0; i < col_size; ++i) { - expr_param.getString(i, expr_ref); - vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match + expr_param.getStringRef(i, expr_ref); + auto res = regexp->match(expr_ref.data, expr_ref.size); + std::cout << fmt::format("memorized not null: {}, res: {}", String(expr_ref.data, expr_ref.size), res) << std::endl; + vec_res[i] = res; // match } block.getByPosition(result).column = std::move(col_res); @@ -395,61 +530,66 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } else { - // container used for receiving data - StringRef expr; - StringRef pat; - - if (null_presence.has_nullable) + if (null_presence.has_nullable_col) { auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); nullmap.resize(expr_param.getDataNum()); - - for (size_t i = 0; i < arg_num; ++i) + for (size_t i = 0; i < col_size; ++i) { if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) { + // This is a null result nullmap[i] = 1; continue; } - expr_param.getString(i, expr); - pat_param.getString(i, pat); + String && expr = expr_param.getString(i); + String && pat = pat_param.getString(i); + + if (pat.empty()) + throw Exception("Empty pattern is invalid"); if constexpr (class_name == regexp_like_name) { - int flags = 0; - SET_FLAGS(flags); - const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); - vec_res[i] = regexp->match(expr.data, expr.size); // match + // regexp_like function + auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i)); + vec_res[i] = regexp->match(expr); // match } else { - // TODO handle match_type first and do match action + // regexp function + int flags = getDefaultFlags(); + const auto & regexp = Regexps::get(pat, flags); + vec_res[i] = regexp->match(expr); // match } - } block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { - for (size_t i = 0; i < arg_num; ++i) + for (size_t i = 0; i < col_size; ++i) { - expr_param.getString(i, expr); - pat_param.getString(i, pat); + String && expr = expr_param.getString(i); + String && pat = pat_param.getString(i); + + if (pat.empty()) + throw Exception("Empty pattern is invalid"); if constexpr (class_name == regexp_like_name) { - int flags = 0; - SET_FLAGS(flags); - const auto & regexp = Regexps::get(String(pat.data, pat.size), flags); - vec_res[i] = regexp->match(expr.data, expr.size); // match + // regexp_like function + auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i)); + vec_res[i] = regexp->match(expr); // match } else { - // TODO handle match_type first and do match action + // regexp function + int flags = getDefaultFlags(); + const auto & regexp = Regexps::get(pat, flags); + vec_res[i] = regexp->match(expr); // match } } @@ -458,6 +598,24 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } } private: + void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col) const + { + std::cout << "type name: " << arg->getName() << std::endl; + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto & null_type = checkAndGetDataType(arg.get()); + const auto & nested_type = null_type->getNestedType(); + if (!nested_type->isString()) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + else + { + if (!arg->isString()) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + } + TiDB::TiDBCollatorPtr collator = nullptr; }; @@ -721,7 +879,5 @@ class FunctionStringReplace : public IFunction TiDB::TiDBCollatorPtr collator{}; }; - -#undef SET_FLAGS } // namespace DB diff --git a/dbms/src/Functions/IFunction.cpp b/dbms/src/Functions/IFunction.cpp index 51ab7cd2a6c..4ace050f7c6 100644 --- a/dbms/src/Functions/IFunction.cpp +++ b/dbms/src/Functions/IFunction.cpp @@ -98,12 +98,20 @@ ColumnPtr wrapInNullable(const ColumnPtr & src, Block & block, const ColumnNumbe return ColumnNullable::create(src_not_nullable, result_null_map_column); } -NullPresence getNullPresense(const ColumnsWithTypeAndName & args) +struct NullPresence +{ + bool has_nullable = false; + bool has_null_constant = false; +}; + +NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) { NullPresence res; - for (const auto & elem : args) + for (const auto & arg : args) { + const auto & elem = block.getByPosition(arg); + if (!res.has_nullable) res.has_nullable = elem.type->isNullable(); if (!res.has_null_constant) @@ -113,23 +121,12 @@ NullPresence getNullPresense(const ColumnsWithTypeAndName & args) return res; } -bool allArgumentsAreConstants(const Block & block, const ColumnNumbers & args) -{ - for (auto arg : args) - if (!block.getByPosition(arg).column->isColumnConst()) - return false; - return true; -} -} // namespace - -NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) +NullPresence getNullPresense(const ColumnsWithTypeAndName & args) { NullPresence res; - for (const auto & arg : args) + for (const auto & elem : args) { - const auto & elem = block.getByPosition(arg); - if (!res.has_nullable) res.has_nullable = elem.type->isNullable(); if (!res.has_null_constant) @@ -139,6 +136,15 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) return res; } +bool allArgumentsAreConstants(const Block & block, const ColumnNumbers & args) +{ + for (auto arg : args) + if (!block.getByPosition(arg).column->isColumnConst()) + return false; + return true; +} +} // namespace + bool IExecutableFunction::defaultImplementationForConstantArguments(Block & block, const ColumnNumbers & args, size_t result) const { ColumnNumbers arguments_to_remain_constants = getArgumentsThatAreAlwaysConstant(); diff --git a/dbms/src/Functions/IFunction.h b/dbms/src/Functions/IFunction.h index aca795ddf29..c1bcdc8b151 100644 --- a/dbms/src/Functions/IFunction.h +++ b/dbms/src/Functions/IFunction.h @@ -390,12 +390,4 @@ class DefaultFunctionBuilder : public IFunctionBuilder using FunctionPtr = std::shared_ptr; -struct NullPresence -{ - bool has_nullable = false; - bool has_null_constant = false; -}; - -NullPresence getNullPresense(const Block &, const ColumnNumbers &); - } // namespace DB diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index f4ffb916b74..447dd983752 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -19,6 +19,7 @@ /// this is a hack, include the cpp file so we can test MatchImpl directly #include // NOLINT +#include // NOLINT #include #include @@ -26,6 +27,8 @@ #pragma GCC diagnostic ignored "-Wsign-compare" #include +#include + #pragma GCC diagnostic pop namespace DB @@ -1780,9 +1783,10 @@ TEST_F(Regexp, testRegexpTiDBCase) ASSERT_ANY_THROW((DB::MatchImpl::constantConstant("", "\\", '\\', "", nullptr, res))); } +// fail Regexp.testRegexp TEST_F(Regexp, testRegexp) { - const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + // const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); auto string_type = std::make_shared(); auto nullable_string_type = makeNullable(string_type); auto uint8_type = std::make_shared(); @@ -1809,74 +1813,79 @@ TEST_F(Regexp, testRegexp) auto const_uint8_null_column = createConstColumn>(row_size, {}); auto const_string_null_column = createConstColumn>(row_size, {}); - std::cout << "here 1" << std::endl; + std::cout << "$$$$$$$$case 1" << std::endl; /// case 1. regexp(const, const [, const]) for (size_t i = 0; i < row_size; i++) { /// test regexp(const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results[i]), - executeFunction("regexp", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]))); -std::cout << "here 1.1" << std::endl; + executeFunction("regexp_like", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]))); + /// test regexp(const, const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type[i]), - executeFunction("regexp", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i]))); -std::cout << "here 1.2" << std::endl; + executeFunction("regexp_like", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i]))); + + /// Not support binary collator so far /// test regexp(const, const, const) with binary collator - ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type_collator[i]), - executeFunction("regexp", {createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i])}, binary_collator)); + // ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type_collator[i]), + // executeFunction("regexp_like", {createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i])}, binary_collator)); } + std::cout << "$$$$$$$$case 2" << std::endl; /// case 2. regexp(const, const [, const]) with null value - std::cout << "here 2" << std::endl; for (size_t i = 0; i < row_size; i++) { + std::cout << fmt::format("index: {}", i) << std::endl; /// test regexp(const, const) ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results[i]), - executeFunction("regexp", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); - std::cout << "here 2.1" << std::endl; + executeFunction("regexp_like", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); + std::cout << fmt::format("index: {}.1", i) << std::endl; /// test regexp(const, const, const) ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type[i]), - executeFunction("regexp", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); - std::cout << "here 2.2" << std::endl; + executeFunction("regexp_like", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); /// test regexp(const, const, const) with binary collator - ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type_collator[i]), - executeFunction("regexp", {input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, binary_collator)); + // ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type_collator[i]), + // executeFunction("regexp_like", {input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, binary_collator)); } + std::cout << "$$$$$$$$case 3" << std::endl; /// case 3 regexp(vector, const[, const]) - std::cout << "here 3" << std::endl; { /// test regexp(vector, const) ASSERT_COLUMN_EQ(createColumn(vec_results), - executeFunction("regexp", createColumn(input_strings), createConstColumn(row_size, patterns[0]))); -std::cout << "here 3.1" << std::endl; + executeFunction("regexp_like", createColumn(input_strings), createConstColumn(row_size, patterns[0]))); + + std::cout << "$$$$$$$$case 3.1" << std::endl; /// test regexp(vector, const, const) ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type), - executeFunction("regexp", createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); -std::cout << "here 3.2" << std::endl; + executeFunction("regexp_like", createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + + /// Not support binary collator so far /// test regexp(vector, const, const) with binary collator - ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type_collator), - executeFunction("regexp", {createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); + // ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type_collator), + // executeFunction("regexp_like", {createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); } + std::cout << "$$$$$$$$case 4" << std::endl; /// case 4 regexp(vector, const[, const]) nullable - std::cout << "here 4" << std::endl; { ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results, input_string_nulls), - executeFunction("regexp", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]))); - std::cout << "here 4.2" << std::endl; + executeFunction("regexp_like", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]))); + std::cout << "$$$$$$$$case 4.1" << std::endl; ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type, input_string_nulls), - executeFunction("regexp", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); - std::cout << "here 4.2" << std::endl; - ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type_collator, input_string_nulls), - executeFunction("regexp", {createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); + executeFunction("regexp_like", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + + /// Not support binary collator so far + // ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type_collator, input_string_nulls), + // executeFunction("regexp_like", {createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); } /// issue 5984 - ASSERT_THROW(executeFunction("regexp", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); - ASSERT_THROW(executeFunction("regexp", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); - ASSERT_THROW(executeFunction("regexp", createColumn(std::vector{"1"}), createColumn(std::vector{""})), Exception); + // ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); + // ASSERT_THROW(executeFunction("regexp_like", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); + // ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createColumn(std::vector{""})), Exception); } +// fail TEST_F(Regexp, testRegexpCustomerCases) { String pattern = "^(53|94)[0-9]{10}$|" From 101c4ff4413fa6a754cbec50a30034af7788a4fa Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Sat, 1 Oct 2022 21:28:37 +0800 Subject: [PATCH 07/87] pass gtests --- dbms/src/Functions/FunctionsRegexp.h | 63 ++++-- dbms/src/Functions/tests/gtest_regexp.cpp | 223 +++++++++++++++------- 2 files changed, 199 insertions(+), 87 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 49fae7edca8..65415d89acf 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -270,12 +270,14 @@ class Param class FunctionStringRegexpBase { public: + static constexpr size_t REGEXP_XXX_MIN_PARAM_NUM = 2; + // Max parameter number the regexp_xxx function could receive - static constexpr size_t REGEXP_PARAM_NUM = 2; - static constexpr size_t REGEXP_LIKE_PARAM_NUM = 3; - static constexpr size_t REGEXP_INSTR_PARAM_NUM = 6; - static constexpr size_t REGEXP_REPLACE_PARAM_NUM = 6; - static constexpr size_t REGEXP_SUBSTR_PARAM_NUM = 5; + static constexpr size_t REGEXP_MAX_PARAM_NUM = 2; + static constexpr size_t REGEXP_LIKE_MAX_PARAM_NUM = 3; + static constexpr size_t REGEXP_INSTR_MAX_PARAM_NUM = 6; + static constexpr size_t REGEXP_REPLACE_MAX_PARAM_NUM = 6; + static constexpr size_t REGEXP_SUBSTR_MAX_PARAM_NUM = 5; void memorize(const Param & pat_param, const std::unique_ptr & match_type_param) const { @@ -303,9 +305,9 @@ class FunctionStringRegexpBase constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); if constexpr (class_name_sv == tidb_regexp_name_sv) - total_param_num = REGEXP_PARAM_NUM; + total_param_num = REGEXP_MAX_PARAM_NUM; else if constexpr (class_name_sv == regexp_like_name_sv) - total_param_num = REGEXP_LIKE_PARAM_NUM; + total_param_num = REGEXP_LIKE_MAX_PARAM_NUM; else throw Exception("Unknown regular function."); @@ -347,24 +349,29 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction bool useDefaultImplementationForNulls() const override { return false; } size_t getNumberOfArguments() const override { return 0; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments [[maybe_unused]]) const override + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - size_t args_min_num = 2; - size_t args_max_num = 3; - if (arguments.size() < args_min_num) + size_t args_max_num; + constexpr std::string_view class_name_sv(Name::name); + constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); + + if constexpr (class_name_sv == regexp_like_name_sv) + args_max_num = REGEXP_LIKE_MAX_PARAM_NUM; + else + args_max_num = REGEXP_MAX_PARAM_NUM; + + size_t arg_num = arguments.size(); + if (arg_num < REGEXP_XXX_MIN_PARAM_NUM) throw Exception("Illegal argument number"); bool has_nullable_col = false; - for (size_t i = 0; i < args_min_num; ++i) + for (size_t i = 0; i < REGEXP_XXX_MIN_PARAM_NUM; ++i) checkInputArg(arguments[i], &has_nullable_col); - - constexpr std::string_view class_name_sv(Name::name); - constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); // check match_type arg for regexp_like if constexpr (class_name_sv == regexp_like_name_sv) - if (arguments.size() == args_max_num && !arguments[args_max_num - 1]->isString()) + if (arg_num == args_max_num && !arguments[args_max_num - 1]->isString()) checkInputArg(arguments[args_max_num - 1], &has_nullable_col); if (has_nullable_col) @@ -483,7 +490,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // Initialize result column auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(expr_param.getDataNum()); + vec_res.resize(expr_param.getDataNum(), 0); // Start to match if (isMemorized()) @@ -538,13 +545,27 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction for (size_t i = 0; i < col_size; ++i) { - if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) + if constexpr (class_name == regexp_like_name) { - // This is a null result - nullmap[i] = 1; - continue; + if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || (match_type_param != nullptr && match_type_param->isNullAt(i))) + { + // This is a null result + nullmap[i] = 1; + continue; + } + } + else + { + if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) + { + // This is a null result + nullmap[i] = 1; + continue; + } } + + nullmap[i] = 0; String && expr = expr_param.getString(i); String && pat = pat_param.getString(i); diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 447dd983752..dd69686fb8e 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -22,6 +22,8 @@ #include // NOLINT #include #include +#include "DataTypes/DataTypesNumber.h" +#include "common/types.h" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" @@ -1783,8 +1785,8 @@ TEST_F(Regexp, testRegexpTiDBCase) ASSERT_ANY_THROW((DB::MatchImpl::constantConstant("", "\\", '\\', "", nullptr, res))); } -// fail Regexp.testRegexp -TEST_F(Regexp, testRegexp) +// We can only test regexp_like function as regexp is the subset of regexp_like +TEST_F(Regexp, RegexpLike) { // const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); auto string_type = std::make_shared(); @@ -1792,97 +1794,186 @@ TEST_F(Regexp, testRegexp) auto uint8_type = std::make_shared(); auto nullable_uint8_type = makeNullable(uint8_type); - std::vector input_strings{"abc", "Abc", "a\nb\nc", "abcd", "hello, 平凯星辰"}; - std::vector input_string_nulls{0, 0, 0, 1, 0}; + std::vector exprs{"abc", "Abc", "a\nb\nc", "a\nb\nc", "a\nb\nc", "abcd", "hello, 平凯星辰", ""}; + std::vector exprs_nulls{0, 0, 0, 0, 1, 0, 0, 0}; - std::vector patterns{"^a", "abc$", "a.*B.*c", "^bc$", "平凯.*"}; - std::vector pattern_nulls{1, 0, 0, 0, 0}; + std::vector patterns{"^a", "abc$", "a.*B.*c", "^a$", "^b$", "^bc$", "平凯.*", "^$"}; + std::vector pattern_nulls{1, 0, 0, 0, 0, 0, 0, 0}; - std::vector match_types{"", "i", "ims", "i", ""}; - std::vector match_type_nulls{0, 1, 0, 0, 0}; + std::vector match_types{"", "i", "ims", "m", "m", "i", "", ""}; + std::vector match_type_nulls{0, 1, 0, 0, 0, 0, 0, 0}; - std::vector results{1, 0, 0, 0, 1}; - std::vector results_with_match_type{1, 1, 1, 0, 1}; - std::vector results_with_match_type_collator{1, 0, 0, 0, 1}; + std::vector results{1, 0, 0, 0, 0, 0, 1, 1}; + std::vector results_with_match_type{1, 1, 1, 1, 1, 0, 1, 1}; + // std::vector results_with_match_type_collator{1, 0, 0, 0, 1}; - std::vector vec_results{1, 0, 1, 1, 0}; - std::vector vec_results_with_match_type{1, 1, 1, 1, 0}; - std::vector vec_results_with_match_type_collator{1, 0, 1, 1, 0}; + std::vector vec_results{1, 0, 1, 1, 1, 1, 0, 0}; + std::vector vec_results_with_match_type{1, 1, 1, 1, 1, 1, 0, 0}; + // std::vector vec_results_with_match_type_collator{1, 0, 1, 1, 0}; - size_t row_size = input_string_nulls.size(); + size_t row_size = exprs_nulls.size(); auto const_uint8_null_column = createConstColumn>(row_size, {}); auto const_string_null_column = createConstColumn>(row_size, {}); - std::cout << "$$$$$$$$case 1" << std::endl; - /// case 1. regexp(const, const [, const]) - for (size_t i = 0; i < row_size; i++) + + // case 1. regexp_like(const, const [, const]) { - /// test regexp(const, const) - ASSERT_COLUMN_EQ(createConstColumn(row_size, results[i]), - executeFunction("regexp_like", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]))); - - /// test regexp(const, const, const) - ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type[i]), - executeFunction("regexp_like", createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i]))); - - /// Not support binary collator so far - /// test regexp(const, const, const) with binary collator - // ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type_collator[i]), - // executeFunction("regexp_like", {createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i])}, binary_collator)); + for (size_t i = 0; i < row_size; i++) + { + // test regexp_like(const, const) + ASSERT_COLUMN_EQ(createConstColumn(row_size, results[i]), + executeFunction("regexp_like", createConstColumn(row_size, exprs[i]), createConstColumn(row_size, patterns[i]))); + + /// test regexp_like(const, const, const) + ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type[i]), + executeFunction("regexp_like", createConstColumn(row_size, exprs[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i]))); + + // Not support binary collator so far + // test regexp_like(const, const, const) with binary collator + // ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type_collator[i]), + // executeFunction("regexp_like", {createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i])}, binary_collator)); + } } - std::cout << "$$$$$$$$case 2" << std::endl; - /// case 2. regexp(const, const [, const]) with null value - for (size_t i = 0; i < row_size; i++) + + // case 2. regexp_like(const, const [, const]) with null value { - std::cout << fmt::format("index: {}", i) << std::endl; - /// test regexp(const, const) - ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results[i]), - executeFunction("regexp_like", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); - - std::cout << fmt::format("index: {}.1", i) << std::endl; - /// test regexp(const, const, const) - ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type[i]), - executeFunction("regexp_like", input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); - - /// test regexp(const, const, const) with binary collator - // ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type_collator[i]), - // executeFunction("regexp_like", {input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, binary_collator)); + for (size_t i = 0; i < row_size; i++) + { + // test regexp_like(const, const) + ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results[i]), + executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); + + // test regexp_like(const, const, const) + ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type[i]), + executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); + + // Not support binary collator so far + // test regexp_like(const, const, const) with binary collator + // ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type_collator[i]), + // executeFunction("regexp_like", {input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, binary_collator)); + } } - std::cout << "$$$$$$$$case 3" << std::endl; - /// case 3 regexp(vector, const[, const]) + // case 3 regexp_like(vector, const[, const]) { - /// test regexp(vector, const) + // test regexp_like(vector, const) ASSERT_COLUMN_EQ(createColumn(vec_results), - executeFunction("regexp_like", createColumn(input_strings), createConstColumn(row_size, patterns[0]))); + executeFunction("regexp_like", createColumn(exprs), createConstColumn(row_size, patterns[0]))); - std::cout << "$$$$$$$$case 3.1" << std::endl; - /// test regexp(vector, const, const) + // test regexp_like(vector, const, const) ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type), - executeFunction("regexp_like", createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + executeFunction("regexp_like", createColumn(exprs), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); - /// Not support binary collator so far - /// test regexp(vector, const, const) with binary collator + // Not support binary collator so far + // test regexp_like(vector, const, const) with binary collator // ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type_collator), // executeFunction("regexp_like", {createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); } - std::cout << "$$$$$$$$case 4" << std::endl; - /// case 4 regexp(vector, const[, const]) nullable + + /// case 4 regexp_like(vector, const[, const]) with null value { - ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results, input_string_nulls), - executeFunction("regexp_like", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]))); - std::cout << "$$$$$$$$case 4.1" << std::endl; - ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type, input_string_nulls), - executeFunction("regexp_like", createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results, exprs_nulls), + executeFunction("regexp_like", createNullableVectorColumn(exprs, exprs_nulls), createConstColumn(row_size, patterns[0]))); - /// Not support binary collator so far + ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type, exprs_nulls), + executeFunction("regexp_like", createNullableVectorColumn(exprs, exprs_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + + // Not support binary collator so far // ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type_collator, input_string_nulls), // executeFunction("regexp_like", {createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); } + const std::vector vv_res{1, 0, 0, 0, 0, 0, 1, 1}; // vector expr, vector pattern + const std::vector vvc_res{1, 1, 0, 0, 0, 0, 1, 1}; // vector expr, vector pattern, const match_type + + // case 5 regexp_like(vector, vector[, const]) + { + + // test regexp_like(vector, vector) + ASSERT_COLUMN_EQ(createColumn(vv_res), + executeFunction( + "regexp_like", + createColumn(exprs), + createColumn(patterns))); + + // test regexp_like(vector, vector, const) + ASSERT_COLUMN_EQ(createColumn(vvc_res), + executeFunction( + "regexp_like", + createColumn(exprs), + createColumn(patterns), + createConstColumn(row_size, "i"))); + + } + + // case 6 regexp_like(vector, vector[, const]) with null vable + { + // test regexp_like(vector, vector) + ASSERT_COLUMN_EQ(createNullableVectorColumn(vv_res, exprs_nulls), + executeFunction( + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns))); + + ASSERT_COLUMN_EQ(createNullableVectorColumn(vv_res, pattern_nulls), + executeFunction( + "regexp_like", + createColumn(exprs), + createNullableVectorColumn(patterns, pattern_nulls))); + + // test regexp_like(vector, vector, const) + ASSERT_COLUMN_EQ(createNullableVectorColumn(vvc_res, exprs_nulls), + executeFunction("regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createConstColumn(row_size, "i"))); + } + + const std::vector vvv_res{1, 1, 1, 1, 1, 0, 1, 1}; // vector expr, vector pattern, vector match_type + + // case 7 regexp_like(vector, vector[, vector]) + { + // test regexp_like(vector, vector, vector) + ASSERT_COLUMN_EQ(createColumn(vvv_res), + executeFunction( + "regexp_like", + createColumn(exprs), + createColumn(patterns), + createColumn(match_types))); + } + + // case 8 regexp_like(vector, vector[, vector]) withh null value + { + // test regexp_like(nullable vector, vector, vector) + ASSERT_COLUMN_EQ(createNullableVectorColumn(vvv_res, exprs_nulls), + executeFunction( + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createColumn(match_types))); + + // test regexp_like(vector, nullable vector, vector) + ASSERT_COLUMN_EQ(createNullableVectorColumn(vvv_res, pattern_nulls), + executeFunction( + "regexp_like", + createColumn(exprs), + createNullableVectorColumn(patterns, pattern_nulls), + createColumn(match_types))); + + // test regexp_like(vector, vector, nullable vector) + ASSERT_COLUMN_EQ(createNullableVectorColumn(vvv_res, match_type_nulls), + executeFunction( + "regexp_like", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(match_types, match_type_nulls))); + } + /// issue 5984 - // ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); - // ASSERT_THROW(executeFunction("regexp_like", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); - // ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createColumn(std::vector{""})), Exception); + ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); + ASSERT_THROW(executeFunction("regexp_like", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); + ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createColumn(std::vector{""})), Exception); + ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createNullableVectorColumn(std::vector{""}, std::vector{0})), Exception); + ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn>(row_size, "")), Exception); } // fail From 2be0ea7a4fee561ffd89a4c6339511e6df3f8492 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 7 Oct 2022 15:16:41 +0800 Subject: [PATCH 08/87] ut passed --- dbms/src/Functions/FunctionsRegexp.cpp | 7 +- dbms/src/Functions/FunctionsRegexp.h | 63 ++++--- dbms/src/Functions/tests/gtest_regexp.cpp | 199 +++++++++++++++++----- 3 files changed, 189 insertions(+), 80 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 2b00e9e57f8..15b679d7e62 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -32,10 +32,11 @@ std::set valid_flags{flag_i, flag_c, flag_m, flag_s}; // If characters specifying contradictory options are specified // within match_type, the rightmost one takes precedence. -String getMatchType(const String & match_type) +String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator) { - // TODO handle collation std::set applied_flags; + if (collator != nullptr && collator->isCI()) + applied_flags.insert(flag_i); for (auto flag : match_type) { @@ -47,7 +48,7 @@ String getMatchType(const String & match_type) // to enable the case-sensitive for the regexp if (flag == flag_c) { - auto iter_i = applied_flags.find('i'); + auto iter_i = applied_flags.find(flag_i); if (iter_i != applied_flags.end()) applied_flags.erase(iter_i); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 65415d89acf..e4f25dea38a 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -76,7 +76,7 @@ struct NameReplaceRegexpAll static constexpr auto name = "replaceRegexpAll"; }; -String getMatchType(const String & match_type); +String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr); inline int getDefaultFlags() { @@ -93,18 +93,24 @@ struct NullPresence NullPresence getNullPresense(const Block & block, const ColumnNumbers & args); -inline String addMatchTypeForPattern(const String & pattern, const String & match_type) +inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { - String flags = getMatchType(match_type); + String flags = getMatchType(match_type, collator); return fmt::format("(?{}){}", flags, pattern); } -inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type) +inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { - String final_pattern = addMatchTypeForPattern(pattern, match_type); + String final_pattern = addMatchTypeForPattern(pattern, match_type, collator); return Regexps::get(final_pattern, getDefaultFlags()); } +inline void handleCollatorWithoutMatchType(String & pattern, TiDB::TiDBCollatorPtr collator) +{ + if (collator != nullptr && collator->isCI()) + pattern = fmt::format("(?i){}", pattern); +} + // Columns may be const, nullable or plain vector, we can conveniently handle // these different type columns with Param. class Param @@ -132,18 +138,14 @@ class Param // const null can't be here as we should have handle it in the previous Field field; auto p = static_cast(*col_const_data).getNestedColumnPtr(); - std::cout << fmt::format("family name: {}", p->getFamilyName()) << std::endl; col_const->get(0, field); - std::cout << "type name: " << field.getTypeName() << std::endl; data_string = field.safeGet(); null_map = &(static_cast(*col_const_data).getNullMapData()); - std::cout << fmt::format("cons data string1: {}", data_string) << std::endl; } else { StringRef tmp_data = col_const->getDataAt(0); data_string = String(tmp_data.data, tmp_data.size); - std::cout << fmt::format("cons data string2: {}", data_string) << std::endl; } is_const = true; @@ -183,12 +185,10 @@ class Param col_const->get(0, field); data_int64 = field.get(); null_map = &(static_cast(*col_ptr).getNullMapData()); - std::cout << fmt::format("cons data int 1: {}", data_int64) << std::endl; } else { data_int64 = col_const->getValue(); - std::cout << fmt::format("cons data int 2: {}", data_int64) << std::endl; } is_const = true; @@ -237,12 +237,10 @@ class Param { StringRef sr = col_str->getDataAt(idx); String ret_str(sr.data, sr.size); - std::cout << fmt::format("getString here1: {}", ret_str) << std::endl; return ret_str; } else { String ret_str(data_string); - std::cout << fmt::format("getString here2: {}", ret_str) << std::endl; return ret_str; } } @@ -279,7 +277,7 @@ class FunctionStringRegexpBase static constexpr size_t REGEXP_REPLACE_MAX_PARAM_NUM = 6; static constexpr size_t REGEXP_SUBSTR_MAX_PARAM_NUM = 5; - void memorize(const Param & pat_param, const std::unique_ptr & match_type_param) const + void memorize(const Param & pat_param, const std::unique_ptr & match_type_param, TiDB::TiDBCollatorPtr collator) const { String && final_pattern = pat_param.getString(0); if (final_pattern.empty()) @@ -288,7 +286,10 @@ class FunctionStringRegexpBase if (match_type_param != nullptr) { String && match_type = match_type_param->getString(0); - final_pattern = addMatchTypeForPattern(final_pattern, match_type); + final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); + } else + { + handleCollatorWithoutMatchType(final_pattern, collator); } int flags = getDefaultFlags(); @@ -361,7 +362,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction args_max_num = REGEXP_MAX_PARAM_NUM; size_t arg_num = arguments.size(); - if (arg_num < REGEXP_XXX_MIN_PARAM_NUM) + if (arg_num < REGEXP_XXX_MIN_PARAM_NUM || arg_num > args_max_num) throw Exception("Illegal argument number"); bool has_nullable_col = false; @@ -408,8 +409,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction auto arg_num = arguments.size(); size_t col_size = expr_param.getDataNum(); - std::cout << fmt::format("pat_param get string: {}", pat_param.getString(0)) << std::endl; - // match_type_param will be initialized, only when this is a regexp_like function std::unique_ptr match_type_param; @@ -433,19 +432,17 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // Check if args are all const columns if (expr_param.isConstCol() && pat_param.isConstCol()) { -#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type) \ +#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator) \ do { \ int flags = getDefaultFlags(); \ - String final_pattern = pat; \ - std::cout << fmt::format("pat: {}", pat) << std::endl; \ + String final_pattern = (pat); \ if constexpr (has_match_type) \ { \ /* put match_type into pattern */ \ String match_type = (match_type_param)->getString(0); \ - std::cout << fmt::format("match_type: {}", match_type) << std::endl; \ - final_pattern = addMatchTypeForPattern(final_pattern, match_type); \ - std::cout << fmt::format("final_pattern: {}", final_pattern) << std::endl; \ - } \ + final_pattern = addMatchTypeForPattern(final_pattern, match_type, (collator)); \ + } else \ + handleCollatorWithoutMatchType(final_pattern, (collator)); \ Regexps::Regexp regexp(final_pattern, flags); \ ResultType res{regexp.match(expr)}; \ (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ @@ -462,13 +459,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction if (arg_num > 2 && match_type_param->isConstCol()) { constexpr bool has_match_type = true; - GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type); + GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator); return; } else if (arg_num == 2) { constexpr bool has_match_type = false; - GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type); + GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator); return; } // reach here when arg_num == 3 and match_type is not const @@ -477,7 +474,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { // regexp function constexpr bool has_match_type = false; - GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type); + GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator); return; } #undef GET_CONST_RESULT @@ -485,7 +482,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // Check memorization if (canMemorize(arg_num, pat_param, match_type_param)) - memorize(pat_param, match_type_param); + memorize(pat_param, match_type_param, collator); // Initialize result column auto col_res = ColumnVector::create(); @@ -528,7 +525,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction { expr_param.getStringRef(i, expr_ref); auto res = regexp->match(expr_ref.data, expr_ref.size); - std::cout << fmt::format("memorized not null: {}, res: {}", String(expr_ref.data, expr_ref.size), res) << std::endl; vec_res[i] = res; // match } @@ -575,12 +571,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction if constexpr (class_name == regexp_like_name) { // regexp_like function - auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i)); + auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i), collator); vec_res[i] = regexp->match(expr); // match } else { // regexp function + handleCollatorWithoutMatchType(pat, collator); int flags = getDefaultFlags(); const auto & regexp = Regexps::get(pat, flags); vec_res[i] = regexp->match(expr); // match @@ -602,12 +599,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction if constexpr (class_name == regexp_like_name) { // regexp_like function - auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i)); + auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i), collator); vec_res[i] = regexp->match(expr); // match } else { // regexp function + handleCollatorWithoutMatchType(pat, collator); int flags = getDefaultFlags(); const auto & regexp = Regexps::get(pat, flags); vec_res[i] = regexp->match(expr); // match @@ -621,7 +619,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction private: void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col) const { - std::cout << "type name: " << arg->getName() << std::endl; if (arg->isNullable()) { *has_nullable_col = true; diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index dd69686fb8e..5aa538db1e0 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -1788,28 +1788,33 @@ TEST_F(Regexp, testRegexpTiDBCase) // We can only test regexp_like function as regexp is the subset of regexp_like TEST_F(Regexp, RegexpLike) { - // const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto * utf8mb4_general_ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); auto string_type = std::make_shared(); auto nullable_string_type = makeNullable(string_type); auto uint8_type = std::make_shared(); auto nullable_uint8_type = makeNullable(uint8_type); - std::vector exprs{"abc", "Abc", "a\nb\nc", "a\nb\nc", "a\nb\nc", "abcd", "hello, 平凯星辰", ""}; - std::vector exprs_nulls{0, 0, 0, 0, 1, 0, 0, 0}; + std::vector exprs{"abc", "Abc", "a\nb\nc", "a\nb\nc", "a\nb\nc", "abcd", "hello, 平凯星辰", "", "a"}; + std::vector exprs_nulls{0, 0, 0, 0, 1, 0, 0, 0, 0}; - std::vector patterns{"^a", "abc$", "a.*B.*c", "^a$", "^b$", "^bc$", "平凯.*", "^$"}; - std::vector pattern_nulls{1, 0, 0, 0, 0, 0, 0, 0}; + std::vector patterns{"^a", "abc$", "a.*B.*c", "^a$", "^b$", "^bc$", "平凯.*", "^$", "A"}; + std::vector pattern_nulls{1, 0, 0, 0, 0, 0, 0, 0, 0}; - std::vector match_types{"", "i", "ims", "m", "m", "i", "", ""}; - std::vector match_type_nulls{0, 1, 0, 0, 0, 0, 0, 0}; + std::vector match_types{"", "i", "ims", "m", "m", "i", "", "", ""}; + std::vector match_type_nulls{0, 1, 0, 0, 0, 0, 0, 0, 0}; - std::vector results{1, 0, 0, 0, 0, 0, 1, 1}; - std::vector results_with_match_type{1, 1, 1, 1, 1, 0, 1, 1}; - // std::vector results_with_match_type_collator{1, 0, 0, 0, 1}; + std::vector results{1, 0, 0, 0, 0, 0, 1, 1, 0}; + std::vector results_with_match_type{1, 1, 1, 1, 1, 0, 1, 1, 0}; + std::vector results_with_collator{1, 1, 0, 0, 0, 0, 1, 1, 1}; + std::vector results_with_collator_and_match_type{1, 1, 1, 1, 1, 0, 1, 1, 1}; - std::vector vec_results{1, 0, 1, 1, 1, 1, 0, 0}; - std::vector vec_results_with_match_type{1, 1, 1, 1, 1, 1, 0, 0}; - // std::vector vec_results_with_match_type_collator{1, 0, 1, 1, 0}; + const String vec_res_match_type{"i"}; + const String vec_res_collator_and_match_type{"m"}; + + std::vector vec_results{1, 0, 1, 1, 1, 1, 0, 0, 1}; + std::vector vec_results_with_match_type{1, 1, 1, 1, 1, 1, 0, 0, 1}; // match type is const 'i' + std::vector vec_results_with_collator{1, 1, 1, 1, 1, 1, 0, 0, 1}; + std::vector vec_results_with_collator_and_match_type{1, 1, 1, 1, 1, 1, 0, 0, 1}; // match type is const 'm' size_t row_size = exprs_nulls.size(); @@ -1822,16 +1827,28 @@ TEST_F(Regexp, RegexpLike) { // test regexp_like(const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results[i]), - executeFunction("regexp_like", createConstColumn(row_size, exprs[i]), createConstColumn(row_size, patterns[i]))); + executeFunction( + "regexp_like", + createConstColumn(row_size, exprs[i]), + createConstColumn(row_size, patterns[i]))); /// test regexp_like(const, const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type[i]), - executeFunction("regexp_like", createConstColumn(row_size, exprs[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i]))); - - // Not support binary collator so far - // test regexp_like(const, const, const) with binary collator - // ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type_collator[i]), - // executeFunction("regexp_like", {createConstColumn(row_size, input_strings[i]), createConstColumn(row_size, patterns[i]), createConstColumn(row_size, match_types[i])}, binary_collator)); + executeFunction( + "regexp_like", + createConstColumn(row_size, exprs[i]), + createConstColumn(row_size, patterns[i]), + createConstColumn(row_size, match_types[i]))); + + // test regexp_like(const, const, const) with ci collator + ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_collator[i]), + executeFunction( + "regexp_like", + { + createConstColumn(row_size, exprs[i]), + createConstColumn(row_size, patterns[i]) + }, + utf8mb4_general_ci_collator)); } } @@ -1847,12 +1864,29 @@ TEST_F(Regexp, RegexpLike) ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type[i]), executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); - // Not support binary collator so far - // test regexp_like(const, const, const) with binary collator - // ASSERT_COLUMN_EQ(input_string_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type_collator[i]), - // executeFunction("regexp_like", {input_string_nulls[i] ? const_string_null_column : createConstColumn>(row_size, input_strings[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, binary_collator)); + // test regexp_like(const, const) with ci collator + ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_collator[i]), + executeFunction( + "regexp_like", + { + exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), + pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]) + }, + utf8mb4_general_ci_collator)); + + // test regexp_like(const, const, const) with ci collator + ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_collator_and_match_type[i]), + executeFunction( + "regexp_like", + { + exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), + pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), + match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]) + }, + utf8mb4_general_ci_collator)); } } + // case 3 regexp_like(vector, const[, const]) { // test regexp_like(vector, const) @@ -1863,31 +1897,73 @@ TEST_F(Regexp, RegexpLike) ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type), executeFunction("regexp_like", createColumn(exprs), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); - // Not support binary collator so far - // test regexp_like(vector, const, const) with binary collator - // ASSERT_COLUMN_EQ(createColumn(vec_results_with_match_type_collator), - // executeFunction("regexp_like", {createColumn(input_strings), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); + // test regexp_like(vector, const) with ci collator + ASSERT_COLUMN_EQ(createColumn(vec_results_with_collator), + executeFunction( + "regexp_like", + { + createColumn(exprs), + createConstColumn(row_size, patterns[0]) + }, + utf8mb4_general_ci_collator)); + + // test regexp_like(vector, const, const) with ci collator + ASSERT_COLUMN_EQ(createColumn(vec_results_with_collator_and_match_type), + executeFunction( + "regexp_like", + { + createColumn(exprs), + createConstColumn(row_size, patterns[0]), + createConstColumn(row_size, "m") + }, + utf8mb4_general_ci_collator)); } /// case 4 regexp_like(vector, const[, const]) with null value { + // regexp_like(vector, const) ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results, exprs_nulls), - executeFunction("regexp_like", createNullableVectorColumn(exprs, exprs_nulls), createConstColumn(row_size, patterns[0]))); + executeFunction( + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]))); + // regexp_like(vector, const, const) ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type, exprs_nulls), - executeFunction("regexp_like", createNullableVectorColumn(exprs, exprs_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i"))); + executeFunction( + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]), + createConstColumn(row_size, vec_res_match_type))); - // Not support binary collator so far - // ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type_collator, input_string_nulls), - // executeFunction("regexp_like", {createNullableVectorColumn(input_strings, input_string_nulls), createConstColumn(row_size, patterns[0]), createConstColumn(row_size, "i")}, binary_collator)); + // test regexp_like(vector, const) with ci collator + ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_collator, exprs_nulls), + executeFunction( + "regexp_like", + { + createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]) + }, + utf8mb4_general_ci_collator)); + + // test regexp_like(vector, const, const) with ci collator + ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_collator_and_match_type, exprs_nulls), + executeFunction( + "regexp_like", + { + createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]), + createConstColumn(row_size, vec_res_collator_and_match_type) + }, + utf8mb4_general_ci_collator)); } - const std::vector vv_res{1, 0, 0, 0, 0, 0, 1, 1}; // vector expr, vector pattern - const std::vector vvc_res{1, 1, 0, 0, 0, 0, 1, 1}; // vector expr, vector pattern, const match_type + const std::vector vv_res{1, 0, 0, 0, 0, 0, 1, 1, 0}; // vector expr, vector pattern + const std::vector vvc_res{1, 1, 0, 0, 0, 0, 1, 1, 1}; // vector expr, vector pattern, const match_type 'i' + const std::vector vvc_collator_res{1, 1, 0, 1, 1, 0, 1, 1, 1}; // vector expr, vector pattern, const match_type 'm', with collator // case 5 regexp_like(vector, vector[, const]) { - // test regexp_like(vector, vector) ASSERT_COLUMN_EQ(createColumn(vv_res), executeFunction( @@ -1901,34 +1977,58 @@ TEST_F(Regexp, RegexpLike) "regexp_like", createColumn(exprs), createColumn(patterns), - createConstColumn(row_size, "i"))); + createConstColumn(row_size, vec_res_match_type))); + // test regexp_like(vector, vector, const) with ci collator + ASSERT_COLUMN_EQ(createColumn(vvc_collator_res), + executeFunction( + "regexp_like", + { + createColumn(exprs), + createColumn(patterns), + createConstColumn(row_size, vec_res_collator_and_match_type) + }, + utf8mb4_general_ci_collator)); } // case 6 regexp_like(vector, vector[, const]) with null vable { - // test regexp_like(vector, vector) + // test regexp_like(nullable vector, vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vv_res, exprs_nulls), executeFunction( "regexp_like", createNullableVectorColumn(exprs, exprs_nulls), createColumn(patterns))); + // test regexp_like(vectir, nullable vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vv_res, pattern_nulls), executeFunction( "regexp_like", createColumn(exprs), createNullableVectorColumn(patterns, pattern_nulls))); - // test regexp_like(vector, vector, const) + // test regexp_like(nullable vector, vector, const) ASSERT_COLUMN_EQ(createNullableVectorColumn(vvc_res, exprs_nulls), - executeFunction("regexp_like", - createNullableVectorColumn(exprs, exprs_nulls), - createColumn(patterns), - createConstColumn(row_size, "i"))); + executeFunction( + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createConstColumn(row_size, vec_res_match_type))); + + // test regexp_like(nullable vector, vector, const) with ci collator + ASSERT_COLUMN_EQ(createNullableVectorColumn(vvc_collator_res, exprs_nulls), + executeFunction( + "regexp_like", + { + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createConstColumn(row_size, vec_res_collator_and_match_type), + }, + utf8mb4_general_ci_collator)); } - const std::vector vvv_res{1, 1, 1, 1, 1, 0, 1, 1}; // vector expr, vector pattern, vector match_type + const std::vector vvv_res{1, 1, 1, 1, 1, 0, 1, 1, 0}; // vector expr, vector pattern, vector match_type + const std::vector vvv_collator_res{1, 1, 1, 1, 1, 0, 1, 1, 1}; // vector expr, vector pattern, vector match_type // case 7 regexp_like(vector, vector[, vector]) { @@ -1939,6 +2039,17 @@ TEST_F(Regexp, RegexpLike) createColumn(exprs), createColumn(patterns), createColumn(match_types))); + + // test regexp_like(vector, vector, vector) with ci collator + ASSERT_COLUMN_EQ(createColumn(vvv_collator_res), + executeFunction( + "regexp_like", + { + createColumn(exprs), + createColumn(patterns), + createColumn(match_types) + }, + utf8mb4_general_ci_collator)); } // case 8 regexp_like(vector, vector[, vector]) withh null value From 61baf637d02a0a5d00111d22c7a33c1b2a76ec0a Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 7 Oct 2022 15:56:22 +0800 Subject: [PATCH 09/87] format --- dbms/src/Functions/FunctionsRegexp.cpp | 3 +- dbms/src/Functions/FunctionsRegexp.h | 105 +++++---- dbms/src/Functions/FunctionsStringSearch.cpp | 5 +- dbms/src/Functions/re2Util.cpp | 2 +- dbms/src/Functions/re2Util.h | 7 +- dbms/src/Functions/tests/gtest_regexp.cpp | 223 +++++++++---------- 6 files changed, 175 insertions(+), 170 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 15b679d7e62..172a621d8f4 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -15,6 +15,7 @@ #include #include #include + #include "Functions/Regexps.h" namespace DB @@ -28,7 +29,7 @@ const char flag_m = 'm'; const char flag_s = 's'; std::set valid_flags{flag_i, flag_c, flag_m, flag_s}; -} +} // namespace // If characters specifying contradictory options are specified // within match_type, the rightmost one takes precedence. diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index e4f25dea38a..dba9552acc1 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -14,27 +14,28 @@ #pragma once -#include -#include #include +#include #include +#include +#include #include #include #include #include -#include #include #include -#include +#include +#include #include + #include "Columns/ColumnNullable.h" #include "Columns/ColumnsNumber.h" #include "Common/Exception.h" #include "Core/Field.h" #include "DataTypes/DataTypeNullable.h" #include "common/types.h" -#include #if USE_RE2_ST #include @@ -119,11 +120,17 @@ class Param DISALLOW_COPY_AND_MOVE(Param); Param(const ColumnPtr & ptr, const String & default_value) - : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), - is_const(false), data_string(default_value), data_int64(0) + : col_ptr(ptr) + , col_str(nullptr) + , col_int64(nullptr) + , null_map(nullptr) + , is_const(false) + , data_string(default_value) + , data_int64(0) { // arg is not provided and we should use default_value - if (col_ptr == nullptr) return; + if (col_ptr == nullptr) + return; const auto * col_const = typeid_cast(&(*col_ptr)); @@ -147,7 +154,7 @@ class Param StringRef tmp_data = col_const->getDataAt(0); data_string = String(tmp_data.data, tmp_data.size); } - + is_const = true; } @@ -158,18 +165,24 @@ class Param col_str = checkAndGetColumn(&(*nested_ptr)); null_map = &(static_cast(*col_ptr).getNullMapData()); } - else { + else + { // This is a pure vector column col_str = checkAndGetColumn(&(*col_ptr)); } } Param(const ColumnPtr & ptr, Int64 default_value) - : col_ptr(ptr), col_str(nullptr), col_int64(nullptr), null_map(nullptr), - is_const(false), data_int64(default_value) + : col_ptr(ptr) + , col_str(nullptr) + , col_int64(nullptr) + , null_map(nullptr) + , is_const(false) + , data_int64(default_value) { // arg is not provided and we should use default_value - if (col_ptr == nullptr) return; + if (col_ptr == nullptr) + return; const auto * col_const = typeid_cast(&(*col_ptr)); @@ -239,7 +252,8 @@ class Param String ret_str(sr.data, sr.size); return ret_str; } - else { + else + { String ret_str(data_string); return ret_str; } @@ -247,7 +261,8 @@ class Param bool isNullAt(size_t idx) const { - if (null_map == nullptr) return false; + if (null_map == nullptr) + return false; return (*null_map)[idx]; } @@ -255,6 +270,7 @@ class Param bool isConstCol() const { return is_const; } bool isNullableCol() const { return null_map == nullptr; } size_t getDataNum() const { return col_ptr->size(); } + private: const ColumnPtr col_ptr; const ColumnString * col_str; @@ -287,7 +303,8 @@ class FunctionStringRegexpBase { String && match_type = match_type_param->getString(0); final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); - } else + } + else { handleCollatorWithoutMatchType(final_pattern, collator); } @@ -304,30 +321,32 @@ class FunctionStringRegexpBase constexpr std::string_view class_name_sv(Name::name); constexpr std::string_view tidb_regexp_name_sv(NameTiDBRegexp::name); constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); - + if constexpr (class_name_sv == tidb_regexp_name_sv) total_param_num = REGEXP_MAX_PARAM_NUM; else if constexpr (class_name_sv == regexp_like_name_sv) total_param_num = REGEXP_LIKE_MAX_PARAM_NUM; else throw Exception("Unknown regular function."); - + if constexpr (class_name_sv == tidb_regexp_name_sv) { return pat_param.isConstCol(); - } else + } + else { const bool is_pat_const = pat_param.isConstCol(); if (is_pat_const && (arg_num < total_param_num || (match_type_param->isConstCol()))) return true; } - return false; + return false; } bool isMemorized() const { return memorized_re != nullptr; } const std::unique_ptr & getRegexp() const { return memorized_re; } + private: // We should pre compile the regular expression when: // - only pattern column is provided and it's a constant column @@ -336,8 +355,9 @@ class FunctionStringRegexpBase }; // Implementation of regexp and regexp_like functions -template -class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction +template +class FunctionStringRegexp : public FunctionStringRegexpBase + , public IFunction { public: using ResultType = UInt8; @@ -366,7 +386,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction throw Exception("Illegal argument number"); bool has_nullable_col = false; - + for (size_t i = 0; i < REGEXP_XXX_MIN_PARAM_NUM; ++i) checkInputArg(arguments[i], &has_nullable_col); @@ -432,21 +452,23 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // Check if args are all const columns if (expr_param.isConstCol() && pat_param.isConstCol()) { -#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator) \ - do { \ - int flags = getDefaultFlags(); \ - String final_pattern = (pat); \ - if constexpr (has_match_type) \ - { \ - /* put match_type into pattern */ \ - String match_type = (match_type_param)->getString(0); \ - final_pattern = addMatchTypeForPattern(final_pattern, match_type, (collator)); \ - } else \ - handleCollatorWithoutMatchType(final_pattern, (collator)); \ - Regexps::Regexp regexp(final_pattern, flags); \ - ResultType res{regexp.match(expr)}; \ +#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator) \ + do \ + { \ + int flags = getDefaultFlags(); \ + String final_pattern = (pat); \ + if constexpr (has_match_type) \ + { \ + /* put match_type into pattern */ \ + String match_type = (match_type_param)->getString(0); \ + final_pattern = addMatchTypeForPattern(final_pattern, match_type, (collator)); \ + } \ + else \ + handleCollatorWithoutMatchType(final_pattern, (collator)); \ + Regexps::Regexp regexp(final_pattern, flags); \ + ResultType res{regexp.match(expr)}; \ (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ - } while(0) + } while (0) String pat = pat_param.getString(0); if (pat.empty()) @@ -492,7 +514,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction // Start to match if (isMemorized()) { - const auto & regexp = getRegexp(); + const auto & regexp = getRegexp(); if (null_presence.has_nullable_col) { // expr column must be a nullable column here, so we need to check null for each elems @@ -503,7 +525,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction StringRef expr_ref; for (size_t i = 0; i < col_size; ++i) { - if (expr_param.isNullAt(i)) { nullmap[i] = 1; @@ -529,7 +550,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } block.getByPosition(result).column = std::move(col_res); - } + } } else { @@ -583,7 +604,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction vec_res[i] = regexp->match(expr); // match } } - + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else @@ -616,6 +637,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction } } } + private: void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col) const { @@ -898,4 +920,3 @@ class FunctionStringReplace : public IFunction TiDB::TiDBCollatorPtr collator{}; }; } // namespace DB - diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index e7525e832e0..d4f6281b7b5 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -22,16 +22,17 @@ #include #include #include +#include #include #include #include +#include #include #include -#include -#include #include #include + #include "Columns/IColumn.h" #include "Common/Exception.h" #include "common/defines.h" diff --git a/dbms/src/Functions/re2Util.cpp b/dbms/src/Functions/re2Util.cpp index 7687fd8e4f8..21aa7ce09f8 100644 --- a/dbms/src/Functions/re2Util.cpp +++ b/dbms/src/Functions/re2Util.cpp @@ -77,5 +77,5 @@ String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBColla else return ""; } -} +} // namespace re2Util } // namespace DB diff --git a/dbms/src/Functions/re2Util.h b/dbms/src/Functions/re2Util.h index cff69f6e3c5..f91a9e3ab9e 100644 --- a/dbms/src/Functions/re2Util.h +++ b/dbms/src/Functions/re2Util.h @@ -15,11 +15,12 @@ #pragma once #include -#include #include -#include "Common/Exception.h" +#include #include +#include "Common/Exception.h" + #if USE_RE2_ST #include @@ -33,5 +34,5 @@ namespace re2Util { re2_st::RE2::Options getDefaultRe2Options(); String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator); -} +} // namespace re2Util } // namespace DB diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 5aa538db1e0..64cb951ce35 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -18,18 +18,18 @@ #include /// this is a hack, include the cpp file so we can test MatchImpl directly -#include // NOLINT #include // NOLINT +#include // NOLINT #include #include + #include "DataTypes/DataTypesNumber.h" #include "common/types.h" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" -#include - #include +#include #pragma GCC diagnostic pop @@ -1827,28 +1827,26 @@ TEST_F(Regexp, RegexpLike) { // test regexp_like(const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results[i]), - executeFunction( - "regexp_like", - createConstColumn(row_size, exprs[i]), - createConstColumn(row_size, patterns[i]))); + executeFunction( + "regexp_like", + createConstColumn(row_size, exprs[i]), + createConstColumn(row_size, patterns[i]))); /// test regexp_like(const, const, const) ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_match_type[i]), - executeFunction( - "regexp_like", - createConstColumn(row_size, exprs[i]), - createConstColumn(row_size, patterns[i]), - createConstColumn(row_size, match_types[i]))); + executeFunction( + "regexp_like", + createConstColumn(row_size, exprs[i]), + createConstColumn(row_size, patterns[i]), + createConstColumn(row_size, match_types[i]))); // test regexp_like(const, const, const) with ci collator ASSERT_COLUMN_EQ(createConstColumn(row_size, results_with_collator[i]), executeFunction( - "regexp_like", - { - createConstColumn(row_size, exprs[i]), - createConstColumn(row_size, patterns[i]) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createConstColumn(row_size, exprs[i]), + createConstColumn(row_size, patterns[i])}, + utf8mb4_general_ci_collator)); } } @@ -1858,32 +1856,28 @@ TEST_F(Regexp, RegexpLike) { // test regexp_like(const, const) ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results[i]), - executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); + executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]))); // test regexp_like(const, const, const) ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_match_type[i]), - executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); + executeFunction("regexp_like", exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]))); // test regexp_like(const, const) with ci collator ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_collator[i]), executeFunction( - "regexp_like", - { - exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), - pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), + pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i])}, + utf8mb4_general_ci_collator)); // test regexp_like(const, const, const) with ci collator ASSERT_COLUMN_EQ(exprs_nulls[i] || pattern_nulls[i] || match_type_nulls[i] ? const_uint8_null_column : createConstColumn(row_size, results_with_collator_and_match_type[i]), executeFunction( - "regexp_like", - { - exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), - pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), - match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i]) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {exprs_nulls[i] ? const_string_null_column : createConstColumn>(row_size, exprs[i]), + pattern_nulls[i] ? const_string_null_column : createConstColumn>(row_size, patterns[i]), + match_type_nulls[i] ? const_string_null_column : createConstColumn>(row_size, match_types[i])}, + utf8mb4_general_ci_collator)); } } @@ -1900,23 +1894,19 @@ TEST_F(Regexp, RegexpLike) // test regexp_like(vector, const) with ci collator ASSERT_COLUMN_EQ(createColumn(vec_results_with_collator), executeFunction( - "regexp_like", - { - createColumn(exprs), - createConstColumn(row_size, patterns[0]) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createColumn(exprs), + createConstColumn(row_size, patterns[0])}, + utf8mb4_general_ci_collator)); // test regexp_like(vector, const, const) with ci collator ASSERT_COLUMN_EQ(createColumn(vec_results_with_collator_and_match_type), executeFunction( - "regexp_like", - { - createColumn(exprs), - createConstColumn(row_size, patterns[0]), - createConstColumn(row_size, "m") - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createColumn(exprs), + createConstColumn(row_size, patterns[0]), + createConstColumn(row_size, "m")}, + utf8mb4_general_ci_collator)); } /// case 4 regexp_like(vector, const[, const]) with null value @@ -1924,38 +1914,34 @@ TEST_F(Regexp, RegexpLike) // regexp_like(vector, const) ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results, exprs_nulls), executeFunction( - "regexp_like", - createNullableVectorColumn(exprs, exprs_nulls), - createConstColumn(row_size, patterns[0]))); + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]))); // regexp_like(vector, const, const) ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_match_type, exprs_nulls), executeFunction( - "regexp_like", - createNullableVectorColumn(exprs, exprs_nulls), - createConstColumn(row_size, patterns[0]), - createConstColumn(row_size, vec_res_match_type))); + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]), + createConstColumn(row_size, vec_res_match_type))); // test regexp_like(vector, const) with ci collator ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_collator, exprs_nulls), executeFunction( - "regexp_like", - { - createNullableVectorColumn(exprs, exprs_nulls), - createConstColumn(row_size, patterns[0]) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0])}, + utf8mb4_general_ci_collator)); // test regexp_like(vector, const, const) with ci collator ASSERT_COLUMN_EQ(createNullableVectorColumn(vec_results_with_collator_and_match_type, exprs_nulls), executeFunction( - "regexp_like", - { - createNullableVectorColumn(exprs, exprs_nulls), - createConstColumn(row_size, patterns[0]), - createConstColumn(row_size, vec_res_collator_and_match_type) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createNullableVectorColumn(exprs, exprs_nulls), + createConstColumn(row_size, patterns[0]), + createConstColumn(row_size, vec_res_collator_and_match_type)}, + utf8mb4_general_ci_collator)); } const std::vector vv_res{1, 0, 0, 0, 0, 0, 1, 1, 0}; // vector expr, vector pattern @@ -1967,28 +1953,26 @@ TEST_F(Regexp, RegexpLike) // test regexp_like(vector, vector) ASSERT_COLUMN_EQ(createColumn(vv_res), executeFunction( - "regexp_like", - createColumn(exprs), - createColumn(patterns))); + "regexp_like", + createColumn(exprs), + createColumn(patterns))); // test regexp_like(vector, vector, const) ASSERT_COLUMN_EQ(createColumn(vvc_res), executeFunction( - "regexp_like", - createColumn(exprs), - createColumn(patterns), - createConstColumn(row_size, vec_res_match_type))); + "regexp_like", + createColumn(exprs), + createColumn(patterns), + createConstColumn(row_size, vec_res_match_type))); // test regexp_like(vector, vector, const) with ci collator ASSERT_COLUMN_EQ(createColumn(vvc_collator_res), executeFunction( - "regexp_like", - { - createColumn(exprs), - createColumn(patterns), - createConstColumn(row_size, vec_res_collator_and_match_type) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createColumn(exprs), + createColumn(patterns), + createConstColumn(row_size, vec_res_collator_and_match_type)}, + utf8mb4_general_ci_collator)); } // case 6 regexp_like(vector, vector[, const]) with null vable @@ -1996,35 +1980,35 @@ TEST_F(Regexp, RegexpLike) // test regexp_like(nullable vector, vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vv_res, exprs_nulls), executeFunction( - "regexp_like", - createNullableVectorColumn(exprs, exprs_nulls), - createColumn(patterns))); + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns))); // test regexp_like(vectir, nullable vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vv_res, pattern_nulls), executeFunction( - "regexp_like", - createColumn(exprs), - createNullableVectorColumn(patterns, pattern_nulls))); + "regexp_like", + createColumn(exprs), + createNullableVectorColumn(patterns, pattern_nulls))); // test regexp_like(nullable vector, vector, const) ASSERT_COLUMN_EQ(createNullableVectorColumn(vvc_res, exprs_nulls), executeFunction( - "regexp_like", - createNullableVectorColumn(exprs, exprs_nulls), - createColumn(patterns), - createConstColumn(row_size, vec_res_match_type))); + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createConstColumn(row_size, vec_res_match_type))); // test regexp_like(nullable vector, vector, const) with ci collator ASSERT_COLUMN_EQ(createNullableVectorColumn(vvc_collator_res, exprs_nulls), executeFunction( - "regexp_like", - { - createNullableVectorColumn(exprs, exprs_nulls), - createColumn(patterns), - createConstColumn(row_size, vec_res_collator_and_match_type), - }, - utf8mb4_general_ci_collator)); + "regexp_like", + { + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createConstColumn(row_size, vec_res_collator_and_match_type), + }, + utf8mb4_general_ci_collator)); } const std::vector vvv_res{1, 1, 1, 1, 1, 0, 1, 1, 0}; // vector expr, vector pattern, vector match_type @@ -2035,21 +2019,19 @@ TEST_F(Regexp, RegexpLike) // test regexp_like(vector, vector, vector) ASSERT_COLUMN_EQ(createColumn(vvv_res), executeFunction( - "regexp_like", - createColumn(exprs), - createColumn(patterns), - createColumn(match_types))); + "regexp_like", + createColumn(exprs), + createColumn(patterns), + createColumn(match_types))); // test regexp_like(vector, vector, vector) with ci collator ASSERT_COLUMN_EQ(createColumn(vvv_collator_res), executeFunction( - "regexp_like", - { - createColumn(exprs), - createColumn(patterns), - createColumn(match_types) - }, - utf8mb4_general_ci_collator)); + "regexp_like", + {createColumn(exprs), + createColumn(patterns), + createColumn(match_types)}, + utf8mb4_general_ci_collator)); } // case 8 regexp_like(vector, vector[, vector]) withh null value @@ -2057,26 +2039,26 @@ TEST_F(Regexp, RegexpLike) // test regexp_like(nullable vector, vector, vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vvv_res, exprs_nulls), executeFunction( - "regexp_like", - createNullableVectorColumn(exprs, exprs_nulls), - createColumn(patterns), - createColumn(match_types))); + "regexp_like", + createNullableVectorColumn(exprs, exprs_nulls), + createColumn(patterns), + createColumn(match_types))); // test regexp_like(vector, nullable vector, vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vvv_res, pattern_nulls), executeFunction( - "regexp_like", - createColumn(exprs), - createNullableVectorColumn(patterns, pattern_nulls), - createColumn(match_types))); + "regexp_like", + createColumn(exprs), + createNullableVectorColumn(patterns, pattern_nulls), + createColumn(match_types))); // test regexp_like(vector, vector, nullable vector) ASSERT_COLUMN_EQ(createNullableVectorColumn(vvv_res, match_type_nulls), executeFunction( - "regexp_like", - createColumn(exprs), - createColumn(patterns), - createNullableVectorColumn(match_types, match_type_nulls))); + "regexp_like", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(match_types, match_type_nulls))); } /// issue 5984 @@ -2087,7 +2069,6 @@ TEST_F(Regexp, RegexpLike) ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn>(row_size, "")), Exception); } -// fail TEST_F(Regexp, testRegexpCustomerCases) { String pattern = "^(53|94)[0-9]{10}$|" From 74f090edd108110815f5cbf973749065f02fd056 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 7 Oct 2022 21:11:08 +0800 Subject: [PATCH 10/87] fix ut --- dbms/src/Functions/FunctionsRegexp.cpp | 9 ++++-- dbms/src/Functions/FunctionsRegexp.h | 36 ++++++++++++++++++----- dbms/src/Functions/tests/gtest_regexp.cpp | 2 +- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 172a621d8f4..0013f6c48b5 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -14,10 +14,9 @@ #include #include +#include #include -#include "Functions/Regexps.h" - namespace DB { @@ -76,6 +75,12 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) const auto & elem = block.getByPosition(arg); const auto * col_const = typeid_cast(&(*(elem.column))); + if (elem.type->getTypeId() == TypeIndex::Nothing) + { + res.has_data_type_nothing = true; + break; + } + if (col_const != nullptr) { auto col_const_data = col_const->getDataColumnPtr(); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index dba9552acc1..62b3b96a6a5 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -34,7 +34,10 @@ #include "Columns/ColumnsNumber.h" #include "Common/Exception.h" #include "Core/Field.h" +#include "Core/Types.h" +#include "DataTypes/DataTypeNothing.h" #include "DataTypes/DataTypeNullable.h" +#include "Parsers/Lexer.h" #include "common/types.h" #if USE_RE2_ST @@ -90,6 +93,7 @@ struct NullPresence { bool has_nullable_col = false; bool has_const_null_col = false; + bool has_data_type_nothing = false; }; NullPresence getNullPresense(const Block & block, const ColumnNumbers & args); @@ -386,19 +390,28 @@ class FunctionStringRegexp : public FunctionStringRegexpBase throw Exception("Illegal argument number"); bool has_nullable_col = false; + bool has_data_type_nothing = false; for (size_t i = 0; i < REGEXP_XXX_MIN_PARAM_NUM; ++i) - checkInputArg(arguments[i], &has_nullable_col); + checkInputArg(arguments[i], &has_nullable_col, &has_data_type_nothing); // check match_type arg for regexp_like if constexpr (class_name_sv == regexp_like_name_sv) if (arg_num == args_max_num && !arguments[args_max_num - 1]->isString()) - checkInputArg(arguments[args_max_num - 1], &has_nullable_col); + checkInputArg(arguments[args_max_num - 1], &has_nullable_col, &has_data_type_nothing); if (has_nullable_col) + { + if (has_data_type_nothing) + return std::make_shared(std::make_shared()); return std::make_shared(std::make_shared>()); + } else + { + if (has_data_type_nothing) + return std::make_shared(); return std::make_shared>(); + } } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override @@ -408,7 +421,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; - if (null_presence.has_const_null_col) + if (null_presence.has_const_null_col || null_presence.has_data_type_nothing) { // There is a const null column in the input block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); @@ -581,7 +594,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } } - nullmap[i] = 0; String && expr = expr_param.getString(i); String && pat = pat_param.getString(i); @@ -639,7 +651,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } private: - void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col) const + void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) const { if (arg->isNullable()) { @@ -647,12 +659,22 @@ class FunctionStringRegexp : public FunctionStringRegexpBase const auto & null_type = checkAndGetDataType(arg.get()); const auto & nested_type = null_type->getNestedType(); if (!nested_type->isString()) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } } else { if (!arg->isString()) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + { + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } } } diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 64cb951ce35..14da67b57cf 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2061,7 +2061,7 @@ TEST_F(Regexp, RegexpLike) createNullableVectorColumn(match_types, match_type_nulls))); } - /// issue 5984 + // empty pattern is not allowed ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); ASSERT_THROW(executeFunction("regexp_like", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createColumn(std::vector{""})), Exception); From 5f1d1f8fbcc6065d9fc09ea0a6d29dab22c1704a Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 10 Oct 2022 18:33:56 +0800 Subject: [PATCH 11/87] save works --- dbms/src/Functions/FunctionsRegexp.cpp | 14 +- dbms/src/Functions/FunctionsRegexp.h | 744 ++++++++++++++----------- 2 files changed, 424 insertions(+), 334 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 0013f6c48b5..860a314b1fa 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -16,6 +16,7 @@ #include #include #include +#include "Columns/ColumnNullable.h" namespace DB { @@ -48,10 +49,7 @@ String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator) // to enable the case-sensitive for the regexp if (flag == flag_c) { - auto iter_i = applied_flags.find(flag_i); - if (iter_i != applied_flags.end()) - applied_flags.erase(iter_i); - + applied_flags.erase(flag_i); continue; } @@ -102,7 +100,15 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) if (!res.has_nullable_col) { if ((elem.column)->isColumnNullable()) + { res.has_nullable_col = true; + + // Check if nullable column wrap a DataTypeNothing type + const auto * type_null = typeid_cast(&(*elem.type)); + const auto & nested_type = type_null->getNestedType(); + if (nested_type->getTypeId() == TypeIndex::Nothing) + res.has_data_type_nothing = true; + } } } } diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 62b3b96a6a5..4e9005b560e 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -29,16 +29,20 @@ #include #include +#include #include "Columns/ColumnNullable.h" #include "Columns/ColumnsNumber.h" +#include "Columns/IColumn.h" #include "Common/Exception.h" #include "Core/Field.h" #include "Core/Types.h" #include "DataTypes/DataTypeNothing.h" #include "DataTypes/DataTypeNullable.h" #include "Parsers/Lexer.h" +#include "common/StringRef.h" #include "common/types.h" +#include "Columns/ColumnString.h" #if USE_RE2_ST #include @@ -80,6 +84,9 @@ struct NameReplaceRegexpAll static constexpr auto name = "replaceRegexpAll"; }; +static constexpr std::string_view regexp_name(NameTiDBRegexp::name); +static constexpr std::string_view regexp_like_name(NameRegexpLike::name); + String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr); inline int getDefaultFlags() @@ -101,6 +108,8 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args); inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { String flags = getMatchType(match_type, collator); + if (flags.empty()) + return pattern; return fmt::format("(?{}){}", flags, pattern); } @@ -110,185 +119,370 @@ inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, return Regexps::get(final_pattern, getDefaultFlags()); } -inline void handleCollatorWithoutMatchType(String & pattern, TiDB::TiDBCollatorPtr collator) +// Only int types used in ColumnsNumber.h can be valid +template +inline constexpr bool check_int_type() { - if (collator != nullptr && collator->isCI()) - pattern = fmt::format("(?i){}", pattern); + return static_cast(std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v); } -// Columns may be const, nullable or plain vector, we can conveniently handle -// these different type columns with Param. -class Param +// Use this type when param is not provided +class ParamDefault { public: - DISALLOW_COPY_AND_MOVE(Param); + explicit ParamDefault(Int64 val) : default_int(val), default_string("") {} + explicit ParamDefault(const StringRef & str) : default_int(0), default_string(str) {} - Param(const ColumnPtr & ptr, const String & default_value) - : col_ptr(ptr) - , col_str(nullptr) - , col_int64(nullptr) - , null_map(nullptr) - , is_const(false) - , data_string(default_value) - , data_int64(0) + // For passing compilation + explicit ParamDefault(const void *) : default_int(0), default_string("") { - // arg is not provided and we should use default_value - if (col_ptr == nullptr) - return; + throw Exception("Shouldn't call this constructor"); + } - const auto * col_const = typeid_cast(&(*col_ptr)); + // For passing compilation + ParamDefault(const void *, const void *) : default_int(0), default_string("") + { + throw Exception("Shouldn't call this constructor"); + } - // Handle const - if (col_const != nullptr) - { - // This is a const column - auto col_const_data = col_const->getDataColumnPtr(); - if (col_const_data->isColumnNullable()) - { - // This is a const nullable column - // const null can't be here as we should have handle it in the previous - Field field; - auto p = static_cast(*col_const_data).getNestedColumnPtr(); - col_const->get(0, field); - data_string = field.safeGet(); - null_map = &(static_cast(*col_const_data).getNullMapData()); - } - else - { - StringRef tmp_data = col_const->getDataAt(0); - data_string = String(tmp_data.data, tmp_data.size); - } + Int64 getInt(size_t) const { return default_int; } + String getString(size_t) const { return String(""); } + void getStringRef(size_t, StringRef &) const {} + constexpr bool isConst() const { return true; } - is_const = true; - } +private: + Int64 default_int; + StringRef default_string; +}; - if (col_ptr->isColumnNullable()) - { - // Handle nullable column - auto nested_ptr = static_cast(*col_ptr).getNestedColumnPtr(); - col_str = checkAndGetColumn(&(*nested_ptr)); - null_map = &(static_cast(*col_ptr).getNullMapData()); - } - else - { - // This is a pure vector column - col_str = checkAndGetColumn(&(*col_ptr)); - } +template +class ParamString +{ +public: + DISALLOW_COPY_AND_MOVE(ParamString); + + using Chars_t = ColumnString::Chars_t; + using Offsets = ColumnString::Offsets; + + // For passing compilation + explicit ParamString(Int64) + : const_string(nullptr, 0), chars(nullptr), offsets(nullptr) + { + throw Exception("Shouldn't call this constructor"); } - Param(const ColumnPtr & ptr, Int64 default_value) - : col_ptr(ptr) - , col_str(nullptr) - , col_int64(nullptr) - , null_map(nullptr) - , is_const(false) - , data_int64(default_value) + explicit ParamString(const StringRef & str_ref) + : const_string(str_ref), chars(nullptr), offsets(nullptr) { - // arg is not provided and we should use default_value - if (col_ptr == nullptr) - return; + if constexpr (!is_const) + throw Exception("non-const parm should not call this constructor"); + } - const auto * col_const = typeid_cast(&(*col_ptr)); + // For passing compilation + explicit ParamString(const void *) + : const_string(nullptr, 0), chars(nullptr), offsets(nullptr) + { + throw Exception("Shouldn't call this constructor"); + } - // Handle const - if (col_const != nullptr) - { - // This is a const column - auto col_const_data = col_const->getDataColumnPtr(); - if (col_const_data->isColumnNullable()) - { - // This is a const nullable column - Field field; - col_const->get(0, field); - data_int64 = field.get(); - null_map = &(static_cast(*col_ptr).getNullMapData()); - } - else - { - data_int64 = col_const->getValue(); - } + ParamString(const void * chars_, const void * offsets_) + : const_string(nullptr, 0) + , chars(reinterpret_cast(chars_)) + , offsets(reinterpret_cast(offsets_)) + { + if constexpr (is_const) + throw Exception("const parm should not call this constructor"); + } - is_const = true; - return; - } + Int64 getInt(size_t) const { throw Exception("ParamString not supports this function"); } - if (col_ptr->isColumnNullable()) + String getString(size_t idx) const + { + if constexpr (is_const) + return String(const_string.data, const_string.size); + else + return String(&chars[offsetAt(idx)], sizeAt(idx) - 1); + } + + void getStringRef(size_t idx, StringRef & dst) const + { + if constexpr (is_const) { - // Handle nullable column - auto nested_ptr = static_cast(*col_ptr).getNestedColumnPtr(); - col_int64 = checkAndGetColumn(&(*nested_ptr)); - null_map = &(static_cast(*col_ptr).getNullMapData()); + dst.data = const_string.data; + dst.size = const_string.size; } else { - // This is a pure vector column - col_int64 = checkAndGetColumn(&(*col_ptr)); + auto tmp = StringRef(&chars[offsetAt(idx)], sizeAt(idx) - 1); + dst.data = tmp.data; + dst.size = tmp.size; } } - Int64 getInt64(size_t idx) const + constexpr bool isConst() const { return is_const; } + +private: + size_t offsetAt(size_t i) const { return i == 0 ? 0 : (*offsets)[i - 1]; } + size_t sizeAt(size_t i) const { return i == 0 ? (*offsets)[0] : ((*offsets)[i] - (*offsets)[i - 1]); } + + StringRef const_string; + + // for vector string + const Chars_t * chars; + const Offsets * offsets; +}; + +template +class ParamInt +{ +public: + DISALLOW_COPY_AND_MOVE(ParamInt); + using Container = typename ColumnVector(), T>>::Container; + + explicit ParamInt(Int64 val) : const_int_val(val), int_container(nullptr) { - // Use default value when arg is const or not provided. - // For safety, nullptr should be checked - return !is_const && col_int64 != nullptr ? col_int64->getInt(idx) : data_int64; + if constexpr (!is_const) + throw Exception("non-const parm should not call this constructor"); } - void getStringRef(size_t idx, StringRef & dst) const + // For passing compilation + explicit ParamInt(const StringRef &) + : const_int_val(0), int_container(nullptr) { - // Use default value when arg is const or not provided. - // For safety, nullptr should be checked - if (!is_const && col_str != nullptr) - dst = col_str->getDataAt(idx); - else - { - dst.data = data_string.c_str(); - dst.size = data_string.size(); - } + throw Exception("Shouldn't call this constructor"); } - String getString(size_t idx) const + explicit ParamInt(const void * int_container_) + : const_int_val(0) + , int_container(reinterpret_cast(int_container_)) { - // Use default value when arg is const or not provided. - // For safety, nullptr should be checked - if (!is_const && col_str != nullptr) - { - StringRef sr = col_str->getDataAt(idx); - String ret_str(sr.data, sr.size); - return ret_str; - } - else - { - String ret_str(data_string); - return ret_str; - } + if constexpr (is_const) + throw Exception("const parm should not call this constructor"); } - bool isNullAt(size_t idx) const + // For passing compilation + ParamInt(const void *, const void *) + : const_int_val(0) + , int_container(nullptr) { - if (null_map == nullptr) - return false; + throw Exception("Shouldn't call this constructor"); + } - return (*null_map)[idx]; + Int64 getInt(size_t idx) const + { + if constexpr (is_const) + return const_int_val; + else + return static_cast((*int_container)[idx]); } - bool isConstCol() const { return is_const; } - bool isNullableCol() const { return null_map == nullptr; } - size_t getDataNum() const { return col_ptr->size(); } + String getString(size_t) const { throw Exception("ParamInt not supports this function"); } + void getStringRef(size_t, StringRef &) const { throw Exception("ParamInt not supports this function"); } + constexpr bool isConst() const { return is_const; } private: - const ColumnPtr col_ptr; - const ColumnString * col_str; - const ColumnInt64 * col_int64; + Int64 const_int_val; + + // for vector int + const Container * int_container; +}; + +// Columns may be const, nullable or plain vector, we can conveniently handle +// these different type columns with Param. +template +class Param +{ +public: + DISALLOW_COPY_AND_MOVE(Param); + + // const string param + Param(size_t col_size_, const StringRef & str_ref) + : col_size(col_size_) + , null_map(nullptr) + , data(str_ref) {} + + // const int param + Param(size_t col_size_, Int64 val) + : col_size(col_size_) + , null_map(nullptr) + , data(val) {} + + // pure vector string param + // chars_ type: ParamImplType::Chars_t + // offsets_ type: ParamImplType::Offsets + Param(size_t col_size_, const void * chars_, const void * offsets_) + : col_size(col_size_) + , null_map(nullptr) + , data(chars_, offsets_) {} + + // pure vector int param + // int_container_ type: ParamImplType::Container + Param(size_t col_size_, const void * int_container_) + : col_size(col_size_) + , null_map(nullptr) + , data(int_container_) {} + + // nullable vector string param + // chars_ type: ParamImplType::Chars_t + // offsets_ type: ParamImplType::Offsets + Param(size_t col_size_, ConstNullMapPtr null_map_, const void * chars_, const void * offsets_) + : col_size(col_size_) + , null_map(null_map_) + , data(chars_, offsets_) {} + + // nullable vector int param + // int_container_ type: ParamImplType::Container + Param(size_t col_size_, ConstNullMapPtr null_map_, const void * int_container_) + : col_size(col_size_) + , null_map(null_map_) + , data(int_container_) {} + + Int64 getInt(size_t idx) const { return data.getInt(idx); } + void getStringRef(size_t idx, StringRef & dst) const { return data.getStringRef(idx, dst); } + String getString(size_t idx) const { return data.getString(idx); } + + constexpr bool isNullAt(size_t idx) const + { + // null_map works only when we are non-const nullable column + if constexpr (is_null && !data.isConst()) + return (*null_map)[idx]; + return false; + } + + constexpr bool isNullableCol() const { return is_null; } + size_t getDataNum() const { return col_size; } + constexpr bool isConst() const { return data.isConst(); } + +private: + const size_t col_size; ConstNullMapPtr null_map; - bool is_const; // mark as the const column when it's true - String data_string; - Int64 data_int64; + ParamImplType data; }; +#define EXPR_COL_PTR_VAR_NAME col_expr +#define PAT_COL_PTR_VAR_NAME col_pat +#define MATCH_TYPE_COL_PTR_VAR_NAME col_match_type + +#define RES_ARG_VAR_NAME res_arg + +#define EXPR_PARAM_VAR_NAME expr_param +#define PAT_PARAM_VAR_NAME pat_param +#define MATCH_TYPE_PARAM_VAR_NAME match_type_param + +#define SELF_CLASS_NAME (name) +#define ARG_NUM_VAR_NAME arg_num + +// Unify the name of functions that actually execute regexp +#define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process + +// processed_col is impossible to be const here +#define PROCESS_STRING_PARAM_NULL(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + if (((processed_col)->isColumnNullable())) \ + { \ + auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ + const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ + const auto * null_map = &(static_cast(*(processed_col)).getNullMapData()); \ + Param, true> (param_name)(col_size, null_map, &(tmp->getChars()), &(tmp->getOffsets())); \ + next_process; \ + } \ + else \ + { \ + /* This is a pure string vector column */ \ + const auto * tmp = checkAndGetColumn(&(*(processed_col))); \ + Param, false> (param_name)(col_size, &(tmp->getChars()), &(tmp->getOffsets())); \ + next_process; \ + } \ + } while(0); + +#define PROCESS_STRING_PARAM_CONST(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + const auto * col_const = typeid_cast(&(*(processed_col))); \ + if (col_const != nullptr) \ + { \ + auto col_const_data = col_const->getDataColumnPtr(); \ + if (col_const_data->isColumnNullable()) \ + { \ + /* This is a const column and it can't be const null column as we should have handled it in the previous */ \ + Field field; \ + col_const->get(0, field); \ + String tmp = field.safeGet(); \ + /* const col */ \ + Param, true> (param_name)(col_size, StringRef(tmp.data(), tmp.size())); \ + next_process; \ + } \ + else \ + { \ + /* const col */ \ + Param, false> (param_name)(col_size, col_const->getDataAt(0)); \ + next_process; \ + \ + } \ + } \ + else \ + { \ + PROCESS_STRING_PARAM_NULL((param_name), (processed_col), next_process); \ + } \ + } while(0); + +// processed_col is impossible to be const here +#define PROCESS_INT_PARAM_NULL(param_name, processed_col, next_process) \ + +#define PROCESS_INT_PARAM_CONST(param_name, processed_col, next_process) \ + +// regexp and regexp_like functions are processed in this macro +#define PROCESS_REGEXP_LIKE() \ + do \ + { \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ + } while(0); + +#define PROCESS_MATCH_TYPE_PARAM_CONST() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ + { \ + if (ARG_NUM_VAR_NAME == 3) \ + { \ + PROCESS_STRING_PARAM_CONST(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({PROCESS_REGEXP_LIKE()})); \ + } \ + else \ + { \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + PROCESS_REGEXP_LIKE(); \ + } \ + } \ + } while(0); + +#define PROCESS_PAT_PARAM_CONST() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ + PROCESS_STRING_PARAM_CONST(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({PROCESS_MATCH_TYPE_PARAM_CONST()})); \ + } while (0); + +#define PROCESS_EXPR_PARAM_CONST() \ + do \ + { \ + PROCESS_STRING_PARAM_CONST(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({PROCESS_PAT_PARAM_CONST()})); \ + } while(0); + class FunctionStringRegexpBase { public: - static constexpr size_t REGEXP_XXX_MIN_PARAM_NUM = 2; + static constexpr size_t REGEXP_MIN_PARAM_NUM = 2; // Max parameter number the regexp_xxx function could receive static constexpr size_t REGEXP_MAX_PARAM_NUM = 2; @@ -297,54 +491,25 @@ class FunctionStringRegexpBase static constexpr size_t REGEXP_REPLACE_MAX_PARAM_NUM = 6; static constexpr size_t REGEXP_SUBSTR_MAX_PARAM_NUM = 5; - void memorize(const Param & pat_param, const std::unique_ptr & match_type_param, TiDB::TiDBCollatorPtr collator) const + template + void memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const { - String && final_pattern = pat_param.getString(0); + String final_pattern = pat_param.getString(0); if (final_pattern.empty()) throw Exception("Empty pattern is invalid"); - if (match_type_param != nullptr) - { - String && match_type = match_type_param->getString(0); - final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); - } - else - { - handleCollatorWithoutMatchType(final_pattern, collator); - } + String match_type = match_type_param.getString(0); + final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); int flags = getDefaultFlags(); memorized_re = std::make_unique(final_pattern, flags); } // Check if we can memorize the regexp - template - static bool canMemorize(size_t arg_num, const Param & pat_param, const std::unique_ptr & match_type_param) + template + static bool canMemorize(const ExprT & pat_param, const MatchTypeT & match_type_param) { - size_t total_param_num = 0; - constexpr std::string_view class_name_sv(Name::name); - constexpr std::string_view tidb_regexp_name_sv(NameTiDBRegexp::name); - constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); - - if constexpr (class_name_sv == tidb_regexp_name_sv) - total_param_num = REGEXP_MAX_PARAM_NUM; - else if constexpr (class_name_sv == regexp_like_name_sv) - total_param_num = REGEXP_LIKE_MAX_PARAM_NUM; - else - throw Exception("Unknown regular function."); - - if constexpr (class_name_sv == tidb_regexp_name_sv) - { - return pat_param.isConstCol(); - } - else - { - const bool is_pat_const = pat_param.isConstCol(); - if (is_pat_const && (arg_num < total_param_num || (match_type_param->isConstCol()))) - return true; - } - - return false; + return (pat_param.isConst() && match_type_param.isConst()); } bool isMemorized() const { return memorized_re != nullptr; } @@ -377,28 +542,22 @@ class FunctionStringRegexp : public FunctionStringRegexpBase DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { size_t args_max_num; - constexpr std::string_view class_name_sv(Name::name); - constexpr std::string_view regexp_like_name_sv(NameRegexpLike::name); + constexpr std::string_view class_name(Name::name); - if constexpr (class_name_sv == regexp_like_name_sv) + if constexpr (class_name == regexp_like_name) args_max_num = REGEXP_LIKE_MAX_PARAM_NUM; else args_max_num = REGEXP_MAX_PARAM_NUM; size_t arg_num = arguments.size(); - if (arg_num < REGEXP_XXX_MIN_PARAM_NUM || arg_num > args_max_num) + if (arg_num < REGEXP_MIN_PARAM_NUM || arg_num > args_max_num) throw Exception("Illegal argument number"); bool has_nullable_col = false; bool has_data_type_nothing = false; - for (size_t i = 0; i < REGEXP_XXX_MIN_PARAM_NUM; ++i) - checkInputArg(arguments[i], &has_nullable_col, &has_data_type_nothing); - - // check match_type arg for regexp_like - if constexpr (class_name_sv == regexp_like_name_sv) - if (arg_num == args_max_num && !arguments[args_max_num - 1]->isString()) - checkInputArg(arguments[args_max_num - 1], &has_nullable_col, &has_data_type_nothing); + for (const auto & arg : arguments) + checkInputArg(arg, &has_nullable_col, &has_data_type_nothing); if (has_nullable_col) { @@ -414,109 +573,27 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + template + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT &expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const { - // Do something related with nullable columns - NullPresence null_presence = getNullPresense(block, arguments); - - const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; - - if (null_presence.has_const_null_col || null_presence.has_data_type_nothing) - { - // There is a const null column in the input - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); - return; - } - - const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; - - if (col_expr->empty()) - { - auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); - block.getByPosition(result).column = ColumnConst::create(std::move(null_col_res), 0); - return; - } - - const Param expr_param(col_expr, String("")); - const Param pat_param(col_pat, String("")); - auto arg_num = arguments.size(); size_t col_size = expr_param.getDataNum(); - // match_type_param will be initialized, only when this is a regexp_like function - std::unique_ptr match_type_param; - - constexpr std::string_view class_name(name); - constexpr std::string_view regexp_like_name(NameRegexpLike::name); - if constexpr (class_name == regexp_like_name) - { - // Try to get match type column only when it's a regexp_like function - ColumnPtr col_match_type; - if (arg_num > 2) - { - col_match_type = block.getByPosition(arguments[2]).column; - match_type_param = std::make_unique(col_match_type, String("")); - } - else - { - match_type_param = std::make_unique(col_match_type, String("")); - } - } - // Check if args are all const columns - if (expr_param.isConstCol() && pat_param.isConstCol()) + if constexpr (expr_param.isConst() && pat_param.isConst() && match_type_param.isConst()) { -#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator) \ - do \ - { \ - int flags = getDefaultFlags(); \ - String final_pattern = (pat); \ - if constexpr (has_match_type) \ - { \ - /* put match_type into pattern */ \ - String match_type = (match_type_param)->getString(0); \ - final_pattern = addMatchTypeForPattern(final_pattern, match_type, (collator)); \ - } \ - else \ - handleCollatorWithoutMatchType(final_pattern, (collator)); \ - Regexps::Regexp regexp(final_pattern, flags); \ - ResultType res{regexp.match(expr)}; \ - (block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \ - } while (0) - + int flags = getDefaultFlags(); + String expr = expr_param.getString(0); String pat = pat_param.getString(0); - if (pat.empty()) - throw Exception("Empty pattern is invalid"); + String match_type = match_type_param.getString(0); - String expr = expr_param.getString(0); - if constexpr (class_name == regexp_like_name) - { - // regexp_like function - if (arg_num > 2 && match_type_param->isConstCol()) - { - constexpr bool has_match_type = true; - GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator); - return; - } - else if (arg_num == 2) - { - constexpr bool has_match_type = false; - GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator); - return; - } - // reach here when arg_num == 3 and match_type is not const - } - else - { - // regexp function - constexpr bool has_match_type = false; - GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator); - return; - } -#undef GET_CONST_RESULT + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + ResultType res{regexp.match(expr)}; + res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); + return; } // Check memorization - if (canMemorize(arg_num, pat_param, match_type_param)) + if (canMemorize(pat_param, match_type_param)) memorize(pat_param, match_type_param, collator); // Initialize result column @@ -524,11 +601,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase typename ColumnVector::Container & vec_res = col_res->getData(); vec_res.resize(expr_param.getDataNum(), 0); + constexpr bool has_nullable_col = expr_param.isNullableCol() || pat_param.isNullableCol() || match_type_param.isNullableCol(); + // Start to match if (isMemorized()) { const auto & regexp = getRegexp(); - if (null_presence.has_nullable_col) + if constexpr (has_nullable_col) { // expr column must be a nullable column here, so we need to check null for each elems auto nullmap_col = ColumnUInt8::create(); @@ -549,7 +628,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match } - block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { @@ -562,94 +641,99 @@ class FunctionStringRegexp : public FunctionStringRegexpBase vec_res[i] = res; // match } - block.getByPosition(result).column = std::move(col_res); + res_arg.column = std::move(col_res); } } else { - if (null_presence.has_nullable_col) + if constexpr (has_nullable_col) { auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); nullmap.resize(expr_param.getDataNum()); + StringRef expr_ref; + String pat; + String match_type; for (size_t i = 0; i < col_size; ++i) { - if constexpr (class_name == regexp_like_name) - { - if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || (match_type_param != nullptr && match_type_param->isNullAt(i))) - { - // This is a null result - nullmap[i] = 1; - continue; - } - } - else + if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || match_type_param.isNullAt(i)) { - if (expr_param.isNullAt(i) || pat_param.isNullAt(i)) - { - // This is a null result - nullmap[i] = 1; - continue; - } + // This is a null result + nullmap[i] = 1; + continue; } nullmap[i] = 0; - String && expr = expr_param.getString(i); - String && pat = pat_param.getString(i); + expr_param.getStringRef(i, expr_ref); + pat = pat_param.getString(i); + match_type = match_type_param.getString(i); - if (pat.empty()) + if (unlikely(pat.empty())) throw Exception("Empty pattern is invalid"); - if constexpr (class_name == regexp_like_name) - { - // regexp_like function - auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i), collator); - vec_res[i] = regexp->match(expr); // match - } - else - { - // regexp function - handleCollatorWithoutMatchType(pat, collator); - int flags = getDefaultFlags(); - const auto & regexp = Regexps::get(pat, flags); - vec_res[i] = regexp->match(expr); // match - } + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match } - block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { + StringRef expr_ref; + String pat; + String match_type; for (size_t i = 0; i < col_size; ++i) { - String && expr = expr_param.getString(i); - String && pat = pat_param.getString(i); + expr_param.getStringRef(i, expr_ref); + pat = pat_param.getString(i); + match_type = match_type_param.getString(i); - if (pat.empty()) + if (unlikely(pat.empty())) throw Exception("Empty pattern is invalid"); - if constexpr (class_name == regexp_like_name) - { - // regexp_like function - auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i), collator); - vec_res[i] = regexp->match(expr); // match - } - else - { - // regexp function - handleCollatorWithoutMatchType(pat, collator); - int flags = getDefaultFlags(); - const auto & regexp = Regexps::get(pat, flags); - vec_res[i] = regexp->match(expr); // match - } + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match } - block.getByPosition(result).column = std::move(col_res); + res_arg.column = std::move(col_res); } } } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + // Do something related with nullable columns + NullPresence null_presence = getNullPresense(block, arguments); + + const ColumnPtr & EXPR_COL_PTR_VAR_NAME = block.getByPosition(arguments[0]).column; + + if (null_presence.has_const_null_col || null_presence.has_data_type_nothing) + { + // There is a const null column in the input + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); + return; + } + + const ColumnPtr & PAT_COL_PTR_VAR_NAME = block.getByPosition(arguments[1]).column; + + if ((EXPR_COL_PTR_VAR_NAME)->empty()) + { + auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); + block.getByPosition(result).column = ColumnConst::create(std::move(null_col_res), 0); + return; + } + + size_t ARG_NUM_VAR_NAME = arguments.size(); + auto & RES_ARG_VAR_NAME = block.getByPosition(result); + + ColumnPtr MATCH_TYPE_COL_PTR_VAR_NAME; + if ((ARG_NUM_VAR_NAME) == 3) + MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; + + PROCESS_EXPR_PARAM_CONST(); + } + private: void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) const { From 1de27473e69dfc263ecafe8ebed58f46ad7e3147 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 10 Oct 2022 23:16:20 +0800 Subject: [PATCH 12/87] pass compilation --- dbms/src/Functions/FunctionsRegexp.h | 29 +++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 4e9005b560e..da6bc2d700b 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -154,9 +154,9 @@ class ParamDefault } Int64 getInt(size_t) const { return default_int; } - String getString(size_t) const { return String(""); } + static String getString(size_t) { return String(""); } void getStringRef(size_t, StringRef &) const {} - constexpr bool isConst() const { return true; } + constexpr static bool isConst() { return true; } private: Int64 default_int; @@ -209,7 +209,7 @@ class ParamString if constexpr (is_const) return String(const_string.data, const_string.size); else - return String(&chars[offsetAt(idx)], sizeAt(idx) - 1); + return String(reinterpret_cast(&chars[offsetAt(idx)]), sizeAt(idx) - 1); } void getStringRef(size_t idx, StringRef & dst) const @@ -221,13 +221,13 @@ class ParamString } else { - auto tmp = StringRef(&chars[offsetAt(idx)], sizeAt(idx) - 1); + auto tmp = StringRef(reinterpret_cast(&chars[offsetAt(idx)]), sizeAt(idx) - 1); dst.data = tmp.data; dst.size = tmp.size; } } - constexpr bool isConst() const { return is_const; } + constexpr static bool isConst() { return is_const; } private: size_t offsetAt(size_t i) const { return i == 0 ? 0 : (*offsets)[i - 1]; } @@ -286,7 +286,7 @@ class ParamInt String getString(size_t) const { throw Exception("ParamInt not supports this function"); } void getStringRef(size_t, StringRef &) const { throw Exception("ParamInt not supports this function"); } - constexpr bool isConst() const { return is_const; } + constexpr static bool isConst() { return is_const; } private: Int64 const_int_val; @@ -349,17 +349,17 @@ class Param void getStringRef(size_t idx, StringRef & dst) const { return data.getStringRef(idx, dst); } String getString(size_t idx) const { return data.getString(idx); } - constexpr bool isNullAt(size_t idx) const + bool isNullAt(size_t idx) const { // null_map works only when we are non-const nullable column - if constexpr (is_null && !data.isConst()) + if constexpr (is_null && !ParamImplType::isConst()) return (*null_map)[idx]; return false; } - constexpr bool isNullableCol() const { return is_null; } size_t getDataNum() const { return col_size; } - constexpr bool isConst() const { return data.isConst(); } + constexpr static bool isNullableCol() { return is_null; } + constexpr static bool isConst() { return ParamImplType::isConst(); } private: const size_t col_size; @@ -523,6 +523,7 @@ class FunctionStringRegexpBase mutable std::unique_ptr memorized_re; }; + // Implementation of regexp and regexp_like functions template class FunctionStringRegexp : public FunctionStringRegexpBase @@ -573,13 +574,15 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } } + constexpr static bool func() { return true; } + template - void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT &expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const { size_t col_size = expr_param.getDataNum(); // Check if args are all const columns - if constexpr (expr_param.isConst() && pat_param.isConst() && match_type_param.isConst()) + if constexpr (ExprT::isConst() && PatT::isConst() && MatchTypeT::isConst()) { int flags = getDefaultFlags(); String expr = expr_param.getString(0); @@ -601,7 +604,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase typename ColumnVector::Container & vec_res = col_res->getData(); vec_res.resize(expr_param.getDataNum(), 0); - constexpr bool has_nullable_col = expr_param.isNullableCol() || pat_param.isNullableCol() || match_type_param.isNullableCol(); + constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || MatchTypeT::isNullableCol(); // Start to match if (isMemorized()) From fdade390f1ca262749c4cf363aeb01029988f358 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 11 Oct 2022 15:09:59 +0800 Subject: [PATCH 13/87] format --- dbms/src/Functions/FunctionsRegexp.cpp | 3 +- dbms/src/Functions/FunctionsRegexp.h | 307 +++++++++++----------- dbms/src/Functions/tests/gtest_regexp.cpp | 1 + 3 files changed, 163 insertions(+), 148 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 860a314b1fa..d42c49ff242 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -16,6 +16,7 @@ #include #include #include + #include "Columns/ColumnNullable.h" namespace DB @@ -102,7 +103,7 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) if ((elem.column)->isColumnNullable()) { res.has_nullable_col = true; - + // Check if nullable column wrap a DataTypeNothing type const auto * type_null = typeid_cast(&(*elem.type)); const auto & nested_type = type_null->getNestedType(); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index da6bc2d700b..6e485b00608 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -32,6 +32,7 @@ #include #include "Columns/ColumnNullable.h" +#include "Columns/ColumnString.h" #include "Columns/ColumnsNumber.h" #include "Columns/IColumn.h" #include "Common/Exception.h" @@ -41,8 +42,8 @@ #include "DataTypes/DataTypeNullable.h" #include "Parsers/Lexer.h" #include "common/StringRef.h" +#include "common/defines.h" #include "common/types.h" -#include "Columns/ColumnString.h" #if USE_RE2_ST #include @@ -59,6 +60,8 @@ extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; } // namespace ErrorCodes +const char * empty_pat_err_msg = "Empty pattern is invalid"; + struct NameTiDBRegexp { static constexpr auto name = "regexp"; @@ -123,32 +126,34 @@ inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, template inline constexpr bool check_int_type() { - return static_cast(std::is_same_v - || std::is_same_v - || std::is_same_v - || std::is_same_v - || std::is_same_v - || std::is_same_v - || std::is_same_v - || std::is_same_v - || std::is_same_v); + return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); } // Use this type when param is not provided class ParamDefault { public: - explicit ParamDefault(Int64 val) : default_int(val), default_string("") {} - explicit ParamDefault(const StringRef & str) : default_int(0), default_string(str) {} + explicit ParamDefault(Int64 val) + : default_int(val) + , default_string("") + {} + explicit ParamDefault(const StringRef & str) + : default_int(0) + , default_string(str) + {} // For passing compilation - explicit ParamDefault(const void *) : default_int(0), default_string("") + explicit ParamDefault(const void *) + : default_int(0) + , default_string("") { throw Exception("Shouldn't call this constructor"); } // For passing compilation - ParamDefault(const void *, const void *) : default_int(0), default_string("") + ParamDefault(const void *, const void *) + : default_int(0) + , default_string("") { throw Exception("Shouldn't call this constructor"); } @@ -174,13 +179,17 @@ class ParamString // For passing compilation explicit ParamString(Int64) - : const_string(nullptr, 0), chars(nullptr), offsets(nullptr) + : const_string(nullptr, 0) + , chars(nullptr) + , offsets(nullptr) { throw Exception("Shouldn't call this constructor"); } explicit ParamString(const StringRef & str_ref) - : const_string(str_ref), chars(nullptr), offsets(nullptr) + : const_string(str_ref) + , chars(nullptr) + , offsets(nullptr) { if constexpr (!is_const) throw Exception("non-const parm should not call this constructor"); @@ -188,7 +197,9 @@ class ParamString // For passing compilation explicit ParamString(const void *) - : const_string(nullptr, 0), chars(nullptr), offsets(nullptr) + : const_string(nullptr, 0) + , chars(nullptr) + , offsets(nullptr) { throw Exception("Shouldn't call this constructor"); } @@ -209,7 +220,7 @@ class ParamString if constexpr (is_const) return String(const_string.data, const_string.size); else - return String(reinterpret_cast(&chars[offsetAt(idx)]), sizeAt(idx) - 1); + return String(reinterpret_cast(&(*chars)[offsetAt(idx)]), sizeAt(idx) - 1); } void getStringRef(size_t idx, StringRef & dst) const @@ -221,7 +232,7 @@ class ParamString } else { - auto tmp = StringRef(reinterpret_cast(&chars[offsetAt(idx)]), sizeAt(idx) - 1); + auto tmp = StringRef(reinterpret_cast(&(*chars)[offsetAt(idx)]), sizeAt(idx) - 1); dst.data = tmp.data; dst.size = tmp.size; } @@ -247,7 +258,9 @@ class ParamInt DISALLOW_COPY_AND_MOVE(ParamInt); using Container = typename ColumnVector(), T>>::Container; - explicit ParamInt(Int64 val) : const_int_val(val), int_container(nullptr) + explicit ParamInt(Int64 val) + : const_int_val(val) + , int_container(nullptr) { if constexpr (!is_const) throw Exception("non-const parm should not call this constructor"); @@ -255,7 +268,8 @@ class ParamInt // For passing compilation explicit ParamInt(const StringRef &) - : const_int_val(0), int_container(nullptr) + : const_int_val(0) + , int_container(nullptr) { throw Exception("Shouldn't call this constructor"); } @@ -307,13 +321,15 @@ class Param Param(size_t col_size_, const StringRef & str_ref) : col_size(col_size_) , null_map(nullptr) - , data(str_ref) {} - + , data(str_ref) + {} + // const int param Param(size_t col_size_, Int64 val) : col_size(col_size_) , null_map(nullptr) - , data(val) {} + , data(val) + {} // pure vector string param // chars_ type: ParamImplType::Chars_t @@ -321,14 +337,16 @@ class Param Param(size_t col_size_, const void * chars_, const void * offsets_) : col_size(col_size_) , null_map(nullptr) - , data(chars_, offsets_) {} + , data(chars_, offsets_) + {} // pure vector int param // int_container_ type: ParamImplType::Container Param(size_t col_size_, const void * int_container_) : col_size(col_size_) , null_map(nullptr) - , data(int_container_) {} + , data(int_container_) + {} // nullable vector string param // chars_ type: ParamImplType::Chars_t @@ -336,14 +354,16 @@ class Param Param(size_t col_size_, ConstNullMapPtr null_map_, const void * chars_, const void * offsets_) : col_size(col_size_) , null_map(null_map_) - , data(chars_, offsets_) {} + , data(chars_, offsets_) + {} // nullable vector int param // int_container_ type: ParamImplType::Container Param(size_t col_size_, ConstNullMapPtr null_map_, const void * int_container_) : col_size(col_size_) , null_map(null_map_) - , data(int_container_) {} + , data(int_container_) + {} Int64 getInt(size_t idx) const { return data.getInt(idx); } void getStringRef(size_t idx, StringRef & dst) const { return data.getStringRef(idx, dst); } @@ -354,7 +374,8 @@ class Param // null_map works only when we are non-const nullable column if constexpr (is_null && !ParamImplType::isConst()) return (*null_map)[idx]; - return false; + else + return false; } size_t getDataNum() const { return col_size; } @@ -367,6 +388,7 @@ class Param ParamImplType data; }; +// Unifying these names is necessary in macros #define EXPR_COL_PTR_VAR_NAME col_expr #define PAT_COL_PTR_VAR_NAME col_pat #define MATCH_TYPE_COL_PTR_VAR_NAME col_match_type @@ -384,100 +406,101 @@ class Param #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process // processed_col is impossible to be const here -#define PROCESS_STRING_PARAM_NULL(param_name, processed_col, next_process) \ - do \ - { \ - size_t col_size = (processed_col)->size(); \ - if (((processed_col)->isColumnNullable())) \ - { \ - auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ - const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ - const auto * null_map = &(static_cast(*(processed_col)).getNullMapData()); \ - Param, true> (param_name)(col_size, null_map, &(tmp->getChars()), &(tmp->getOffsets())); \ - next_process; \ - } \ - else \ - { \ - /* This is a pure string vector column */ \ - const auto * tmp = checkAndGetColumn(&(*(processed_col))); \ - Param, false> (param_name)(col_size, &(tmp->getChars()), &(tmp->getOffsets())); \ - next_process; \ - } \ - } while(0); - -#define PROCESS_STRING_PARAM_CONST(param_name, processed_col, next_process) \ - do \ - { \ - size_t col_size = (processed_col)->size(); \ - const auto * col_const = typeid_cast(&(*(processed_col))); \ - if (col_const != nullptr) \ - { \ - auto col_const_data = col_const->getDataColumnPtr(); \ - if (col_const_data->isColumnNullable()) \ - { \ - /* This is a const column and it can't be const null column as we should have handled it in the previous */ \ - Field field; \ - col_const->get(0, field); \ - String tmp = field.safeGet(); \ - /* const col */ \ - Param, true> (param_name)(col_size, StringRef(tmp.data(), tmp.size())); \ - next_process; \ - } \ - else \ - { \ - /* const col */ \ - Param, false> (param_name)(col_size, col_const->getDataAt(0)); \ - next_process; \ - \ - } \ - } \ - else \ - { \ - PROCESS_STRING_PARAM_NULL((param_name), (processed_col), next_process); \ - } \ - } while(0); - -// processed_col is impossible to be const here -#define PROCESS_INT_PARAM_NULL(param_name, processed_col, next_process) \ +#define PROCESS_STRING_PARAM_NULL(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + if (((processed_col)->isColumnNullable())) \ + { \ + auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ + const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ + const auto * null_map = &(static_cast(*(processed_col)).getNullMapData()); \ + Param, true>(param_name)(col_size, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ + next_process; \ + } \ + else \ + { \ + /* This is a pure string vector column */ \ + const auto * tmp = checkAndGetColumn(&(*(processed_col))); \ + Param, false>(param_name)(col_size, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ + next_process; \ + } \ + } while (0); -#define PROCESS_INT_PARAM_CONST(param_name, processed_col, next_process) \ +#define PROCESS_STRING_PARAM_CONST(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + const auto * col_const = typeid_cast(&(*(processed_col))); \ + if (col_const != nullptr) \ + { \ + auto col_const_data = col_const->getDataColumnPtr(); \ + if (col_const_data->isColumnNullable()) \ + { \ + /* This is a const column and it can't be const null column as we should have handled it in the previous */ \ + Field field; \ + col_const->get(0, field); \ + String tmp = field.safeGet(); \ + /* const col */ \ + Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size())); \ + next_process; \ + } \ + else \ + { \ + /* const col */ \ + Param, false>(param_name)(col_size, col_const->getDataAt(0)); \ + next_process; \ + } \ + } \ + else \ + { \ + PROCESS_STRING_PARAM_NULL((param_name), (processed_col), next_process); \ + } \ + } while (0); // regexp and regexp_like functions are processed in this macro -#define PROCESS_REGEXP_LIKE() \ - do \ - { \ +#define EXECUTE_REGEXP_LIKE() \ + do \ + { \ REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ - } while(0); - -#define PROCESS_MATCH_TYPE_PARAM_CONST() \ - do \ - { \ - if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - { \ - if (ARG_NUM_VAR_NAME == 3) \ - { \ - PROCESS_STRING_PARAM_CONST(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({PROCESS_REGEXP_LIKE()})); \ - } \ - else \ - { \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - PROCESS_REGEXP_LIKE(); \ - } \ - } \ - } while(0); - -#define PROCESS_PAT_PARAM_CONST() \ - do \ - { \ - if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - PROCESS_STRING_PARAM_CONST(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({PROCESS_MATCH_TYPE_PARAM_CONST()})); \ } while (0); -#define PROCESS_EXPR_PARAM_CONST() \ - do \ - { \ - PROCESS_STRING_PARAM_CONST(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({PROCESS_PAT_PARAM_CONST()})); \ - } while(0); +#define PROCESS_PAT_PARAMPROCESS_MATCH_TYPE_PARAM() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ + { \ + if (ARG_NUM_VAR_NAME == 3) \ + { \ + PROCESS_STRING_PARAM_CONST(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})); \ + } \ + else \ + { \ + /* match_type is not provided here */ \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_LIKE(); \ + } \ + } \ + } while (0); + +#define PROCESS_PAT_PARAM() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ + PROCESS_STRING_PARAM_CONST(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({PROCESS_PAT_PARAMPROCESS_MATCH_TYPE_PARAM()})); \ + } while (0); + +#define PROCESS_EXPR_PARAM() \ + do \ + { \ + PROCESS_STRING_PARAM_CONST(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({PROCESS_PAT_PARAM()})); \ + } while (0); + +#define PROCESS_PARAMS_AND_EXECUTE() \ + do \ + { \ + PROCESS_EXPR_PARAM() \ + } while (0); class FunctionStringRegexpBase { @@ -495,8 +518,8 @@ class FunctionStringRegexpBase void memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const { String final_pattern = pat_param.getString(0); - if (final_pattern.empty()) - throw Exception("Empty pattern is invalid"); + if (unlikely(final_pattern.empty())) + throw Exception(empty_pat_err_msg); String match_type = match_type_param.getString(0); final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); @@ -506,10 +529,10 @@ class FunctionStringRegexpBase } // Check if we can memorize the regexp - template - static bool canMemorize(const ExprT & pat_param, const MatchTypeT & match_type_param) + template + constexpr static bool canMemorize() { - return (pat_param.isConst() && match_type_param.isConst()); + return (PatT::isConst() && MatchTypeT::isConst()); } bool isMemorized() const { return memorized_re != nullptr; } @@ -560,22 +583,15 @@ class FunctionStringRegexp : public FunctionStringRegexpBase for (const auto & arg : arguments) checkInputArg(arg, &has_nullable_col, &has_data_type_nothing); + if (has_data_type_nothing) + return std::make_shared(std::make_shared()); + if (has_nullable_col) - { - if (has_data_type_nothing) - return std::make_shared(std::make_shared()); return std::make_shared(std::make_shared>()); - } else - { - if (has_data_type_nothing) - return std::make_shared(); return std::make_shared>(); - } } - constexpr static bool func() { return true; } - template void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const { @@ -587,6 +603,9 @@ class FunctionStringRegexp : public FunctionStringRegexpBase int flags = getDefaultFlags(); String expr = expr_param.getString(0); String pat = pat_param.getString(0); + if (unlikely(pat.empty())) + throw Exception(empty_pat_err_msg); + String match_type = match_type_param.getString(0); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); @@ -596,13 +615,14 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } // Check memorization - if (canMemorize(pat_param, match_type_param)) - memorize(pat_param, match_type_param, collator); + if constexpr (canMemorize()) + if (likely(col_size) > 0) + memorize(pat_param, match_type_param, collator); // Initialize result column auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(expr_param.getDataNum(), 0); + vec_res.resize(col_size, 0); constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || MatchTypeT::isNullableCol(); @@ -615,7 +635,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // expr column must be a nullable column here, so we need to check null for each elems auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(expr_param.getDataNum()); + nullmap.resize(col_size); StringRef expr_ref; for (size_t i = 0; i < col_size; ++i) @@ -653,7 +673,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase { auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(expr_param.getDataNum()); + nullmap.resize(col_size); StringRef expr_ref; String pat; @@ -673,7 +693,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase match_type = match_type_param.getString(i); if (unlikely(pat.empty())) - throw Exception("Empty pattern is invalid"); + throw Exception(empty_pat_err_msg); auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match @@ -693,7 +713,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase match_type = match_type_param.getString(i); if (unlikely(pat.empty())) - throw Exception("Empty pattern is invalid"); + throw Exception(empty_pat_err_msg); auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match @@ -720,13 +740,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase const ColumnPtr & PAT_COL_PTR_VAR_NAME = block.getByPosition(arguments[1]).column; - if ((EXPR_COL_PTR_VAR_NAME)->empty()) - { - auto null_col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); - block.getByPosition(result).column = ColumnConst::create(std::move(null_col_res), 0); - return; - } - size_t ARG_NUM_VAR_NAME = arguments.size(); auto & RES_ARG_VAR_NAME = block.getByPosition(result); @@ -734,7 +747,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if ((ARG_NUM_VAR_NAME) == 3) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - PROCESS_EXPR_PARAM_CONST(); + PROCESS_PARAMS_AND_EXECUTE(); } private: diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 14da67b57cf..ab1982e940a 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -1785,6 +1785,7 @@ TEST_F(Regexp, testRegexpTiDBCase) ASSERT_ANY_THROW((DB::MatchImpl::constantConstant("", "\\", '\\', "", nullptr, res))); } +// TODO test empty columns // We can only test regexp_like function as regexp is the subset of regexp_like TEST_F(Regexp, RegexpLike) { From 9a03b7ae392411232299b6429a501f6f91bd5714 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 11 Oct 2022 17:04:31 +0800 Subject: [PATCH 14/87] add the convertion of int col --- dbms/src/Functions/FunctionsRegexp.h | 230 ++++++++++++++++++++++----- 1 file changed, 192 insertions(+), 38 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 6e485b00608..f1617038768 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -60,7 +60,7 @@ extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; } // namespace ErrorCodes -const char * empty_pat_err_msg = "Empty pattern is invalid"; +const char * EMPTY_PAT_ERR_MSG = "Empty pattern is invalid"; struct NameTiDBRegexp { @@ -256,6 +256,8 @@ class ParamInt { public: DISALLOW_COPY_AND_MOVE(ParamInt); + + // raise error in compile-time when type is incorrect using Container = typename ColumnVector(), T>>::Container; explicit ParamInt(Int64 val) @@ -405,8 +407,9 @@ class Param // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process +// Common method to convert nullable string column // processed_col is impossible to be const here -#define PROCESS_STRING_PARAM_NULL(param_name, processed_col, next_process) \ +#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ { \ size_t col_size = (processed_col)->size(); \ @@ -427,7 +430,8 @@ class Param } \ } while (0); -#define PROCESS_STRING_PARAM_CONST(param_name, processed_col, next_process) \ +// Common method to convert const string column +#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ { \ size_t col_size = (processed_col)->size(); \ @@ -454,52 +458,202 @@ class Param } \ else \ { \ - PROCESS_STRING_PARAM_NULL((param_name), (processed_col), next_process); \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process); \ } \ } while (0); -// regexp and regexp_like functions are processed in this macro -#define EXECUTE_REGEXP_LIKE() \ - do \ - { \ - REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ +// Common method to convert nullable int column +// processed_col is impossible to be const here +#define CONVERT_NULL_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + if ((processed_col)->isColumnNullable()) \ + { \ + auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ + null_map = &(static_cast(*(processed_col)).getNullMapData()); \ + /* various int types may be input, we need to check them one by one */ \ + if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + { \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ + next_process; \ + } \ + else \ + throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ + } \ + else \ + { \ + /* This is a pure vector column */ \ + /* various int types may be input, we need to check them one by one */ \ + if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + { \ + Param, false>(param_name)(col_size, &(ptr->getData())); \ + next_process; \ + } \ + else \ + throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ + } \ } while (0); -#define PROCESS_PAT_PARAMPROCESS_MATCH_TYPE_PARAM() \ +// Common method to convert const int column +#define CONVERT_CONST_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + const auto * col_const = typeid_cast(&(*(processed_col))); \ + if (col_const != nullptr) \ + { \ + auto col_const_data = col_const->getDataColumnPtr(); \ + if (col_const_data->isColumnNullable()) \ + { \ + /* This is a const nullable column */ \ + Field field; \ + col_const->get(0, field); \ + data_int64 = field.get(); \ + /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ + Param, true>(param_name)(col_size, data_int64); \ + next_process; \ + } \ + else \ + { \ + /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ + data_int64 = col_const->getValue(); \ + Param, false>(param_name)(col_size, data_int64); \ + next_process; \ + } \ + } \ + else \ + { \ + CONVERT_NULL_INT_COL_TO_PARAM((param_name), (processed_col), next_process); \ + } \ + } while (0); + +// regexp and regexp_like functions are executed in this macro +#define EXECUTE_REGEXP_LIKE() \ do \ { \ - if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - { \ - if (ARG_NUM_VAR_NAME == 3) \ - { \ - PROCESS_STRING_PARAM_CONST(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})); \ - } \ - else \ - { \ - /* match_type is not provided here */ \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - EXECUTE_REGEXP_LIKE(); \ - } \ - } \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ } while (0); -#define PROCESS_PAT_PARAM() \ +// Common method to convert match type column +#define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ do \ { \ if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - PROCESS_STRING_PARAM_CONST(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({PROCESS_PAT_PARAMPROCESS_MATCH_TYPE_PARAM()})); \ + { \ + if (ARG_NUM_VAR_NAME == 3) \ + { \ + CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})); \ + } \ + else \ + { \ + /* match_type is not provided here */ \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_LIKE(); \ + } \ + } \ + } while (0); + +// Common method to convert pattern column +#define CONVERT_PAT_COL_TO_PARAM() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ + CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})); \ } while (0); -#define PROCESS_EXPR_PARAM() \ - do \ - { \ - PROCESS_STRING_PARAM_CONST(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({PROCESS_PAT_PARAM()})); \ +// Common method to convert expression column +#define CONVERT_EXPR_COL_TO_PARAM() \ + do \ + { \ + CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})); \ } while (0); -#define PROCESS_PARAMS_AND_EXECUTE() \ - do \ - { \ - PROCESS_EXPR_PARAM() \ +// The entry to convert columns to params and execute regexp_xxx functions +#define CONVERT_COLS_TO_PARAMS_AND_EXECUTE() \ + do \ + { \ + CONVERT_EXPR_COL_TO_PARAM() \ } while (0); class FunctionStringRegexpBase @@ -519,7 +673,7 @@ class FunctionStringRegexpBase { String final_pattern = pat_param.getString(0); if (unlikely(final_pattern.empty())) - throw Exception(empty_pat_err_msg); + throw Exception(EMPTY_PAT_ERR_MSG); String match_type = match_type_param.getString(0); final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); @@ -604,7 +758,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase String expr = expr_param.getString(0); String pat = pat_param.getString(0); if (unlikely(pat.empty())) - throw Exception(empty_pat_err_msg); + throw Exception(EMPTY_PAT_ERR_MSG); String match_type = match_type_param.getString(0); @@ -693,7 +847,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase match_type = match_type_param.getString(i); if (unlikely(pat.empty())) - throw Exception(empty_pat_err_msg); + throw Exception(EMPTY_PAT_ERR_MSG); auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match @@ -713,7 +867,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase match_type = match_type_param.getString(i); if (unlikely(pat.empty())) - throw Exception(empty_pat_err_msg); + throw Exception(EMPTY_PAT_ERR_MSG); auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match @@ -747,7 +901,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if ((ARG_NUM_VAR_NAME) == 3) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - PROCESS_PARAMS_AND_EXECUTE(); + CONVERT_COLS_TO_PARAMS_AND_EXECUTE(); } private: From c5e667209b5336953de9271f3b254750215c785c Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 12 Oct 2022 13:39:40 +0800 Subject: [PATCH 15/87] undef --- dbms/src/Functions/FunctionsRegexp.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index f1617038768..dfba2db1722 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -770,8 +770,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // Check memorization if constexpr (canMemorize()) - if (likely(col_size) > 0) - memorize(pat_param, match_type_param, collator); + memorize(pat_param, match_type_param, collator); // Initialize result column auto col_res = ColumnVector::create(); @@ -1195,4 +1194,25 @@ class FunctionStringReplace : public IFunction TiDB::TiDBCollatorPtr collator{}; }; + +#undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE +#undef CONVERT_EXPR_COL_TO_PARAM +#undef CONVERT_PAT_COL_TO_PARAM +#undef CONVERT_MATCH_TYPE_COL_TO_PARAM +#undef EXECUTE_REGEXP_LIKE +#undef CONVERT_CONST_INT_COL_TO_PARAM +#undef CONVERT_NULL_INT_COL_TO_PARAM +#undef CONVERT_CONST_STR_COL_TO_PARAM +#undef CONVERT_NULL_STR_COL_TO_PARAM +#undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME +#undef ARG_NUM_VAR_NAME +#undef SELF_CLASS_NAME +#undef MATCH_TYPE_PARAM_VAR_NAME +#undef PAT_PARAM_VAR_NAME +#undef EXPR_PARAM_VAR_NAME +#undef RES_ARG_VAR_NAME +#undef MATCH_TYPE_COL_PTR_VAR_NAME +#undef PAT_COL_PTR_VAR_NAME +#undef EXPR_COL_PTR_VAR_NAME + } // namespace DB From 8f44a5d4e0a66c44d2c42edc760a5667c8c4a8dd Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 12 Oct 2022 17:45:42 +0800 Subject: [PATCH 16/87] save works --- dbms/src/Flash/Coprocessor/DAGUtils.cpp | 2 +- dbms/src/Functions/FunctionsRegexp.cpp | 2 + dbms/src/Functions/FunctionsRegexp.h | 314 ++++++++++++++++++++---- 3 files changed, 263 insertions(+), 55 deletions(-) diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index e43cb7f9a12..f11ba5a79e0 100755 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -434,7 +434,7 @@ const std::unordered_map scalar_func_map({ {tipb::ScalarFuncSig::RegexpSig, "regexp"}, {tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"}, {tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"}, - // {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"}, + {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"}, // {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"}, // {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"}, diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index d42c49ff242..c90d09e55b7 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -1105,6 +1105,7 @@ struct ReplaceStringImpl using FunctionTiDBRegexp = FunctionStringRegexp; using FunctionRegexpLike = FunctionStringRegexp; +using FunctionRegexpInstr = FunctionStringRegexpInstr; using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; @@ -1118,6 +1119,7 @@ void registerFunctionsRegexp(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); } } // namespace DB diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index dfba2db1722..edd2dd8db31 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -58,6 +58,8 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; +extern const int TOO_LESS_ARGUMENTS_FOR_FUNCTION; +extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } // namespace ErrorCodes const char * EMPTY_PAT_ERR_MSG = "Empty pattern is invalid"; @@ -70,6 +72,10 @@ struct NameRegexpLike { static constexpr auto name = "regexp_like"; }; +struct NameRegexpInstr +{ + static constexpr auto name = "regexp_instr"; +}; struct NameReplaceOne { static constexpr auto name = "replaceOne"; @@ -89,6 +95,7 @@ struct NameReplaceRegexpAll static constexpr std::string_view regexp_name(NameTiDBRegexp::name); static constexpr std::string_view regexp_like_name(NameRegexpLike::name); +static constexpr std::string_view regexp_instr_name(NameRegexpInstr::name); String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr); @@ -393,19 +400,25 @@ class Param // Unifying these names is necessary in macros #define EXPR_COL_PTR_VAR_NAME col_expr #define PAT_COL_PTR_VAR_NAME col_pat +#define POS_COL_PTR_VAR_NAME col_pos +#define OCCUR_COL_PTR_VAR_NAME col_occur +#define RET_OP_COL_PTR_VAR_NAME col_ret_op #define MATCH_TYPE_COL_PTR_VAR_NAME col_match_type #define RES_ARG_VAR_NAME res_arg #define EXPR_PARAM_VAR_NAME expr_param #define PAT_PARAM_VAR_NAME pat_param +#define POS_PARAM_VAR_NAME pos_param +#define OCCUR_PARAM_VAR_NAME occur_param +#define RET_OP_PARAM_VAR_NAME ret_op_param #define MATCH_TYPE_PARAM_VAR_NAME match_type_param #define SELF_CLASS_NAME (name) #define ARG_NUM_VAR_NAME arg_num -// Unify the name of functions that actually execute regexp -#define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process +#define EXECUTE_REGEXP_LIKE_FUNC_NAME execute_regexp_like +#define EXECUTE_REGEXP_INSTR_FUNC_NAME execute_regexp_instr // Common method to convert nullable string column // processed_col is impossible to be const here @@ -458,7 +471,7 @@ class Param } \ else \ { \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process); \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ } \ } while (0); @@ -471,52 +484,43 @@ class Param if ((processed_col)->isColumnNullable()) \ { \ auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ - null_map = &(static_cast(*(processed_col)).getNullMapData()); \ + const auto * null_map = &(static_cast(*(processed_col)).getNullMapData()); \ /* various int types may be input, we need to check them one by one */ \ - if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ + else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ { \ Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ } \ else \ throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ @@ -525,50 +529,41 @@ class Param { \ /* This is a pure vector column */ \ /* various int types may be input, we need to check them one by one */ \ - if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ + else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ { \ Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ } \ else \ throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ @@ -589,7 +584,7 @@ class Param /* This is a const nullable column */ \ Field field; \ col_const->get(0, field); \ - data_int64 = field.get(); \ + auto data_int64 = field.get(); \ /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ Param, true>(param_name)(col_size, data_int64); \ next_process; \ @@ -597,7 +592,7 @@ class Param else \ { \ /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ - data_int64 = col_const->getValue(); \ + auto data_int64 = col_const->getValue(); \ Param, false>(param_name)(col_size, data_int64); \ next_process; \ } \ @@ -612,7 +607,13 @@ class Param #define EXECUTE_REGEXP_LIKE() \ do \ { \ - REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ + EXECUTE_REGEXP_LIKE_FUNC_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ + } while (0); + +#define EXECUTE_REGEXP_INSTR() \ + do \ + { \ + EXECUTE_REGEXP_INSTR_FUNC_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, POS_PARAM_VAR_NAME, OCCUR_PARAM_VAR_NAME, RET_OP_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ } while (0); // Common method to convert match type column @@ -622,16 +623,41 @@ class Param if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ { \ if (ARG_NUM_VAR_NAME == 3) \ - { \ - CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})); \ - } \ + CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ else \ { \ /* match_type is not provided here */ \ Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - EXECUTE_REGEXP_LIKE(); \ + EXECUTE_REGEXP_LIKE() \ } \ } \ + if constexpr (SELF_CLASS_NAME == regexp_instr_name) \ + { \ + } \ + } while (0); + +// Common method to convert return option column +#define CONVERT_RET_OP_COL_TO_PARAM() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_instr_name) \ + CONVERT_CONST_INT_COL_TO_PARAM(RET_OP_PARAM_VAR_NAME, RET_OP_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ + } while (0); + +// Common method to convert occurrence column +#define CONVERT_OCCUR_COL_TO_PARAM() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_instr_name) \ + CONVERT_CONST_INT_COL_TO_PARAM(OCCUR_PARAM_VAR_NAME, OCCUR_COL_PTR_VAR_NAME, ({CONVERT_RET_OP_COL_TO_PARAM()})) \ + } while (0); + +// Common method to convert position column +#define CONVERT_POS_COL_TO_PARAM() \ + do \ + { \ + if constexpr (SELF_CLASS_NAME == regexp_instr_name) \ + CONVERT_CONST_INT_COL_TO_PARAM(POS_PARAM_VAR_NAME, POS_COL_PTR_VAR_NAME, ({CONVERT_OCCUR_COL_TO_PARAM()})) \ } while (0); // Common method to convert pattern column @@ -639,14 +665,16 @@ class Param do \ { \ if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})); \ + CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ + else if constexpr (SELF_CLASS_NAME == regexp_instr_name) \ + CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_POS_COL_TO_PARAM()})) \ } while (0); // Common method to convert expression column #define CONVERT_EXPR_COL_TO_PARAM() \ do \ { \ - CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})); \ + CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ } while (0); // The entry to convert columns to params and execute regexp_xxx functions @@ -747,7 +775,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } template - void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + void EXECUTE_REGEXP_LIKE_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const { size_t col_size = expr_param.getDataNum(); @@ -877,13 +905,17 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } } + template + void EXECUTE_REGEXP_INSTR_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + { + throw Exception("Shouldn't call this function"); + } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { // Do something related with nullable columns NullPresence null_presence = getNullPresense(block, arguments); - const ColumnPtr & EXPR_COL_PTR_VAR_NAME = block.getByPosition(arguments[0]).column; - if (null_presence.has_const_null_col || null_presence.has_data_type_nothing) { // There is a const null column in the input @@ -891,16 +923,22 @@ class FunctionStringRegexp : public FunctionStringRegexpBase return; } + const ColumnPtr & EXPR_COL_PTR_VAR_NAME = block.getByPosition(arguments[0]).column; const ColumnPtr & PAT_COL_PTR_VAR_NAME = block.getByPosition(arguments[1]).column; size_t ARG_NUM_VAR_NAME = arguments.size(); auto & RES_ARG_VAR_NAME = block.getByPosition(result); ColumnPtr MATCH_TYPE_COL_PTR_VAR_NAME; - if ((ARG_NUM_VAR_NAME) == 3) + if ((ARG_NUM_VAR_NAME) == REGEXP_LIKE_MAX_PARAM_NUM) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - CONVERT_COLS_TO_PARAMS_AND_EXECUTE(); + // For passing compilation + ColumnPtr POS_COL_PTR_VAR_NAME; + ColumnPtr OCCUR_COL_PTR_VAR_NAME; + ColumnPtr RET_OP_COL_PTR_VAR_NAME; + + CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } private: @@ -934,6 +972,174 @@ class FunctionStringRegexp : public FunctionStringRegexpBase TiDB::TiDBCollatorPtr collator = nullptr; }; +// Implementation of regexp_instr function +template +class FunctionStringRegexpInstr : public FunctionStringRegexpBase + , public IFunction +{ +public: + using ResultType = Int64; + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + bool useDefaultImplementationForNulls() const override { return false; } + size_t getNumberOfArguments() const override { return 0; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + size_t arg_num = arguments.size(); + if (arg_num < REGEXP_MIN_PARAM_NUM) + throw Exception("Too few arguments", ErrorCodes::TOO_LESS_ARGUMENTS_FOR_FUNCTION); + else if (arg_num > REGEXP_INSTR_MAX_PARAM_NUM) + throw Exception("Too many arguments", ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION); + + bool has_nullable_col = false; + bool has_data_type_nothing = false; + bool is_str_arg; + + // Check type of arguments + for (size_t i = 0; i < arg_num; ++i) + { + // Index at 0, 1 and 5 arguments should be string type, otherwise int type. + is_str_arg = (i <= 1 || i == 5); + checkInputArg(arguments[i], is_str_arg, &has_nullable_col, &has_data_type_nothing); + } + + if (has_data_type_nothing) + return std::make_shared(std::make_shared()); + + if (has_nullable_col) + return std::make_shared(std::make_shared>()); + else + return std::make_shared>(); + } + + template + void EXECUTE_REGEXP_LIKE_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + { + throw Exception("Shouldn't call this function"); + } + + template + void EXECUTE_REGEXP_INSTR_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + {} + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + // Do something related with nullable columns + NullPresence null_presence = getNullPresense(block, arguments); + + if (null_presence.has_const_null_col || null_presence.has_data_type_nothing) + { + // There is a const null column in the input + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); + return; + } + + const ColumnPtr & EXPR_COL_PTR_VAR_NAME = block.getByPosition(arguments[0]).column; + const ColumnPtr & PAT_COL_PTR_VAR_NAME = block.getByPosition(arguments[1]).column; + + size_t ARG_NUM_VAR_NAME = arguments.size(); + auto & RES_ARG_VAR_NAME = block.getByPosition(result); + + ColumnPtr POS_COL_PTR_VAR_NAME; + ColumnPtr OCCUR_COL_PTR_VAR_NAME; + ColumnPtr RET_OP_COL_PTR_VAR_NAME; + ColumnPtr MATCH_TYPE_COL_PTR_VAR_NAME; + + // Go through cases to get arguments + switch((ARG_NUM_VAR_NAME)) + { + case REGEXP_INSTR_MAX_PARAM_NUM: + MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[5]).column; + case REGEXP_MIN_PARAM_NUM + 3: + RET_OP_COL_PTR_VAR_NAME = block.getByPosition(arguments[4]).column; + case REGEXP_MIN_PARAM_NUM + 2: + OCCUR_COL_PTR_VAR_NAME = block.getByPosition(arguments[3]).column; + case REGEXP_MIN_PARAM_NUM + 1: + POS_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; + }; + + CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + } + +private: + void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) const + { + if (is_str) + { + // Check string type argument + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isString()) + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + else + { + if (!arg->isString()) + { + // It may be DataTypeNothing if it's not string + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + } + else + { + // Check int type argument + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isInteger()) + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + else + { + if (!arg->isInteger()) + { + // It may be DataTypeNothing if it's not string + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + } + } + + TiDB::TiDBCollatorPtr collator = nullptr; +}; + template class FunctionStringReplace : public IFunction { From efb2d73fd6871f593162192540764a4d965ef7ab Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 13 Oct 2022 09:57:45 +0800 Subject: [PATCH 17/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 267 ++++++--------------------- 1 file changed, 59 insertions(+), 208 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index dfba2db1722..88a03d5e11f 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -58,6 +58,8 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; +extern const int TOO_LESS_ARGUMENTS_FOR_FUNCTION; +extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } // namespace ErrorCodes const char * EMPTY_PAT_ERR_MSG = "Empty pattern is invalid"; @@ -457,203 +459,7 @@ class Param } \ } \ else \ - { \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process); \ - } \ - } while (0); - -// Common method to convert nullable int column -// processed_col is impossible to be const here -#define CONVERT_NULL_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ - do \ - { \ - size_t col_size = (processed_col)->size(); \ - if ((processed_col)->isColumnNullable()) \ - { \ - auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ - null_map = &(static_cast(*(processed_col)).getNullMapData()); \ - /* various int types may be input, we need to check them one by one */ \ - if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(nested_ptr))))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - next_process; \ - } \ - else \ - throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ - } \ - else \ - { \ - /* This is a pure vector column */ \ - /* various int types may be input, we need to check them one by one */ \ - if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else if ((const auto * ptr = typeid_cast(&(*(processed_col))))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - next_process; \ - } \ - else \ - throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ - } \ - } while (0); - -// Common method to convert const int column -#define CONVERT_CONST_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ - do \ - { \ - size_t col_size = (processed_col)->size(); \ - const auto * col_const = typeid_cast(&(*(processed_col))); \ - if (col_const != nullptr) \ - { \ - auto col_const_data = col_const->getDataColumnPtr(); \ - if (col_const_data->isColumnNullable()) \ - { \ - /* This is a const nullable column */ \ - Field field; \ - col_const->get(0, field); \ - data_int64 = field.get(); \ - /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ - Param, true>(param_name)(col_size, data_int64); \ - next_process; \ - } \ - else \ - { \ - /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ - data_int64 = col_const->getValue(); \ - Param, false>(param_name)(col_size, data_int64); \ - next_process; \ - } \ - } \ - else \ - { \ - CONVERT_NULL_INT_COL_TO_PARAM((param_name), (processed_col), next_process); \ - } \ - } while (0); - -// regexp and regexp_like functions are executed in this macro -#define EXECUTE_REGEXP_LIKE() \ - do \ - { \ - REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ - } while (0); - -// Common method to convert match type column -#define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ - do \ - { \ - if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - { \ - if (ARG_NUM_VAR_NAME == 3) \ - { \ - CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})); \ - } \ - else \ - { \ - /* match_type is not provided here */ \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - EXECUTE_REGEXP_LIKE(); \ - } \ - } \ - } while (0); - -// Common method to convert pattern column -#define CONVERT_PAT_COL_TO_PARAM() \ - do \ - { \ - if constexpr (SELF_CLASS_NAME == regexp_name || SELF_CLASS_NAME == regexp_like_name) \ - CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})); \ - } while (0); - -// Common method to convert expression column -#define CONVERT_EXPR_COL_TO_PARAM() \ - do \ - { \ - CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})); \ - } while (0); - -// The entry to convert columns to params and execute regexp_xxx functions -#define CONVERT_COLS_TO_PARAMS_AND_EXECUTE() \ - do \ - { \ - CONVERT_EXPR_COL_TO_PARAM() \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ } while (0); class FunctionStringRegexpBase @@ -700,6 +506,47 @@ class FunctionStringRegexpBase mutable std::unique_ptr memorized_re; }; +// regexp and regexp_like functions are executed in this macro +#define EXECUTE_REGEXP_LIKE() \ + do \ + { \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ + } while (0); + +// Method to convert match type column +#define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ + do \ + { \ + if ((ARG_NUM_VAR_NAME) == 3) \ + CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ + else \ + { \ + /* match_type is not provided here */ \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_LIKE() \ + } \ + } while (0); + +// Method to convert pattern column +#define CONVERT_PAT_COL_TO_PARAM() \ + do \ + { \ + CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ + } while (0); + +// Method to convert expression column +#define CONVERT_EXPR_COL_TO_PARAM() \ + do \ + { \ + CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ + } while (0); + +// The entry to convert columns to params and execute regexp_xxx functions +#define CONVERT_COLS_TO_PARAMS_AND_EXECUTE() \ + do \ + { \ + CONVERT_EXPR_COL_TO_PARAM() \ + } while (0); // Implementation of regexp and regexp_like functions template @@ -728,8 +575,10 @@ class FunctionStringRegexp : public FunctionStringRegexpBase args_max_num = REGEXP_MAX_PARAM_NUM; size_t arg_num = arguments.size(); - if (arg_num < REGEXP_MIN_PARAM_NUM || arg_num > args_max_num) - throw Exception("Illegal argument number"); + if (arg_num < REGEXP_MIN_PARAM_NUM) + throw Exception("Too few arguments", ErrorCodes::TOO_LESS_ARGUMENTS_FOR_FUNCTION); + else if (arg_num > args_max_num) + throw Exception("Too mant arguments", ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION); bool has_nullable_col = false; bool has_data_type_nothing = false; @@ -900,16 +749,25 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if ((ARG_NUM_VAR_NAME) == 3) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - CONVERT_COLS_TO_PARAMS_AND_EXECUTE(); + CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } +#undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE +#undef CONVERT_EXPR_COL_TO_PARAM +#undef CONVERT_PAT_COL_TO_PARAM +#undef CONVERT_MATCH_TYPE_COL_TO_PARAM +#undef EXECUTE_REGEXP_LIKE + private: void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) const { if (arg->isNullable()) { *has_nullable_col = true; - const auto & null_type = checkAndGetDataType(arg.get()); + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexp", ErrorCodes::LOGICAL_ERROR); + const auto & nested_type = null_type->getNestedType(); if (!nested_type->isString()) { @@ -1195,13 +1053,6 @@ class FunctionStringReplace : public IFunction TiDB::TiDBCollatorPtr collator{}; }; -#undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE -#undef CONVERT_EXPR_COL_TO_PARAM -#undef CONVERT_PAT_COL_TO_PARAM -#undef CONVERT_MATCH_TYPE_COL_TO_PARAM -#undef EXECUTE_REGEXP_LIKE -#undef CONVERT_CONST_INT_COL_TO_PARAM -#undef CONVERT_NULL_INT_COL_TO_PARAM #undef CONVERT_CONST_STR_COL_TO_PARAM #undef CONVERT_NULL_STR_COL_TO_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME From 456105000fe58bd6681153aed8957def43737540 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 13 Oct 2022 10:04:41 +0800 Subject: [PATCH 18/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 88a03d5e11f..08906058a1f 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -752,12 +752,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } -#undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE -#undef CONVERT_EXPR_COL_TO_PARAM -#undef CONVERT_PAT_COL_TO_PARAM -#undef CONVERT_MATCH_TYPE_COL_TO_PARAM -#undef EXECUTE_REGEXP_LIKE - private: void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) const { @@ -792,6 +786,12 @@ class FunctionStringRegexp : public FunctionStringRegexpBase TiDB::TiDBCollatorPtr collator = nullptr; }; +#undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE +#undef CONVERT_EXPR_COL_TO_PARAM +#undef CONVERT_PAT_COL_TO_PARAM +#undef CONVERT_MATCH_TYPE_COL_TO_PARAM +#undef EXECUTE_REGEXP_LIKE + template class FunctionStringReplace : public IFunction { From 35ac32a30335ae53d5be969286ee6eb08bb27399 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 13 Oct 2022 10:36:41 +0800 Subject: [PATCH 19/87] save --- dbms/src/Functions/FunctionsRegexp.h | 211 +++++++++++---------------- 1 file changed, 82 insertions(+), 129 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 41c7a45bc29..161bb338cfe 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -417,10 +417,10 @@ class Param #define SELF_CLASS_NAME (name) #define ARG_NUM_VAR_NAME arg_num -#define EXECUTE_REGEXP_LIKE_FUNC_NAME execute_regexp_like -#define EXECUTE_REGEXP_INSTR_FUNC_NAME execute_regexp_instr +// Unify the name of functions that actually execute regexp +#define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process -// Common method to convert nullable string column +// Method to convert nullable string column // processed_col is impossible to be const here #define CONVERT_NULL_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ @@ -443,7 +443,7 @@ class Param } \ } while (0); -// Common method to convert const string column +// Method to convert const string column #define CONVERT_CONST_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ { \ @@ -475,7 +475,7 @@ class Param } \ } while (0); -// Common method to convert nullable int column +// Method to convert nullable int column // processed_col is impossible to be const here #define CONVERT_NULL_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ @@ -570,7 +570,7 @@ class Param } \ } while (0); -// Common method to convert const int column +// Method to convert const int column #define CONVERT_CONST_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ { \ @@ -598,9 +598,7 @@ class Param } \ } \ else \ - { \ - CONVERT_NULL_INT_COL_TO_PARAM((param_name), (processed_col), next_process); \ - } \ + CONVERT_NULL_INT_COL_TO_PARAM((param_name), (processed_col), next_process) \ } while (0); class FunctionStringRegexpBase @@ -636,6 +634,76 @@ class FunctionStringRegexpBase return (PatT::isConst() && MatchTypeT::isConst()); } + void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) const + { + if (is_str) + { + // Check string type argument + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isString()) + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + else + { + if (!arg->isString()) + { + // It may be DataTypeNothing if it's not string + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + } + else + { + // Check int type argument + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isInteger()) + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + else + { + if (!arg->isInteger()) + { + // It may be DataTypeNothing if it's not string + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + } + } + bool isMemorized() const { return memorized_re != nullptr; } const std::unique_ptr & getRegexp() const { return memorized_re; } @@ -725,7 +793,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase bool has_data_type_nothing = false; for (const auto & arg : arguments) - checkInputArg(arg, &has_nullable_col, &has_data_type_nothing); + checkInputArg(arg, false, &has_nullable_col, &has_data_type_nothing); if (has_data_type_nothing) return std::make_shared(std::make_shared()); @@ -737,7 +805,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } template - void EXECUTE_REGEXP_LIKE_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const { size_t col_size = expr_param.getDataNum(); @@ -867,12 +935,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } } - template - void EXECUTE_REGEXP_INSTR_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const PatT & pat_param, const MatchTypeT & match_type_param) const - { - throw Exception("Shouldn't call this function"); - } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { // Do something related with nullable columns @@ -899,36 +961,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } private: - void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) const - { - if (arg->isNullable()) - { - *has_nullable_col = true; - const auto * null_type = checkAndGetDataType(arg.get()); - if (null_type == nullptr) - throw Exception("Get unexpected nullptr in FunctionStringRegexp", ErrorCodes::LOGICAL_ERROR); - - const auto & nested_type = null_type->getNestedType(); - if (!nested_type->isString()) - { - if (nested_type->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - else - { - if (!arg->isString()) - { - if (arg->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - } - TiDB::TiDBCollatorPtr collator = nullptr; }; @@ -941,16 +973,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define EXECUTE_REGEXP_INSTR() \ do \ { \ - EXECUTE_REGEXP_INSTR_FUNC_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, POS_PARAM_VAR_NAME, OCCUR_PARAM_VAR_NAME, RET_OP_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, POS_PARAM_VAR_NAME, OCCUR_PARAM_VAR_NAME, RET_OP_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ } while (0); // Method to convert match type column #define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ do \ - { \ \ - if constexpr (SELF_CLASS_NAME == regexp_instr_name) \ - { \ - } \ + { \ } while (0); // Method to convert return option column @@ -1040,14 +1069,8 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase return std::make_shared>(); } - template - void EXECUTE_REGEXP_LIKE_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const MatchTypeT & match_type_param) const - { - throw Exception("Shouldn't call this function"); - } - template - void EXECUTE_REGEXP_INSTR_FUNC_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const PatT & pat_param, const MatchTypeT & match_type_param) const + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const PatT & pat_param, const MatchTypeT & match_type_param) const {} void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override @@ -1090,76 +1113,6 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } private: - void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) const - { - if (is_str) - { - // Check string type argument - if (arg->isNullable()) - { - *has_nullable_col = true; - const auto * null_type = checkAndGetDataType(arg.get()); - if (null_type == nullptr) - throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); - - const auto & nested_type = null_type->getNestedType(); - - // It may be DataTypeNothing if it's not string - if (!nested_type->isString()) - { - if (nested_type->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - else - { - if (!arg->isString()) - { - // It may be DataTypeNothing if it's not string - if (arg->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - } - else - { - // Check int type argument - if (arg->isNullable()) - { - *has_nullable_col = true; - const auto * null_type = checkAndGetDataType(arg.get()); - if (null_type == nullptr) - throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); - - const auto & nested_type = null_type->getNestedType(); - - // It may be DataTypeNothing if it's not string - if (!nested_type->isInteger()) - { - if (nested_type->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - else - { - if (!arg->isInteger()) - { - // It may be DataTypeNothing if it's not string - if (arg->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - } - } - TiDB::TiDBCollatorPtr collator = nullptr; }; From 910598f9edcc71ca399703e69994664c1bd822ee Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 13 Oct 2022 18:41:23 +0800 Subject: [PATCH 20/87] save works --- dbms/src/Common/OptimizedRegularExpression.h | 3 + .../Common/OptimizedRegularExpression.inl.h | 6 + dbms/src/Functions/FunctionsRegexp.h | 397 ++++++++++++++---- 3 files changed, 325 insertions(+), 81 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 0c8f7660568..af6629a7566 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -112,6 +113,8 @@ class OptimizedRegularExpressionImpl out_required_substring_is_prefix = required_substring_is_prefix; } + Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op); + private: bool is_trivial; bool required_substring_is_prefix; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 3e2c09869eb..17ca63f3eaf 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -469,5 +469,11 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject } } +template +Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) +{ + +} + #undef MIN_LENGTH_FOR_STRSTR #undef MAX_SUBPATTERNS diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 161bb338cfe..cd0b0e39608 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -33,6 +33,7 @@ #include "Columns/ColumnNullable.h" #include "Columns/ColumnString.h" +#include "Columns/ColumnVector.h" #include "Columns/ColumnsNumber.h" #include "Columns/IColumn.h" #include "Common/Exception.h" @@ -115,12 +116,14 @@ struct NullPresence NullPresence getNullPresense(const Block & block, const ColumnNumbers & args); +// add '()' outside of the pattern to get the matched substr inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { String flags = getMatchType(match_type, collator); if (flags.empty()) - return pattern; - return fmt::format("(?{}){}", flags, pattern); + return fmt::format("({})", pattern); + + return fmt::format("(?{})({})", flags, pattern); } inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) @@ -136,6 +139,8 @@ inline constexpr bool check_int_type() return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); } +enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64 }; + // Use this type when param is not provided class ParamDefault { @@ -165,6 +170,9 @@ class ParamDefault throw Exception("Shouldn't call this constructor"); } + static IntType getIntType() { return IntType::Int64; } + + template Int64 getInt(size_t) const { return default_int; } static String getString(size_t) { return String(""); } void getStringRef(size_t, StringRef &) const {} @@ -220,6 +228,9 @@ class ParamString throw Exception("const parm should not call this constructor"); } + static IntType getIntType() { throw Exception("ParamString not supports this function"); } + + template Int64 getInt(size_t) const { throw Exception("ParamString not supports this function"); } String getString(size_t idx) const @@ -258,17 +269,16 @@ class ParamString const Offsets * offsets; }; -template +template class ParamInt { public: DISALLOW_COPY_AND_MOVE(ParamInt); - // raise error in compile-time when type is incorrect - using Container = typename ColumnVector(), T>>::Container; explicit ParamInt(Int64 val) : const_int_val(val) + , int_type(IntType::UInt8) , int_container(nullptr) { if constexpr (!is_const) @@ -278,14 +288,16 @@ class ParamInt // For passing compilation explicit ParamInt(const StringRef &) : const_int_val(0) + , int_type(IntType::UInt8) , int_container(nullptr) { throw Exception("Shouldn't call this constructor"); } - explicit ParamInt(const void * int_container_) + explicit ParamInt(const void * int_container_, IntType int_type_) : const_int_val(0) - , int_container(reinterpret_cast(int_container_)) + , int_type(int_type_) + , int_container(int_container_) { if constexpr (is_const) throw Exception("const parm should not call this constructor"); @@ -294,28 +306,36 @@ class ParamInt // For passing compilation ParamInt(const void *, const void *) : const_int_val(0) + , int_type(IntType::UInt8) , int_container(nullptr) { throw Exception("Shouldn't call this constructor"); } + template Int64 getInt(size_t idx) const { if constexpr (is_const) return const_int_val; else - return static_cast((*int_container)[idx]); + { + const auto * tmp = reinterpret_cast, T>>::Container *>(int_container); + return static_cast((*tmp)[idx]); + } } + IntType getIntType() const { return int_type; } String getString(size_t) const { throw Exception("ParamInt not supports this function"); } void getStringRef(size_t, StringRef &) const { throw Exception("ParamInt not supports this function"); } constexpr static bool isConst() { return is_const; } private: Int64 const_int_val; + IntType int_type; // for vector int - const Container * int_container; + // type: ColumnVector::Container + const void * int_container; }; // Columns may be const, nullable or plain vector, we can conveniently handle @@ -351,10 +371,10 @@ class Param // pure vector int param // int_container_ type: ParamImplType::Container - Param(size_t col_size_, const void * int_container_) + Param(size_t col_size_, const void * int_container_, IntType int_type) : col_size(col_size_) , null_map(nullptr) - , data(int_container_) + , data(int_container_, int_type) {} // nullable vector string param @@ -368,13 +388,14 @@ class Param // nullable vector int param // int_container_ type: ParamImplType::Container - Param(size_t col_size_, ConstNullMapPtr null_map_, const void * int_container_) + Param(size_t col_size_, ConstNullMapPtr null_map_, const void * int_container_, IntType int_type) : col_size(col_size_) , null_map(null_map_) - , data(int_container_) + , data(int_container_, int_type) {} - Int64 getInt(size_t idx) const { return data.getInt(idx); } + template + Int64 getInt(size_t idx) const { return data.template getInt(idx); } void getStringRef(size_t idx, StringRef & dst) const { return data.getStringRef(idx, dst); } String getString(size_t idx) const { return data.getString(idx); } @@ -387,6 +408,7 @@ class Param return false; } + IntType getIntType() const { return data.getIntType(); } size_t getDataNum() const { return col_size; } constexpr static bool isNullableCol() { return is_null; } constexpr static bool isConst() { return ParamImplType::isConst(); } @@ -416,6 +438,8 @@ class Param #define SELF_CLASS_NAME (name) #define ARG_NUM_VAR_NAME arg_num +#define NULL_MAP_VAR_NAME null_map +#define VEC_RES_VAR_NAME vec_res // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process @@ -487,41 +511,23 @@ class Param const auto * null_map = &(static_cast(*(processed_col)).getNullMapData()); \ /* various int types may be input, we need to check them one by one */ \ if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::UInt8); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::UInt16); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::UInt32); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::UInt64); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::UInt128); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::Int8); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::Int16); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::Int32); \ else if (const auto * ptr = typeid_cast(&(*(nested_ptr)))) \ - { \ - Param, true>(param_name)(col_size, null_map, &(ptr->getData())); \ - } \ + Param, true>(param_name)(col_size, null_map, &(ptr->getData()), IntType::Int64); \ else \ throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ } \ @@ -530,41 +536,23 @@ class Param /* This is a pure vector column */ \ /* various int types may be input, we need to check them one by one */ \ if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::UInt8); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::UInt16); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::UInt32); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::UInt64); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::UInt128); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::Int8); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::Int16); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::Int32); \ else if (const auto * ptr = typeid_cast(&(*(processed_col)))) \ - { \ - Param, false>(param_name)(col_size, &(ptr->getData())); \ - } \ + Param, false>(param_name)(col_size, &(ptr->getData()), IntType::Int64); \ else \ throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); \ } \ @@ -586,14 +574,14 @@ class Param col_const->get(0, field); \ auto data_int64 = field.get(); \ /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ - Param, true>(param_name)(col_size, data_int64); \ + Param, true>(param_name)(col_size, data_int64); \ next_process; \ } \ else \ { \ /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ auto data_int64 = col_const->getValue(); \ - Param, false>(param_name)(col_size, data_int64); \ + Param, false>(param_name)(col_size, data_int64); \ next_process; \ } \ } \ @@ -601,6 +589,7 @@ class Param CONVERT_NULL_INT_COL_TO_PARAM((param_name), (processed_col), next_process) \ } while (0); + class FunctionStringRegexpBase { public: @@ -634,7 +623,11 @@ class FunctionStringRegexpBase return (PatT::isConst() && MatchTypeT::isConst()); } - void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) const + bool isMemorized() const { return memorized_re != nullptr; } + + const std::unique_ptr & getRegexp() const { return memorized_re; } + + static void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) { if (is_str) { @@ -704,10 +697,6 @@ class FunctionStringRegexpBase } } - bool isMemorized() const { return memorized_re != nullptr; } - - const std::unique_ptr & getRegexp() const { return memorized_re; } - private: // We should pre compile the regular expression when: // - only pattern column is provided and it's a constant column @@ -730,7 +719,7 @@ class FunctionStringRegexpBase CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ else \ { \ - /* match_type is not provided here */ \ + /* match_type is not provided here and set default values */ \ Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ EXECUTE_REGEXP_LIKE() \ } \ @@ -793,7 +782,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase bool has_data_type_nothing = false; for (const auto & arg : arguments) - checkInputArg(arg, false, &has_nullable_col, &has_data_type_nothing); + checkInputArg(arg, true, &has_nullable_col, &has_data_type_nothing); if (has_data_type_nothing) return std::make_shared(std::make_shared()); @@ -957,7 +946,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if ((ARG_NUM_VAR_NAME) == REGEXP_LIKE_MAX_PARAM_NUM) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + // CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } private: @@ -980,6 +969,40 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ do \ { \ + if (ARG_NUM_VAR_NAME == REGEXP_INSTR_MAX_PARAM_NUM) \ + { \ + CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_INSTR()})) \ + } \ + else if (ARG_NUM_VAR_NAME == REGEXP_MIN_PARAM_NUM + 3) \ + { \ + /* match_type is not provided here and set default values */ \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_INSTR() \ + } \ + else if (ARG_NUM_VAR_NAME == REGEXP_MIN_PARAM_NUM + 2) \ + { \ + /* return_option and match_type are not provided here and set default values */ \ + Param RET_OP_PARAM_VAR_NAME(-1, 0); \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_INSTR() \ + } \ + else if (ARG_NUM_VAR_NAME == REGEXP_MIN_PARAM_NUM + 1) \ + { \ + /* occurrence, return_option and match_type are not provided here and set default values */ \ + Param OCCUR_PARAM_VAR_NAME(-1, 1); \ + Param RET_OP_PARAM_VAR_NAME(-1, 0); \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_INSTR() \ + } \ + else \ + { \ + /* position, occurrence, return_option and match_type are not provided here and set default values */ \ + Param POS_PARAM_VAR_NAME(-1, 1); \ + Param OCCUR_PARAM_VAR_NAME(-1, 1); \ + Param RET_OP_PARAM_VAR_NAME(-1, 0); \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + EXECUTE_REGEXP_INSTR() \ + } \ } while (0); // Method to convert return option column @@ -1024,6 +1047,114 @@ class FunctionStringRegexp : public FunctionStringRegexpBase CONVERT_EXPR_COL_TO_PARAM() \ } while (0); +// Choose int type for return option param and execute +#define CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, occur_type) \ + do \ + { \ + switch (RET_OP_PARAM_VAR_NAME.getIntType()) \ + { \ + case IntType::UInt8: \ + EXECUTE_INSTR(pos_type, occur_type, UInt8) \ + break; \ + case IntType::UInt16: \ + EXECUTE_INSTR(pos_type, occur_type, UInt16) \ + break; \ + case IntType::UInt32: \ + EXECUTE_INSTR(pos_type, occur_type, UInt32) \ + break; \ + case IntType::UInt64: \ + EXECUTE_INSTR(pos_type, occur_type, UInt64) \ + break; \ + case IntType::UInt128: \ + EXECUTE_INSTR(pos_type, occur_type, UInt128) \ + break; \ + case IntType::Int8: \ + EXECUTE_INSTR(pos_type, occur_type, Int8) \ + break; \ + case IntType::Int16: \ + EXECUTE_INSTR(pos_type, occur_type, Int16) \ + break; \ + case IntType::Int32: \ + EXECUTE_INSTR(pos_type, occur_type, Int32) \ + break; \ + case IntType::Int64: \ + EXECUTE_INSTR(pos_type, occur_type, Int64) \ + break; \ + } \ + } while (0); + +// Choose int type for occurrance param and execute +#define CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(pos_type) \ + do \ + { \ + switch (OCCUR_PARAM_VAR_NAME.getIntType()) \ + { \ + case IntType::UInt8: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt8) \ + break; \ + case IntType::UInt16: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt16) \ + break; \ + case IntType::UInt32: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt32) \ + break; \ + case IntType::UInt64: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt64) \ + break; \ + case IntType::UInt128: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt128) \ + break; \ + case IntType::Int8: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int8) \ + break; \ + case IntType::Int16: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int16) \ + break; \ + case IntType::Int32: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int32) \ + break; \ + case IntType::Int64: \ + CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int64) \ + break; \ + } \ + } while (0); + +// Choose int type for position param and execute +#define CHOOSE_AND_EXEC_FOR_POS_PARAM(execute) \ + do \ + { \ + switch (POS_PARAM_VAR_NAME.getIntType()) \ + { \ + case IntType::UInt8: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt8) \ + break; \ + case IntType::UInt16: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt16) \ + break; \ + case IntType::UInt32: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt32) \ + break; \ + case IntType::UInt64: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt64) \ + break; \ + case IntType::UInt128: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt128) \ + break; \ + case IntType::Int8: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int8) \ + break; \ + case IntType::Int16: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int16) \ + break; \ + case IntType::Int32: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int32) \ + break; \ + case IntType::Int64: \ + CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int64) \ + break; \ + } \ + } while (0); + // Implementation of regexp_instr function template class FunctionStringRegexpInstr : public FunctionStringRegexpBase @@ -1070,8 +1201,100 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } template - void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const PatT & pat_param, const MatchTypeT & match_type_param) const - {} + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & EXPR_PARAM_VAR_NAME, const PatT & PAT_PARAM_VAR_NAME, const PosT & POS_PARAM_VAR_NAME, const OccurT & OCCUR_PARAM_VAR_NAME, const RetOpT & RET_OP_PARAM_VAR_NAME, const MatchTypeT & MATCH_TYPE_PARAM_VAR_NAME) const + { + size_t col_size = EXPR_PARAM_VAR_NAME.getDataNum(); + + // Check if args are all const columns + if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && RetOpT::isConst() && MatchTypeT::isConst()) + { + int flags = getDefaultFlags(); + String expr = EXPR_PARAM_VAR_NAME.getString(0); + String pat = PAT_PARAM_VAR_NAME.getString(0); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + + Int64 pos = POS_PARAM_VAR_NAME.template getInt(0); + Int64 occur = OCCUR_PARAM_VAR_NAME.template getInt(0); + Int64 ret_op = RET_OP_PARAM_VAR_NAME.template getInt(0); + String match_type = MATCH_TYPE_PARAM_VAR_NAME.getString(0); + + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + ResultType res = regexp.instr(expr.c_str(), expr.size(), pos, occur, ret_op); + res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); + return; + } + + // Check memorization + if constexpr (canMemorize()) + memorize(PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME, collator); + + // Initialize result column + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & VEC_RES_VAR_NAME = col_res->getData(); + VEC_RES_VAR_NAME.resize(col_size, 0); + + constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullable() || OccurT::isNullable() || RetOpT::isNullable() || MatchTypeT::isNullableCol(); + + // Start to instr + if (isMemorized()) + { +#define REGEXP_VAR_NAME regexp + const auto & REGEXP_VAR_NAME = getRegexp(); + if constexpr (has_nullable_col) + { + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & NULL_MAP_VAR_NAME = nullmap_col->getData(); + NULL_MAP_VAR_NAME.resize(col_size); + +#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ + do \ + { \ + StringRef expr_ref; \ + Int64 pos; \ + Int64 occur; \ + Int64 ret_op; \ + for (size_t i = 0; i < col_size; ++i) \ + { \ + if (EXPR_PARAM_VAR_NAME.isNullAt(i) || POS_PARAM_VAR_NAME.isNullAt(i) || OCCUR_PARAM_VAR_NAME.isNullAt(i) || RET_OP_PARAM_VAR_NAME.isNullAt(i)) \ + { \ + NULL_MAP_VAR_NAME[i] = 1; \ + continue; \ + } \ + NULL_MAP_VAR_NAME[i] = 0; \ + EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ + POS_PARAM_VAR_NAME.template getInt(i); \ + OCCUR_PARAM_VAR_NAME.template getInt(i); \ + RET_OP_PARAM_VAR_NAME.template getInt(i); \ + VEC_RES_VAR_NAME[i] = REGEXP_VAR_NAME->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ + } \ + } while (0); + + // Identify int type of position, occurrance and return option, and execute the instr + CHOOSE_AND_EXEC_FOR_POS_PARAM() + +#undef EXECUTE_INSTR + + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + } + else + { + // expr column is impossible to be a nullable column here + StringRef expr_ref; + for (size_t i = 0; i < col_size; ++i) + { + EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); + auto res = regexp->match(expr_ref.data, expr_ref.size); + vec_res[i] = res; // match + } + + res_arg.column = std::move(col_res); + } +#undef REGEXP_VAR_NAME + } + else + {} + } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { @@ -1109,13 +1332,23 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase POS_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; }; - CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + // CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } private: + // Int64 executeRegexpInstr(StringRef & expr, StringRef & pat, Int64 pos, Int64 occur, Int64 ret_op, StringRef & match_type) const + // {} + + // Int64 executeMemorizedRegexpInstr(StringRef & expr, Int64 pos, Int64 occur, Int64 ret_op) const + // {} + TiDB::TiDBCollatorPtr collator = nullptr; }; +#undef CHOOSE_AND_EXEC_FOR_POS_PARAM +#undef CHOOSE_AND_EXEC_FOR_OCCUR_PARAM +#undef CHOOSE_AND_EXEC_FOR_RET_OP_PARAM + #undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE #undef CONVERT_EXPR_COL_TO_PARAM #undef CONVERT_PAT_COL_TO_PARAM @@ -1389,6 +1622,8 @@ class FunctionStringReplace : public IFunction #undef CONVERT_CONST_STR_COL_TO_PARAM #undef CONVERT_NULL_STR_COL_TO_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME +#undef VEC_RES_VAR_NAME +#undef NULL_MAP_VAR_NAME #undef ARG_NUM_VAR_NAME #undef SELF_CLASS_NAME #undef MATCH_TYPE_PARAM_VAR_NAME From 85074aff474ed348a61418313700e838262efb19 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 14 Oct 2022 18:04:34 +0800 Subject: [PATCH 21/87] save works --- .../Common/OptimizedRegularExpression.inl.h | 21 +++ dbms/src/Common/StringUtils/StringUtils.h | 29 ++++ dbms/src/Functions/FunctionsRegexp.h | 151 ++++++++++++++---- 3 files changed, 172 insertions(+), 29 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 17ca63f3eaf..23cd662c721 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -14,8 +14,10 @@ #include #include +#include #include +#include "Common/Exception.h" #define MIN_LENGTH_FOR_STRSTR 3 @@ -472,7 +474,26 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { + Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); + if (pos <= 0 || pos > utf8_total_len) + throw DB::Exception("Index out of bounds in regular expression search."); + + String matched_str; // store the matched substring + const char * expr = subject; // expr is the string actually passed into regexp to be matched + size_t expr_size = subject_size; + + // TODO convert utf8 pos to binary pos. + // size_t offset = /*todo*/; // This is a offset for bytes, not utf8 + + while (occur > 0) + { + bool success = RegexType::FindandConsume(StringPieceType(expr, expr_size), *re2, &matched_str); + if (!success) + return 0; + + --occur; + } } #undef MIN_LENGTH_FOR_STRSTR diff --git a/dbms/src/Common/StringUtils/StringUtils.h b/dbms/src/Common/StringUtils/StringUtils.h index 61b85f0912b..9060dbc53e9 100644 --- a/dbms/src/Common/StringUtils/StringUtils.h +++ b/dbms/src/Common/StringUtils/StringUtils.h @@ -149,3 +149,32 @@ inline bool equalsCaseInsensitive(char a, char b) { return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b); } + +// Get how many bytes this utf8 character needs. +// Input must be the first byte of a utf8 character. +inline size_t getUtf8Len(uint8_t utf8_first_byte) +{ + uint8_t flag = 128; + size_t len = 0; + while (flag & utf8_first_byte) + { + len++; + flag >>= 1; + } + + return flag == 128 ? 0 : len; +} + +inline size_t getStringUtf8Len(const char * str, size_t total_len) +{ + size_t len = 0; + size_t utf8_len; + + for (size_t i = 0; i < total_len; i += utf8_len) + { + utf8_len = getUtf8Len(str[i]); + len++; + } + + return len; +} diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index cd0b0e39608..0a13805e691 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -440,6 +440,7 @@ class Param #define ARG_NUM_VAR_NAME arg_num #define NULL_MAP_VAR_NAME null_map #define VEC_RES_VAR_NAME vec_res +#define COLLATOR_VAR_NAME collator // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process @@ -467,7 +468,7 @@ class Param } \ } while (0); -// Method to convert const string column +// Method to convert const string column to param #define CONVERT_CONST_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ { \ @@ -499,7 +500,7 @@ class Param } \ } while (0); -// Method to convert nullable int column +// Method to convert nullable int column to param // processed_col is impossible to be const here #define CONVERT_NULL_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ @@ -965,7 +966,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, POS_PARAM_VAR_NAME, OCCUR_PARAM_VAR_NAME, RET_OP_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ } while (0); -// Method to convert match type column +// Method to convert match type column to param #define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ do \ { \ @@ -1012,28 +1013,28 @@ class FunctionStringRegexp : public FunctionStringRegexpBase CONVERT_CONST_INT_COL_TO_PARAM(RET_OP_PARAM_VAR_NAME, RET_OP_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ } while (0); -// Method to convert occurrence column +// Method to convert occurrence column to param #define CONVERT_OCCUR_COL_TO_PARAM() \ do \ { \ CONVERT_CONST_INT_COL_TO_PARAM(OCCUR_PARAM_VAR_NAME, OCCUR_COL_PTR_VAR_NAME, ({CONVERT_RET_OP_COL_TO_PARAM()})) \ } while (0); -// Method to convert position column +// Method to convert position column to param #define CONVERT_POS_COL_TO_PARAM() \ do \ { \ CONVERT_CONST_INT_COL_TO_PARAM(POS_PARAM_VAR_NAME, POS_COL_PTR_VAR_NAME, ({CONVERT_OCCUR_COL_TO_PARAM()})) \ } while (0); -// Method to convert pattern column +// Method to convert pattern column to param #define CONVERT_PAT_COL_TO_PARAM() \ do \ { \ CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_POS_COL_TO_PARAM()})) \ } while (0); -// Method to convert expression column +// Method to convert expression column to param #define CONVERT_EXPR_COL_TO_PARAM() \ do \ { \ @@ -1167,7 +1168,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return true; } - void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { COLLATOR_VAR_NAME = collator_; } bool useDefaultImplementationForNulls() const override { return false; } size_t getNumberOfArguments() const override { return 0; } @@ -1219,7 +1220,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase Int64 ret_op = RET_OP_PARAM_VAR_NAME.template getInt(0); String match_type = MATCH_TYPE_PARAM_VAR_NAME.getString(0); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, COLLATOR_VAR_NAME), flags); ResultType res = regexp.instr(expr.c_str(), expr.size(), pos, occur, ret_op); res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; @@ -1227,7 +1228,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Check memorization if constexpr (canMemorize()) - memorize(PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME, collator); + memorize(PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME, COLLATOR_VAR_NAME); // Initialize result column auto col_res = ColumnVector::create(); @@ -1236,13 +1237,16 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullable() || OccurT::isNullable() || RetOpT::isNullable() || MatchTypeT::isNullableCol(); - // Start to instr + // Start to execute instr if (isMemorized()) { + // Codes in this if-condition execute instr with memorized regexp #define REGEXP_VAR_NAME regexp + const auto & REGEXP_VAR_NAME = getRegexp(); if constexpr (has_nullable_col) { + // Process nullable columns with memorized regexp auto nullmap_col = ColumnUInt8::create(); typename ColumnUInt8::Container & NULL_MAP_VAR_NAME = nullmap_col->getData(); NULL_MAP_VAR_NAME.resize(col_size); @@ -1262,6 +1266,34 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase continue; \ } \ NULL_MAP_VAR_NAME[i] = 0; \ + EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ + pos = POS_PARAM_VAR_NAME.template getInt(i); \ + occur = OCCUR_PARAM_VAR_NAME.template getInt(i); \ + ret_op = RET_OP_PARAM_VAR_NAME.template getInt(i); \ + VEC_RES_VAR_NAME[i] = REGEXP_VAR_NAME->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ + } \ + } while (0); + + // Identify int type of position, occurrance and return option, and execute the instr + CHOOSE_AND_EXEC_FOR_POS_PARAM() + +#undef EXECUTE_INSTR + + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + } + else + { + // Process pure vector columns with memorized regexp +#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ + do \ + { \ + /* columns are impossible to be a nullable column here */ \ + StringRef expr_ref; \ + Int64 pos; \ + Int64 occur; \ + Int64 ret_op; \ + for (size_t i = 0; i < col_size; ++i) \ + { \ EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ POS_PARAM_VAR_NAME.template getInt(i); \ OCCUR_PARAM_VAR_NAME.template getInt(i); \ @@ -1273,27 +1305,93 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Identify int type of position, occurrance and return option, and execute the instr CHOOSE_AND_EXEC_FOR_POS_PARAM() +#undef EXECUTE_INSTR + + res_arg.column = std::move(col_res); + } +#undef REGEXP_VAR_NAME + } + else + { + // Codes in this if-condition execute instr without memorized regexp + if constexpr (has_nullable_col) + { + // Process nullable columns without memorized regexp + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & NULL_MAP_VAR_NAME = nullmap_col->getData(); + NULL_MAP_VAR_NAME.resize(col_size); + +#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ + do \ + { \ + StringRef expr_ref; \ + String pat; \ + Int64 pos; \ + Int64 occur; \ + Int64 ret_op; \ + String match_type; \ + for (size_t i = 0; i < col_size; ++i) \ + { \ + if (EXPR_PARAM_VAR_NAME.isNullAt(i) || POS_PARAM_VAR_NAME.isNullAt(i) || OCCUR_PARAM_VAR_NAME.isNullAt(i) || RET_OP_PARAM_VAR_NAME.isNullAt(i)) \ + { \ + NULL_MAP_VAR_NAME[i] = 1; \ + continue; \ + } \ + NULL_MAP_VAR_NAME[i] = 0; \ + EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ + pat = PAT_PARAM_VAR_NAME.getString(i); \ + if (unlikely(pat.empty())) \ + throw Exception(EMPTY_PAT_ERR_MSG); \ + pos = POS_PARAM_VAR_NAME.template getInt(i); \ + occur = OCCUR_PARAM_VAR_NAME.template getInt(i); \ + ret_op = RET_OP_PARAM_VAR_NAME.template getInt(i); \ + match_type = match_type_param.getString(i); \ + auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); \ + VEC_RES_VAR_NAME[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ + } \ + } while (0); + + // Identify int type of position, occurrance and return option, and execute the instr + CHOOSE_AND_EXEC_FOR_POS_PARAM() + #undef EXECUTE_INSTR res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { - // expr column is impossible to be a nullable column here - StringRef expr_ref; - for (size_t i = 0; i < col_size; ++i) - { - EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); - auto res = regexp->match(expr_ref.data, expr_ref.size); - vec_res[i] = res; // match - } + // Process pure vector columns without memorized regexp +#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ + do \ + { \ + StringRef expr_ref; \ + String pat; \ + Int64 pos; \ + Int64 occur; \ + Int64 ret_op; \ + String match_type; \ + for (size_t i = 0; i < col_size; ++i) \ + { \ + EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ + pat = PAT_PARAM_VAR_NAME.getString(i); \ + if (unlikely(pat.empty())) \ + throw Exception(EMPTY_PAT_ERR_MSG); \ + pos = POS_PARAM_VAR_NAME.template getInt(i); \ + occur = OCCUR_PARAM_VAR_NAME.template getInt(i); \ + ret_op = RET_OP_PARAM_VAR_NAME.template getInt(i); \ + match_type = match_type_param.getString(i); \ + auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); \ + vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ + } \ + } while (0); + + // Identify int type of position, occurrance and return option, and execute the instr + CHOOSE_AND_EXEC_FOR_POS_PARAM() +#undef EXECUTE_INSTR res_arg.column = std::move(col_res); } -#undef REGEXP_VAR_NAME } - else - {} } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override @@ -1336,13 +1434,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } private: - // Int64 executeRegexpInstr(StringRef & expr, StringRef & pat, Int64 pos, Int64 occur, Int64 ret_op, StringRef & match_type) const - // {} - - // Int64 executeMemorizedRegexpInstr(StringRef & expr, Int64 pos, Int64 occur, Int64 ret_op) const - // {} - - TiDB::TiDBCollatorPtr collator = nullptr; + TiDB::TiDBCollatorPtr COLLATOR_VAR_NAME = nullptr; }; #undef CHOOSE_AND_EXEC_FOR_POS_PARAM @@ -1622,6 +1714,7 @@ class FunctionStringReplace : public IFunction #undef CONVERT_CONST_STR_COL_TO_PARAM #undef CONVERT_NULL_STR_COL_TO_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME +#undef COLLATOR_VAR_NAME #undef VEC_RES_VAR_NAME #undef NULL_MAP_VAR_NAME #undef ARG_NUM_VAR_NAME From ceeb1a2f661256a4ee401e580e62a3e59a84aa36 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 18 Oct 2022 16:46:47 +0800 Subject: [PATCH 22/87] need gtest --- .../Common/OptimizedRegularExpression.inl.h | 104 ++++++++++++++++-- dbms/src/Common/StringUtils/StringUtils.h | 2 +- dbms/src/Functions/tests/gtest_regexp.cpp | 5 + 3 files changed, 103 insertions(+), 8 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 23cd662c721..7020f9bfb8d 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -18,6 +18,7 @@ #include #include "Common/Exception.h" +#include "common/types.h" #define MIN_LENGTH_FOR_STRSTR 3 @@ -471,29 +472,118 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject } } +// Convert utf8 position to byte position. +// For Example: +// Taking string "ni好a" as an example. utf8 position 4 is corresponding to byte position 6. +static inline size_t utf8Pos2bytePos(const char * str, size_t utf8_pos) +{ + size_t byte_index = 0; + utf8_pos--; + while (utf8_pos > 0) + { + byte_index += getUtf8Len(str[byte_index]); + utf8_pos--; + } + return byte_index + 1; +} + +static inline size_t bytePos2Utf8Pos(const char * str, size_t byte_pos) +{ + // byte_num means the number of byte before this byte_pos + size_t byte_num = byte_pos - 1; + size_t utf8_num = getStringUtf8Len(str, byte_num); + return utf8_num + 1; +} + +static inline size_t getMatchedIndex(const char * str, const char * sub_str, size_t sub_str_size) +{ + const size_t stride = sizeof(int64_t); + size_t single_checked_num = sub_str_size >= stride ? sub_str_size % stride : sub_str_size; + size_t start_offset = -1; // offset that the head of sub_str in the str + size_t str_offset = 0; + size_t sub_str_offset = 0; + + // sub_str must be in the str, so while loop condition could be true + while (true) + { + // PRINT("while"); + sub_str_offset = 0; + start_offset += 1; + str_offset = start_offset; + + bool is_same = true; + while (sub_str_offset < single_checked_num) + { + if (str[str_offset++] != sub_str[sub_str_offset++]) + { + is_same = false; + break; + } + } + + if (!is_same) + continue; + + while (sub_str_offset < sub_str_size && is_same) + { + if (static_cast(str[str_offset]) == static_cast(sub_str[str_offset])) + { + is_same = false; + break; + } + + sub_str_offset += stride; + str_offset += stride; + } + + if (sub_str_offset >= sub_str_size) + break; + } + + return start_offset; +} template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { - Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); + int64_t utf8_total_len = getStringUtf8Len(subject, subject_size); if (pos <= 0 || pos > utf8_total_len) throw DB::Exception("Index out of bounds in regular expression search."); String matched_str; // store the matched substring - const char * expr = subject; // expr is the string actually passed into regexp to be matched - size_t expr_size = subject_size; - // TODO convert utf8 pos to binary pos. - // size_t offset = /*todo*/; // This is a offset for bytes, not utf8 + // This is a offset for bytes, not utf8 + size_t byte_pos = utf8Pos2bytePos(subject, pos); + size_t byte_offset = byte_pos - 1; + + const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched + size_t expr_size = subject_size - byte_offset; + + size_t ret_pos = 0; + size_t matched_index = 0; + StringPieceType expr_sp(expr, expr_size); + size_t matched_str_size = 0; while (occur > 0) { - bool success = RegexType::FindandConsume(StringPieceType(expr, expr_size), *re2, &matched_str); + bool success = re2::RE2::FindAndConsume(&expr_sp, *re2, &matched_str); if (!success) return 0; - + + matched_str_size = matched_str.size(); + + // get the start index of matched string in expr + matched_index = getMatchedIndex(expr, matched_str.c_str(), matched_str_size); + byte_offset += matched_index + matched_str_size; + + // expr is truncated each time we get a matched string + expr = subject + byte_offset; + --occur; } + + byte_offset -= matched_str_size; + return ret_op == 0 ? bytePos2Utf8Pos(subject, byte_offset + 1) : bytePos2Utf8Pos(subject, byte_offset + matched_str.size() + 1); } #undef MIN_LENGTH_FOR_STRSTR diff --git a/dbms/src/Common/StringUtils/StringUtils.h b/dbms/src/Common/StringUtils/StringUtils.h index 9060dbc53e9..bc865b9fdba 100644 --- a/dbms/src/Common/StringUtils/StringUtils.h +++ b/dbms/src/Common/StringUtils/StringUtils.h @@ -162,7 +162,7 @@ inline size_t getUtf8Len(uint8_t utf8_first_byte) flag >>= 1; } - return flag == 128 ? 0 : len; + return flag == 128 ? 1 : len; } inline size_t getStringUtf8Len(const char * str, size_t total_len) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index ab1982e940a..ed424a527ba 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2157,6 +2157,11 @@ TEST_F(Regexp, testRegexpCustomerCases) } } +TEST_F(Regexp, RegexpInstr) +{ + +} + TEST_F(Regexp, testRegexpReplaceMatchType) { String res; From 0b18c2ae3edc47bb88ed7affe605d51f81627406 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 18 Oct 2022 16:54:04 +0800 Subject: [PATCH 23/87] fix integration test --- tests/fullstack-test/expr/regexp.test | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index fc29e30958b..268ed99fd46 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -71,13 +71,12 @@ mysql> set @@tidb_isolation_read_engines='tiflash'; set @@tidb_enforce_mpp=1; se mysql> drop table if exists test.t mysql> create table test.t (data varchar(30), pattern varchar(30)); -mysql> insert into test.t values ('', ''), ('abcd', 'abcd'); +mysql> insert into test.t values ('abcd', 'abcd'); mysql> alter table test.t set tiflash replica 1 func> wait_table test t -mysql> set @@tidb_isolation_read_engines='tiflash'; set @@tidb_enforce_mpp=1; select data regexp pattern, data regexp '', '' regexp pattern from test.t; -+---------------------+----------------+-------------------+ -| data regexp pattern | data regexp '' | '' regexp pattern | -+---------------------+----------------+-------------------+ -| 1 | 1 | 1 | -| 1 | 1 | 0 | -+---------------------+----------------+-------------------+ +mysql> set @@tidb_isolation_read_engines='tiflash'; set @@tidb_enforce_mpp=1; select data regexp pattern, '' regexp pattern from test.t; ++---------------------+-------------------+ +| data regexp pattern | '' regexp pattern | ++---------------------+-------------------+ +| 1 | 0 | ++---------------------+-------------------+ From 5474fd05eaa80cb2af27b709615ca7d1e3b05e8b Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 19 Oct 2022 10:19:00 +0800 Subject: [PATCH 24/87] fix --- dbms/src/Common/OptimizedRegularExpression.inl.h | 3 +-- dbms/src/Functions/FunctionsRegexp.h | 10 +++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 7020f9bfb8d..fca5157c517 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -559,14 +559,13 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched size_t expr_size = subject_size - byte_offset; - size_t ret_pos = 0; size_t matched_index = 0; StringPieceType expr_sp(expr, expr_size); size_t matched_str_size = 0; while (occur > 0) { - bool success = re2::RE2::FindAndConsume(&expr_sp, *re2, &matched_str); + bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); if (!success) return 0; diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 0a13805e691..5459db90846 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1121,7 +1121,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } while (0); // Choose int type for position param and execute -#define CHOOSE_AND_EXEC_FOR_POS_PARAM(execute) \ +#define CHOOSE_AND_EXEC_FOR_POS_PARAM() \ do \ { \ switch (POS_PARAM_VAR_NAME.getIntType()) \ @@ -1235,7 +1235,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase typename ColumnVector::Container & VEC_RES_VAR_NAME = col_res->getData(); VEC_RES_VAR_NAME.resize(col_size, 0); - constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullable() || OccurT::isNullable() || RetOpT::isNullable() || MatchTypeT::isNullableCol(); + constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || RetOpT::isNullableCol() || MatchTypeT::isNullableCol(); // Start to execute instr if (isMemorized()) @@ -1352,7 +1352,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } while (0); // Identify int type of position, occurrance and return option, and execute the instr - CHOOSE_AND_EXEC_FOR_POS_PARAM() + // CHOOSE_AND_EXEC_FOR_POS_PARAM() #undef EXECUTE_INSTR @@ -1386,7 +1386,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } while (0); // Identify int type of position, occurrance and return option, and execute the instr - CHOOSE_AND_EXEC_FOR_POS_PARAM() + // CHOOSE_AND_EXEC_FOR_POS_PARAM() #undef EXECUTE_INSTR res_arg.column = std::move(col_res); @@ -1430,7 +1430,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase POS_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; }; - // CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } private: From 6368c9d8042b45b24917544d710fb2dc275d8074 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 19 Oct 2022 15:55:34 +0800 Subject: [PATCH 25/87] workaround --- dbms/src/Functions/FunctionsRegexp.h | 451 +++++++++++++-------------- 1 file changed, 211 insertions(+), 240 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 5459db90846..22d6e916aae 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -141,6 +141,89 @@ inline constexpr bool check_int_type() enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64 }; +Int64 getUInt8(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getUInt16(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getUInt32(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getUInt64(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getUInt128(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getInt8(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getInt16(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getInt32(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +Int64 getInt64(const void * container, size_t idx) +{ + const auto * tmp = reinterpret_cast::Container *>(container); + return static_cast((*tmp)[idx]); +} + +using GetIntFuncPointerType = Int64 (*)(const void *, size_t); + +GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) +{ + switch (int_type) + { + case IntType::UInt8: + return getUInt8; + case IntType::UInt16: + return getUInt16; + case IntType::UInt32: + return getUInt32; + case IntType::UInt64: + return getUInt64; + case IntType::UInt128: + return getUInt128; + case IntType::Int8: + return getInt8; + case IntType::Int16: + return getInt16; + case IntType::Int32: + return getInt32; + case IntType::Int64: + return getInt64; + default: + throw Exception("Unexpected int type"); + } +} + // Use this type when param is not provided class ParamDefault { @@ -177,6 +260,7 @@ class ParamDefault static String getString(size_t) { return String(""); } void getStringRef(size_t, StringRef &) const {} constexpr static bool isConst() { return true; } + static const void * getContainer() { throw Exception("ParamDefault not supports this function"); } private: Int64 default_int; @@ -258,6 +342,8 @@ class ParamString constexpr static bool isConst() { return is_const; } + const void * getContainer() const { return nullptr; } + private: size_t offsetAt(size_t i) const { return i == 0 ? 0 : (*offsets)[i - 1]; } size_t sizeAt(size_t i) const { return i == 0 ? (*offsets)[0] : ((*offsets)[i] - (*offsets)[i - 1]); } @@ -328,6 +414,7 @@ class ParamInt String getString(size_t) const { throw Exception("ParamInt not supports this function"); } void getStringRef(size_t, StringRef &) const { throw Exception("ParamInt not supports this function"); } constexpr static bool isConst() { return is_const; } + const void * getContainer() const { return int_container; } private: Int64 const_int_val; @@ -412,6 +499,7 @@ class Param size_t getDataNum() const { return col_size; } constexpr static bool isNullableCol() { return is_null; } constexpr static bool isConst() { return ParamImplType::isConst(); } + const void * getContainer() const { return data.getContainer(); } private: const size_t col_size; @@ -438,7 +526,6 @@ class Param #define SELF_CLASS_NAME (name) #define ARG_NUM_VAR_NAME arg_num -#define NULL_MAP_VAR_NAME null_map #define VEC_RES_VAR_NAME vec_res #define COLLATOR_VAR_NAME collator @@ -947,7 +1034,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if ((ARG_NUM_VAR_NAME) == REGEXP_LIKE_MAX_PARAM_NUM) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - // CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } private: @@ -1048,114 +1135,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase CONVERT_EXPR_COL_TO_PARAM() \ } while (0); -// Choose int type for return option param and execute -#define CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, occur_type) \ - do \ - { \ - switch (RET_OP_PARAM_VAR_NAME.getIntType()) \ - { \ - case IntType::UInt8: \ - EXECUTE_INSTR(pos_type, occur_type, UInt8) \ - break; \ - case IntType::UInt16: \ - EXECUTE_INSTR(pos_type, occur_type, UInt16) \ - break; \ - case IntType::UInt32: \ - EXECUTE_INSTR(pos_type, occur_type, UInt32) \ - break; \ - case IntType::UInt64: \ - EXECUTE_INSTR(pos_type, occur_type, UInt64) \ - break; \ - case IntType::UInt128: \ - EXECUTE_INSTR(pos_type, occur_type, UInt128) \ - break; \ - case IntType::Int8: \ - EXECUTE_INSTR(pos_type, occur_type, Int8) \ - break; \ - case IntType::Int16: \ - EXECUTE_INSTR(pos_type, occur_type, Int16) \ - break; \ - case IntType::Int32: \ - EXECUTE_INSTR(pos_type, occur_type, Int32) \ - break; \ - case IntType::Int64: \ - EXECUTE_INSTR(pos_type, occur_type, Int64) \ - break; \ - } \ - } while (0); - -// Choose int type for occurrance param and execute -#define CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(pos_type) \ - do \ - { \ - switch (OCCUR_PARAM_VAR_NAME.getIntType()) \ - { \ - case IntType::UInt8: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt8) \ - break; \ - case IntType::UInt16: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt16) \ - break; \ - case IntType::UInt32: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt32) \ - break; \ - case IntType::UInt64: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt64) \ - break; \ - case IntType::UInt128: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, UInt128) \ - break; \ - case IntType::Int8: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int8) \ - break; \ - case IntType::Int16: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int16) \ - break; \ - case IntType::Int32: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int32) \ - break; \ - case IntType::Int64: \ - CHOOSE_AND_EXEC_FOR_RET_OP_PARAM(pos_type, Int64) \ - break; \ - } \ - } while (0); - -// Choose int type for position param and execute -#define CHOOSE_AND_EXEC_FOR_POS_PARAM() \ - do \ - { \ - switch (POS_PARAM_VAR_NAME.getIntType()) \ - { \ - case IntType::UInt8: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt8) \ - break; \ - case IntType::UInt16: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt16) \ - break; \ - case IntType::UInt32: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt32) \ - break; \ - case IntType::UInt64: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt64) \ - break; \ - case IntType::UInt128: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(UInt128) \ - break; \ - case IntType::Int8: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int8) \ - break; \ - case IntType::Int16: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int16) \ - break; \ - case IntType::Int32: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int32) \ - break; \ - case IntType::Int64: \ - CHOOSE_AND_EXEC_FOR_OCCUR_PARAM(Int64) \ - break; \ - } \ - } while (0); - // Implementation of regexp_instr function template class FunctionStringRegexpInstr : public FunctionStringRegexpBase @@ -1202,23 +1181,23 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } template - void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & EXPR_PARAM_VAR_NAME, const PatT & PAT_PARAM_VAR_NAME, const PosT & POS_PARAM_VAR_NAME, const OccurT & OCCUR_PARAM_VAR_NAME, const RetOpT & RET_OP_PARAM_VAR_NAME, const MatchTypeT & MATCH_TYPE_PARAM_VAR_NAME) const + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expar_param, const PatT & par_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const MatchTypeT & match_type_param) const { - size_t col_size = EXPR_PARAM_VAR_NAME.getDataNum(); + size_t col_size = expar_param.getDataNum(); // Check if args are all const columns if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && RetOpT::isConst() && MatchTypeT::isConst()) { int flags = getDefaultFlags(); - String expr = EXPR_PARAM_VAR_NAME.getString(0); - String pat = PAT_PARAM_VAR_NAME.getString(0); + String expr = expar_param.getString(0); + String pat = par_param.getString(0); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - Int64 pos = POS_PARAM_VAR_NAME.template getInt(0); - Int64 occur = OCCUR_PARAM_VAR_NAME.template getInt(0); - Int64 ret_op = RET_OP_PARAM_VAR_NAME.template getInt(0); - String match_type = MATCH_TYPE_PARAM_VAR_NAME.getString(0); + Int64 pos = pos_param.template getInt(0); + Int64 occur = occur_param.template getInt(0); + Int64 ret_op = ret_op_param.template getInt(0); + String match_type = match_type_param.getString(0); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, COLLATOR_VAR_NAME), flags); ResultType res = regexp.instr(expr.c_str(), expr.size(), pos, occur, ret_op); @@ -1228,7 +1207,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Check memorization if constexpr (canMemorize()) - memorize(PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME, COLLATOR_VAR_NAME); + memorize(par_param, match_type_param, COLLATOR_VAR_NAME); // Initialize result column auto col_res = ColumnVector::create(); @@ -1237,79 +1216,96 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || RetOpT::isNullableCol() || MatchTypeT::isNullableCol(); + // Get function pointers to process the specific int type + GetIntFuncPointerType get_pos_func = getGetIntFuncPointer(pos_param.getIntType()); + GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); + GetIntFuncPointerType get_ret_op_func = getGetIntFuncPointer(ret_op_param.getIntType()); + + const void * pos_container = pos_param.getContainer(); + const void * occur_container = occur_param.getContainer(); + const void * ret_op_container = ret_op_param.getContainer(); + + Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; + Int64 occur_const_val = OccurT::isConst() ? occur_param. template getInt(0) : -1; + Int64 ret_op_const_val = RetOpT::isConst() ? ret_op_param. template getInt(0) : -1; + +#define GET_POS_VALUE(idx) \ + do \ + { \ + if constexpr (PosT::isConst()) \ + pos = pos_const_val; \ + else \ + pos = get_pos_func(pos_container, idx); \ + } while (0); + +#define GET_OCCUR_VALUE(idx) \ + do \ + { \ + if constexpr (OccurT::isConst()) \ + occur = occur_const_val; \ + else \ + occur = get_occur_func(occur_container, idx); \ + } while (0); + +#define GET_RET_OP_VALUE(idx) \ + do \ + { \ + if constexpr (RetOpT::isConst()) \ + ret_op = ret_op_const_val; \ + else \ + ret_op = get_ret_op_func(ret_op_container, idx); \ + } while (0); + + StringRef expr_ref; + String pat; + Int64 pos; + Int64 occur; + Int64 ret_op; + String match_type; + // Start to execute instr if (isMemorized()) { // Codes in this if-condition execute instr with memorized regexp -#define REGEXP_VAR_NAME regexp - - const auto & REGEXP_VAR_NAME = getRegexp(); + const auto & regexp = getRegexp(); if constexpr (has_nullable_col) { // Process nullable columns with memorized regexp auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & NULL_MAP_VAR_NAME = nullmap_col->getData(); - NULL_MAP_VAR_NAME.resize(col_size); - -#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ - do \ - { \ - StringRef expr_ref; \ - Int64 pos; \ - Int64 occur; \ - Int64 ret_op; \ - for (size_t i = 0; i < col_size; ++i) \ - { \ - if (EXPR_PARAM_VAR_NAME.isNullAt(i) || POS_PARAM_VAR_NAME.isNullAt(i) || OCCUR_PARAM_VAR_NAME.isNullAt(i) || RET_OP_PARAM_VAR_NAME.isNullAt(i)) \ - { \ - NULL_MAP_VAR_NAME[i] = 1; \ - continue; \ - } \ - NULL_MAP_VAR_NAME[i] = 0; \ - EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ - pos = POS_PARAM_VAR_NAME.template getInt(i); \ - occur = OCCUR_PARAM_VAR_NAME.template getInt(i); \ - ret_op = RET_OP_PARAM_VAR_NAME.template getInt(i); \ - VEC_RES_VAR_NAME[i] = REGEXP_VAR_NAME->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ - } \ - } while (0); - - // Identify int type of position, occurrance and return option, and execute the instr - CHOOSE_AND_EXEC_FOR_POS_PARAM() - -#undef EXECUTE_INSTR + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size); + for (size_t i = 0; i < col_size; ++i) + { + if (expar_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || ret_op_param.isNullAt(i)) + { + null_map[i] = 1; + continue; + } + null_map[i] = 0; + expar_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + GET_RET_OP_VALUE(i) + VEC_RES_VAR_NAME[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); + } res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { - // Process pure vector columns with memorized regexp -#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ - do \ - { \ - /* columns are impossible to be a nullable column here */ \ - StringRef expr_ref; \ - Int64 pos; \ - Int64 occur; \ - Int64 ret_op; \ - for (size_t i = 0; i < col_size; ++i) \ - { \ - EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ - POS_PARAM_VAR_NAME.template getInt(i); \ - OCCUR_PARAM_VAR_NAME.template getInt(i); \ - RET_OP_PARAM_VAR_NAME.template getInt(i); \ - VEC_RES_VAR_NAME[i] = REGEXP_VAR_NAME->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ - } \ - } while (0); - - // Identify int type of position, occurrance and return option, and execute the instr - CHOOSE_AND_EXEC_FOR_POS_PARAM() - -#undef EXECUTE_INSTR + // Process pure vector columns with memorized regexp. + // columns are impossible to be a nullable column here. + for (size_t i = 0; i < col_size; ++i) + { + expar_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + GET_RET_OP_VALUE(i) + VEC_RES_VAR_NAME[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); + } res_arg.column = std::move(col_res); } -#undef REGEXP_VAR_NAME } else { @@ -1318,80 +1314,55 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase { // Process nullable columns without memorized regexp auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & NULL_MAP_VAR_NAME = nullmap_col->getData(); - NULL_MAP_VAR_NAME.resize(col_size); - -#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ - do \ - { \ - StringRef expr_ref; \ - String pat; \ - Int64 pos; \ - Int64 occur; \ - Int64 ret_op; \ - String match_type; \ - for (size_t i = 0; i < col_size; ++i) \ - { \ - if (EXPR_PARAM_VAR_NAME.isNullAt(i) || POS_PARAM_VAR_NAME.isNullAt(i) || OCCUR_PARAM_VAR_NAME.isNullAt(i) || RET_OP_PARAM_VAR_NAME.isNullAt(i)) \ - { \ - NULL_MAP_VAR_NAME[i] = 1; \ - continue; \ - } \ - NULL_MAP_VAR_NAME[i] = 0; \ - EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ - pat = PAT_PARAM_VAR_NAME.getString(i); \ - if (unlikely(pat.empty())) \ - throw Exception(EMPTY_PAT_ERR_MSG); \ - pos = POS_PARAM_VAR_NAME.template getInt(i); \ - occur = OCCUR_PARAM_VAR_NAME.template getInt(i); \ - ret_op = RET_OP_PARAM_VAR_NAME.template getInt(i); \ - match_type = match_type_param.getString(i); \ - auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); \ - VEC_RES_VAR_NAME[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ - } \ - } while (0); - - // Identify int type of position, occurrance and return option, and execute the instr - // CHOOSE_AND_EXEC_FOR_POS_PARAM() + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size); -#undef EXECUTE_INSTR + for (size_t i = 0; i < col_size; ++i) + { + if (expar_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || ret_op_param.isNullAt(i)) + { + null_map[i] = 1; + continue; + } + null_map[i] = 0; + expar_param.getStringRef(i, expr_ref); + pat = par_param.getString(i); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + GET_RET_OP_VALUE(i) + match_type = match_type_param.getString(i); + auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); + VEC_RES_VAR_NAME[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); + } res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { // Process pure vector columns without memorized regexp -#define EXECUTE_INSTR(pos_type, occur_type, ret_op_type) \ - do \ - { \ - StringRef expr_ref; \ - String pat; \ - Int64 pos; \ - Int64 occur; \ - Int64 ret_op; \ - String match_type; \ - for (size_t i = 0; i < col_size; ++i) \ - { \ - EXPR_PARAM_VAR_NAME.getStringRef(i, expr_ref); \ - pat = PAT_PARAM_VAR_NAME.getString(i); \ - if (unlikely(pat.empty())) \ - throw Exception(EMPTY_PAT_ERR_MSG); \ - pos = POS_PARAM_VAR_NAME.template getInt(i); \ - occur = OCCUR_PARAM_VAR_NAME.template getInt(i); \ - ret_op = RET_OP_PARAM_VAR_NAME.template getInt(i); \ - match_type = match_type_param.getString(i); \ - auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); \ - vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); \ - } \ - } while (0); - - // Identify int type of position, occurrance and return option, and execute the instr - // CHOOSE_AND_EXEC_FOR_POS_PARAM() + for (size_t i = 0; i < col_size; ++i) + { + expar_param.getStringRef(i, expr_ref); + pat = par_param.getString(i); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + GET_RET_OP_VALUE(i) + match_type = match_type_param.getString(i); + auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); + vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); + } -#undef EXECUTE_INSTR res_arg.column = std::move(col_res); } } + +#undef GET_RET_OP_VALUE +#undef GET_OCCUR_VALUE +#undef GET_POS_VALUE } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override @@ -1437,6 +1408,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase TiDB::TiDBCollatorPtr COLLATOR_VAR_NAME = nullptr; }; +#undef CHOOSE_TYPE_AND_EXECUTE #undef CHOOSE_AND_EXEC_FOR_POS_PARAM #undef CHOOSE_AND_EXEC_FOR_OCCUR_PARAM #undef CHOOSE_AND_EXEC_FOR_RET_OP_PARAM @@ -1716,7 +1688,6 @@ class FunctionStringReplace : public IFunction #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME #undef COLLATOR_VAR_NAME #undef VEC_RES_VAR_NAME -#undef NULL_MAP_VAR_NAME #undef ARG_NUM_VAR_NAME #undef SELF_CLASS_NAME #undef MATCH_TYPE_PARAM_VAR_NAME From 6892ea088e321ff12af294bbb946b3751d3aba1f Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 21 Oct 2022 10:45:47 +0800 Subject: [PATCH 26/87] add todo --- dbms/src/Functions/tests/gtest_regexp.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index ab1982e940a..dff218d28fd 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -1786,6 +1786,7 @@ TEST_F(Regexp, testRegexpTiDBCase) } // TODO test empty columns +// TODO test const null // We can only test regexp_like function as regexp is the subset of regexp_like TEST_F(Regexp, RegexpLike) { From 72877ce0dd9358616a50a00fc136fe9d528e170b Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 21 Oct 2022 11:48:36 +0800 Subject: [PATCH 27/87] fix --- dbms/src/Functions/FunctionsRegexp.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 08906058a1f..5b7bbce7049 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -441,13 +441,12 @@ class Param if (col_const != nullptr) \ { \ auto col_const_data = col_const->getDataColumnPtr(); \ + Field field; \ + col_const->get(0, field); \ + String tmp = field.isNull() ? String("") : field.safeGet(); \ if (col_const_data->isColumnNullable()) \ { \ /* This is a const column and it can't be const null column as we should have handled it in the previous */ \ - Field field; \ - col_const->get(0, field); \ - String tmp = field.safeGet(); \ - /* const col */ \ Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size())); \ next_process; \ } \ @@ -459,7 +458,9 @@ class Param } \ } \ else \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ + { \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ + } \ } while (0); class FunctionStringRegexpBase From 34c784989f6b62d10f26ddd0a741c6524515fbb6 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 21 Oct 2022 13:54:20 +0800 Subject: [PATCH 28/87] pass const test --- dbms/src/Functions/FunctionsRegexp.h | 100 +++++++++++++--------- dbms/src/Functions/tests/gtest_regexp.cpp | 49 ++++++++++- 2 files changed, 106 insertions(+), 43 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 22d6e916aae..507fee8967a 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -139,6 +139,21 @@ inline constexpr bool check_int_type() return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); } +// Field field; +// field.safeGet(); + case Field::Types::Which::UInt64: + return field.safeGet(); + default: + throw Exception("Unexpected int type"); + } +} + enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64 }; Int64 getUInt8(const void * container, size_t idx) @@ -564,13 +579,12 @@ class Param if (col_const != nullptr) \ { \ auto col_const_data = col_const->getDataColumnPtr(); \ + Field field; \ + col_const->get(0, field); \ + String tmp = field.isNull() ? String("") : field.safeGet(); \ if (col_const_data->isColumnNullable()) \ { \ /* This is a const column and it can't be const null column as we should have handled it in the previous */ \ - Field field; \ - col_const->get(0, field); \ - String tmp = field.safeGet(); \ - /* const col */ \ Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size())); \ next_process; \ } \ @@ -650,25 +664,24 @@ class Param #define CONVERT_CONST_INT_COL_TO_PARAM(param_name, processed_col, next_process) \ do \ { \ + std::cout << "CONVERT_CONST_INT_COL_TO_PARAM1\n"; \ size_t col_size = (processed_col)->size(); \ const auto * col_const = typeid_cast(&(*(processed_col))); \ if (col_const != nullptr) \ { \ + std::cout << "CONVERT_CONST_INT_COL_TO_PARAM4\n"; \ + Field field; \ + col_const->get(0, field); \ + auto data_int64 = field.isNull() ? -1 : getIntFromField(field); \ auto col_const_data = col_const->getDataColumnPtr(); \ if (col_const_data->isColumnNullable()) \ { \ /* This is a const nullable column */ \ - Field field; \ - col_const->get(0, field); \ - auto data_int64 = field.get(); \ - /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ Param, true>(param_name)(col_size, data_int64); \ next_process; \ } \ else \ { \ - /* type template of ParamInt is useless when column is const, so we can arbitrary designate a valid as template parameter */ \ - auto data_int64 = col_const->getValue(); \ Param, false>(param_name)(col_size, data_int64); \ next_process; \ } \ @@ -1034,7 +1047,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if ((ARG_NUM_VAR_NAME) == REGEXP_LIKE_MAX_PARAM_NUM) MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; - CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + // CONVERT_COLS_TO_PARAMS_AND_EXECUTE() } private: @@ -1056,39 +1069,16 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // Method to convert match type column to param #define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ do \ - { \ + { \ + std::cout << "CONVERT_MATCH_TYPE_COL_TO_PARAM1\n"; \ if (ARG_NUM_VAR_NAME == REGEXP_INSTR_MAX_PARAM_NUM) \ { \ + std::cout << "CONVERT_MATCH_TYPE_COL_TO_PARAM2\n"; \ CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_INSTR()})) \ } \ - else if (ARG_NUM_VAR_NAME == REGEXP_MIN_PARAM_NUM + 3) \ - { \ - /* match_type is not provided here and set default values */ \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - EXECUTE_REGEXP_INSTR() \ - } \ - else if (ARG_NUM_VAR_NAME == REGEXP_MIN_PARAM_NUM + 2) \ + else \ { \ - /* return_option and match_type are not provided here and set default values */ \ - Param RET_OP_PARAM_VAR_NAME(-1, 0); \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - EXECUTE_REGEXP_INSTR() \ - } \ - else if (ARG_NUM_VAR_NAME == REGEXP_MIN_PARAM_NUM + 1) \ - { \ - /* occurrence, return_option and match_type are not provided here and set default values */ \ - Param OCCUR_PARAM_VAR_NAME(-1, 1); \ - Param RET_OP_PARAM_VAR_NAME(-1, 0); \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ - EXECUTE_REGEXP_INSTR() \ - } \ - else \ - { \ - /* position, occurrence, return_option and match_type are not provided here and set default values */ \ - Param POS_PARAM_VAR_NAME(-1, 1); \ - Param OCCUR_PARAM_VAR_NAME(-1, 1); \ - Param RET_OP_PARAM_VAR_NAME(-1, 0); \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ EXECUTE_REGEXP_INSTR() \ } \ } while (0); @@ -1097,27 +1087,52 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_RET_OP_COL_TO_PARAM() \ do \ { \ - CONVERT_CONST_INT_COL_TO_PARAM(RET_OP_PARAM_VAR_NAME, RET_OP_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ + std::cout << "CONVERT_RET_OP_COL_TO_PARAM1\n"; \ + if (ARG_NUM_VAR_NAME < REGEXP_MIN_PARAM_NUM + 3) \ + { \ + std::cout << "CONVERT_RET_OP_COL_TO_PARAM2\n"; \ + Param RET_OP_PARAM_VAR_NAME(-1, 0); \ + CONVERT_MATCH_TYPE_COL_TO_PARAM() \ + } \ + else \ + CONVERT_CONST_INT_COL_TO_PARAM(RET_OP_PARAM_VAR_NAME, RET_OP_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ } while (0); // Method to convert occurrence column to param #define CONVERT_OCCUR_COL_TO_PARAM() \ do \ { \ - CONVERT_CONST_INT_COL_TO_PARAM(OCCUR_PARAM_VAR_NAME, OCCUR_COL_PTR_VAR_NAME, ({CONVERT_RET_OP_COL_TO_PARAM()})) \ + std::cout << "CONVERT_OCCUR_COL_TO_PARAM1\n"; \ + if (ARG_NUM_VAR_NAME < REGEXP_MIN_PARAM_NUM + 2) \ + { \ + std::cout << "CONVERT_OCCUR_COL_TO_PARAM2\n"; \ + Param OCCUR_PARAM_VAR_NAME(-1, 1); \ + CONVERT_RET_OP_COL_TO_PARAM() \ + } \ + else \ + CONVERT_CONST_INT_COL_TO_PARAM(OCCUR_PARAM_VAR_NAME, OCCUR_COL_PTR_VAR_NAME, ({CONVERT_RET_OP_COL_TO_PARAM()})) \ } while (0); // Method to convert position column to param #define CONVERT_POS_COL_TO_PARAM() \ do \ { \ - CONVERT_CONST_INT_COL_TO_PARAM(POS_PARAM_VAR_NAME, POS_COL_PTR_VAR_NAME, ({CONVERT_OCCUR_COL_TO_PARAM()})) \ + std::cout << "CONVERT_POS_COL_TO_PARAM1\n"; \ + if (ARG_NUM_VAR_NAME < REGEXP_MIN_PARAM_NUM + 1) \ + { \ + std::cout << "CONVERT_POS_COL_TO_PARAM2\n"; \ + Param POS_PARAM_VAR_NAME(-1, 1); \ + CONVERT_OCCUR_COL_TO_PARAM() \ + } \ + else \ + CONVERT_CONST_INT_COL_TO_PARAM(POS_PARAM_VAR_NAME, POS_COL_PTR_VAR_NAME, ({CONVERT_OCCUR_COL_TO_PARAM()})) \ } while (0); // Method to convert pattern column to param #define CONVERT_PAT_COL_TO_PARAM() \ do \ { \ + std::cout << "CONVERT_PAT_COL_TO_PARAM\n"; \ CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_POS_COL_TO_PARAM()})) \ } while (0); @@ -1125,6 +1140,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_EXPR_COL_TO_PARAM() \ do \ { \ + std::cout << "CONVERT_EXPR_COL_TO_PARAM\n"; \ CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ } while (0); diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index ed424a527ba..a43536cc41f 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2159,7 +2159,54 @@ TEST_F(Regexp, testRegexpCustomerCases) TEST_F(Regexp, RegexpInstr) { - + { + // Test: All parameters are const + std::cout << "here1\n"; + size_t row_size = 2; + ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."))); + std::cout << "here2\n"; + ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2))); + std::cout << "here3\n"; + ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2))); + std::cout << "here4\n"; + ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1))); + std::cout << "here5\n"; + ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "aabab"), + createConstColumn(row_size, "aB"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1), + createConstColumn(row_size, "i"))); + } + + { + // Test: null const + } } TEST_F(Regexp, testRegexpReplaceMatchType) From 4003f736c1143cd26a895f87178293751f32fee3 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 21 Oct 2022 18:07:07 +0800 Subject: [PATCH 29/87] pass some gtests --- .../Common/OptimizedRegularExpression.inl.h | 3 +- dbms/src/Functions/FunctionsRegexp.h | 4 +- dbms/src/Functions/tests/gtest_regexp.cpp | 86 +++++++++++++++++-- 3 files changed, 84 insertions(+), 9 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index fca5157c517..540f2351a80 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -18,6 +18,7 @@ #include #include "Common/Exception.h" +#include "common/defines.h" #include "common/types.h" @@ -547,7 +548,7 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s { int64_t utf8_total_len = getStringUtf8Len(subject, subject_size); - if (pos <= 0 || pos > utf8_total_len) + if (unlikely(pos <= 0 || (pos > utf8_total_len && subject_size != 0))) throw DB::Exception("Index out of bounds in regular expression search."); String matched_str; // store the matched substring diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 507fee8967a..70cbbeb6ffe 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -275,7 +275,7 @@ class ParamDefault static String getString(size_t) { return String(""); } void getStringRef(size_t, StringRef &) const {} constexpr static bool isConst() { return true; } - static const void * getContainer() { throw Exception("ParamDefault not supports this function"); } + static const void * getContainer() { return nullptr; } private: Int64 default_int; @@ -1237,10 +1237,12 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); GetIntFuncPointerType get_ret_op_func = getGetIntFuncPointer(ret_op_param.getIntType()); + // Container will not be used when parm is const const void * pos_container = pos_param.getContainer(); const void * occur_container = occur_param.getContainer(); const void * ret_op_container = ret_op_param.getContainer(); + // Const value will not be used when the param is not const Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; Int64 occur_const_val = OccurT::isConst() ? occur_param. template getInt(0) : -1; Int64 ret_op_const_val = RetOpT::isConst() ? ret_op_param. template getInt(0) : -1; diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index a43536cc41f..6bccf3285c5 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2159,23 +2159,20 @@ TEST_F(Regexp, testRegexpCustomerCases) TEST_F(Regexp, RegexpInstr) { + // Test: All columns are const { - // Test: All parameters are const - std::cout << "here1\n"; size_t row_size = 2; ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), executeFunction( "regexp_instr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."))); - std::cout << "here2\n"; ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), executeFunction( "regexp_instr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."), createConstColumn(row_size, 2))); - std::cout << "here3\n"; ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), executeFunction( "regexp_instr", @@ -2183,7 +2180,6 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, "12"), createConstColumn(row_size, 2), createConstColumn(row_size, 2))); - std::cout << "here4\n"; ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), executeFunction( "regexp_instr", @@ -2192,7 +2188,6 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, 2), createConstColumn(row_size, 2), createConstColumn(row_size, 1))); - std::cout << "here5\n"; ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), executeFunction( "regexp_instr", @@ -2204,9 +2199,86 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, "i"))); } + // Test: null const { - // Test: null const + size_t row_size = 2; + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn>(row_size, {}), + createConstColumn(row_size, "123"))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + } + + // Test: All columns are pure vector + { + std::vector exprs{"ttttifl", "tidb_tikv", "aaaaaa", "\n", "", "ab\naB", "pp跑ppのaaa"}; + std::vector patterns{"tifl", "ti(db|kv)", "aa", ".", "^$", "^ab$", "(跑|の|P)"}; + std::vector results{4, 1, 1, 0, 1, 0, 3}; + + // test regexp_instr(vector, vector) + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns))); + + std::vector positions{}; + results = {}; + + std::vector occurs{}; + results = {}; + + std::vector return_options{}; + results = {}; + + std::vector match_types{}; + results = {}; + + // TODO collation } + + // Test: Invalid parameter handling + {} } TEST_F(Regexp, testRegexpReplaceMatchType) From f89f4eb2c2eb4b3766504accb66203673223e407 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 24 Oct 2022 12:57:03 +0800 Subject: [PATCH 30/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 51 +++++++++++++++-------- dbms/src/Functions/tests/gtest_regexp.cpp | 8 ++++ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 70cbbeb6ffe..39e53ded34b 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1201,18 +1201,48 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase { size_t col_size = expar_param.getDataNum(); + // Get function pointers to process the specific int type + GetIntFuncPointerType get_pos_func = getGetIntFuncPointer(pos_param.getIntType()); + GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); + GetIntFuncPointerType get_ret_op_func = getGetIntFuncPointer(ret_op_param.getIntType()); + + // Container will not be used when parm is const + const void * pos_container = pos_param.getContainer(); + const void * occur_container = occur_param.getContainer(); + const void * ret_op_container = ret_op_param.getContainer(); + + // Const value will not be used when the param is not const + Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; + Int64 occur_const_val = OccurT::isConst() ? occur_param. template getInt(0) : -1; + Int64 ret_op_const_val = RetOpT::isConst() ? ret_op_param. template getInt(0) : -1; + // Check if args are all const columns if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && RetOpT::isConst() && MatchTypeT::isConst()) { + // TODO check + if (expar_param.isNullAt(0) || par_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || ret_op_param.isNullAt(0) || match_type_param.isNullAt(0)) + { + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & VEC_RES_VAR_NAME = col_res->getData(); + VEC_RES_VAR_NAME.resize(col_size, 0); + + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size, 1); + + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + res_arg.column = res_arg.type->createColumn(); + } + int flags = getDefaultFlags(); String expr = expar_param.getString(0); String pat = par_param.getString(0); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - Int64 pos = pos_param.template getInt(0); - Int64 occur = occur_param.template getInt(0); - Int64 ret_op = ret_op_param.template getInt(0); + Int64 pos = if PosT::isConst() ? pos_const_val : get_pos_func(pos_container, idx); + Int64 occur = if OccurT::isConst() ? occur_const_val : get_occur_func(occur_container, idx); + Int64 ret_op = if RetOpT::isConst() ? ret_op_const_val : get_ret_op_func(ret_op_container, idx); String match_type = match_type_param.getString(0); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, COLLATOR_VAR_NAME), flags); @@ -1232,21 +1262,6 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || RetOpT::isNullableCol() || MatchTypeT::isNullableCol(); - // Get function pointers to process the specific int type - GetIntFuncPointerType get_pos_func = getGetIntFuncPointer(pos_param.getIntType()); - GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); - GetIntFuncPointerType get_ret_op_func = getGetIntFuncPointer(ret_op_param.getIntType()); - - // Container will not be used when parm is const - const void * pos_container = pos_param.getContainer(); - const void * occur_container = occur_param.getContainer(); - const void * ret_op_container = ret_op_param.getContainer(); - - // Const value will not be used when the param is not const - Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; - Int64 occur_const_val = OccurT::isConst() ? occur_param. template getInt(0) : -1; - Int64 ret_op_const_val = RetOpT::isConst() ? ret_op_param. template getInt(0) : -1; - #define GET_POS_VALUE(idx) \ do \ { \ diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 6bccf3285c5..57f3f1bbe85 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2265,15 +2265,23 @@ TEST_F(Regexp, RegexpInstr) std::vector positions{}; results = {}; + // test regexp_instr(vector, vector, vector) + std::vector occurs{}; results = {}; + // test regexp_instr(vector, vector, vector, vector) + std::vector return_options{}; results = {}; + // test regexp_instr(vector, vector, vector, vector, vector) + std::vector match_types{}; results = {}; + // test regexp_instr(vector, vector, vector, vector, vector, vector) + // TODO collation } From 02f56a66e59a0160ea1f4806ef408e12658af07b Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 24 Oct 2022 16:23:56 +0800 Subject: [PATCH 31/87] modify the processing of const null etc... --- dbms/src/Functions/FunctionsRegexp.cpp | 52 ------------- dbms/src/Functions/FunctionsRegexp.h | 90 ++++++++++++----------- dbms/src/Functions/IFunction.cpp | 36 ++++----- dbms/src/Functions/IFunction.h | 8 ++ dbms/src/Functions/tests/gtest_regexp.cpp | 49 +++++++++--- 5 files changed, 110 insertions(+), 125 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index d42c49ff242..fc80f3e513d 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -65,58 +65,6 @@ String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator) return flags; } -NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) -{ - NullPresence res; - - for (const auto & arg : args) - { - const auto & elem = block.getByPosition(arg); - const auto * col_const = typeid_cast(&(*(elem.column))); - - if (elem.type->getTypeId() == TypeIndex::Nothing) - { - res.has_data_type_nothing = true; - break; - } - - if (col_const != nullptr) - { - auto col_const_data = col_const->getDataColumnPtr(); - - // It's needless to check if it's a const nullable column when res.has_const_null has been set - if (!res.has_const_null_col) - { - // check const null - if (col_const_data->isColumnNullable()) - { - if (static_cast(*col_const_data).isNullAt(0)) - res.has_const_null_col = true; - } - } - } - else - { - // It's needless to check if it's a nullable column when res.has_nullable_col has been set - if (!res.has_nullable_col) - { - if ((elem.column)->isColumnNullable()) - { - res.has_nullable_col = true; - - // Check if nullable column wrap a DataTypeNothing type - const auto * type_null = typeid_cast(&(*elem.type)); - const auto & nested_type = type_null->getNestedType(); - if (nested_type->getTypeId() == TypeIndex::Nothing) - res.has_data_type_nothing = true; - } - } - } - } - - return res; -} - /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. * 'replacement' could contain substitutions, for example: '\2-\3-\1' */ diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 5b7bbce7049..e81ea211e33 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -101,15 +101,6 @@ inline int getDefaultFlags() return flags; } -struct NullPresence -{ - bool has_nullable_col = false; - bool has_const_null_col = false; - bool has_data_type_nothing = false; -}; - -NullPresence getNullPresense(const Block & block, const ColumnNumbers & args); - inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { String flags = getMatchType(match_type, collator); @@ -328,6 +319,13 @@ class Param , data(str_ref) {} + // const nullable string param + Param(size_t col_size_, const StringRef & str_ref, ConstNullMapPtr null_map_) + : col_size(col_size_) + , null_map(null_map_) + , data(str_ref) + {} + // const int param Param(size_t col_size_, Int64 val) : col_size(col_size_) @@ -335,6 +333,13 @@ class Param , data(val) {} + // const nullable int param + Param(size_t col_size_, Int64 val, ConstNullMapPtr null_map_) + : col_size(col_size_) + , null_map(null_map_) + , data(val) + {} + // pure vector string param // chars_ type: ParamImplType::Chars_t // offsets_ type: ParamImplType::Offsets @@ -375,8 +380,7 @@ class Param bool isNullAt(size_t idx) const { - // null_map works only when we are non-const nullable column - if constexpr (is_null && !ParamImplType::isConst()) + if constexpr (is_null) return (*null_map)[idx]; else return false; @@ -433,34 +437,33 @@ class Param } while (0); // Common method to convert const string column -#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ - do \ - { \ - size_t col_size = (processed_col)->size(); \ - const auto * col_const = typeid_cast(&(*(processed_col))); \ - if (col_const != nullptr) \ - { \ - auto col_const_data = col_const->getDataColumnPtr(); \ - Field field; \ - col_const->get(0, field); \ - String tmp = field.isNull() ? String("") : field.safeGet(); \ - if (col_const_data->isColumnNullable()) \ - { \ - /* This is a const column and it can't be const null column as we should have handled it in the previous */ \ - Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size())); \ - next_process; \ - } \ - else \ - { \ - /* const col */ \ - Param, false>(param_name)(col_size, col_const->getDataAt(0)); \ - next_process; \ - } \ - } \ - else \ - { \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ - } \ +#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ + do \ + { \ + size_t col_size = (processed_col)->size(); \ + const auto * col_const = typeid_cast(&(*(processed_col))); \ + if (col_const != nullptr) \ + { \ + auto col_const_data = col_const->getDataColumnPtr(); \ + Field field; \ + col_const->get(0, field); \ + String tmp = field.isNull() ? String("") : field.safeGet(); \ + if (col_const_data->isColumnNullable()) \ + { \ + const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ + Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size()), null_map); \ + next_process; \ + } \ + else \ + { \ + Param, false>(param_name)(col_size, col_const->getDataAt(0)); \ + next_process; \ + } \ + } \ + else \ + { \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ + } \ } while (0); class FunctionStringRegexpBase @@ -604,6 +607,12 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // Check if args are all const columns if constexpr (ExprT::isConst() && PatT::isConst() && MatchTypeT::isConst()) { + if (col_size == 0 || expr_param.isNullAt(0) || pat_param.isNullAt(0) || match_type_param.isNullAt(0)) + { + res_arg.column = res_arg.type->createColumnConst(col_size, Null()); + return; + } + int flags = getDefaultFlags(); String expr = expr_param.getString(0); String pat = pat_param.getString(0); @@ -734,9 +743,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase const ColumnPtr & EXPR_COL_PTR_VAR_NAME = block.getByPosition(arguments[0]).column; - if (null_presence.has_const_null_col || null_presence.has_data_type_nothing) + if (null_presence.has_null_constant) { - // There is a const null column in the input block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); return; } diff --git a/dbms/src/Functions/IFunction.cpp b/dbms/src/Functions/IFunction.cpp index 4ace050f7c6..51ab7cd2a6c 100644 --- a/dbms/src/Functions/IFunction.cpp +++ b/dbms/src/Functions/IFunction.cpp @@ -98,20 +98,12 @@ ColumnPtr wrapInNullable(const ColumnPtr & src, Block & block, const ColumnNumbe return ColumnNullable::create(src_not_nullable, result_null_map_column); } -struct NullPresence -{ - bool has_nullable = false; - bool has_null_constant = false; -}; - -NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) +NullPresence getNullPresense(const ColumnsWithTypeAndName & args) { NullPresence res; - for (const auto & arg : args) + for (const auto & elem : args) { - const auto & elem = block.getByPosition(arg); - if (!res.has_nullable) res.has_nullable = elem.type->isNullable(); if (!res.has_null_constant) @@ -121,12 +113,23 @@ NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) return res; } -NullPresence getNullPresense(const ColumnsWithTypeAndName & args) +bool allArgumentsAreConstants(const Block & block, const ColumnNumbers & args) +{ + for (auto arg : args) + if (!block.getByPosition(arg).column->isColumnConst()) + return false; + return true; +} +} // namespace + +NullPresence getNullPresense(const Block & block, const ColumnNumbers & args) { NullPresence res; - for (const auto & elem : args) + for (const auto & arg : args) { + const auto & elem = block.getByPosition(arg); + if (!res.has_nullable) res.has_nullable = elem.type->isNullable(); if (!res.has_null_constant) @@ -136,15 +139,6 @@ NullPresence getNullPresense(const ColumnsWithTypeAndName & args) return res; } -bool allArgumentsAreConstants(const Block & block, const ColumnNumbers & args) -{ - for (auto arg : args) - if (!block.getByPosition(arg).column->isColumnConst()) - return false; - return true; -} -} // namespace - bool IExecutableFunction::defaultImplementationForConstantArguments(Block & block, const ColumnNumbers & args, size_t result) const { ColumnNumbers arguments_to_remain_constants = getArgumentsThatAreAlwaysConstant(); diff --git a/dbms/src/Functions/IFunction.h b/dbms/src/Functions/IFunction.h index c1bcdc8b151..46287627c21 100644 --- a/dbms/src/Functions/IFunction.h +++ b/dbms/src/Functions/IFunction.h @@ -32,6 +32,14 @@ extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NOT_IMPLEMENTED; } // namespace ErrorCodes +struct NullPresence +{ + bool has_nullable = false; + bool has_null_constant = false; +}; + +NullPresence getNullPresense(const Block &, const ColumnNumbers &); + /// The simplest executable object. /// Motivation: /// * Prepare something heavy once before main execution loop instead of doing it for each block. diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index dff218d28fd..24df4483604 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -23,6 +23,7 @@ #include #include +#include "Core/ColumnWithTypeAndName.h" #include "DataTypes/DataTypesNumber.h" #include "common/types.h" @@ -44,6 +45,10 @@ class Regexp : public FunctionTest { return column_with_type.column->isColumnConst() && column_with_type.column->isNullAt(0); } + static bool isColumnConst(const ColumnWithTypeAndName & column_with_type) + { + return column_with_type.column->isColumnConst(); + } static bool isColumnConstNotNull(const ColumnWithTypeAndName & column_with_type) { return column_with_type.column->isColumnConst() && !column_with_type.column->isNullAt(0); @@ -1786,7 +1791,6 @@ TEST_F(Regexp, testRegexpTiDBCase) } // TODO test empty columns -// TODO test const null // We can only test regexp_like function as regexp is the subset of regexp_like TEST_F(Regexp, RegexpLike) { @@ -1820,8 +1824,8 @@ TEST_F(Regexp, RegexpLike) size_t row_size = exprs_nulls.size(); - auto const_uint8_null_column = createConstColumn>(row_size, {}); - auto const_string_null_column = createConstColumn>(row_size, {}); + auto const_uint8_null_column = createOnlyNullColumnConst(row_size); + auto const_string_null_column = createOnlyNullColumnConst(row_size); // case 1. regexp_like(const, const [, const]) { @@ -2108,26 +2112,49 @@ TEST_F(Regexp, testRegexpCustomerCases) "^756[0-9]{11}$"; std::vector patterns{pattern, pattern, pattern, pattern, pattern}; std::vector inputs{"73228012343218", "530101343498", "540101323298", "31111191919191", "78200000000000"}; + size_t col_size = inputs.size(); /// columnNothing, columnConstNull, columnConstNotNull, columnVectorNullable, columnVectorNotNull - ColumnsWithTypeAndName input_columns{createOnlyNullColumnConst(5), createConstColumn>(5, {}), createConstColumn>(5, inputs[0]), createConstColumn(5, inputs[0]), createColumn>({inputs[0], {}, {}, inputs[3], inputs[4]}), createColumn(inputs)}; - ColumnsWithTypeAndName pattern_columns{createOnlyNullColumnConst(5), createConstColumn>(5, {}), createConstColumn>(5, patterns[0]), createConstColumn(5, patterns[0]), createColumn>({patterns[0], {}, {}, patterns[3], patterns[4]}), createColumn(patterns)}; + ColumnsWithTypeAndName input_columns{createOnlyNullColumnConst(col_size), createConstColumn>(col_size, {}), createConstColumn>(col_size, inputs[0]), createConstColumn(col_size, inputs[0]), createColumn>({inputs[0], {}, {}, inputs[3], inputs[4]}), createColumn(inputs)}; + ColumnsWithTypeAndName pattern_columns{createOnlyNullColumnConst(col_size), createConstColumn>(col_size, patterns[0]), createConstColumn(col_size, patterns[0]), createColumn>({patterns[0], {}, {}, patterns[3], patterns[4]}), createColumn(patterns)}; + for (const auto & input_column : input_columns) { for (const auto & pattern_column : pattern_columns) { if (input_column.type->onlyNull() || pattern_column.type->onlyNull()) { - ASSERT_COLUMN_EQ(createOnlyNullColumnConst(5), + ASSERT_COLUMN_EQ(createOnlyNullColumnConst(col_size), executeFunction("regexp", input_column, pattern_column)); } - else if (isColumnConstNull(input_column) || isColumnConstNull(pattern_column)) + else if (isColumnConst(input_column) && isColumnConst(pattern_column)) // All columns are const { - ASSERT_COLUMN_EQ(createConstColumn>(5, {}), - executeFunction("regexp", input_column, pattern_column)); + if (isColumnConstNull(input_column) && isColumnConstNull(pattern_column)) + { + ASSERT_COLUMN_EQ(createOnlyNullColumnConst(col_size, {}), + executeFunction("regexp", input_column, pattern_column)); + } + else if ((isColumnConstNotNull(input_column) && isColumnConstNotNull(pattern_column))) + { + ASSERT_COLUMN_EQ(createConstColumn(col_size, 1), + executeFunction("regexp", input_column, pattern_column)); + } + else if (isColumnConstNull(input_column) || isColumnConstNull(pattern_column)) + { + DataTypePtr data_type = std::make_shared(std::make_shared>()); + auto col = data_type->createColumnConst(col_size, Null()); + + ASSERT_COLUMN_EQ(ColumnWithTypeAndName(std::move(col), data_type, ""), + executeFunction("regexp", input_column, pattern_column)); + } } - else if (isColumnConstNotNull(input_column) && isColumnConstNotNull(pattern_column)) + else if (isColumnConstNull(input_column) || isColumnConstNull(pattern_column)) { - ASSERT_COLUMN_EQ(createConstColumn(5, 1), + DataTypePtr data_type = std::make_shared(std::make_shared>()); + auto col = data_type->createColumn(); + for (size_t i = 0; i < col_size; i++) + col->insert(Null()); + + ASSERT_COLUMN_EQ(ColumnWithTypeAndName(std::move(col), data_type, ""), executeFunction("regexp", input_column, pattern_column)); } else From 8f89448a0dfc4564017a9961eefb5f0127d2c07f Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 24 Oct 2022 16:42:42 +0800 Subject: [PATCH 32/87] refine macro name --- dbms/src/Functions/FunctionsRegexp.h | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index e81ea211e33..56e485351a8 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -414,34 +414,34 @@ class Param #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process // Common method to convert nullable string column -// processed_col is impossible to be const here -#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ +// converted_col is impossible to be const here +#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ do \ { \ - size_t col_size = (processed_col)->size(); \ - if (((processed_col)->isColumnNullable())) \ + size_t col_size = (converted_col)->size(); \ + if (((converted_col)->isColumnNullable())) \ { \ - auto nested_ptr = static_cast(*(processed_col)).getNestedColumnPtr(); \ + auto nested_ptr = static_cast(*(converted_col)).getNestedColumnPtr(); \ const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ - const auto * null_map = &(static_cast(*(processed_col)).getNullMapData()); \ + const auto * null_map = &(static_cast(*(converted_col)).getNullMapData()); \ Param, true>(param_name)(col_size, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ - next_process; \ + next_convertion; \ } \ else \ { \ /* This is a pure string vector column */ \ - const auto * tmp = checkAndGetColumn(&(*(processed_col))); \ + const auto * tmp = checkAndGetColumn(&(*(converted_col))); \ Param, false>(param_name)(col_size, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ - next_process; \ + next_convertion; \ } \ } while (0); // Common method to convert const string column -#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, processed_col, next_process) \ +#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ do \ { \ - size_t col_size = (processed_col)->size(); \ - const auto * col_const = typeid_cast(&(*(processed_col))); \ + size_t col_size = (converted_col)->size(); \ + const auto * col_const = typeid_cast(&(*(converted_col))); \ if (col_const != nullptr) \ { \ auto col_const_data = col_const->getDataColumnPtr(); \ @@ -452,17 +452,17 @@ class Param { \ const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size()), null_map); \ - next_process; \ + next_convertion; \ } \ else \ { \ Param, false>(param_name)(col_size, col_const->getDataAt(0)); \ - next_process; \ + next_convertion; \ } \ } \ else \ { \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (processed_col), next_process) \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col), next_convertion) \ } \ } while (0); From 28c14a64e1f1e80cbcef5ab6c8906b170b808c9d Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 26 Oct 2022 11:31:41 +0800 Subject: [PATCH 33/87] pass collation --- dbms/src/Functions/tests/gtest_regexp.cpp | 84 ++++++++++++++++------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 0772df42cc1..1524d88de2b 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2197,6 +2197,17 @@ struct RegexpInstrCase , match_type(mt) {} + RegexpInstrCase(Int64 res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, Int64 ret_op = 0, const String & mt = "") + : result(res) + , null_map(null_map_) + , expression(expr) + , pattern(pat) + , position(pos) + , occurrence(occur) + , return_option(ret_op) + , match_type(mt) + {} + static std::vector getResultVec(const std::vector & test_cases) { std::vector vecs; @@ -2267,7 +2278,26 @@ struct RegexpInstrCase return vecs; } + static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) + { + results = getResultVec(test_cases); + switch (param_num) { + case 6: + match_types = getMatchTypeVec(test_cases); + case 5: + ret_ops = getRetOpVec(test_cases); + case 4: + occurs = getOccurVec(test_cases); + case 3: + positions = getPosVec(test_cases); + case 2: + pats = getPatVec(test_cases); + exprs = getExprVec(test_cases); + } + } + Int64 result; + std::vector null_map; String expression; String pattern; Int64 position; @@ -2402,9 +2432,7 @@ TEST_F(Regexp, RegexpInstr) {1, "", "^$"}, {0, "ab\naB", "^ab$"}, {3, "pp跑ppのaaa", "(跑|の|P)"}}; - exprs = RegexpInstrCase::getExprVec(test_cases); - patterns = RegexpInstrCase::getPatVec(test_cases); - results = RegexpInstrCase::getResultVec(test_cases); + RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2420,10 +2448,7 @@ TEST_F(Regexp, RegexpInstr) {3, "", "^$", 3}, {0, "ab\naB", "^ab$", 1}, {3, "pp跑ppのaaa", "(跑|の|P)", 2}}; - exprs = RegexpInstrCase::getExprVec(test_cases); - patterns = RegexpInstrCase::getPatVec(test_cases); - positions = RegexpInstrCase::getPosVec(test_cases); - results = RegexpInstrCase::getResultVec(test_cases); + RegexpInstrCase::setVecsWithoutNullMap(3, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2439,11 +2464,7 @@ TEST_F(Regexp, RegexpInstr) {0, "\n", ".", 1, 1}, {0, "", "^$", 3, 2}, {0, "ab\naB", "^ab$", 1, 1}, {6, "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; - exprs = RegexpInstrCase::getExprVec(test_cases); - patterns = RegexpInstrCase::getPatVec(test_cases); - positions = RegexpInstrCase::getPosVec(test_cases); - occurs = RegexpInstrCase::getOccurVec(test_cases); - results = RegexpInstrCase::getResultVec(test_cases); + RegexpInstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2461,12 +2482,7 @@ TEST_F(Regexp, RegexpInstr) {0, "", "^$", 3, 2, 1}, {0, "ab\naB", "^ab$", 1, 1, 1}, {7, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1}}; - exprs = RegexpInstrCase::getExprVec(test_cases); - patterns = RegexpInstrCase::getPatVec(test_cases); - positions = RegexpInstrCase::getPosVec(test_cases); - occurs = RegexpInstrCase::getOccurVec(test_cases); - return_options = RegexpInstrCase::getRetOpVec(test_cases); - results = RegexpInstrCase::getResultVec(test_cases); + RegexpInstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2483,14 +2499,9 @@ TEST_F(Regexp, RegexpInstr) {7, "aaaaaa", "aa", 3, 2, 1, ""}, {2, "\n", ".", 1, 1, 1, "s"}, {0, "", "^$", 3, 2, 1, ""}, - {3, "ab\naB", "^ab$", 1, 1, 1, "mi"}, + {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; - exprs = RegexpInstrCase::getExprVec(test_cases); - patterns = RegexpInstrCase::getPatVec(test_cases); - positions = RegexpInstrCase::getPosVec(test_cases); - occurs = RegexpInstrCase::getOccurVec(test_cases); - return_options = RegexpInstrCase::getRetOpVec(test_cases); - match_types = RegexpInstrCase::getMatchTypeVec(test_cases); + RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); results = RegexpInstrCase::getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( @@ -2502,7 +2513,28 @@ TEST_F(Regexp, RegexpInstr) createColumn(return_options), createColumn(match_types))); - // TODO collation + // test collation + const auto * utf8mb4_general_ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + test_cases = {{2, "ttiFl", "tifl", 1, 1, 0, ""}, + {0, "ttiFl", "tifl", 1, 1, 0, "c"}, + {2, "ttiFl", "tifl", 1, 1, 0, "i"}, + {2, "ttiFl", "tifl", 1, 1, 0, "ci"}, + {0, "ttiFl", "tifl", 1, 1, 0, "ic"}, + {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, + {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; + RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + results = RegexpInstrCase::getResultVec(test_cases); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + {createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createColumn(match_types)}, + utf8mb4_general_ci_collator)); + } // Test: Invalid parameter handling From 3400eb60d9471bc086164535effa91d52c7bc073 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 26 Oct 2022 14:52:52 +0800 Subject: [PATCH 34/87] unport replace and make memorization in multi-threads --- dbms/src/Functions/FunctionsRegexp.cpp | 786 +------------------ dbms/src/Functions/FunctionsRegexp.h | 292 +------ dbms/src/Functions/FunctionsStringReplace.h | 295 +++++++ dbms/src/Functions/FunctionsStringSearch.cpp | 736 ++++++++++++++++- dbms/src/Functions/re2Util.cpp | 4 +- dbms/src/Functions/re2Util.h | 2 +- dbms/src/Functions/tests/gtest_regexp.cpp | 2 +- 7 files changed, 1074 insertions(+), 1043 deletions(-) create mode 100644 dbms/src/Functions/FunctionsStringReplace.h diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index fc80f3e513d..5d5643fa581 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -34,36 +34,36 @@ std::set valid_flags{flag_i, flag_c, flag_m, flag_s}; // If characters specifying contradictory options are specified // within match_type, the rightmost one takes precedence. -String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator) -{ - std::set applied_flags; - if (collator != nullptr && collator->isCI()) - applied_flags.insert(flag_i); - - for (auto flag : match_type) - { - auto iter = valid_flags.find(flag); - if (iter == valid_flags.end()) - throw Exception(fmt::format("Invalid match type '{}' in regexp function", flag)); - - // re2 is case-sensitive by default, so we only need to delete 'i' flag - // to enable the case-sensitive for the regexp - if (flag == flag_c) - { - applied_flags.erase(flag_i); - continue; - } - - applied_flags.insert(flag); - } - - // generate match type flag - String flags; - for (auto flag : applied_flags) - flags += flag; - - return flags; -} +// String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator) +// { +// std::set applied_flags; +// if (collator != nullptr && collator->isCI()) +// applied_flags.insert(flag_i); + +// for (auto flag : match_type) +// { +// auto iter = valid_flags.find(flag); +// if (iter == valid_flags.end()) +// throw Exception(fmt::format("Invalid match type '{}' in regexp function", flag)); + +// // re2 is case-sensitive by default, so we only need to delete 'i' flag +// // to enable the case-sensitive for the regexp +// if (flag == flag_c) +// { +// applied_flags.erase(flag_i); +// continue; +// } + +// applied_flags.insert(flag); +// } + +// // generate match type flag +// String flags; +// for (auto flag : applied_flags) +// flags += flag; + +// return flags; +// } /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. * 'replacement' could contain substitutions, for example: '\2-\3-\1' @@ -329,739 +329,13 @@ struct ReplaceRegexpImpl } }; -/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. - */ -template -struct ReplaceStringImpl -{ - static constexpr bool support_non_const_needle = true; - static constexpr bool support_non_const_replacement = true; - /// need customized escape char during the string search - static const bool need_customized_escape_char = false; - /// support match type during the string search, used in regexp - static const bool support_match_type = false; - - static void vector(const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); - - if (needle.empty()) - { - /// Copy all the data without changing. - res_data.resize(data.size()); - memcpy(&res_data[0], begin, data.size()); - memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); - return; - } - - /// The current index in the array of strings. - size_t i = 0; - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - - /// Determine which index it belongs to. - while (i < offsets.size() && begin + offsets[i] <= match) - { - res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); - ++i; - } - res_offset += (match - pos); - - /// If you have reached the end, it's time to stop - if (i == offsets.size()) - break; - - /// Is it true that this line no longer needs to perform transformations. - bool can_finish_current_string = false; - - /// We check that the entry does not go through the boundaries of strings. - if (match + needle.size() < begin + offsets[i]) - { - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle.size(); - if (replace_one) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - res_data.resize(res_data.size() + (begin + offsets[i] - pos)); - memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); - res_offset += (begin + offsets[i] - pos); - res_offsets[i] = res_offset; - pos = begin + offsets[i]; - ++i; - } - } - } - - static void vectorNonConstNeedle( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(offsets.size()); - - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < offsets.size(); ++i) - { - auto data_offset = StringUtil::offsetAt(offsets, i); - auto data_size = StringUtil::sizeAt(offsets, i); - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - const UInt8 * begin = &data[data_offset]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data_size; - - if (needle_size == 0) - { - /// Copy the whole data to res without changing - res_data.resize(res_data.size() + data_size); - memcpy(&res_data[res_offset], begin, data_size); - res_offset += data_size; - res_offsets[i] = res_offset; - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - if (match == end) - { - /// It's time to stop. - break; - } - - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle_size; - - if (replace_one) - { - /// Copy the rest of data and stop. - res_data.resize(res_data.size() + (end - pos)); - memcpy(&res_data[res_offset], pos, (end - pos)); - res_offset += (end - pos); - break; - } - } - res_offsets[i] = res_offset; - } - } - - static void vectorNonConstReplacement( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); - - if (needle.empty()) - { - /// Copy all the data without changing. - res_data.resize(data.size()); - memcpy(&res_data[0], begin, data.size()); - memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); - return; - } - - /// The current index in the array of strings. - size_t i = 0; - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - - /// Determine which index it belongs to. - while (i < offsets.size() && begin + offsets[i] <= match) - { - res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); - ++i; - } - res_offset += (match - pos); - - /// If you have reached the end, it's time to stop - if (i == offsets.size()) - break; - - /// Is it true that this line no longer needs to perform transformations. - bool can_finish_current_string = false; - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - /// We check that the entry does not go through the boundaries of strings. - if (match + needle.size() < begin + offsets[i]) - { - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle.size(); - if (replace_one) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - res_data.resize(res_data.size() + (begin + offsets[i] - pos)); - memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); - res_offset += (begin + offsets[i] - pos); - res_offsets[i] = res_offset; - pos = begin + offsets[i]; - ++i; - } - } - } - - static void vectorNonConstNeedleReplacement( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(offsets.size()); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < offsets.size(); ++i) - { - auto data_offset = StringUtil::offsetAt(offsets, i); - auto data_size = StringUtil::sizeAt(offsets, i); - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - const UInt8 * begin = &data[data_offset]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data_size; - - if (needle_size == 0) - { - res_data.resize(res_data.size() + data_size); - memcpy(&res_data[res_offset], begin, data_size); - res_offset += data_size; - res_offsets[i] = res_offset; - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy the data without changing. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - if (match == end) - { - /// It's time to stop. - break; - } - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle_size; - - if (replace_one) - { - /// Copy the rest of data and stop. - res_data.resize(res_data.size() + (end - pos)); - memcpy(&res_data[res_offset], pos, (end - pos)); - res_offset += (end - pos); - break; - } - } - res_offsets[i] = res_offset; - } - } - - /// Note: this function converts fixed-length strings to variable-length strings - /// and each variable-length string should ends with zero byte. - static void vectorFixed(const ColumnString::Chars_t & data, - size_t n, - const std::string & needle, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - - /// The current index in the string array. - size_t i = 0; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = begin + n * (i + 1) - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = begin + n * (i + 1); \ - ++i; \ - } while (false) - - if (needle.empty()) - { - /// Copy all the data without changing. - while (i < count) - { - COPY_REST_OF_CURRENT_STRING(); - } - return; - } - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy skipped strings without any changes but - /// add zero byte to the end of each string. - while (i < count && begin + n * (i + 1) <= match) - { - COPY_REST_OF_CURRENT_STRING(); - } - - /// If you have reached the end, it's time to stop - if (i == count) - break; - - /// Copy unchanged part of current string. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += (match - pos); - - /// Is it true that this line no longer needs to perform conversions. - bool can_finish_current_string = false; - - /// We check that the entry does not pass through the boundaries of strings. - if (match + needle.size() <= begin + n * (i + 1)) - { - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle.size(); - if (replace_one || pos == begin + n * (i + 1)) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - COPY_REST_OF_CURRENT_STRING(); - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void vectorFixedNonConstNeedle( - const ColumnString::Chars_t & data, - size_t n, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const std::string & replacement, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < count; ++i) - { - const UInt8 * begin = &data[i * n]; - const UInt8 * pos = begin; - const UInt8 * end = pos + n; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = end - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = end; \ - } while (false) - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - if (needle_size == 0) - { - COPY_REST_OF_CURRENT_STRING(); - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - if (match == end) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - res_data.resize(res_data.size() + replacement.size()); - memcpy(&res_data[res_offset], replacement.data(), replacement.size()); - res_offset += replacement.size(); - pos = match + needle_size; - - if (replace_one) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void vectorFixedNonConstReplacement( - const ColumnString::Chars_t & data, - size_t n, - const std::string & needle, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - const UInt8 * begin = &data[0]; - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); - - ColumnString::Offset res_offset = 0; - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - - /// The current index in the string array. - size_t i = 0; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = begin + n * (i + 1) - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = begin + n * (i + 1); \ - ++i; \ - } while (false) - - if (needle.empty()) - { - /// Copy all the data without changing. - while (i < count) - { - COPY_REST_OF_CURRENT_STRING(); - } - return; - } - - Volnitsky searcher(needle.data(), needle.size(), end - pos); - - /// We will search for the next occurrence in all rows at once. - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - /// Copy skipped strings without any changes but - /// add zero byte to the end of each string. - while (i < count && begin + n * (i + 1) <= match) - { - COPY_REST_OF_CURRENT_STRING(); - } - - /// If you have reached the end, it's time to stop - if (i == count) - break; - - /// Copy unchanged part of current string. - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += (match - pos); - - /// Is it true that this line no longer needs to perform conversions. - bool can_finish_current_string = false; - - /// We check that the entry does not pass through the boundaries of strings. - if (match + needle.size() <= begin + n * (i + 1)) - { - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle.size(); - if (replace_one || pos == begin + n * (i + 1)) - can_finish_current_string = true; - } - else - { - pos = match; - can_finish_current_string = true; - } - - if (can_finish_current_string) - { - COPY_REST_OF_CURRENT_STRING(); - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void vectorFixedNonConstNeedleReplacement( - const ColumnString::Chars_t & data, - size_t n, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < count; ++i) - { - const UInt8 * begin = &data[i * n]; - const UInt8 * pos = begin; - const UInt8 * end = pos + n; - -#define COPY_REST_OF_CURRENT_STRING() \ - do \ - { \ - const size_t len = end - pos; \ - res_data.resize(res_data.size() + len + 1); \ - memcpy(&res_data[res_offset], pos, len); \ - res_offset += len; \ - res_data[res_offset++] = 0; \ - res_offsets[i] = res_offset; \ - pos = end; \ - } while (false) - - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - - if (needle_size == 0) - { - COPY_REST_OF_CURRENT_STRING(); - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); - while (pos < end) - { - const UInt8 * match = searcher.search(pos, end - pos); - - if (match == end) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - - /// Copy the data without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos = match + needle_size; - - if (replace_one) - { - COPY_REST_OF_CURRENT_STRING(); - break; - } - } -#undef COPY_REST_OF_CURRENT_STRING - } - } - - static void constant(const std::string & data, const std::string & needle, const std::string & replacement, const Int64 & /* pos */, const Int64 & /* occ */, const std::string & /* match_type */, TiDB::TiDBCollatorPtr /* collator */, std::string & res_data) - { - if (needle.empty()) - { - res_data = data; - return; - } - res_data = ""; - int replace_cnt = 0; - for (size_t i = 0; i < data.size(); ++i) - { - bool match = true; - if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0)) - match = false; - for (size_t j = 0; match && j < needle.size(); ++j) - if (data[i + j] != needle[j]) - match = false; - if (match) - { - ++replace_cnt; - res_data += replacement; - i = i + needle.size() - 1; - } - else - res_data += data[i]; - } - } -}; - using FunctionTiDBRegexp = FunctionStringRegexp; using FunctionRegexpLike = FunctionStringRegexp; -using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; -using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; void registerFunctionsRegexp(FunctionFactory & factory) { - factory.registerFunction(); - factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 56e485351a8..2f0ffe649cf 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -72,14 +73,6 @@ struct NameRegexpLike { static constexpr auto name = "regexp_like"; }; -struct NameReplaceOne -{ - static constexpr auto name = "replaceOne"; -}; -struct NameReplaceAll -{ - static constexpr auto name = "replaceAll"; -}; struct NameReplaceRegexpOne { static constexpr auto name = "replaceRegexpOne"; @@ -92,7 +85,7 @@ struct NameReplaceRegexpAll static constexpr std::string_view regexp_name(NameTiDBRegexp::name); static constexpr std::string_view regexp_like_name(NameRegexpLike::name); -String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr); +// String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr); inline int getDefaultFlags() { @@ -103,10 +96,10 @@ inline int getDefaultFlags() inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { - String flags = getMatchType(match_type, collator); - if (flags.empty()) + String mode = re2Util::getRE2ModeModifiers(match_type, collator); + if (mode.empty()) return pattern; - return fmt::format("(?{}){}", flags, pattern); + return fmt::format("{}{}", mode, pattern); } inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) @@ -479,7 +472,7 @@ class FunctionStringRegexpBase static constexpr size_t REGEXP_SUBSTR_MAX_PARAM_NUM = 5; template - void memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const + std::unique_ptr memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const { String final_pattern = pat_param.getString(0); if (unlikely(final_pattern.empty())) @@ -489,7 +482,7 @@ class FunctionStringRegexpBase final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); int flags = getDefaultFlags(); - memorized_re = std::make_unique(final_pattern, flags); + return std::make_unique(final_pattern, flags); } // Check if we can memorize the regexp @@ -499,8 +492,6 @@ class FunctionStringRegexpBase return (PatT::isConst() && MatchTypeT::isConst()); } - bool isMemorized() const { return memorized_re != nullptr; } - const std::unique_ptr & getRegexp() const { return memorized_re; } private: @@ -627,10 +618,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase return; } - // Check memorization - if constexpr (canMemorize()) - memorize(pat_param, match_type_param, collator); - // Initialize result column auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); @@ -639,9 +626,9 @@ class FunctionStringRegexp : public FunctionStringRegexpBase constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || MatchTypeT::isNullableCol(); // Start to match - if (isMemorized()) + if constexpr (canMemorize()) { - const auto & regexp = getRegexp(); + const auto & regexp = memorize(pat_param, match_type_param, collator); if constexpr (has_nullable_col) { // expr column must be a nullable column here, so we need to check null for each elems @@ -801,267 +788,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef CONVERT_MATCH_TYPE_COL_TO_PARAM #undef EXECUTE_REGEXP_LIKE -template -class FunctionStringReplace : public IFunction -{ -public: - static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) - { - return std::make_shared(); - } - - String getName() const override - { - return name; - } - - size_t getNumberOfArguments() const override - { - return 0; - } - - bool isVariadic() const override { return true; } - bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - return {3, 4, 5}; - } - else if constexpr (Impl::support_non_const_needle) - { - return {2, 3, 4, 5}; - } - else if constexpr (Impl::support_non_const_replacement) - { - return {1, 3, 4, 5}; - } - else - { - return {1, 2, 3, 4, 5}; - } - } - void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (!arguments[0]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (!arguments[1]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (!arguments[2]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (arguments.size() > 3 && !arguments[3]->isInteger()) - throw Exception("Illegal type " + arguments[2]->getName() + " of forth argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (arguments.size() > 4 && !arguments[4]->isInteger()) - throw Exception("Illegal type " + arguments[2]->getName() + " of fifth argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - if (arguments.size() > 5 && !arguments[5]->isStringOrFixedString()) - throw Exception("Illegal type " + arguments[2]->getName() + " of sixth argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - return std::make_shared(); - } - - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override - { - const ColumnPtr & column_src = block.getByPosition(arguments[0]).column; - const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; - const ColumnPtr & column_replacement = block.getByPosition(arguments[2]).column; - const ColumnPtr column_pos = arguments.size() > 3 ? block.getByPosition(arguments[3]).column : nullptr; - const ColumnPtr column_occ = arguments.size() > 4 ? block.getByPosition(arguments[4]).column : nullptr; - const ColumnPtr column_match_type = arguments.size() > 5 ? block.getByPosition(arguments[5]).column : nullptr; - - if ((column_pos != nullptr && !column_pos->isColumnConst()) - || (column_occ != nullptr && !column_occ->isColumnConst()) - || (column_match_type != nullptr && !column_match_type->isColumnConst())) - throw Exception("4th, 5th, 6th arguments of function " + getName() + " must be constants."); - Int64 pos = column_pos == nullptr ? 1 : typeid_cast(column_pos.get())->getInt(0); - Int64 occ = column_occ == nullptr ? 0 : typeid_cast(column_occ.get())->getInt(0); - String match_type = column_match_type == nullptr ? "" : typeid_cast(column_match_type.get())->getValue(); - - ColumnWithTypeAndName & column_result = block.getByPosition(result); - - bool needle_const = column_needle->isColumnConst(); - bool replacement_const = column_replacement->isColumnConst(); - - if (needle_const && replacement_const) - { - executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - else if (needle_const) - { - executeImplNonConstReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - else if (replacement_const) - { - executeImplNonConstNeedle(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - else - { - executeImplNonConstNeedleReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); - } - } - -private: - void executeImpl( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos, - Int64 occ, - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - const auto * c1_const = typeid_cast(column_needle.get()); - const auto * c2_const = typeid_cast(column_replacement.get()); - auto needle = c1_const->getValue(); - auto replacement = c2_const->getValue(); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - - void executeImplNonConstNeedle( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement_const = typeid_cast(column_replacement.get()); - auto replacement = col_replacement_const->getValue(); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 2 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplNonConstReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_replacement) - { - const auto * col_needle_const = typeid_cast(column_needle.get()); - auto needle = col_needle_const->getValue(); - const auto * col_replacement = typeid_cast(column_replacement.get()); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplNonConstNeedleReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); - - if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 2 and 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - TiDB::TiDBCollatorPtr collator{}; -}; - #undef CONVERT_CONST_STR_COL_TO_PARAM #undef CONVERT_NULL_STR_COL_TO_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h new file mode 100644 index 00000000000..5bb39f52902 --- /dev/null +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -0,0 +1,295 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "Columns/IColumn.h" +#include "Common/Exception.h" +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int ILLEGAL_COLUMN; +} // namespace ErrorCodes + +template +class FunctionStringReplace : public IFunction +{ +public: + static constexpr auto name = Name::name; + static FunctionPtr create(const Context &) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override + { + return 0; + } + + bool isVariadic() const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + return {3, 4, 5}; + } + else if constexpr (Impl::support_non_const_needle) + { + return {2, 3, 4, 5}; + } + else if constexpr (Impl::support_non_const_replacement) + { + return {1, 3, 4, 5}; + } + else + { + return {1, 2, 3, 4, 5}; + } + } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!arguments[0]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (!arguments[1]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (!arguments[2]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() > 3 && !arguments[3]->isInteger()) + throw Exception("Illegal type " + arguments[2]->getName() + " of forth argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() > 4 && !arguments[4]->isInteger()) + throw Exception("Illegal type " + arguments[2]->getName() + " of fifth argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() > 5 && !arguments[5]->isStringOrFixedString()) + throw Exception("Illegal type " + arguments[2]->getName() + " of sixth argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + const ColumnPtr & column_src = block.getByPosition(arguments[0]).column; + const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; + const ColumnPtr & column_replacement = block.getByPosition(arguments[2]).column; + const ColumnPtr column_pos = arguments.size() > 3 ? block.getByPosition(arguments[3]).column : nullptr; + const ColumnPtr column_occ = arguments.size() > 4 ? block.getByPosition(arguments[4]).column : nullptr; + const ColumnPtr column_match_type = arguments.size() > 5 ? block.getByPosition(arguments[5]).column : nullptr; + + if ((column_pos != nullptr && !column_pos->isColumnConst()) + || (column_occ != nullptr && !column_occ->isColumnConst()) + || (column_match_type != nullptr && !column_match_type->isColumnConst())) + throw Exception("4th, 5th, 6th arguments of function " + getName() + " must be constants."); + Int64 pos = column_pos == nullptr ? 1 : typeid_cast(column_pos.get())->getInt(0); + Int64 occ = column_occ == nullptr ? 0 : typeid_cast(column_occ.get())->getInt(0); + String match_type = column_match_type == nullptr ? "" : typeid_cast(column_match_type.get())->getValue(); + + ColumnWithTypeAndName & column_result = block.getByPosition(result); + + bool needle_const = column_needle->isColumnConst(); + bool replacement_const = column_replacement->isColumnConst(); + + if (needle_const && replacement_const) + { + executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + else if (needle_const) + { + executeImplNonConstReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + else if (replacement_const) + { + executeImplNonConstNeedle(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + else + { + executeImplNonConstNeedleReplacement(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); + } + } + +private: + void executeImpl( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos, + Int64 occ, + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + const auto * c1_const = typeid_cast(column_needle.get()); + const auto * c2_const = typeid_cast(column_replacement.get()); + auto needle = c1_const->getValue(); + auto replacement = c2_const->getValue(); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + + void executeImplNonConstNeedle( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement_const = typeid_cast(column_replacement.get()); + auto replacement = col_replacement_const->getValue(); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 2 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + void executeImplNonConstReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_replacement) + { + const auto * col_needle_const = typeid_cast(column_needle.get()); + auto needle = col_needle_const->getValue(); + const auto * col_replacement = typeid_cast(column_replacement.get()); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + void executeImplNonConstNeedleReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); + + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 2 and 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + TiDB::TiDBCollatorPtr collator{}; +}; +} // namespace DB diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index d4f6281b7b5..9dfb9e30ca7 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -767,6 +768,728 @@ struct ExtractImpl } }; +/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. + */ +template +struct ReplaceStringImpl +{ + static constexpr bool support_non_const_needle = true; + static constexpr bool support_non_const_replacement = true; + /// need customized escape char during the string search + static const bool need_customized_escape_char = false; + /// support match type during the string search, used in regexp + static const bool support_match_type = false; + + static void vector(const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const std::string & needle, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + res_data.reserve(data.size()); + size_t size = offsets.size(); + res_offsets.resize(size); + + if (needle.empty()) + { + /// Copy all the data without changing. + res_data.resize(data.size()); + memcpy(&res_data[0], begin, data.size()); + memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); + return; + } + + /// The current index in the array of strings. + size_t i = 0; + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + + /// Determine which index it belongs to. + while (i < offsets.size() && begin + offsets[i] <= match) + { + res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); + ++i; + } + res_offset += (match - pos); + + /// If you have reached the end, it's time to stop + if (i == offsets.size()) + break; + + /// Is it true that this line no longer needs to perform transformations. + bool can_finish_current_string = false; + + /// We check that the entry does not go through the boundaries of strings. + if (match + needle.size() < begin + offsets[i]) + { + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle.size(); + if (replace_one) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + res_data.resize(res_data.size() + (begin + offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); + res_offset += (begin + offsets[i] - pos); + res_offsets[i] = res_offset; + pos = begin + offsets[i]; + ++i; + } + } + } + + static void vectorNonConstNeedle( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(offsets.size()); + + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(offsets, i); + auto data_size = StringUtil::sizeAt(offsets, i); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + /// Copy the whole data to res without changing + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + /// It's time to stop. + break; + } + + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle_size; + + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + res_offsets[i] = res_offset; + } + } + + static void vectorNonConstReplacement( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const std::string & needle, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + res_data.reserve(data.size()); + size_t size = offsets.size(); + res_offsets.resize(size); + + if (needle.empty()) + { + /// Copy all the data without changing. + res_data.resize(data.size()); + memcpy(&res_data[0], begin, data.size()); + memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64)); + return; + } + + /// The current index in the array of strings. + size_t i = 0; + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + + /// Determine which index it belongs to. + while (i < offsets.size() && begin + offsets[i] <= match) + { + res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); + ++i; + } + res_offset += (match - pos); + + /// If you have reached the end, it's time to stop + if (i == offsets.size()) + break; + + /// Is it true that this line no longer needs to perform transformations. + bool can_finish_current_string = false; + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + /// We check that the entry does not go through the boundaries of strings. + if (match + needle.size() < begin + offsets[i]) + { + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle.size(); + if (replace_one) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + res_data.resize(res_data.size() + (begin + offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); + res_offset += (begin + offsets[i] - pos); + res_offsets[i] = res_offset; + pos = begin + offsets[i]; + ++i; + } + } + } + + static void vectorNonConstNeedleReplacement( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(offsets.size()); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(offsets, i); + auto data_size = StringUtil::sizeAt(offsets, i); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + /// It's time to stop. + break; + } + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle_size; + + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + res_offsets[i] = res_offset; + } + } + + /// Note: this function converts fixed-length strings to variable-length strings + /// and each variable-length string should ends with zero byte. + static void vectorFixed(const ColumnString::Chars_t & data, + size_t n, + const std::string & needle, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + + /// The current index in the string array. + size_t i = 0; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = begin + n * (i + 1) - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = begin + n * (i + 1); \ + ++i; \ + } while (false) + + if (needle.empty()) + { + /// Copy all the data without changing. + while (i < count) + { + COPY_REST_OF_CURRENT_STRING(); + } + return; + } + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy skipped strings without any changes but + /// add zero byte to the end of each string. + while (i < count && begin + n * (i + 1) <= match) + { + COPY_REST_OF_CURRENT_STRING(); + } + + /// If you have reached the end, it's time to stop + if (i == count) + break; + + /// Copy unchanged part of current string. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += (match - pos); + + /// Is it true that this line no longer needs to perform conversions. + bool can_finish_current_string = false; + + /// We check that the entry does not pass through the boundaries of strings. + if (match + needle.size() <= begin + n * (i + 1)) + { + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle.size(); + if (replace_one || pos == begin + n * (i + 1)) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + COPY_REST_OF_CURRENT_STRING(); + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void vectorFixedNonConstNeedle( + const ColumnString::Chars_t & data, + size_t n, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < count; ++i) + { + const UInt8 * begin = &data[i * n]; + const UInt8 * pos = begin; + const UInt8 * end = pos + n; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = end - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = end; \ + } while (false) + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + if (needle_size == 0) + { + COPY_REST_OF_CURRENT_STRING(); + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + if (match == end) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle_size; + + if (replace_one) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void vectorFixedNonConstReplacement( + const ColumnString::Chars_t & data, + size_t n, + const std::string & needle, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data.size(); + + ColumnString::Offset res_offset = 0; + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + + /// The current index in the string array. + size_t i = 0; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = begin + n * (i + 1) - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = begin + n * (i + 1); \ + ++i; \ + } while (false) + + if (needle.empty()) + { + /// Copy all the data without changing. + while (i < count) + { + COPY_REST_OF_CURRENT_STRING(); + } + return; + } + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy skipped strings without any changes but + /// add zero byte to the end of each string. + while (i < count && begin + n * (i + 1) <= match) + { + COPY_REST_OF_CURRENT_STRING(); + } + + /// If you have reached the end, it's time to stop + if (i == count) + break; + + /// Copy unchanged part of current string. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += (match - pos); + + /// Is it true that this line no longer needs to perform conversions. + bool can_finish_current_string = false; + + /// We check that the entry does not pass through the boundaries of strings. + if (match + needle.size() <= begin + n * (i + 1)) + { + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle.size(); + if (replace_one || pos == begin + n * (i + 1)) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + COPY_REST_OF_CURRENT_STRING(); + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void vectorFixedNonConstNeedleReplacement( + const ColumnString::Chars_t & data, + size_t n, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + size_t count = data.size() / n; + res_data.reserve(data.size()); + res_offsets.resize(count); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < count; ++i) + { + const UInt8 * begin = &data[i * n]; + const UInt8 * pos = begin; + const UInt8 * end = pos + n; + +#define COPY_REST_OF_CURRENT_STRING() \ + do \ + { \ + const size_t len = end - pos; \ + res_data.resize(res_data.size() + len + 1); \ + memcpy(&res_data[res_offset], pos, len); \ + res_offset += len; \ + res_data[res_offset++] = 0; \ + res_offsets[i] = res_offset; \ + pos = end; \ + } while (false) + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + if (needle_size == 0) + { + COPY_REST_OF_CURRENT_STRING(); + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, n); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + if (match == end) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle_size; + + if (replace_one) + { + COPY_REST_OF_CURRENT_STRING(); + break; + } + } +#undef COPY_REST_OF_CURRENT_STRING + } + } + + static void constant(const std::string & data, const std::string & needle, const std::string & replacement, const Int64 & /* pos */, const Int64 & /* occ */, const std::string & /* match_type */, TiDB::TiDBCollatorPtr /* collator */, std::string & res_data) + { + if (needle.empty()) + { + res_data = data; + return; + } + res_data = ""; + int replace_cnt = 0; + for (size_t i = 0; i < data.size(); ++i) + { + bool match = true; + if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0)) + match = false; + for (size_t j = 0; match && j < needle.size(); ++j) + if (data[i + j] != needle[j]) + match = false; + if (match) + { + ++replace_cnt; + res_data += replacement; + i = i + needle.size() - 1; + } + else + res_data += data[i]; + } + } +}; + struct NameLike { static constexpr auto name = "like"; @@ -804,6 +1527,14 @@ struct NameExtract { static constexpr auto name = "extract"; }; +struct NameReplaceOne +{ + static constexpr auto name = "replaceOne"; +}; +struct NameReplaceAll +{ + static constexpr auto name = "replaceAll"; +}; // using FunctionPosition = FunctionsStringSearch, NamePosition>; using FunctionPositionUTF8 = FunctionsStringSearch, NamePositionUTF8>; @@ -816,10 +1547,13 @@ using FunctionLike = FunctionsStringSearch, NameLike>; using FunctionLike3Args = FunctionsStringSearch, NameLike3Args>; using FunctionNotLike = FunctionsStringSearch, NameNotLike>; using FunctionExtract = FunctionsStringSearchToString; - +using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; +using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; void registerFunctionsStringSearch(FunctionFactory & factory) { + factory.registerFunction(); + factory.registerFunction(); // factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/re2Util.cpp b/dbms/src/Functions/re2Util.cpp index 21aa7ce09f8..b74fdb58bdc 100644 --- a/dbms/src/Functions/re2Util.cpp +++ b/dbms/src/Functions/re2Util.cpp @@ -27,6 +27,8 @@ re2_st::RE2::Options getDefaultRe2Options() return options; } +// If characters specifying contradictory options are specified +// within match_type, the rightmost one takes precedence. String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator) { /// for regexp only ci/cs is supported @@ -58,7 +60,7 @@ String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBColla options.set_one_line(false); break; default: - throw Exception("Incorrect arguments to regexp related functions."); + throw Exception("Invalid match type in regexp related functions."); } } } diff --git a/dbms/src/Functions/re2Util.h b/dbms/src/Functions/re2Util.h index f91a9e3ab9e..4a7f44a5ac5 100644 --- a/dbms/src/Functions/re2Util.h +++ b/dbms/src/Functions/re2Util.h @@ -33,6 +33,6 @@ namespace DB namespace re2Util { re2_st::RE2::Options getDefaultRe2Options(); -String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator); +String getRE2ModeModifiers(const std::string & match_type, const TiDB::TiDBCollatorPtr collator = nullptr); } // namespace re2Util } // namespace DB diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 24df4483604..101b28fb50d 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2396,4 +2396,4 @@ TEST_F(Regexp, testRegexpReplace) } } } // namespace tests -} // namespace DB \ No newline at end of file +} // namespace DB From 640461dffac58ce659d6ee358125f230ab458da7 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 26 Oct 2022 15:10:47 +0800 Subject: [PATCH 35/87] remove ParamDefault --- dbms/src/Functions/FunctionsRegexp.h | 152 +++++++------------ dbms/src/Functions/FunctionsStringReplace.h | 5 +- dbms/src/Functions/FunctionsStringSearch.cpp | 2 +- 3 files changed, 59 insertions(+), 100 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 2f0ffe649cf..9c6c4dae87a 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -22,11 +22,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include @@ -115,45 +115,6 @@ inline constexpr bool check_int_type() return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); } -// Use this type when param is not provided -class ParamDefault -{ -public: - explicit ParamDefault(Int64 val) - : default_int(val) - , default_string("") - {} - explicit ParamDefault(const StringRef & str) - : default_int(0) - , default_string(str) - {} - - // For passing compilation - explicit ParamDefault(const void *) - : default_int(0) - , default_string("") - { - throw Exception("Shouldn't call this constructor"); - } - - // For passing compilation - ParamDefault(const void *, const void *) - : default_int(0) - , default_string("") - { - throw Exception("Shouldn't call this constructor"); - } - - Int64 getInt(size_t) const { return default_int; } - static String getString(size_t) { return String(""); } - void getStringRef(size_t, StringRef &) const {} - constexpr static bool isConst() { return true; } - -private: - Int64 default_int; - StringRef default_string; -}; - template class ParamString { @@ -395,6 +356,7 @@ class Param #define MATCH_TYPE_COL_PTR_VAR_NAME col_match_type #define RES_ARG_VAR_NAME res_arg +#define COL_SIZE_VAR_NAME col_size #define EXPR_PARAM_VAR_NAME expr_param #define PAT_PARAM_VAR_NAME pat_param @@ -404,59 +366,57 @@ class Param #define ARG_NUM_VAR_NAME arg_num // Unify the name of functions that actually execute regexp -#define REGEXP_CLASS_MEM_FUNC_IMPL_NAME process +#define REGEXP_CLASS_MEM_FUNC_IMPL_NAME executeRegexpFunc // Common method to convert nullable string column // converted_col is impossible to be const here -#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ - do \ - { \ - size_t col_size = (converted_col)->size(); \ - if (((converted_col)->isColumnNullable())) \ - { \ - auto nested_ptr = static_cast(*(converted_col)).getNestedColumnPtr(); \ - const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ - const auto * null_map = &(static_cast(*(converted_col)).getNullMapData()); \ - Param, true>(param_name)(col_size, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ - next_convertion; \ - } \ - else \ - { \ - /* This is a pure string vector column */ \ - const auto * tmp = checkAndGetColumn(&(*(converted_col))); \ - Param, false>(param_name)(col_size, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ - next_convertion; \ - } \ +#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ + do \ + { \ + if (((converted_col)->isColumnNullable())) \ + { \ + auto nested_ptr = static_cast(*(converted_col)).getNestedColumnPtr(); \ + const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ + const auto * null_map = &(static_cast(*(converted_col)).getNullMapData()); \ + Param, true>(param_name)(COL_SIZE_VAR_NAME, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ + next_convertion; \ + } \ + else \ + { \ + /* This is a pure string vector column */ \ + const auto * tmp = checkAndGetColumn(&(*(converted_col))); \ + Param, false>(param_name)(COL_SIZE_VAR_NAME, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ + next_convertion; \ + } \ } while (0); // Common method to convert const string column -#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ - do \ - { \ - size_t col_size = (converted_col)->size(); \ - const auto * col_const = typeid_cast(&(*(converted_col))); \ - if (col_const != nullptr) \ - { \ - auto col_const_data = col_const->getDataColumnPtr(); \ - Field field; \ - col_const->get(0, field); \ - String tmp = field.isNull() ? String("") : field.safeGet(); \ - if (col_const_data->isColumnNullable()) \ - { \ - const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ - Param, true>(param_name)(col_size, StringRef(tmp.data(), tmp.size()), null_map); \ - next_convertion; \ - } \ - else \ - { \ - Param, false>(param_name)(col_size, col_const->getDataAt(0)); \ - next_convertion; \ - } \ - } \ - else \ - { \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col), next_convertion) \ - } \ +#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ + do \ + { \ + const auto * col_const = typeid_cast(&(*(converted_col))); \ + if (col_const != nullptr) \ + { \ + auto col_const_data = col_const->getDataColumnPtr(); \ + Field field; \ + col_const->get(0, field); \ + String tmp = field.isNull() ? String("") : field.safeGet(); \ + if (col_const_data->isColumnNullable()) \ + { \ + const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ + Param, true>(param_name)(COL_SIZE_VAR_NAME, StringRef(tmp.data(), tmp.size()), null_map); \ + next_convertion; \ + } \ + else \ + { \ + Param, false>(param_name)(COL_SIZE_VAR_NAME, col_const->getDataAt(0)); \ + next_convertion; \ + } \ + } \ + else \ + { \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col), next_convertion) \ + } \ } while (0); class FunctionStringRegexpBase @@ -471,6 +431,9 @@ class FunctionStringRegexpBase static constexpr size_t REGEXP_REPLACE_MAX_PARAM_NUM = 6; static constexpr size_t REGEXP_SUBSTR_MAX_PARAM_NUM = 5; + // We should pre compile the regular expression when: + // - only pattern column is provided and it's a constant column + // - pattern and match type columns are provided and they are both constant columns template std::unique_ptr memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const { @@ -491,14 +454,6 @@ class FunctionStringRegexpBase { return (PatT::isConst() && MatchTypeT::isConst()); } - - const std::unique_ptr & getRegexp() const { return memorized_re; } - -private: - // We should pre compile the regular expression when: - // - only pattern column is provided and it's a constant column - // - pattern and match type columns are provided and they are both constant columns - mutable std::unique_ptr memorized_re; }; // regexp and regexp_like functions are executed in this macro @@ -517,7 +472,7 @@ class FunctionStringRegexpBase else \ { \ /* match_type is not provided here */ \ - Param MATCH_TYPE_PARAM_VAR_NAME(-1, StringRef("", 0)); \ + Param, false> MATCH_TYPE_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, StringRef("", 0)); \ EXECUTE_REGEXP_LIKE() \ } \ } while (0); @@ -533,10 +488,12 @@ class FunctionStringRegexpBase #define CONVERT_EXPR_COL_TO_PARAM() \ do \ { \ + /* Getting column size from expr col */ \ + size_t COL_SIZE_VAR_NAME = (EXPR_COL_PTR_VAR_NAME)->size(); \ CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ } while (0); -// The entry to convert columns to params and execute regexp_xxx functions +// The entry to convert columns to params and execute regexp functions #define CONVERT_COLS_TO_PARAMS_AND_EXECUTE() \ do \ { \ @@ -796,6 +753,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef MATCH_TYPE_PARAM_VAR_NAME #undef PAT_PARAM_VAR_NAME #undef EXPR_PARAM_VAR_NAME +#undef COL_SIZE_VAR_NAME col_size #undef RES_ARG_VAR_NAME #undef MATCH_TYPE_COL_PTR_VAR_NAME #undef PAT_COL_PTR_VAR_NAME diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 5bb39f52902..9260f504bba 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -17,12 +17,13 @@ #include #include #include -#include #include #include +#include +#include + #include "Columns/IColumn.h" #include "Common/Exception.h" -#include namespace DB { diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 9dfb9e30ca7..0e6d78e48c8 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -19,11 +19,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include From f7e3302ee95678fee114ee4ad5e71bba3c73a769 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 26 Oct 2022 15:49:26 +0800 Subject: [PATCH 36/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 102 +++++++++++++++++++-------- 1 file changed, 71 insertions(+), 31 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 9c6c4dae87a..49b35666432 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -454,6 +454,76 @@ class FunctionStringRegexpBase { return (PatT::isConst() && MatchTypeT::isConst()); } + + static void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) + { + if (is_str) + { + // Check string type argument + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isString()) + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + else + { + if (!arg->isString()) + { + // It may be DataTypeNothing if it's not string + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + } + else + { + // Check int type argument + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + if (null_type == nullptr) + throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isInteger()) + { + if (nested_type->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + else + { + if (!arg->isInteger()) + { + // It may be DataTypeNothing if it's not string + if (arg->getTypeId() != TypeIndex::Nothing) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; + } + } + } + } }; // regexp and regexp_like functions are executed in this macro @@ -536,7 +606,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase bool has_data_type_nothing = false; for (const auto & arg : arguments) - checkInputArg(arg, &has_nullable_col, &has_data_type_nothing); + checkInputArg(arg, true, &has_nullable_col, &has_data_type_nothing); if (has_data_type_nothing) return std::make_shared(std::make_shared()); @@ -706,36 +776,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase } private: - void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) const - { - if (arg->isNullable()) - { - *has_nullable_col = true; - const auto * null_type = checkAndGetDataType(arg.get()); - if (null_type == nullptr) - throw Exception("Get unexpected nullptr in FunctionStringRegexp", ErrorCodes::LOGICAL_ERROR); - - const auto & nested_type = null_type->getNestedType(); - if (!nested_type->isString()) - { - if (nested_type->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - else - { - if (!arg->isString()) - { - if (arg->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of function {}", arg->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - } - TiDB::TiDBCollatorPtr collator = nullptr; }; From bfcf3fbee58947c7aa23db833f8a7042eaadbccd Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 27 Oct 2022 11:02:55 +0800 Subject: [PATCH 37/87] resolve comments --- dbms/src/Functions/FunctionsRegexp.h | 127 +++++++++++++-------------- 1 file changed, 61 insertions(+), 66 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 49b35666432..1b132eade55 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -32,19 +32,19 @@ #include #include -#include "Columns/ColumnNullable.h" -#include "Columns/ColumnString.h" -#include "Columns/ColumnsNumber.h" -#include "Columns/IColumn.h" -#include "Common/Exception.h" -#include "Core/Field.h" -#include "Core/Types.h" -#include "DataTypes/DataTypeNothing.h" -#include "DataTypes/DataTypeNullable.h" -#include "Parsers/Lexer.h" -#include "common/StringRef.h" -#include "common/defines.h" -#include "common/types.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #if USE_RE2_ST #include @@ -260,7 +260,7 @@ class ParamInt // Columns may be const, nullable or plain vector, we can conveniently handle // these different type columns with Param. -template +template class Param { public: @@ -334,14 +334,16 @@ class Param bool isNullAt(size_t idx) const { - if constexpr (is_null) + if constexpr (is_nullable && ParamImplType::isConst()) + return (*null_map)[0]; + else if constexpr (is_nullable) return (*null_map)[idx]; else return false; } size_t getDataNum() const { return col_size; } - constexpr static bool isNullableCol() { return is_null; } + constexpr static bool isNullableCol() { return is_nullable; } constexpr static bool isConst() { return ParamImplType::isConst(); } private: @@ -362,6 +364,8 @@ class Param #define PAT_PARAM_VAR_NAME pat_param #define MATCH_TYPE_PARAM_VAR_NAME match_type_param +#define COL_CONST_VAR_NAME col_const + #define SELF_CLASS_NAME (name) #define ARG_NUM_VAR_NAME arg_num @@ -369,54 +373,57 @@ class Param #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME executeRegexpFunc // Common method to convert nullable string column -// converted_col is impossible to be const here -#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ +// converted column referred by converted_col_name is impossible to be const here +#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ do \ { \ - if (((converted_col)->isColumnNullable())) \ + if (((converted_col_name)->isColumnNullable())) \ { \ - auto nested_ptr = static_cast(*(converted_col)).getNestedColumnPtr(); \ + auto nested_ptr = static_cast(*(converted_col_name)).getNestedColumnPtr(); \ const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ - const auto * null_map = &(static_cast(*(converted_col)).getNullMapData()); \ + const auto * null_map = &(static_cast(*(converted_col_name)).getNullMapData()); \ Param, true>(param_name)(COL_SIZE_VAR_NAME, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ next_convertion; \ } \ else \ { \ /* This is a pure string vector column */ \ - const auto * tmp = checkAndGetColumn(&(*(converted_col))); \ + const auto * tmp = checkAndGetColumn(&(*(converted_col_name))); \ Param, false>(param_name)(COL_SIZE_VAR_NAME, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ next_convertion; \ } \ } while (0); // Common method to convert const string column -#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col, next_convertion) \ - do \ - { \ - const auto * col_const = typeid_cast(&(*(converted_col))); \ - if (col_const != nullptr) \ - { \ - auto col_const_data = col_const->getDataColumnPtr(); \ - Field field; \ - col_const->get(0, field); \ - String tmp = field.isNull() ? String("") : field.safeGet(); \ - if (col_const_data->isColumnNullable()) \ - { \ - const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ - Param, true>(param_name)(COL_SIZE_VAR_NAME, StringRef(tmp.data(), tmp.size()), null_map); \ - next_convertion; \ - } \ - else \ - { \ - Param, false>(param_name)(COL_SIZE_VAR_NAME, col_const->getDataAt(0)); \ - next_convertion; \ - } \ - } \ - else \ - { \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col), next_convertion) \ - } \ +#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ + do \ + { \ + auto col_const_data = COL_CONST_VAR_NAME->getDataColumnPtr(); \ + Field field; \ + COL_CONST_VAR_NAME->get(0, field); \ + String tmp = field.isNull() ? String("") : field.safeGet(); \ + if (col_const_data->isColumnNullable()) \ + { \ + const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ + Param, true>(param_name)(COL_SIZE_VAR_NAME, StringRef(tmp.data(), tmp.size()), null_map); \ + next_convertion; \ + } \ + else \ + { \ + Param, false>(param_name)(COL_SIZE_VAR_NAME, COL_CONST_VAR_NAME->getDataAt(0)); \ + next_convertion; \ + } \ + } while (0); + +// Common method to convert string column +#define CONVERT_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ + do \ + { \ + const auto * COL_CONST_VAR_NAME = typeid_cast(&(*(converted_col_name))); \ + if (COL_CONST_VAR_NAME != nullptr) \ + CONVERT_CONST_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ + else \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ } while (0); class FunctionStringRegexpBase @@ -481,13 +488,7 @@ class FunctionStringRegexpBase else { if (!arg->isString()) - { - // It may be DataTypeNothing if it's not string - if (arg->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } } else @@ -514,13 +515,7 @@ class FunctionStringRegexpBase else { if (!arg->isInteger()) - { - // It may be DataTypeNothing if it's not string - if (arg->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } } } @@ -538,7 +533,7 @@ class FunctionStringRegexpBase do \ { \ if ((ARG_NUM_VAR_NAME) == 3) \ - CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ + CONVERT_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ else \ { \ /* match_type is not provided here */ \ @@ -551,7 +546,7 @@ class FunctionStringRegexpBase #define CONVERT_PAT_COL_TO_PARAM() \ do \ { \ - CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ + CONVERT_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ } while (0); // Method to convert expression column @@ -560,7 +555,7 @@ class FunctionStringRegexpBase { \ /* Getting column size from expr col */ \ size_t COL_SIZE_VAR_NAME = (EXPR_COL_PTR_VAR_NAME)->size(); \ - CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ + CONVERT_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ } while (0); // The entry to convert columns to params and execute regexp functions @@ -793,7 +788,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef MATCH_TYPE_PARAM_VAR_NAME #undef PAT_PARAM_VAR_NAME #undef EXPR_PARAM_VAR_NAME -#undef COL_SIZE_VAR_NAME col_size +#undef COL_SIZE_VAR_NAME #undef RES_ARG_VAR_NAME #undef MATCH_TYPE_COL_PTR_VAR_NAME #undef PAT_COL_PTR_VAR_NAME From 54aa32473cba642d5a13d39c4304c7b302ed56f1 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 27 Oct 2022 15:45:29 +0800 Subject: [PATCH 38/87] finish --- .../Common/OptimizedRegularExpression.inl.h | 5 +- dbms/src/Functions/FunctionsRegexp.h | 40 +-- dbms/src/Functions/tests/gtest_regexp.cpp | 247 +++++++++++++----- 3 files changed, 200 insertions(+), 92 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 3c98d86e90e..a83da92366a 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -562,7 +562,10 @@ template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { int64_t utf8_total_len = getStringUtf8Len(subject, subject_size); - + + if (unlikely(ret_op != 0 && ret_op != 1)) + throw DB::Exception("Incorrect arguments to regexp_instr: return_option must be 1 or 0"); + if (unlikely(pos <= 0 || (pos > utf8_total_len && subject_size != 0))) throw DB::Exception("Index out of bounds in regular expression search."); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index fd894fea1db..47dd68bd62f 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -674,11 +674,9 @@ class Param #define CONVERT_CONST_INT_COL_TO_PARAM(param_name, converted_col, next_convertion) \ do \ { \ - std::cout << "CONVERT_CONST_INT_COL_TO_PARAM1\n"; \ const auto * col_const = typeid_cast(&(*(converted_col))); \ if (col_const != nullptr) \ { \ - std::cout << "CONVERT_CONST_INT_COL_TO_PARAM4\n"; \ Field field; \ col_const->get(0, field); \ auto data_int64 = field.isNull() ? -1 : getIntFromField(field); \ @@ -1075,10 +1073,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ do \ { \ - std::cout << "CONVERT_MATCH_TYPE_COL_TO_PARAM1\n"; \ if (ARG_NUM_VAR_NAME == REGEXP_INSTR_MAX_PARAM_NUM) \ { \ - std::cout << "CONVERT_MATCH_TYPE_COL_TO_PARAM2\n"; \ CONVERT_CONST_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_INSTR()})) \ } \ else \ @@ -1092,10 +1088,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_RET_OP_COL_TO_PARAM() \ do \ { \ - std::cout << "CONVERT_RET_OP_COL_TO_PARAM1\n"; \ if (ARG_NUM_VAR_NAME < REGEXP_MIN_PARAM_NUM + 3) \ { \ - std::cout << "CONVERT_RET_OP_COL_TO_PARAM2\n"; \ Param, false> RET_OP_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, static_cast(0)); \ CONVERT_MATCH_TYPE_COL_TO_PARAM() \ } \ @@ -1107,10 +1101,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_OCCUR_COL_TO_PARAM() \ do \ { \ - std::cout << "CONVERT_OCCUR_COL_TO_PARAM1\n"; \ if (ARG_NUM_VAR_NAME < REGEXP_MIN_PARAM_NUM + 2) \ { \ - std::cout << "CONVERT_OCCUR_COL_TO_PARAM2\n"; \ Param, false> OCCUR_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, static_cast(1)); \ CONVERT_RET_OP_COL_TO_PARAM() \ } \ @@ -1122,10 +1114,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_POS_COL_TO_PARAM() \ do \ { \ - std::cout << "CONVERT_POS_COL_TO_PARAM1\n"; \ if (ARG_NUM_VAR_NAME < REGEXP_MIN_PARAM_NUM + 1) \ { \ - std::cout << "CONVERT_POS_COL_TO_PARAM2\n"; \ Param, false> POS_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, static_cast(1)); \ CONVERT_OCCUR_COL_TO_PARAM() \ } \ @@ -1137,7 +1127,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_PAT_COL_TO_PARAM() \ do \ { \ - std::cout << "CONVERT_PAT_COL_TO_PARAM\n"; \ CONVERT_CONST_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_POS_COL_TO_PARAM()})) \ } while (0); @@ -1145,7 +1134,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #define CONVERT_EXPR_COL_TO_PARAM() \ do \ { \ - std::cout << "CONVERT_EXPR_COL_TO_PARAM\n"; \ /* Getting column size from expr col */ \ size_t COL_SIZE_VAR_NAME = (EXPR_COL_PTR_VAR_NAME)->size(); \ CONVERT_CONST_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ @@ -1204,9 +1192,9 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase } template - void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expar_param, const PatT & par_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const MatchTypeT & match_type_param) const + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const PosT & pos_param, const OccurT & occur_param, const RetOpT & ret_op_param, const MatchTypeT & match_type_param) const { - size_t col_size = expar_param.getDataNum(); + size_t col_size = expr_param.getDataNum(); // Get function pointers to process the specific int type GetIntFuncPointerType get_pos_func = getGetIntFuncPointer(pos_param.getIntType()); @@ -1226,15 +1214,15 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Check if args are all const columns if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && RetOpT::isConst() && MatchTypeT::isConst()) { - if (col_size == 0 || expar_param.isNullAt(0) || par_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || ret_op_param.isNullAt(0) || match_type_param.isNullAt(0)) + if (col_size == 0 || expr_param.isNullAt(0) || pat_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || ret_op_param.isNullAt(0) || match_type_param.isNullAt(0)) { res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; } int flags = getDefaultFlags(); - String expr = expar_param.getString(0); - String pat = par_param.getString(0); + String expr = expr_param.getString(0); + String pat = pat_param.getString(0); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); @@ -1295,7 +1283,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase { // Codes in this if branch execute instr with memorized regexp - const auto & regexp = memorize(par_param, match_type_param, COLLATOR_VAR_NAME); + const auto & regexp = memorize(pat_param, match_type_param, COLLATOR_VAR_NAME); if constexpr (has_nullable_col) { // Process nullable columns with memorized regexp @@ -1305,13 +1293,13 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase for (size_t i = 0; i < col_size; ++i) { - if (expar_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || ret_op_param.isNullAt(i)) + if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || ret_op_param.isNullAt(i)) { null_map[i] = 1; continue; } null_map[i] = 0; - expar_param.getStringRef(i, expr_ref); + expr_param.getStringRef(i, expr_ref); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) @@ -1326,7 +1314,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase for (size_t i = 0; i < col_size; ++i) { - expar_param.getStringRef(i, expr_ref); + expr_param.getStringRef(i, expr_ref); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) @@ -1348,14 +1336,14 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase for (size_t i = 0; i < col_size; ++i) { - if (expar_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || ret_op_param.isNullAt(i)) + if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || ret_op_param.isNullAt(i) || match_type_param.isNullAt(i)) { null_map[i] = 1; continue; } null_map[i] = 0; - expar_param.getStringRef(i, expr_ref); - pat = par_param.getString(i); + expr_param.getStringRef(i, expr_ref); + pat = pat_param.getString(i); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); GET_POS_VALUE(i) @@ -1373,8 +1361,8 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Process pure vector columns without memorized regexp for (size_t i = 0; i < col_size; ++i) { - expar_param.getStringRef(i, expr_ref); - pat = par_param.getString(i); + expr_param.getStringRef(i, expr_ref); + pat = pat_param.getString(i); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); GET_POS_VALUE(i) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index de1a7892b09..d324bd5d258 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -23,6 +23,7 @@ #include #include +#include "Common/Exception.h" #include "Core/ColumnWithTypeAndName.h" #include "DataTypes/DataTypesNumber.h" #include "common/types.h" @@ -2296,6 +2297,31 @@ struct RegexpInstrCase } } + static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) + { + null_map.clear(); + null_map.resize(REGEXP_INSTR_MAX_PARAM_NUM); + for (const auto & elem : test_cases) + { + null_map[EXPR_NULL_MAP_IDX].push_back(elem.null_map[EXPR_NULL_MAP_IDX]); + null_map[PAT_NULL_MAP_IDX].push_back(elem.null_map[PAT_NULL_MAP_IDX]); + null_map[POS_NULL_MAP_IDX].push_back(elem.null_map[POS_NULL_MAP_IDX]); + null_map[OCCUR_NULL_MAP_IDX].push_back(elem.null_map[OCCUR_NULL_MAP_IDX]); + null_map[RET_OP_NULL_MAP_IDX].push_back(elem.null_map[RET_OP_NULL_MAP_IDX]); + null_map[MATCH_TYPE_NULL_MAP_IDX].push_back(elem.null_map[MATCH_TYPE_NULL_MAP_IDX]); + } + + setVecsWithoutNullMap(param_num, test_cases, results, exprs, pats, positions, occurs, ret_ops, match_types); + } + + const static UInt8 REGEXP_INSTR_MAX_PARAM_NUM = 6; + const static UInt8 EXPR_NULL_MAP_IDX = 0; + const static UInt8 PAT_NULL_MAP_IDX = 1; + const static UInt8 POS_NULL_MAP_IDX = 2; + const static UInt8 OCCUR_NULL_MAP_IDX = 3; + const static UInt8 RET_OP_NULL_MAP_IDX = 4; + const static UInt8 MATCH_TYPE_NULL_MAP_IDX = 5; + Int64 result; std::vector null_map; String expression; @@ -2308,70 +2334,62 @@ struct RegexpInstrCase TEST_F(Regexp, RegexpInstr) { - std::cout << "test1\n"; // Test: All columns are const { - size_t row_size = 2; - std::cout << "test1.1\n"; - ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."))); - std::cout << "test1.2\n"; - ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2))); - std::cout << "test1.3\n"; - ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "11212"), - createConstColumn(row_size, "12"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2))); - std::cout << "test1.4\n"; - ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "11212"), - createConstColumn(row_size, "12"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 1))); - std::cout << "test1.5\n"; - ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "aabab"), - createConstColumn(row_size, "aB"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 1), - createConstColumn(row_size, "i"))); + for (size_t row_size = 1; row_size < 3; ++row_size) + { + ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "aabab"), + createConstColumn(row_size, "aB"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1), + createConstColumn(row_size, "i"))); + } } - std::cout << "test2\n"; // Test: null const { size_t row_size = 2; - std::cout << "test2.1\n"; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( "regexp_instr", createConstColumn>(row_size, {}), createConstColumn(row_size, "123"))); - std::cout << "test2.2\n"; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( "regexp_instr", createConstColumn(row_size, "123"), createConstColumn>(row_size, {}))); - std::cout << "test2.3\n"; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( @@ -2379,7 +2397,6 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, "123"), createConstColumn(row_size, "12."), createConstColumn>(row_size, {}))); - std::cout << "test2.4\n"; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( @@ -2388,7 +2405,6 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, "12."), createConstColumn(row_size, 2), createConstColumn>(row_size, {}))); - std::cout << "test2.5\n"; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( @@ -2398,7 +2414,6 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, 2), createConstColumn(row_size, 2), createConstColumn>(row_size, {}))); - std::cout << "test2.6\n"; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( @@ -2411,20 +2426,19 @@ TEST_F(Regexp, RegexpInstr) createConstColumn>(row_size, {}))); } - std::cout << "test3\n"; + std::vector test_cases; + std::vector results; + std::vector> null_maps; + std::vector exprs; + std::vector patterns; + std::vector positions; + std::vector occurs; + std::vector return_options; + std::vector match_types; + // Test: All columns are pure vector { - std::vector test_cases; - std::vector results; - std::vector exprs{"ttttifl", "tidb_tikv", "aaaaaa", "\n", "", "ab\naB", "pp跑ppのaaa"}; - std::vector patterns{"tifl", "ti(db|kv)", "aa", ".", "^$", "^ab$", "(跑|の|P)"}; - std::vector positions; - std::vector occurs; - std::vector return_options; - std::vector match_types; - // test regexp_instr(vector, vector) - std::cout << "test3.1\n"; test_cases = {{4, "ttttifl", "tifl"}, {1, "tidb_tikv", "ti(db|kv)"}, {1, "aaaaaa", "aa"}, @@ -2440,7 +2454,6 @@ TEST_F(Regexp, RegexpInstr) createColumn(patterns))); // test regexp_instr(vector, vector, vector) - std::cout << "test3.2\n"; test_cases = {{4, "ttttifl", "tifl", 3}, {6, "tidb_tikv", "ti(db|kv)", 2}, {3, "aaaaaa", "aa", 3}, @@ -2457,7 +2470,6 @@ TEST_F(Regexp, RegexpInstr) createColumn(positions))); // test regexp_instr(vector, vector, vector, vector) - std::cout << "test3.3\n"; test_cases = {{4, "ttttifl", "tifl", 3, 1}, {6, "tidb_tikv", "ti(db|kv)", 2, 1}, {5, "aaaaaa", "aa", 3, 2}, @@ -2474,7 +2486,6 @@ TEST_F(Regexp, RegexpInstr) createColumn(occurs))); // test regexp_instr(vector, vector, vector, vector, vector) - std::cout << "test3.4\n"; test_cases = {{8, "ttttifl", "tifl", 3, 1, 1}, {10, "tidb_tikv", "ti(db|kv)", 2, 1, 1}, {7, "aaaaaa", "aa", 3, 2, 1}, @@ -2493,7 +2504,6 @@ TEST_F(Regexp, RegexpInstr) createColumn(return_options))); // test regexp_instr(vector, vector, vector, vector, vector, vector) - std::cout << "test3.5\n"; test_cases = {{8, "ttttifl", "tifl", 3, 1, 1, ""}, {10, "tidb_tikv", "ti(db|kv)", 2, 1, 1, ""}, {7, "aaaaaa", "aa", 3, 2, 1, ""}, @@ -2537,8 +2547,115 @@ TEST_F(Regexp, RegexpInstr) } + // Test: Args including nullable columns + { + // test regexp_instr(nullable vector, vector) + test_cases = {{0, {{1, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns))); + + // test regexp_instr(vector, nullable vector) + test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {0, {{0, 1, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createNullableVectorColumn(patterns, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, nullable vector) + test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, + {0, {{0, 0, 1, 0, 0, 0}}, "ttttifl", "tifl", 3}}; + RegexpInstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, vector, nullable vector) + test_cases = {{6, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, + {0, {{0, 0, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; + RegexpInstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createNullableVectorColumn(occurs, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, vector, vector, nullable vector) + test_cases = {{10, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}, + {0, {{0, 0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}}; + RegexpInstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createNullableVectorColumn(return_options, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, vector, vector, vector, nullable vector) + test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "b", "B", 1, 1, 0, "i"}, + {0, {{0, 0, 0, 0, 0, 1}}, "b", "B", 1, 1, 0, "i"}}; + RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createNullableVectorColumn(match_types, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]))); + } + + // Test: const, nullable and pure vector columns appear together + { + // test regexp_instr(nullable vector, vector, nullable vector, vector, const vector, vector) + test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, + {0, {{1, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, + {0, {{0, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, + {0, {{1, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}}; + RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), + executeFunction( + "regexp_instr", + createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + createColumn(occurs), + createConstColumn(test_cases.size(), 0), + createColumn(match_types))); + } + // Test: Invalid parameter handling - {} + { + // test empty pattern + test_cases = {{0, "ttt", ""}}; + RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns)), Exception); + + // test invalid ret_option + test_cases = {{0, "ttt", "t", 1, 1, 2}}; + RegexpInstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(return_options)), Exception); + + // test invalid match type + test_cases = {{0, "ttt", "t", 1, 1, 1, "p"}}; + RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(return_options), createColumn(match_types)), Exception); + } } TEST_F(Regexp, testRegexpReplaceMatchType) From 6cac98ca4f01591d2843cb7fe88cb26ec9764083 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 28 Oct 2022 23:31:18 +0800 Subject: [PATCH 39/87] tweaking --- dbms/src/Functions/FunctionsRegexp.cpp | 44 -------------------------- dbms/src/Functions/FunctionsRegexp.h | 2 +- 2 files changed, 1 insertion(+), 45 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 5d5643fa581..7cb26b34dba 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -21,50 +21,6 @@ namespace DB { - -namespace -{ -const char flag_i = 'i'; -const char flag_c = 'c'; -const char flag_m = 'm'; -const char flag_s = 's'; - -std::set valid_flags{flag_i, flag_c, flag_m, flag_s}; -} // namespace - -// If characters specifying contradictory options are specified -// within match_type, the rightmost one takes precedence. -// String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator) -// { -// std::set applied_flags; -// if (collator != nullptr && collator->isCI()) -// applied_flags.insert(flag_i); - -// for (auto flag : match_type) -// { -// auto iter = valid_flags.find(flag); -// if (iter == valid_flags.end()) -// throw Exception(fmt::format("Invalid match type '{}' in regexp function", flag)); - -// // re2 is case-sensitive by default, so we only need to delete 'i' flag -// // to enable the case-sensitive for the regexp -// if (flag == flag_c) -// { -// applied_flags.erase(flag_i); -// continue; -// } - -// applied_flags.insert(flag); -// } - -// // generate match type flag -// String flags; -// for (auto flag : applied_flags) -// flags += flag; - -// return flags; -// } - /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. * 'replacement' could contain substitutions, for example: '\2-\3-\1' */ diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 1b132eade55..9d9d4ea4e27 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -112,7 +112,7 @@ inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, template inline constexpr bool check_int_type() { - return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); + return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); } template From 6ebc3f639067f48f7d074d0da748cc7719ac64f4 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 31 Oct 2022 11:16:41 +0800 Subject: [PATCH 40/87] modify punctuation and format --- dbms/src/Functions/FunctionsRegexp.cpp | 3 +- dbms/src/Functions/FunctionsRegexp.h | 94 ++++++++++----------- dbms/src/Functions/FunctionsStringReplace.h | 5 +- dbms/src/Functions/re2Util.h | 3 +- dbms/src/Functions/tests/gtest_regexp.cpp | 8 +- 5 files changed, 54 insertions(+), 59 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index 7cb26b34dba..e4b9c56e88f 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include -#include "Columns/ColumnNullable.h" - namespace DB { /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 9d9d4ea4e27..8ba90b9a6e3 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -16,9 +16,17 @@ #include #include +#include #include +#include +#include +#include #include #include +#include +#include +#include +#include #include #include #include @@ -27,24 +35,14 @@ #include #include #include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include #include +#include + +#include +#include #if USE_RE2_ST #include @@ -374,28 +372,28 @@ class Param // Common method to convert nullable string column // converted column referred by converted_col_name is impossible to be const here -#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ +#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ do \ { \ - if (((converted_col_name)->isColumnNullable())) \ + if (((converted_col_name)->isColumnNullable())) \ { \ - auto nested_ptr = static_cast(*(converted_col_name)).getNestedColumnPtr(); \ + auto nested_ptr = static_cast(*(converted_col_name)).getNestedColumnPtr(); \ const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ - const auto * null_map = &(static_cast(*(converted_col_name)).getNullMapData()); \ + const auto * null_map = &(static_cast(*(converted_col_name)).getNullMapData()); \ Param, true>(param_name)(COL_SIZE_VAR_NAME, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ next_convertion; \ } \ else \ { \ /* This is a pure string vector column */ \ - const auto * tmp = checkAndGetColumn(&(*(converted_col_name))); \ + const auto * tmp = checkAndGetColumn(&(*(converted_col_name))); \ Param, false>(param_name)(COL_SIZE_VAR_NAME, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ next_convertion; \ } \ } while (0); // Common method to convert const string column -#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ +#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ do \ { \ auto col_const_data = COL_CONST_VAR_NAME->getDataColumnPtr(); \ @@ -416,14 +414,14 @@ class Param } while (0); // Common method to convert string column -#define CONVERT_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ - do \ - { \ - const auto * COL_CONST_VAR_NAME = typeid_cast(&(*(converted_col_name))); \ - if (COL_CONST_VAR_NAME != nullptr) \ - CONVERT_CONST_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ - else \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ +#define CONVERT_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ + do \ + { \ + const auto * COL_CONST_VAR_NAME = typeid_cast(&(*(converted_col_name))); \ + if (COL_CONST_VAR_NAME != nullptr) \ + CONVERT_CONST_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ + else \ + CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ } while (0); class FunctionStringRegexpBase @@ -500,7 +498,7 @@ class FunctionStringRegexpBase const auto * null_type = checkAndGetDataType(arg.get()); if (null_type == nullptr) throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); - + const auto & nested_type = null_type->getNestedType(); // It may be DataTypeNothing if it's not string @@ -529,32 +527,32 @@ class FunctionStringRegexpBase } while (0); // Method to convert match type column -#define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ - do \ - { \ - if ((ARG_NUM_VAR_NAME) == 3) \ - CONVERT_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ - else \ - { \ - /* match_type is not provided here */ \ - Param, false> MATCH_TYPE_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, StringRef("", 0)); \ - EXECUTE_REGEXP_LIKE() \ - } \ +#define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ + do \ + { \ + if ((ARG_NUM_VAR_NAME) == 3) \ + CONVERT_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ + else \ + { \ + /* match_type is not provided here */ \ + Param, false> MATCH_TYPE_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, StringRef("", 0)); \ + EXECUTE_REGEXP_LIKE() \ + } \ } while (0); // Method to convert pattern column -#define CONVERT_PAT_COL_TO_PARAM() \ - do \ - { \ +#define CONVERT_PAT_COL_TO_PARAM() \ + do \ + { \ CONVERT_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ } while (0); // Method to convert expression column -#define CONVERT_EXPR_COL_TO_PARAM() \ - do \ - { \ - /* Getting column size from expr col */ \ - size_t COL_SIZE_VAR_NAME = (EXPR_COL_PTR_VAR_NAME)->size(); \ +#define CONVERT_EXPR_COL_TO_PARAM() \ + do \ + { \ + /* Getting column size from expr col */ \ + size_t COL_SIZE_VAR_NAME = (EXPR_COL_PTR_VAR_NAME)->size(); \ CONVERT_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ } while (0); diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 9260f504bba..aba51633471 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -17,14 +17,13 @@ #include #include #include +#include +#include #include #include #include #include -#include "Columns/IColumn.h" -#include "Common/Exception.h" - namespace DB { diff --git a/dbms/src/Functions/re2Util.h b/dbms/src/Functions/re2Util.h index 4a7f44a5ac5..532e7ef163a 100644 --- a/dbms/src/Functions/re2Util.h +++ b/dbms/src/Functions/re2Util.h @@ -14,13 +14,12 @@ #pragma once +#include #include #include #include #include -#include "Common/Exception.h" - #if USE_RE2_ST #include diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 101b28fb50d..7f79fea364f 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -18,15 +18,15 @@ #include /// this is a hack, include the cpp file so we can test MatchImpl directly +#include +#include +#include + #include // NOLINT #include // NOLINT #include #include -#include "Core/ColumnWithTypeAndName.h" -#include "DataTypes/DataTypesNumber.h" -#include "common/types.h" - #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" #include From 08ff36d32b4310c0094ba213e427ac3bca1fdf52 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 31 Oct 2022 14:56:15 +0800 Subject: [PATCH 41/87] resolve comments --- dbms/src/Functions/FunctionsRegexp.h | 38 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 8ba90b9a6e3..ab7a6fcd8d8 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -372,7 +372,7 @@ class Param // Common method to convert nullable string column // converted column referred by converted_col_name is impossible to be const here -#define CONVERT_NULL_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ +#define CONVERT_STR_VEC_TO_PARAM(param_name, converted_col_name, next_convertion) \ do \ { \ if (((converted_col_name)->isColumnNullable())) \ @@ -393,7 +393,7 @@ class Param } while (0); // Common method to convert const string column -#define CONVERT_CONST_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ +#define CONVERT_STR_CONST_TO_PARAM(param_name, converted_col_name, next_convertion) \ do \ { \ auto col_const_data = COL_CONST_VAR_NAME->getDataColumnPtr(); \ @@ -419,9 +419,9 @@ class Param { \ const auto * COL_CONST_VAR_NAME = typeid_cast(&(*(converted_col_name))); \ if (COL_CONST_VAR_NAME != nullptr) \ - CONVERT_CONST_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ + CONVERT_STR_CONST_TO_PARAM((param_name), (converted_col_name), next_convertion) \ else \ - CONVERT_NULL_STR_COL_TO_PARAM((param_name), (converted_col_name), next_convertion) \ + CONVERT_STR_VEC_TO_PARAM((param_name), (converted_col_name), next_convertion) \ } while (0); class FunctionStringRegexpBase @@ -442,6 +442,9 @@ class FunctionStringRegexpBase template std::unique_ptr memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const { + if (pat_param.isNullAt(0) || match_type_param.isNullAt(0)) + return nullptr; + String final_pattern = pat_param.getString(0); if (unlikely(final_pattern.empty())) throw Exception(EMPTY_PAT_ERR_MSG); @@ -469,8 +472,7 @@ class FunctionStringRegexpBase { *has_nullable_col = true; const auto * null_type = checkAndGetDataType(arg.get()); - if (null_type == nullptr) - throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + assert(null_type != null_type); const auto & nested_type = null_type->getNestedType(); @@ -496,8 +498,7 @@ class FunctionStringRegexpBase { *has_nullable_col = true; const auto * null_type = checkAndGetDataType(arg.get()); - if (null_type == nullptr) - throw Exception("Get unexpected nullptr in FunctionStringRegexpInstr", ErrorCodes::LOGICAL_ERROR); + assert(null_type != nullptr); const auto & nested_type = null_type->getNestedType(); @@ -527,7 +528,7 @@ class FunctionStringRegexpBase } while (0); // Method to convert match type column -#define CONVERT_MATCH_TYPE_COL_TO_PARAM() \ +#define CONVERT_MATCH_TYPE_COL_TO_PARAM_AND_EXECUTE() \ do \ { \ if ((ARG_NUM_VAR_NAME) == 3) \ @@ -541,10 +542,10 @@ class FunctionStringRegexpBase } while (0); // Method to convert pattern column -#define CONVERT_PAT_COL_TO_PARAM() \ - do \ - { \ - CONVERT_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM()})) \ +#define CONVERT_PAT_COL_TO_PARAM() \ + do \ + { \ + CONVERT_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM_AND_EXECUTE()})) \ } while (0); // Method to convert expression column @@ -618,7 +619,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // Check if args are all const columns if constexpr (ExprT::isConst() && PatT::isConst() && MatchTypeT::isConst()) { - if (col_size == 0 || expr_param.isNullAt(0) || pat_param.isNullAt(0) || match_type_param.isNullAt(0)) + if (expr_param.isNullAt(0) || pat_param.isNullAt(0) || match_type_param.isNullAt(0)) { res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; @@ -649,6 +650,15 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if constexpr (canMemorize()) { const auto & regexp = memorize(pat_param, match_type_param, collator); + if (regexp == nullptr) + { + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(col_size, 1); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + return; + } + if constexpr (has_nullable_col) { // expr column must be a nullable column here, so we need to check null for each elems From 202f44d943762fe0798eee8a739577e8f178da56 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 31 Oct 2022 15:52:05 +0800 Subject: [PATCH 42/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index ab7a6fcd8d8..c3b3018d5b3 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -83,8 +83,6 @@ struct NameReplaceRegexpAll static constexpr std::string_view regexp_name(NameTiDBRegexp::name); static constexpr std::string_view regexp_like_name(NameRegexpLike::name); -// String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr); - inline int getDefaultFlags() { int flags = 0; @@ -594,7 +592,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if (arg_num < REGEXP_MIN_PARAM_NUM) throw Exception("Too few arguments", ErrorCodes::TOO_LESS_ARGUMENTS_FOR_FUNCTION); else if (arg_num > args_max_num) - throw Exception("Too mant arguments", ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION); + throw Exception("Too many arguments", ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION); bool has_nullable_col = false; bool has_data_type_nothing = false; From 8e231fc7f0e1ef3edf065740100776797b9c7f36 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 31 Oct 2022 17:51:06 +0800 Subject: [PATCH 43/87] solve not all const col --- dbms/src/Functions/FunctionsRegexp.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index c3b3018d5b3..76ea1629815 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -647,14 +647,18 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // Start to match if constexpr (canMemorize()) { - const auto & regexp = memorize(pat_param, match_type_param, collator); - if (regexp == nullptr) + std::unique_ptr regexp; + if (col_size > 0) { - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(col_size, 1); - res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); - return; + regexp = memorize(pat_param, match_type_param, collator); + if (regexp == nullptr) + { + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(col_size, 1); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + return; + } } if constexpr (has_nullable_col) From 515d19f00ae8a83183000e85df773bb69a4f10ae Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 1 Nov 2022 08:51:30 +0800 Subject: [PATCH 44/87] Update dbms/src/Functions/FunctionsRegexp.cpp Co-authored-by: Liqi Geng --- dbms/src/Functions/FunctionsRegexp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index e4b9c56e88f..a85d7850791 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -36,7 +36,7 @@ struct ReplaceRegexpImpl /// Sequence of instructions, describing how to get resulting string. /// Each element is either: /// - substitution (in that case first element of pair is their number and second element is empty) - /// - string that need to be inserted (in that case, first element of pair is that string and second element is -1) + /// - string that need to be inserted (in that case, first element of pair is -1 and second element is that string) using Instructions = std::vector>; static const size_t max_captures = 10; From add642fcbb890cb163ca58b79de1872caf502beb Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 1 Nov 2022 09:38:58 +0800 Subject: [PATCH 45/87] resolve comments --- dbms/src/Functions/FunctionsRegexp.h | 41 ++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 76ea1629815..b90e5c212c8 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -108,7 +108,7 @@ inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, template inline constexpr bool check_int_type() { - return static_cast(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); + return std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; } template @@ -267,28 +267,36 @@ class Param : col_size(col_size_) , null_map(nullptr) , data(str_ref) - {} + { + checkNullableLogic(); + } // const nullable string param Param(size_t col_size_, const StringRef & str_ref, ConstNullMapPtr null_map_) : col_size(col_size_) , null_map(null_map_) , data(str_ref) - {} + { + checkNullableLogic(); + } // const int param Param(size_t col_size_, Int64 val) : col_size(col_size_) , null_map(nullptr) , data(val) - {} + { + checkNullableLogic(); + } // const nullable int param Param(size_t col_size_, Int64 val, ConstNullMapPtr null_map_) : col_size(col_size_) , null_map(null_map_) , data(val) - {} + { + checkNullableLogic(); + } // pure vector string param // chars_ type: ParamImplType::Chars_t @@ -297,7 +305,9 @@ class Param : col_size(col_size_) , null_map(nullptr) , data(chars_, offsets_) - {} + { + checkNullableLogic(); + } // pure vector int param // int_container_ type: ParamImplType::Container @@ -305,7 +315,9 @@ class Param : col_size(col_size_) , null_map(nullptr) , data(int_container_) - {} + { + checkNullableLogic(); + } // nullable vector string param // chars_ type: ParamImplType::Chars_t @@ -314,7 +326,9 @@ class Param : col_size(col_size_) , null_map(null_map_) , data(chars_, offsets_) - {} + { + checkNullableLogic(); + } // nullable vector int param // int_container_ type: ParamImplType::Container @@ -322,7 +336,9 @@ class Param : col_size(col_size_) , null_map(null_map_) , data(int_container_) - {} + { + checkNullableLogic(); + } Int64 getInt(size_t idx) const { return data.getInt(idx); } void getStringRef(size_t idx, StringRef & dst) const { return data.getStringRef(idx, dst); } @@ -343,6 +359,13 @@ class Param constexpr static bool isConst() { return ParamImplType::isConst(); } private: + // When this is a nullable param, we should ensure the null_map is not nullptr + inline void checkNullableLogic() + { + if (is_nullable && (null_map == nullptr)) + throw Exception("Nullable Param with nullptr null_map"); + } + const size_t col_size; ConstNullMapPtr null_map; ParamImplType data; From e80dc23022c6a607b228426303e283d21d6ce1c7 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 1 Nov 2022 12:57:29 +0800 Subject: [PATCH 46/87] fix bug --- dbms/src/Functions/FunctionsRegexp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index b90e5c212c8..14a0e255198 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -493,7 +493,7 @@ class FunctionStringRegexpBase { *has_nullable_col = true; const auto * null_type = checkAndGetDataType(arg.get()); - assert(null_type != null_type); + assert(null_type != nullptr); const auto & nested_type = null_type->getNestedType(); From a4231499b9a2ce3bca90ed4dc0bcf6a3fd8c97a2 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 1 Nov 2022 13:24:02 +0800 Subject: [PATCH 47/87] add match_type ft --- tests/fullstack-test/expr/regexp.test | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index 268ed99fd46..dc6115e2534 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -80,3 +80,17 @@ mysql> set @@tidb_isolation_read_engines='tiflash'; set @@tidb_enforce_mpp=1; se +---------------------+-------------------+ | 1 | 0 | +---------------------+-------------------+ + +mysql> drop table if exists test.t; +mysql> create table test.t (data varchar(30), pattern varchar(30), match_type varchar(30)); +mysql> insert into test.t values ('a', 'A', 'i'), ('\n', '.', 's'), ('ab\nabc', '^abc$', 'm'); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_like(data, pattern, match_type) as res from test.t; ++------+ +| res | ++------+ +| 1 | +| 1 | +| 1 | ++------+ From 602b3809b09e9547bbdae92b30fcdb59169f45cb Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 3 Nov 2022 13:41:18 +0800 Subject: [PATCH 48/87] refactor the handling of parms --- dbms/src/Functions/FunctionsRegexp.h | 427 +++++++++++++++++++-------- 1 file changed, 303 insertions(+), 124 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 14a0e255198..867bb967746 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -111,6 +111,19 @@ inline constexpr bool check_int_type() return std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; } +enum class IntType +{ + UInt8 = 0, + UInt16, + UInt32, + UInt64, + UInt128, + Int8, + Int16, + Int32, + Int64 +}; + template class ParamString { @@ -194,17 +207,15 @@ class ParamString const Offsets * offsets; }; -template +template class ParamInt { public: DISALLOW_COPY_AND_MOVE(ParamInt); - // raise error in compile-time when type is incorrect - using Container = typename ColumnVector(), T>>::Container; - explicit ParamInt(Int64 val) : const_int_val(val) + , int_type(IntType::UInt8) , int_container(nullptr) { if constexpr (!is_const) @@ -214,14 +225,16 @@ class ParamInt // For passing compilation explicit ParamInt(const StringRef &) : const_int_val(0) + , int_type(IntType::UInt8) , int_container(nullptr) { throw Exception("Shouldn't call this constructor"); } - explicit ParamInt(const void * int_container_) + explicit ParamInt(const void * int_container_, IntType int_type_) : const_int_val(0) - , int_container(reinterpret_cast(int_container_)) + , int_type(int_type_) + , int_container(int_container_) { if constexpr (is_const) throw Exception("const parm should not call this constructor"); @@ -230,28 +243,39 @@ class ParamInt // For passing compilation ParamInt(const void *, const void *) : const_int_val(0) + , int_type(IntType::UInt8) , int_container(nullptr) { throw Exception("Shouldn't call this constructor"); } + template Int64 getInt(size_t idx) const { if constexpr (is_const) return const_int_val; else - return static_cast((*int_container)[idx]); + { + const auto * tmp = reinterpret_cast(), T>>::Container *>(int_container); + return static_cast((*tmp)[idx]); + } } + void setIntType(IntType int_type_) { int_type = int_type_; } + IntType getIntType() const { return int_type; } String getString(size_t) const { throw Exception("ParamInt not supports this function"); } void getStringRef(size_t, StringRef &) const { throw Exception("ParamInt not supports this function"); } constexpr static bool isConst() { return is_const; } + void setContainer(const void * container) { int_container = container; } + const void * getContainer() const { return int_container; } private: Int64 const_int_val; + IntType int_type; // for vector int - const Container * int_container; + // type: ColumnVector::Container + const void * int_container; }; // Columns may be const, nullable or plain vector, we can conveniently handle @@ -371,78 +395,238 @@ class Param ParamImplType data; }; -// Unifying these names is necessary in macros -#define EXPR_COL_PTR_VAR_NAME col_expr -#define PAT_COL_PTR_VAR_NAME col_pat -#define MATCH_TYPE_COL_PTR_VAR_NAME col_match_type +class ParamVariant +{ +public: + // String type + using ParamStringNullableAndNotConst = Param, true>; + using ParamStringNotNullableAndConst = Param, false>; + using ParamStringNotNullableAndNotConst = Param, false>; + using ParamStringNullableAndConst = Param, true>; + + // Int type + using ParamIntNullableAndNotConst = Param, true>; + using ParamIntNotNullableAndConst = Param, false>; + using ParamIntNotNullableAndNotConst = Param, false>; + using ParamIntNullableAndConst = Param, true>; + + enum class ParamType + { + StringNullableAndNotConst, + StringNotNullableAndConst, + StringNotNullableAndNotConst, + StringNullableAndConst, + IntNullableAndNotConst, + IntNotNullableAndConst, + IntNotNullableAndNotConst, + IntNullableAndConst + }; + + // default ParamString's ParamType should be ParamType::StringNotNullAndNotConst + explicit ParamVariant(ColumnPtr col, size_t col_size, const StringRef & default_val) + : col_ptr(col) + , default_str(default_val) + , default_int(0) + , param(nullptr) + { + if (col_ptr != nullptr) + { + setParamStringTypeAndGenerateParam(col_size); + } + else + { + // This param is not provided by user, so we should use default value. + param = new ParamStringNotNullableAndConst(col_size, default_val); + param_type = ParamType::StringNotNullableAndConst; + } + } -#define RES_ARG_VAR_NAME res_arg -#define COL_SIZE_VAR_NAME col_size + // default ParamInt's ParamType should be ParamType::IntNotNullAndNotConst + explicit ParamVariant(ColumnPtr col, size_t col_size [[maybe_unused]], Int64 default_val) + : col_ptr(col) + , default_str("", 0) + , default_int(default_val) + , param(nullptr) + { + // TODO implement it in next pr + throw Exception("Not implemented so far"); + } + + ~ParamVariant() + { + if (param != nullptr) + { + switch (param_type) + { + case ParamType::StringNullableAndNotConst: + delete reinterpret_cast(param); + break; + case ParamType::StringNotNullableAndConst: + delete reinterpret_cast(param); + break; + case ParamType::StringNotNullableAndNotConst: + delete reinterpret_cast(param); + break; + case ParamType::StringNullableAndConst: + delete reinterpret_cast(param); + break; + case ParamType::IntNullableAndNotConst: + delete reinterpret_cast(param); + break; + case ParamType::IntNotNullableAndConst: + delete reinterpret_cast(param); + break; + case ParamType::IntNotNullableAndNotConst: + delete reinterpret_cast(param); + break; + case ParamType::IntNullableAndConst: + delete reinterpret_cast(param); + break; + default: + throw Exception("Unexpected ParamType"); + } + } + } + + ParamType getParamType() const { return param_type; } + + // Return string + ParamStringNullableAndNotConst * getParamStringNullableAndNotConst() const { return reinterpret_cast(param); } + ParamStringNotNullableAndConst * getParamStringNotNullableAndConst() const { return reinterpret_cast(param); } + ParamStringNotNullableAndNotConst * getParamStringNotNullableAndNotConst() const { return reinterpret_cast(param); } + ParamStringNullableAndConst * getParamStringNullableAndConst() const { return reinterpret_cast(param); } + + // Return int + ParamIntNullableAndNotConst * getParamIntNullableAndNotConst() const { return reinterpret_cast(param); } + ParamIntNotNullableAndConst * getParamIntNotNullableAndConst() const { return reinterpret_cast(param); } + ParamIntNotNullableAndNotConst * getParamIntNotNullableAndNotConst() const { return reinterpret_cast(param); } + ParamIntNullableAndConst * getParamIntNullableAndConst() const { return reinterpret_cast(param); } + +private: + void handleStringConstCol(size_t col_size, const ColumnConst * col_const) + { + const auto & col_const_data = col_const->getDataColumnPtr(); + if (col_const_data->isColumnNullable()) + { + Field field; + col_const->get(0, field); + String tmp = field.isNull() ? String("") : field.safeGet(); + const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); + + // Construct actual param + param = new ParamStringNullableAndConst(col_size, StringRef(tmp.data(), tmp.size()), null_map); + param_type = ParamType::StringNullableAndConst; + } + else + { + // Construct actual param + param = new ParamStringNotNullableAndConst(col_size, col_const->getDataAt(0)); + param_type = ParamType::StringNotNullableAndConst; + } + } -#define EXPR_PARAM_VAR_NAME expr_param -#define PAT_PARAM_VAR_NAME pat_param -#define MATCH_TYPE_PARAM_VAR_NAME match_type_param + void handleStringNonConstCol(size_t col_size) + { + if (col_ptr->isColumnNullable()) + { + auto nested_ptr = static_cast(*(col_ptr)).getNestedColumnPtr(); + const auto * tmp = checkAndGetColumn(&(*nested_ptr)); + const auto * null_map = &(static_cast(*(col_ptr)).getNullMapData()); + + // Construct actual param + param = new ParamStringNullableAndNotConst(col_size, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); + param_type = ParamType::StringNullableAndNotConst; + } + else + { + // This is a pure string vector column + const auto * tmp = checkAndGetColumn(&(*(col_ptr))); -#define COL_CONST_VAR_NAME col_const + // Construct actual param + param = new ParamStringNotNullableAndNotConst(col_size, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); + param_type = ParamType::StringNotNullableAndNotConst; + } + } -#define SELF_CLASS_NAME (name) -#define ARG_NUM_VAR_NAME arg_num + void setParamStringTypeAndGenerateParam(size_t col_size) + { + const auto * col_const = typeid_cast(&(*(col_ptr))); + if (col_const != nullptr) + handleStringConstCol(col_size, col_const); + else + handleStringNonConstCol(col_size); + } + + // TODO implement it in next pr + void setParamIntTypeAndGenerateParam(size_t col_size [[maybe_unused]]) + { + throw Exception("Not implemented so far"); + } + + ParamType param_type; + ColumnPtr col_ptr; + StringRef default_str; + Int64 default_int [[maybe_unused]]; + void * param; +}; + +// Unifying these names is necessary in macros +#define EXPR_PV_VAR_NAME expr_pv +#define PAT_PV_VAR_NAME pat_pv +#define MATCH_TYPE_PV_VAR_NAME match_type_pv + +#define EXPR_PARAM_PTR_VAR_NAME expr_param +#define PAT_PARAM_PTR_VAR_NAME pat_param +#define MATCH_TYPE_PARAM_PTR_VAR_NAME match_type_param + +#define RES_ARG_VAR_NAME res_arg // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME executeRegexpFunc -// Common method to convert nullable string column -// converted column referred by converted_col_name is impossible to be const here -#define CONVERT_STR_VEC_TO_PARAM(param_name, converted_col_name, next_convertion) \ - do \ - { \ - if (((converted_col_name)->isColumnNullable())) \ - { \ - auto nested_ptr = static_cast(*(converted_col_name)).getNestedColumnPtr(); \ - const auto * tmp = checkAndGetColumn(&(*nested_ptr)); \ - const auto * null_map = &(static_cast(*(converted_col_name)).getNullMapData()); \ - Param, true>(param_name)(COL_SIZE_VAR_NAME, null_map, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ - next_convertion; \ - } \ - else \ - { \ - /* This is a pure string vector column */ \ - const auto * tmp = checkAndGetColumn(&(*(converted_col_name))); \ - Param, false>(param_name)(COL_SIZE_VAR_NAME, static_cast(&(tmp->getChars())), static_cast(&(tmp->getOffsets()))); \ - next_convertion; \ - } \ - } while (0); +// Do not merge GET_ACTUAL_STRING_PARAM and GET_ACTUAL_INT_PARAM together, +// as this will generate more useless codes and templates. -// Common method to convert const string column -#define CONVERT_STR_CONST_TO_PARAM(param_name, converted_col_name, next_convertion) \ - do \ - { \ - auto col_const_data = COL_CONST_VAR_NAME->getDataColumnPtr(); \ - Field field; \ - COL_CONST_VAR_NAME->get(0, field); \ - String tmp = field.isNull() ? String("") : field.safeGet(); \ - if (col_const_data->isColumnNullable()) \ - { \ - const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); \ - Param, true>(param_name)(COL_SIZE_VAR_NAME, StringRef(tmp.data(), tmp.size()), null_map); \ - next_convertion; \ - } \ - else \ - { \ - Param, false>(param_name)(COL_SIZE_VAR_NAME, COL_CONST_VAR_NAME->getDataAt(0)); \ - next_convertion; \ - } \ +// Common method to get actual string param +#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ + do \ + { \ + switch ((pv_name).getParamType()) \ + { \ + case ParamVariant::ParamType::StringNullableAndNotConst: \ + { \ + ParamVariant::ParamStringNullableAndNotConst *(param_name) = (pv_name).getParamStringNullableAndNotConst(); \ + next_process; \ + break; \ + } \ + case ParamVariant::ParamType::StringNotNullableAndConst: \ + { \ + ParamVariant::ParamStringNotNullableAndConst *(param_name) = (pv_name).getParamStringNotNullableAndConst(); \ + next_process; \ + break; \ + } \ + case ParamVariant::ParamType::StringNotNullableAndNotConst: \ + { \ + ParamVariant::ParamStringNotNullableAndNotConst *(param_name) = (pv_name).getParamStringNotNullableAndNotConst(); \ + next_process; \ + break; \ + } \ + case ParamVariant::ParamType::StringNullableAndConst: \ + { \ + ParamVariant::ParamStringNullableAndConst *(param_name) = (pv_name).getParamStringNullableAndConst(); \ + next_process; \ + break; \ + } \ + default: \ + throw Exception("Unexpected ParamType"); \ + } \ } while (0); -// Common method to convert string column -#define CONVERT_STR_COL_TO_PARAM(param_name, converted_col_name, next_convertion) \ - do \ - { \ - const auto * COL_CONST_VAR_NAME = typeid_cast(&(*(converted_col_name))); \ - if (COL_CONST_VAR_NAME != nullptr) \ - CONVERT_STR_CONST_TO_PARAM((param_name), (converted_col_name), next_convertion) \ - else \ - CONVERT_STR_VEC_TO_PARAM((param_name), (converted_col_name), next_convertion) \ +// Common method to get actual string param +// TODO implement it in next pr +#define GET_ACTUAL_INT_PARAM(pv_name, param_name, next_process) \ + do \ + { \ } while (0); class FunctionStringRegexpBase @@ -542,47 +726,38 @@ class FunctionStringRegexpBase }; // regexp and regexp_like functions are executed in this macro -#define EXECUTE_REGEXP_LIKE() \ - do \ - { \ - REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, EXPR_PARAM_VAR_NAME, PAT_PARAM_VAR_NAME, MATCH_TYPE_PARAM_VAR_NAME); \ +#define EXECUTE_REGEXP_LIKE() \ + do \ + { \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, *(EXPR_PARAM_PTR_VAR_NAME), *(PAT_PARAM_PTR_VAR_NAME), *(MATCH_TYPE_PARAM_PTR_VAR_NAME)); \ } while (0); -// Method to convert match type column -#define CONVERT_MATCH_TYPE_COL_TO_PARAM_AND_EXECUTE() \ - do \ - { \ - if ((ARG_NUM_VAR_NAME) == 3) \ - CONVERT_STR_COL_TO_PARAM(MATCH_TYPE_PARAM_VAR_NAME, MATCH_TYPE_COL_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ - else \ - { \ - /* match_type is not provided here */ \ - Param, false> MATCH_TYPE_PARAM_VAR_NAME(COL_SIZE_VAR_NAME, StringRef("", 0)); \ - EXECUTE_REGEXP_LIKE() \ - } \ +// Method to get actual match type param +#define GET_MATCH_TYPE_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(MATCH_TYPE_PV_VAR_NAME, MATCH_TYPE_PARAM_PTR_VAR_NAME, ({EXECUTE_REGEXP_LIKE()})) \ } while (0); -// Method to convert pattern column -#define CONVERT_PAT_COL_TO_PARAM() \ - do \ - { \ - CONVERT_STR_COL_TO_PARAM(PAT_PARAM_VAR_NAME, PAT_COL_PTR_VAR_NAME, ({CONVERT_MATCH_TYPE_COL_TO_PARAM_AND_EXECUTE()})) \ +// Method to get actual pattern param +#define GET_PAT_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(PAT_PV_VAR_NAME, PAT_PARAM_PTR_VAR_NAME, ({GET_MATCH_TYPE_ACTUAL_PARAM()})) \ } while (0); -// Method to convert expression column -#define CONVERT_EXPR_COL_TO_PARAM() \ - do \ - { \ - /* Getting column size from expr col */ \ - size_t COL_SIZE_VAR_NAME = (EXPR_COL_PTR_VAR_NAME)->size(); \ - CONVERT_STR_COL_TO_PARAM(EXPR_PARAM_VAR_NAME, EXPR_COL_PTR_VAR_NAME, ({CONVERT_PAT_COL_TO_PARAM()})) \ +// Method to get actual expression param +#define GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(EXPR_PV_VAR_NAME, EXPR_PARAM_PTR_VAR_NAME, ({GET_PAT_ACTUAL_PARAM()})) \ } while (0); -// The entry to convert columns to params and execute regexp functions -#define CONVERT_COLS_TO_PARAMS_AND_EXECUTE() \ - do \ - { \ - CONVERT_EXPR_COL_TO_PARAM() \ +// The entry to get actual params and execute regexp functions +#define GET_ACTUAL_PARAMS_AND_EXECUTE() \ + do \ + { \ + GET_EXPR_ACTUAL_PARAM() \ } while (0); // Implementation of regexp and regexp_like functions @@ -783,7 +958,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase // Do something related with nullable columns NullPresence null_presence = getNullPresense(block, arguments); - const ColumnPtr & EXPR_COL_PTR_VAR_NAME = block.getByPosition(arguments[0]).column; + const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; if (null_presence.has_null_constant) { @@ -791,40 +966,44 @@ class FunctionStringRegexp : public FunctionStringRegexpBase return; } - const ColumnPtr & PAT_COL_PTR_VAR_NAME = block.getByPosition(arguments[1]).column; + const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; - size_t ARG_NUM_VAR_NAME = arguments.size(); + size_t arg_num = arguments.size(); auto & RES_ARG_VAR_NAME = block.getByPosition(result); - ColumnPtr MATCH_TYPE_COL_PTR_VAR_NAME; - if ((ARG_NUM_VAR_NAME) == 3) - MATCH_TYPE_COL_PTR_VAR_NAME = block.getByPosition(arguments[2]).column; + ColumnPtr col_match_type; + if (arg_num == 3) + col_match_type = block.getByPosition(arguments[2]).column; + + size_t col_size = col_expr->size(); - CONVERT_COLS_TO_PARAMS_AND_EXECUTE() + ParamVariant EXPR_PV_VAR_NAME(col_expr, col_size, StringRef("", 0)); + ParamVariant PAT_PV_VAR_NAME(col_pat, col_size, StringRef("", 0)); + ParamVariant MATCH_TYPE_PV_VAR_NAME(col_match_type, col_size, StringRef("", 0)); + + GET_ACTUAL_PARAMS_AND_EXECUTE() } private: TiDB::TiDBCollatorPtr collator = nullptr; }; -#undef CONVERT_COLS_TO_PARAMS_AND_EXECUTE -#undef CONVERT_EXPR_COL_TO_PARAM -#undef CONVERT_PAT_COL_TO_PARAM -#undef CONVERT_MATCH_TYPE_COL_TO_PARAM +#undef GET_ACTUAL_PARAMS_AND_EXECUTE +#undef GET_EXPR_ACTUAL_PARAM +#undef GET_PAT_ACTUAL_PARAM +#undef GET_MATCH_TYPE_ACTUAL_PARAM #undef EXECUTE_REGEXP_LIKE -#undef CONVERT_CONST_STR_COL_TO_PARAM -#undef CONVERT_NULL_STR_COL_TO_PARAM + +#undef GET_ACTUAL_INT_PARAM +#undef GET_ACTUAL_STRING_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME -#undef ARG_NUM_VAR_NAME -#undef SELF_CLASS_NAME -#undef MATCH_TYPE_PARAM_VAR_NAME -#undef PAT_PARAM_VAR_NAME -#undef EXPR_PARAM_VAR_NAME -#undef COL_SIZE_VAR_NAME #undef RES_ARG_VAR_NAME -#undef MATCH_TYPE_COL_PTR_VAR_NAME -#undef PAT_COL_PTR_VAR_NAME -#undef EXPR_COL_PTR_VAR_NAME +#undef MATCH_TYPE_PARAM_PTR_VAR_NAME +#undef PAT_PARAM_PTR_VAR_NAME +#undef EXPR_PARAM_PTR_VAR_NAME +#undef MATCH_TYPE_PV_VAR_NAME +#undef PAT_PV_VAR_NAME +#undef EXPR_PV_VAR_NAME } // namespace DB From 8622c01fb0bba0bd3a06457dcd337db15c826d12 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 3 Nov 2022 15:47:51 +0800 Subject: [PATCH 49/87] compress macros --- dbms/src/Functions/FunctionsRegexp.h | 104 +++++++++++++-------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 867bb967746..ecf7309357d 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -395,6 +395,18 @@ class Param ParamImplType data; }; +#define APPLY_FOR_PARAM_STRING_VARIANTS(M, pv_name, param_name, next_process) \ + M(StringNullableAndNotConst, pv_name, param_name, next_process) \ + M(StringNotNullableAndConst, pv_name, param_name, next_process) \ + M(StringNotNullableAndNotConst, pv_name, param_name, next_process) \ + M(StringNullableAndConst, pv_name, param_name, next_process) \ + +#define APPLY_FOR_PARAM_INT_VARIANTS(M, pv_name, param_name, next_process) \ + M(IntNullableAndNotConst, pv_name, param_name, next_process) \ + M(IntNotNullableAndConst, pv_name, param_name, next_process) \ + M(IntNotNullableAndNotConst, pv_name, param_name, next_process) \ + M(IntNullableAndConst, pv_name, param_name, next_process) \ + class ParamVariant { public: @@ -458,30 +470,29 @@ class ParamVariant { switch (param_type) { - case ParamType::StringNullableAndNotConst: - delete reinterpret_cast(param); - break; - case ParamType::StringNotNullableAndConst: - delete reinterpret_cast(param); - break; - case ParamType::StringNotNullableAndNotConst: - delete reinterpret_cast(param); - break; - case ParamType::StringNullableAndConst: - delete reinterpret_cast(param); - break; - case ParamType::IntNullableAndNotConst: - delete reinterpret_cast(param); - break; - case ParamType::IntNotNullableAndConst: - delete reinterpret_cast(param); - break; - case ParamType::IntNotNullableAndNotConst: - delete reinterpret_cast(param); - break; - case ParamType::IntNullableAndConst: - delete reinterpret_cast(param); - break; +// Enumerate cases that delete string param pointer +#define M(NAME, pv_name, param_name, next_process) \ + case ParamType::NAME: \ + { \ + delete reinterpret_cast(param); \ + break; \ + } + + // Expand the macro to enumerate string param cases + APPLY_FOR_PARAM_STRING_VARIANTS(M, placeholder1, placeholder2, placeholder3) +#undef M + +// Enumerate cases that delete int param pointer +#define M(NAME, pv_name, param_name, next_process) \ + case ParamType::NAME: \ + { \ + delete reinterpret_cast(param); \ + break; \ + } + + // Expand the macro to enumerate int param cases + APPLY_FOR_PARAM_INT_VARIANTS(M, placeholder1, placeholder2, placeholder3) +#undef M default: throw Exception("Unexpected ParamType"); } @@ -587,36 +598,21 @@ class ParamVariant // Do not merge GET_ACTUAL_STRING_PARAM and GET_ACTUAL_INT_PARAM together, // as this will generate more useless codes and templates. -// Common method to get actual string param -#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ - do \ - { \ +#define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ + case ParamVariant::ParamType::NAME: \ + { \ + ParamVariant::Param##NAME *(param_name) = (pv_name).getParam##NAME(); \ + next_process; \ + break; \ + } + +#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ + do \ + { \ switch ((pv_name).getParamType()) \ { \ - case ParamVariant::ParamType::StringNullableAndNotConst: \ - { \ - ParamVariant::ParamStringNullableAndNotConst *(param_name) = (pv_name).getParamStringNullableAndNotConst(); \ - next_process; \ - break; \ - } \ - case ParamVariant::ParamType::StringNotNullableAndConst: \ - { \ - ParamVariant::ParamStringNotNullableAndConst *(param_name) = (pv_name).getParamStringNotNullableAndConst(); \ - next_process; \ - break; \ - } \ - case ParamVariant::ParamType::StringNotNullableAndNotConst: \ - { \ - ParamVariant::ParamStringNotNullableAndNotConst *(param_name) = (pv_name).getParamStringNotNullableAndNotConst(); \ - next_process; \ - break; \ - } \ - case ParamVariant::ParamType::StringNullableAndConst: \ - { \ - ParamVariant::ParamStringNullableAndConst *(param_name) = (pv_name).getParamStringNullableAndConst(); \ - next_process; \ - break; \ - } \ + /* Expand this macro to enumerate all string cases */ \ + APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, pv_name, param_name, next_process) \ default: \ throw Exception("Unexpected ParamType"); \ } \ @@ -997,6 +993,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef GET_ACTUAL_INT_PARAM #undef GET_ACTUAL_STRING_PARAM +#undef ENUMERATE_PARAM_VARIANT_CASES #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME #undef RES_ARG_VAR_NAME #undef MATCH_TYPE_PARAM_PTR_VAR_NAME @@ -1006,4 +1003,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef PAT_PV_VAR_NAME #undef EXPR_PV_VAR_NAME +#undef APPLY_FOR_PARAM_INT_VARIANTS +#undef APPLY_FOR_PARAM_STRING_VARIANTS + } // namespace DB From a94293f361d76e1ce444d64b4f5b27a8e6e2639d Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 3 Nov 2022 15:48:53 +0800 Subject: [PATCH 50/87] format --- dbms/src/Functions/FunctionsRegexp.h | 74 ++++++++++++++-------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index ecf7309357d..3f9b3559e55 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -396,16 +396,16 @@ class Param }; #define APPLY_FOR_PARAM_STRING_VARIANTS(M, pv_name, param_name, next_process) \ - M(StringNullableAndNotConst, pv_name, param_name, next_process) \ - M(StringNotNullableAndConst, pv_name, param_name, next_process) \ - M(StringNotNullableAndNotConst, pv_name, param_name, next_process) \ - M(StringNullableAndConst, pv_name, param_name, next_process) \ + M(StringNullableAndNotConst, pv_name, param_name, next_process) \ + M(StringNotNullableAndConst, pv_name, param_name, next_process) \ + M(StringNotNullableAndNotConst, pv_name, param_name, next_process) \ + M(StringNullableAndConst, pv_name, param_name, next_process) #define APPLY_FOR_PARAM_INT_VARIANTS(M, pv_name, param_name, next_process) \ - M(IntNullableAndNotConst, pv_name, param_name, next_process) \ - M(IntNotNullableAndConst, pv_name, param_name, next_process) \ - M(IntNotNullableAndNotConst, pv_name, param_name, next_process) \ - M(IntNullableAndConst, pv_name, param_name, next_process) \ + M(IntNullableAndNotConst, pv_name, param_name, next_process) \ + M(IntNotNullableAndConst, pv_name, param_name, next_process) \ + M(IntNotNullableAndNotConst, pv_name, param_name, next_process) \ + M(IntNullableAndConst, pv_name, param_name, next_process) class ParamVariant { @@ -471,27 +471,27 @@ class ParamVariant switch (param_type) { // Enumerate cases that delete string param pointer -#define M(NAME, pv_name, param_name, next_process) \ - case ParamType::NAME: \ - { \ +#define M(NAME, pv_name, param_name, next_process) \ + case ParamType::NAME: \ + { \ delete reinterpret_cast(param); \ - break; \ + break; \ } - // Expand the macro to enumerate string param cases - APPLY_FOR_PARAM_STRING_VARIANTS(M, placeholder1, placeholder2, placeholder3) + // Expand the macro to enumerate string param cases + APPLY_FOR_PARAM_STRING_VARIANTS(M, placeholder1, placeholder2, placeholder3) #undef M // Enumerate cases that delete int param pointer -#define M(NAME, pv_name, param_name, next_process) \ - case ParamType::NAME: \ - { \ +#define M(NAME, pv_name, param_name, next_process) \ + case ParamType::NAME: \ + { \ delete reinterpret_cast(param); \ - break; \ + break; \ } - // Expand the macro to enumerate int param cases - APPLY_FOR_PARAM_INT_VARIANTS(M, placeholder1, placeholder2, placeholder3) + // Expand the macro to enumerate int param cases + APPLY_FOR_PARAM_INT_VARIANTS(M, placeholder1, placeholder2, placeholder3) #undef M default: throw Exception("Unexpected ParamType"); @@ -599,23 +599,23 @@ class ParamVariant // as this will generate more useless codes and templates. #define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ - case ParamVariant::ParamType::NAME: \ - { \ - ParamVariant::Param##NAME *(param_name) = (pv_name).getParam##NAME(); \ - next_process; \ - break; \ - } - -#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ - do \ - { \ - switch ((pv_name).getParamType()) \ - { \ - /* Expand this macro to enumerate all string cases */ \ - APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, pv_name, param_name, next_process) \ - default: \ - throw Exception("Unexpected ParamType"); \ - } \ + case ParamVariant::ParamType::NAME: \ + { \ + ParamVariant::Param##NAME *(param_name) = (pv_name).getParam##NAME(); \ + next_process; \ + break; \ + } + +#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ + do \ + { \ + switch ((pv_name).getParamType()) \ + { \ + /* Expand this macro to enumerate all string cases */ \ + APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, pv_name, param_name, next_process) \ + default: \ + throw Exception("Unexpected ParamType"); \ + } \ } while (0); // Common method to get actual string param From cb93deacf256b7a9b1ce1852cd2539a006ba0003 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 3 Nov 2022 15:50:00 +0800 Subject: [PATCH 51/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 3f9b3559e55..aff3757fb64 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -595,9 +595,6 @@ class ParamVariant // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME executeRegexpFunc -// Do not merge GET_ACTUAL_STRING_PARAM and GET_ACTUAL_INT_PARAM together, -// as this will generate more useless codes and templates. - #define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ case ParamVariant::ParamType::NAME: \ { \ @@ -606,6 +603,9 @@ class ParamVariant break; \ } +// Do not merge GET_ACTUAL_STRING_PARAM and GET_ACTUAL_INT_PARAM together, +// as this will generate more useless codes and templates. + #define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ do \ { \ From 27ff20ec44aa9822a74e4cd0419cd5a2268e7d7c Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 4 Nov 2022 09:23:16 +0800 Subject: [PATCH 52/87] add tests --- dbms/src/Functions/FunctionsRegexp.h | 6 +++--- dbms/src/Functions/tests/gtest_regexp.cpp | 25 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index aff3757fb64..c5c9a7166ef 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -231,7 +231,7 @@ class ParamInt throw Exception("Shouldn't call this constructor"); } - explicit ParamInt(const void * int_container_, IntType int_type_) + ParamInt(const void * int_container_, IntType int_type_) : const_int_val(0) , int_type(int_type_) , int_container(int_container_) @@ -435,7 +435,7 @@ class ParamVariant }; // default ParamString's ParamType should be ParamType::StringNotNullAndNotConst - explicit ParamVariant(ColumnPtr col, size_t col_size, const StringRef & default_val) + ParamVariant(ColumnPtr col, size_t col_size, const StringRef & default_val) : col_ptr(col) , default_str(default_val) , default_int(0) @@ -454,7 +454,7 @@ class ParamVariant } // default ParamInt's ParamType should be ParamType::IntNotNullAndNotConst - explicit ParamVariant(ColumnPtr col, size_t col_size [[maybe_unused]], Int64 default_val) + ParamVariant(ColumnPtr col, size_t col_size [[maybe_unused]], Int64 default_val) : col_ptr(col) , default_str("", 0) , default_int(default_val) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 7f79fea364f..e423ef3c2da 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2067,6 +2067,31 @@ TEST_F(Regexp, RegexpLike) createNullableVectorColumn(match_types, match_type_nulls))); } + std::cout << "case 9" << std::endl; + // case 9 test empty columns + { + ASSERT_COLUMN_EQ(createColumn({}), + executeFunction( + "regexp_like", + createColumn({}), + createColumn({}), + createColumn({}))); + + ASSERT_COLUMN_EQ(createOnlyNullColumnConst(0), + executeFunction( + "regexp_like", + createOnlyNullColumnConst(0), + createColumn({}), + createColumn({}))); + + ASSERT_COLUMN_EQ(createColumn({}), + executeFunction( + "regexp_like", + createConstColumn(0, ""), + createColumn({}), + createColumn({}))); + } + // empty pattern is not allowed ASSERT_THROW(executeFunction("regexp_like", createColumn(std::vector{"1"}), createConstColumn(row_size, "")), Exception); ASSERT_THROW(executeFunction("regexp_like", createConstColumn(row_size, ""), createConstColumn(row_size, "")), Exception); From 444cb5e6588261a61a60551e7d6d47dcaa8713cc Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 4 Nov 2022 12:39:32 +0800 Subject: [PATCH 53/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index c5c9a7166ef..932e37576af 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -501,18 +501,6 @@ class ParamVariant ParamType getParamType() const { return param_type; } - // Return string - ParamStringNullableAndNotConst * getParamStringNullableAndNotConst() const { return reinterpret_cast(param); } - ParamStringNotNullableAndConst * getParamStringNotNullableAndConst() const { return reinterpret_cast(param); } - ParamStringNotNullableAndNotConst * getParamStringNotNullableAndNotConst() const { return reinterpret_cast(param); } - ParamStringNullableAndConst * getParamStringNullableAndConst() const { return reinterpret_cast(param); } - - // Return int - ParamIntNullableAndNotConst * getParamIntNullableAndNotConst() const { return reinterpret_cast(param); } - ParamIntNotNullableAndConst * getParamIntNotNullableAndConst() const { return reinterpret_cast(param); } - ParamIntNotNullableAndNotConst * getParamIntNotNullableAndNotConst() const { return reinterpret_cast(param); } - ParamIntNullableAndConst * getParamIntNullableAndConst() const { return reinterpret_cast(param); } - private: void handleStringConstCol(size_t col_size, const ColumnConst * col_const) { @@ -578,6 +566,10 @@ class ParamVariant ColumnPtr col_ptr; StringRef default_str; Int64 default_int [[maybe_unused]]; + +public: + // This variable should be reinterpret_cast to specific type before used + // macro GET_ACTUAL_PARAM_PTR may be helpful void * param; }; @@ -595,10 +587,14 @@ class ParamVariant // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME executeRegexpFunc +#define ACTUAL_PARAM_TYPE(NAME) ParamVariant::Param##NAME + +#define GET_ACTUAL_PARAM_PTR(NAME, ptr) (reinterpret_cast(ptr)) + #define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ case ParamVariant::ParamType::NAME: \ { \ - ParamVariant::Param##NAME *(param_name) = (pv_name).getParam##NAME(); \ + auto *(param_name) = GET_ACTUAL_PARAM_PTR(NAME, (pv_name).param); \ next_process; \ break; \ } @@ -994,6 +990,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef GET_ACTUAL_INT_PARAM #undef GET_ACTUAL_STRING_PARAM #undef ENUMERATE_PARAM_VARIANT_CASES +#undef GET_ACTUAL_PARAM_PTR +#undef ACTUAL_PARAM_TYPE #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME #undef RES_ARG_VAR_NAME #undef MATCH_TYPE_PARAM_PTR_VAR_NAME From a2d4bae5250e7d31641db88bc585d979de47a410 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 4 Nov 2022 15:07:16 +0800 Subject: [PATCH 54/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 29 +++++++++++++--------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 932e37576af..7560eaaecfe 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -407,6 +407,10 @@ class Param M(IntNotNullableAndNotConst, pv_name, param_name, next_process) \ M(IntNullableAndConst, pv_name, param_name, next_process) +#define PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME param + +#define GET_PARAM_PTR_FROM_PARAM_VARIANT(pv_name) (pv_name).PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME + class ParamVariant { public: @@ -439,7 +443,7 @@ class ParamVariant : col_ptr(col) , default_str(default_val) , default_int(0) - , param(nullptr) + , PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME(nullptr) { if (col_ptr != nullptr) { @@ -458,7 +462,7 @@ class ParamVariant : col_ptr(col) , default_str("", 0) , default_int(default_val) - , param(nullptr) + , PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME(nullptr) { // TODO implement it in next pr throw Exception("Not implemented so far"); @@ -466,7 +470,7 @@ class ParamVariant ~ParamVariant() { - if (param != nullptr) + if (PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME != nullptr) { switch (param_type) { @@ -474,21 +478,12 @@ class ParamVariant #define M(NAME, pv_name, param_name, next_process) \ case ParamType::NAME: \ { \ - delete reinterpret_cast(param); \ + delete reinterpret_cast(PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME); \ break; \ } // Expand the macro to enumerate string param cases APPLY_FOR_PARAM_STRING_VARIANTS(M, placeholder1, placeholder2, placeholder3) -#undef M - -// Enumerate cases that delete int param pointer -#define M(NAME, pv_name, param_name, next_process) \ - case ParamType::NAME: \ - { \ - delete reinterpret_cast(param); \ - break; \ - } // Expand the macro to enumerate int param cases APPLY_FOR_PARAM_INT_VARIANTS(M, placeholder1, placeholder2, placeholder3) @@ -570,7 +565,7 @@ class ParamVariant public: // This variable should be reinterpret_cast to specific type before used // macro GET_ACTUAL_PARAM_PTR may be helpful - void * param; + void * PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME; }; // Unifying these names is necessary in macros @@ -589,12 +584,12 @@ class ParamVariant #define ACTUAL_PARAM_TYPE(NAME) ParamVariant::Param##NAME -#define GET_ACTUAL_PARAM_PTR(NAME, ptr) (reinterpret_cast(ptr)) +#define GET_ACTUAL_PARAM_PTR(NAME, param_ptr_name) (reinterpret_cast(param_ptr_name)) #define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ case ParamVariant::ParamType::NAME: \ { \ - auto *(param_name) = GET_ACTUAL_PARAM_PTR(NAME, (pv_name).param); \ + auto *(param_name) = GET_ACTUAL_PARAM_PTR(NAME, GET_PARAM_PTR_FROM_PARAM_VARIANT(pv_name)); \ next_process; \ break; \ } @@ -1001,6 +996,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef PAT_PV_VAR_NAME #undef EXPR_PV_VAR_NAME +#undef GET_PARAM_PTR_FROM_PARAM_VARIANT +#undef PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME #undef APPLY_FOR_PARAM_INT_VARIANTS #undef APPLY_FOR_PARAM_STRING_VARIANTS From 31ed79eb5659408111a8a9cdbb007aef471e69e7 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 4 Nov 2022 15:25:28 +0800 Subject: [PATCH 55/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 7560eaaecfe..7b096abfb05 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -474,12 +474,12 @@ class ParamVariant { switch (param_type) { -// Enumerate cases that delete string param pointer -#define M(NAME, pv_name, param_name, next_process) \ - case ParamType::NAME: \ - { \ +// Enumerate cases that delete param pointer +#define M(NAME, pv_name, param_name, next_process) \ + case ParamType::NAME: \ + { \ delete reinterpret_cast(PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME); \ - break; \ + break; \ } // Expand the macro to enumerate string param cases @@ -586,12 +586,12 @@ class ParamVariant #define GET_ACTUAL_PARAM_PTR(NAME, param_ptr_name) (reinterpret_cast(param_ptr_name)) -#define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ - case ParamVariant::ParamType::NAME: \ - { \ - auto *(param_name) = GET_ACTUAL_PARAM_PTR(NAME, GET_PARAM_PTR_FROM_PARAM_VARIANT(pv_name)); \ - next_process; \ - break; \ +#define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ + case ParamVariant::ParamType::NAME: \ + { \ + auto *(param_name) = GET_ACTUAL_PARAM_PTR(NAME, GET_PARAM_PTR_FROM_PARAM_VARIANT(pv_name)); \ + next_process; \ + break; \ } // Do not merge GET_ACTUAL_STRING_PARAM and GET_ACTUAL_INT_PARAM together, From 8cdf3ae2e2543fc00f6ba987686622b6d4ed3b3d Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 4 Nov 2022 16:22:22 +0800 Subject: [PATCH 56/87] refine --- dbms/src/Functions/FunctionsRegexp.h | 76 ++++++++++++---------------- 1 file changed, 31 insertions(+), 45 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 7b096abfb05..4c075c78edd 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -407,9 +407,17 @@ class Param M(IntNotNullableAndNotConst, pv_name, param_name, next_process) \ M(IntNullableAndConst, pv_name, param_name, next_process) -#define PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME param +#define ACTUAL_PARAM_TYPE(NAME) ParamVariant::Param##NAME -#define GET_PARAM_PTR_FROM_PARAM_VARIANT(pv_name) (pv_name).PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME +#define GET_ACTUAL_PARAM_PTR(NAME, param_ptr_name) (reinterpret_cast(param_ptr_name)) + +#define ENUMERATE_PARAM_VARIANT_CASES(NAME, param_ptr_name, actual_param_ptr_name, next_process) \ + case ParamVariant::ParamType::NAME: \ + { \ + auto *(actual_param_ptr_name) = GET_ACTUAL_PARAM_PTR(NAME, param_ptr_name); \ + next_process; \ + break; \ + } class ParamVariant { @@ -443,7 +451,7 @@ class ParamVariant : col_ptr(col) , default_str(default_val) , default_int(0) - , PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME(nullptr) + , param(nullptr) { if (col_ptr != nullptr) { @@ -462,7 +470,7 @@ class ParamVariant : col_ptr(col) , default_str("", 0) , default_int(default_val) - , PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME(nullptr) + , param(nullptr) { // TODO implement it in next pr throw Exception("Not implemented so far"); @@ -470,24 +478,15 @@ class ParamVariant ~ParamVariant() { - if (PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME != nullptr) + if (param != nullptr) { switch (param_type) { -// Enumerate cases that delete param pointer -#define M(NAME, pv_name, param_name, next_process) \ - case ParamType::NAME: \ - { \ - delete reinterpret_cast(PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME); \ - break; \ - } - // Expand the macro to enumerate string param cases - APPLY_FOR_PARAM_STRING_VARIANTS(M, placeholder1, placeholder2, placeholder3) + APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, param, actual_param_ptr, ({ delete actual_param_ptr; })) // Expand the macro to enumerate int param cases - APPLY_FOR_PARAM_INT_VARIANTS(M, placeholder1, placeholder2, placeholder3) -#undef M + APPLY_FOR_PARAM_INT_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, param, actual_param_ptr, ({ delete actual_param_ptr; })) default: throw Exception("Unexpected ParamType"); } @@ -563,9 +562,10 @@ class ParamVariant Int64 default_int [[maybe_unused]]; public: + // ATTENTION! Be careful to change this variable's name as many macros use it + // // This variable should be reinterpret_cast to specific type before used - // macro GET_ACTUAL_PARAM_PTR may be helpful - void * PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME; + void * param; }; // Unifying these names is necessary in macros @@ -582,31 +582,19 @@ class ParamVariant // Unify the name of functions that actually execute regexp #define REGEXP_CLASS_MEM_FUNC_IMPL_NAME executeRegexpFunc -#define ACTUAL_PARAM_TYPE(NAME) ParamVariant::Param##NAME - -#define GET_ACTUAL_PARAM_PTR(NAME, param_ptr_name) (reinterpret_cast(param_ptr_name)) - -#define ENUMERATE_PARAM_VARIANT_CASES(NAME, pv_name, param_name, next_process) \ - case ParamVariant::ParamType::NAME: \ - { \ - auto *(param_name) = GET_ACTUAL_PARAM_PTR(NAME, GET_PARAM_PTR_FROM_PARAM_VARIANT(pv_name)); \ - next_process; \ - break; \ - } - // Do not merge GET_ACTUAL_STRING_PARAM and GET_ACTUAL_INT_PARAM together, // as this will generate more useless codes and templates. -#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ - do \ - { \ - switch ((pv_name).getParamType()) \ - { \ - /* Expand this macro to enumerate all string cases */ \ - APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, pv_name, param_name, next_process) \ - default: \ - throw Exception("Unexpected ParamType"); \ - } \ +#define GET_ACTUAL_STRING_PARAM(pv_name, param_name, next_process) \ + do \ + { \ + switch ((pv_name).getParamType()) \ + { \ + /* Expand this macro to enumerate all string cases */ \ + APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, (pv_name).param, param_name, next_process) \ + default: \ + throw Exception("Unexpected ParamType"); \ + } \ } while (0); // Common method to get actual string param @@ -984,9 +972,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef GET_ACTUAL_INT_PARAM #undef GET_ACTUAL_STRING_PARAM -#undef ENUMERATE_PARAM_VARIANT_CASES -#undef GET_ACTUAL_PARAM_PTR -#undef ACTUAL_PARAM_TYPE #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME #undef RES_ARG_VAR_NAME #undef MATCH_TYPE_PARAM_PTR_VAR_NAME @@ -996,8 +981,9 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef PAT_PV_VAR_NAME #undef EXPR_PV_VAR_NAME -#undef GET_PARAM_PTR_FROM_PARAM_VARIANT -#undef PARAM_VARIANT_PARAM_PTR_MEM_VAR_NAME +#undef ACTUAL_PARAM_TYPE +#undef GET_ACTUAL_PARAM_PTR +#undef ENUMERATE_PARAM_VARIANT_CASES #undef APPLY_FOR_PARAM_INT_VARIANTS #undef APPLY_FOR_PARAM_STRING_VARIANTS From 8ad4307e41f24636388c2932296b9bac8319afb4 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 7 Nov 2022 11:04:41 +0800 Subject: [PATCH 57/87] resolve comment --- dbms/src/Functions/FunctionsRegexp.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 4c075c78edd..3e68086610f 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -215,7 +215,7 @@ class ParamInt explicit ParamInt(Int64 val) : const_int_val(val) - , int_type(IntType::UInt8) + , int_type(IntType::Int64) , int_container(nullptr) { if constexpr (!is_const) @@ -261,14 +261,18 @@ class ParamInt } } - void setIntType(IntType int_type_) { int_type = int_type_; } IntType getIntType() const { return int_type; } String getString(size_t) const { throw Exception("ParamInt not supports this function"); } void getStringRef(size_t, StringRef &) const { throw Exception("ParamInt not supports this function"); } constexpr static bool isConst() { return is_const; } - void setContainer(const void * container) { int_container = container; } const void * getContainer() const { return int_container; } + void setIntTypeAndContainer(IntType type, const void * container) + { + int_type = type; + int_container = container; + } + private: Int64 const_int_val; IntType int_type; From 6db90ee35d46b4f9f2b42e9d65716c524fe627c2 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 7 Nov 2022 18:20:59 +0800 Subject: [PATCH 58/87] fix ut --- dbms/src/Functions/FunctionsRegexp.h | 31 +++++++++++++--------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index ffcd8e57a9c..f855252cf4b 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -649,13 +649,12 @@ class ParamVariant private: void handleIntConstCol(size_t col_size, const ColumnConst * col_const) { + Field field; + col_const->get(0, field); + auto data_int64 = field.isNull() ? -1 : getIntFromField(field); const auto & col_const_data = col_const->getDataColumnPtr(); if (col_const_data->isColumnNullable()) { - Field field; - col_const->get(0, field); - auto data_int64 = field.isNull() ? -1 : getIntFromField(field); - auto col_const_data = col_const->getDataColumnPtr(); const auto * null_map = &(static_cast(*(col_const_data)).getNullMapData()); // Construct actual param @@ -665,7 +664,7 @@ class ParamVariant else { // Construct actual param - param = new ParamStringNotNullableAndConst(col_size, col_const->getDataAt(0)); + param = new ParamIntNotNullableAndConst(col_size, data_int64); param_type = ParamType::IntNotNullableAndConst; } } @@ -693,11 +692,11 @@ class ParamVariant #define M(INT_TYPE, col_ptr, null_map, param) \ else if (const auto * ptr = typeid_cast(&(*(col_ptr)))) \ { \ - param = new ParamIntNullableAndNotConst(col_size, null_map, reinterpret_cast(&(ptr->getData())), IntType::INT_TYPE); \ + (param) = new ParamIntNullableAndNotConst(col_size, null_map, reinterpret_cast(&(ptr->getData())), IntType::INT_TYPE); \ } if (false) {} - APPLY_FOR_INT_CONTAINER(M, col_ptr, null_map, param) + APPLY_FOR_INT_CONTAINER(M, nested_ptr, null_map, param) else throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -711,7 +710,7 @@ class ParamVariant #define M(INT_TYPE, col_ptr, null_map, param) \ else if (const auto * ptr = typeid_cast(&(*(col_ptr)))) \ { \ - param = new ParamIntNotNullableAndNotConst(col_size, reinterpret_cast(&(ptr->getData())), IntType::INT_TYPE); \ + (param) = new ParamIntNotNullableAndNotConst(col_size, reinterpret_cast(&(ptr->getData())), IntType::INT_TYPE); \ } if (false) {} @@ -1224,7 +1223,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return true; } - void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { COLLATOR_VAR_NAME = collator_; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } bool useDefaultImplementationForNulls() const override { return false; } size_t getNumberOfArguments() const override { return 0; } @@ -1297,7 +1296,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase Int64 ret_op = RetOpT::isConst() ? ret_op_const_val : get_ret_op_func(ret_op_container, 0); String match_type = match_type_param.getString(0); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, COLLATOR_VAR_NAME), flags); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); ResultType res = regexp.instr(expr.c_str(), expr.size(), pos, occur, ret_op); res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; @@ -1349,7 +1348,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase { // Codes in this if branch execute instr with memorized regexp - const auto & regexp = memorize(pat_param, match_type_param, COLLATOR_VAR_NAME); + const auto & regexp = memorize(pat_param, match_type_param, collator); if constexpr (has_nullable_col) { // Process nullable columns with memorized regexp @@ -1416,7 +1415,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) match_type = match_type_param.getString(i); - auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); } @@ -1435,7 +1434,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) match_type = match_type_param.getString(i); - auto regexp = createRegexpWithMatchType(pat, match_type, COLLATOR_VAR_NAME); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); } @@ -1489,14 +1488,14 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase ParamVariant PAT_PV_VAR_NAME(col_pat, col_size, StringRef("", 0)); ParamVariant POS_PV_VAR_NAME(col_pos, col_size, 1); ParamVariant OCCUR_PV_VAR_NAME(col_occur, col_size, 1); - ParamVariant RET_OP_PV_VAR_NAME(col_occur, col_size, 0); + ParamVariant RET_OP_PV_VAR_NAME(col_return_option, col_size, 0); ParamVariant MATCH_TYPE_PV_VAR_NAME(col_match_type, col_size, StringRef("", 0)); GET_ACTUAL_PARAMS_AND_EXECUTE() } private: - TiDB::TiDBCollatorPtr COLLATOR_VAR_NAME = nullptr; + TiDB::TiDBCollatorPtr collator = nullptr; }; #undef GET_ACTUAL_PARAMS_AND_EXECUTE @@ -1508,8 +1507,6 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase #undef GET_MATCH_TYPE_ACTUAL_PARAM #undef EXECUTE_REGEXP_INSTR -#undef COLLATOR_VAR_NAME - #undef GET_ACTUAL_INT_PARAM #undef GET_ACTUAL_STRING_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME From e1d1501277f05388a78dcaa587cbd9927e235453 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 7 Nov 2022 18:25:57 +0800 Subject: [PATCH 59/87] refine header --- dbms/src/Common/OptimizedRegularExpression.inl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index a83da92366a..8df03527e6e 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -17,9 +17,9 @@ #include #include -#include "Common/Exception.h" -#include "common/defines.h" -#include "common/types.h" +#include +#include +#include #define MIN_LENGTH_FOR_STRSTR 3 From c23bb9707e6a4d09b3517d02663e838abf627606 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 8 Nov 2022 09:38:34 +0800 Subject: [PATCH 60/87] refine header --- dbms/src/Common/OptimizedRegularExpression.inl.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 8df03527e6e..74a2403ffce 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -12,15 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include +#include #include - -#include #include #include #include +#include #define MIN_LENGTH_FOR_STRSTR 3 #define MAX_SUBPATTERNS 5 From f74afa9497346080f9a788414f2b00042a3bba36 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 8 Nov 2022 16:46:13 +0800 Subject: [PATCH 61/87] replace getMatchedIndex with find and refine comments --- .../Common/OptimizedRegularExpression.inl.h | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 74a2403ffce..68d385b41f5 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -474,7 +474,8 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject // Convert utf8 position to byte position. // For Example: -// Taking string "ni好a" as an example. utf8 position 4 is corresponding to byte position 6. +// Taking string "ni好a" as an example. +// utf8 position of character 'a' in this string is 4 and byte position is 6. static inline size_t utf8Pos2bytePos(const char * str, size_t utf8_pos) { size_t byte_index = 0; @@ -506,11 +507,11 @@ static inline size_t getMatchedIndex(const char * str, const char * sub_str, siz // sub_str must be in the str, so while loop condition could be true while (true) { - // PRINT("while"); sub_str_offset = 0; start_offset += 1; str_offset = start_offset; + // Check string byte by byte bool is_same = true; while (sub_str_offset < single_checked_num) { @@ -524,6 +525,8 @@ static inline size_t getMatchedIndex(const char * str, const char * sub_str, siz if (!is_same) continue; + // Length of rest of sub_str that needs to be compared can be divided exactly by eight. + // So, we can see 8 bytes as uint64_t and compare 8 byte at a time. while (sub_str_offset < sub_str_size && is_same) { if (static_cast(str[str_offset]) == static_cast(sub_str[str_offset])) @@ -587,21 +590,22 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s size_t matched_str_size = 0; String matched_str; // store the matched substring + // RegexType::FindAndConsume will truncate expr_sp each time it is called. + // expr_sp_before_truncated stores the string before expr_sp is truncated so that + // we can find the matched index of the substr + StringPieceType expr_sp_before_truncated; + while (occur > 0) { + expr_sp_before_truncated = expr_sp; bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); if (!success) return 0; + matched_index = expr_sp_before_truncated.find(matched_str); matched_str_size = matched_str.size(); - - // get the start index of matched string in expr - matched_index = getMatchedIndex(expr, matched_str.c_str(), matched_str_size); byte_offset += matched_index + matched_str_size; - // expr is truncated each time we get a matched string - expr = subject + byte_offset; - --occur; } From 288ac97f09b0deb948609365bd8874583c5180b2 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 8 Nov 2022 18:32:32 +0800 Subject: [PATCH 62/87] refinw --- dbms/src/Common/OptimizedRegularExpression.h | 1 + .../Common/OptimizedRegularExpression.inl.h | 107 ++++++------------ 2 files changed, 38 insertions(+), 70 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 40eec9033a2..46eab4878cd 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -117,6 +117,7 @@ class OptimizedRegularExpressionImpl private: Int64 processEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + Int64 getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); bool is_trivial; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 68d385b41f5..588c9ed957d 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -476,9 +476,9 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject // For Example: // Taking string "ni好a" as an example. // utf8 position of character 'a' in this string is 4 and byte position is 6. -static inline size_t utf8Pos2bytePos(const char * str, size_t utf8_pos) +static inline Int64 utf8Pos2bytePos(const char * str, Int64 utf8_pos) { - size_t byte_index = 0; + Int64 byte_index = 0; utf8_pos--; while (utf8_pos > 0) { @@ -488,64 +488,14 @@ static inline size_t utf8Pos2bytePos(const char * str, size_t utf8_pos) return byte_index + 1; } -static inline size_t bytePos2Utf8Pos(const char * str, size_t byte_pos) +static inline Int64 bytePos2Utf8Pos(const char * str, Int64 byte_pos) { // byte_num means the number of byte before this byte_pos - size_t byte_num = byte_pos - 1; - size_t utf8_num = getStringUtf8Len(str, byte_num); + Int64 byte_num = byte_pos - 1; + Int64 utf8_num = getStringUtf8Len(str, byte_num); return utf8_num + 1; } -static inline size_t getMatchedIndex(const char * str, const char * sub_str, size_t sub_str_size) -{ - const size_t stride = sizeof(int64_t); - size_t single_checked_num = sub_str_size >= stride ? sub_str_size % stride : sub_str_size; - size_t start_offset = -1; // offset that the head of sub_str in the str - size_t str_offset = 0; - size_t sub_str_offset = 0; - - // sub_str must be in the str, so while loop condition could be true - while (true) - { - sub_str_offset = 0; - start_offset += 1; - str_offset = start_offset; - - // Check string byte by byte - bool is_same = true; - while (sub_str_offset < single_checked_num) - { - if (str[str_offset++] != sub_str[sub_str_offset++]) - { - is_same = false; - break; - } - } - - if (!is_same) - continue; - - // Length of rest of sub_str that needs to be compared can be divided exactly by eight. - // So, we can see 8 bytes as uint64_t and compare 8 byte at a time. - while (sub_str_offset < sub_str_size && is_same) - { - if (static_cast(str[str_offset]) == static_cast(sub_str[str_offset])) - { - is_same = false; - break; - } - - sub_str_offset += stride; - str_offset += stride; - } - - if (sub_str_offset >= sub_str_size) - break; - } - - return start_offset; -} - // We use this function when expr string is empty template Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) @@ -560,27 +510,23 @@ Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const return pos; } -template -Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) +static inline void checkArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) { - int64_t utf8_total_len = getStringUtf8Len(subject, subject_size); - if (unlikely(ret_op != 0 && ret_op != 1)) - throw DB::Exception("Incorrect arguments to regexp_instr: return_option must be 1 or 0"); + throw DB::Exception("Incorrect argument to regexp function: return_option must be 1 or 0"); if (unlikely(pos <= 0 || (pos > utf8_total_len && subject_size != 0))) - throw DB::Exception("Index out of bounds in regular expression search."); - - if (occur <= 0) - occur = 1; + throw DB::Exception("Index out of bounds in regular function."); +} - if (unlikely(subject_size == 0)) - { - // Process empty expr in this if branch - return processEmptyStringExpr(subject, subject_size, pos, occur); - } +static inline void makeOccurValid(Int64 & occur) +{ + occur = occur < 0 ? 1 : occur; +} - size_t byte_pos = utf8Pos2bytePos(subject, pos); +template +Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op) +{ size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched size_t expr_size = subject_size - byte_offset; @@ -602,6 +548,8 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s if (!success) return 0; + // byte_offset is used for locating the substr's start index in the string + // so we need to update it each time. matched_index = expr_sp_before_truncated.find(matched_str); matched_str_size = matched_str.size(); byte_offset += matched_index + matched_str_size; @@ -613,5 +561,24 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s return ret_op == 0 ? bytePos2Utf8Pos(subject, byte_offset + 1) : bytePos2Utf8Pos(subject, byte_offset + matched_str.size() + 1); } +template +Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) +{ + Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); + + checkArgs(utf8_total_len, subject_size, pos, ret_op); + + makeOccurValid(occur); + + if (unlikely(subject_size == 0)) + { + // Process empty expr in this if branch + return processEmptyStringExpr(subject, subject_size, pos, occur); + } + + size_t byte_pos = utf8Pos2bytePos(subject, pos); + return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op); +} + #undef MIN_LENGTH_FOR_STRSTR #undef MAX_SUBPATTERNS From 94d5b780785d5e41e5335e37734c0f6465b063c8 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 8 Nov 2022 18:44:12 +0800 Subject: [PATCH 63/87] clean code --- dbms/src/Common/OptimizedRegularExpression.inl.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 588c9ed957d..0eaff4bac4c 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -496,7 +496,6 @@ static inline Int64 bytePos2Utf8Pos(const char * str, Int64 byte_pos) return utf8_num + 1; } -// We use this function when expr string is empty template Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) { @@ -571,10 +570,7 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s makeOccurValid(occur); if (unlikely(subject_size == 0)) - { - // Process empty expr in this if branch return processEmptyStringExpr(subject, subject_size, pos, occur); - } size_t byte_pos = utf8Pos2bytePos(subject, pos); return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op); From 8efe7afb931e9a527b085f0aeafc1bdde57c99c4 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 14 Nov 2022 11:00:49 +0800 Subject: [PATCH 64/87] refine check type --- dbms/src/Functions/FunctionsRegexp.h | 77 +++++++++++++++------------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index f855252cf4b..2dbadfe4532 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -833,57 +833,62 @@ class FunctionStringRegexpBase static void checkInputArg(const DataTypePtr & arg, bool is_str, bool * has_nullable_col, bool * has_data_type_nothing) { if (is_str) + checkStringTypeArg(arg, has_nullable_col, has_data_type_nothing); + else + checkIntTypeArg(arg, has_nullable_col, has_data_type_nothing); + } + +private: + static void checkStringTypeArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) + { + if (arg->isNullable()) { - // Check string type argument - if (arg->isNullable()) - { - *has_nullable_col = true; - const auto * null_type = checkAndGetDataType(arg.get()); - assert(null_type != nullptr); + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + assert(null_type != nullptr); - const auto & nested_type = null_type->getNestedType(); + const auto & nested_type = null_type->getNestedType(); - // It may be DataTypeNothing if it's not string - if (!nested_type->isString()) - { - if (nested_type->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - else + // It may be DataTypeNothing if it's not string + if (!nested_type->isString()) { - if (!arg->isString()) + if (nested_type->getTypeId() != TypeIndex::Nothing) throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; } } else { - // Check int type argument - if (arg->isNullable()) - { - *has_nullable_col = true; - const auto * null_type = checkAndGetDataType(arg.get()); - assert(null_type != nullptr); + if (!arg->isString()) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + } - const auto & nested_type = null_type->getNestedType(); + static void checkIntTypeArg(const DataTypePtr & arg, bool * has_nullable_col, bool * has_data_type_nothing) + { + if (arg->isNullable()) + { + *has_nullable_col = true; + const auto * null_type = checkAndGetDataType(arg.get()); + assert(null_type != nullptr); - // It may be DataTypeNothing if it's not string - if (!nested_type->isInteger()) - { - if (nested_type->getTypeId() != TypeIndex::Nothing) - throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else - *has_data_type_nothing = true; - } - } - else + const auto & nested_type = null_type->getNestedType(); + + // It may be DataTypeNothing if it's not string + if (!nested_type->isInteger()) { - if (!arg->isInteger()) + if (nested_type->getTypeId() != TypeIndex::Nothing) throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + *has_data_type_nothing = true; } } + else + { + if (!arg->isInteger()) + throw Exception(fmt::format("Illegal type {} of argument of regexp function", arg->getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } } }; From 1ead665bf1e72f843b9774f2bbee55aa0c1cd536 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 14 Nov 2022 15:40:15 +0800 Subject: [PATCH 65/87] start --- dbms/src/Common/OptimizedRegularExpression.h | 8 +- .../Common/OptimizedRegularExpression.inl.h | 69 ++- dbms/src/Functions/FunctionsRegexp.h | 334 ++++++++++- dbms/src/Functions/tests/gtest_regexp.cpp | 546 +++++++++++++++--- 4 files changed, 873 insertions(+), 84 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 46eab4878cd..dc973021610 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -114,11 +115,14 @@ class OptimizedRegularExpressionImpl } Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op); + StringRef substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur); private: - Int64 processEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); - Int64 getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); + Int64 processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + Int64 instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); + Int64 processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + StringRef substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur); bool is_trivial; bool required_substring_is_prefix; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 0eaff4bac4c..b0b1372340b 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -347,7 +348,7 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si pos = strstr(subject, required_substring.data()); if (nullptr == pos) - return 0; + return false; } return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0); @@ -367,12 +368,12 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si pos = strstr(subject, required_substring.data()); if (pos == nullptr) - return 0; + return false; else { match.offset = pos - subject; match.length = required_substring.size(); - return 1; + return true; } } else @@ -386,18 +387,18 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si pos = strstr(subject, required_substring.data()); if (nullptr == pos) - return 0; + return false; } StringPieceType piece; if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece)) - return 0; + return false; else { match.offset = piece.data() - subject; match.length = piece.length(); - return 1; + return true; } } } @@ -497,7 +498,7 @@ static inline Int64 bytePos2Utf8Pos(const char * str, Int64 byte_pos) } template -Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) +Int64 OptimizedRegularExpressionImpl::processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) { if (occur != 1) return 0; @@ -509,7 +510,7 @@ Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const return pos; } -static inline void checkArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) +static inline void checkInstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) { if (unlikely(ret_op != 0 && ret_op != 1)) throw DB::Exception("Incorrect argument to regexp function: return_option must be 1 or 0"); @@ -518,13 +519,19 @@ static inline void checkArgs(Int64 utf8_total_len, size_t subject_size, Int64 po throw DB::Exception("Index out of bounds in regular function."); } +static inline void checkSubstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos) +{ + if (unlikely(pos <= 0 || (pos > utf8_total_len && subject_size != 0))) + throw DB::Exception("Index out of bounds in regular function."); +} + static inline void makeOccurValid(Int64 & occur) { occur = occur < 0 ? 1 : occur; } template -Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op) +Int64 OptimizedRegularExpressionImpl::instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op) { size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched @@ -560,20 +567,58 @@ Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const c return ret_op == 0 ? bytePos2Utf8Pos(subject, byte_offset + 1) : bytePos2Utf8Pos(subject, byte_offset + matched_str.size() + 1); } +template +StringRef OptimizedRegularExpressionImpl::substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur) +{ + size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 + const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched + size_t expr_size = subject_size - byte_offset; + + StringPieceType expr_sp(expr, expr_size); + StringRef matched_str; // store the matched substring + + while (occur > 0) + { + bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); + if (!success) + return ""; + + --occur; + } + + return matched_str; +} + template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); - checkArgs(utf8_total_len, subject_size, pos, ret_op); + checkInstrArgs(utf8_total_len, subject_size, pos, ret_op); + + makeOccurValid(occur); + + if (unlikely(subject_size == 0)) + return processInstrEmptyStringExpr(subject, subject_size, pos, occur); + + size_t byte_pos = utf8Pos2bytePos(subject, pos); + return instrImpl(subject, subject_size, byte_pos, occur, ret_op); +} + +template +StringRef OptimizedRegularExpressionImpl::substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur) +{ + Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); + + checkSubstrArgs(utf8_total_len, subject_size, pos); makeOccurValid(occur); if (unlikely(subject_size == 0)) - return processEmptyStringExpr(subject, subject_size, pos, occur); + return processSubstrEmptyStringExpr(subject, subject_size, pos, occur); size_t byte_pos = utf8Pos2bytePos(subject, pos); - return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op); + return substrImpl(subject, subject_size, byte_pos, occur); } #undef MIN_LENGTH_FOR_STRSTR diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 2dbadfe4532..3464be2aa68 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1148,7 +1148,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase ParamVariant PAT_PV_VAR_NAME(col_pat, col_size, StringRef("", 0)); ParamVariant MATCH_TYPE_PV_VAR_NAME(col_match_type, col_size, StringRef("", 0)); - GET_ACTUAL_PARAMS_AND_EXECUTE() + // GET_ACTUAL_PARAMS_AND_EXECUTE() } private: @@ -1512,6 +1512,338 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase #undef GET_MATCH_TYPE_ACTUAL_PARAM #undef EXECUTE_REGEXP_INSTR +#define EXECUTE_REGEXP_SUBSTR() \ + do \ + { \ + REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, *(EXPR_PARAM_PTR_VAR_NAME), *(PAT_PARAM_PTR_VAR_NAME), *(POS_PARAM_PTR_VAR_NAME), *(OCCUR_PARAM_PTR_VAR_NAME), *(MATCH_TYPE_PARAM_PTR_VAR_NAME)); \ + } while (0); + +// Method to get actual match type param +#define GET_MATCH_TYPE_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(MATCH_TYPE_PV_VAR_NAME, MATCH_TYPE_PARAM_PTR_VAR_NAME, ({EXECUTE_REGEXP_SUBSTR()})) \ + } while (0); + +// Method to get actual occur param +#define GET_OCCUR_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_INT_PARAM(OCCUR_PV_VAR_NAME, OCCUR_PARAM_PTR_VAR_NAME, ({GET_MATCH_TYPE_ACTUAL_PARAM()})) \ + } while (0); + +// Method to get actual position param +#define GET_POS_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_INT_PARAM(POS_PV_VAR_NAME, POS_PARAM_PTR_VAR_NAME, ({GET_OCCUR_ACTUAL_PARAM()})) \ + } while (0); + +// Method to get actual pattern param +#define GET_PAT_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(PAT_PV_VAR_NAME, PAT_PARAM_PTR_VAR_NAME, ({GET_POS_ACTUAL_PARAM()})) \ + } while (0); + +// Method to get actual expression param +#define GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ + GET_ACTUAL_STRING_PARAM(EXPR_PV_VAR_NAME, EXPR_PARAM_PTR_VAR_NAME, ({GET_PAT_ACTUAL_PARAM()})) \ + } while (0); + +// The entry to get actual params and execute regexp functions +#define GET_ACTUAL_PARAMS_AND_EXECUTE() \ + do \ + { \ + GET_EXPR_ACTUAL_PARAM() \ + } while (0); + +// Implementation of regexp_substr function +template +class FunctionStringRegexpSubstr : public FunctionStringRegexpBase + , public IFunction +{ +public: + using ResultType = String; + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; } + bool useDefaultImplementationForNulls() const override { return false; } + size_t getNumberOfArguments() const override { return 0; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + size_t arg_num = arguments.size(); + if (arg_num < REGEXP_MIN_PARAM_NUM) + throw Exception("Too few arguments", ErrorCodes::TOO_LESS_ARGUMENTS_FOR_FUNCTION); + else if (arg_num > REGEXP_SUBSTR_MAX_PARAM_NUM) + throw Exception("Too many arguments", ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION); + + bool has_nullable_col = false; + bool has_data_type_nothing = false; + bool is_str_arg; + + // Check type of arguments + for (size_t i = 0; i < arg_num; ++i) + { + // Index at 0, 1 and 4 arguments should be string type, otherwise int type. + is_str_arg = (i <= 1 || i == 4); + checkInputArg(arguments[i], is_str_arg, &has_nullable_col, &has_data_type_nothing); + } + + if (has_data_type_nothing) + return std::make_shared(std::make_shared()); + + if (has_nullable_col) + return std::make_shared(std::make_shared()); + else + return std::make_shared(); + } + + template + void REGEXP_CLASS_MEM_FUNC_IMPL_NAME(ColumnWithTypeAndName & res_arg, const ExprT & expr_param, const PatT & pat_param, const PosT & pos_param, const OccurT & occur_param, const MatchTypeT & match_type_param) const + { + size_t col_size = expr_param.getDataNum(); + + // Get function pointers to process the specific int type + GetIntFuncPointerType get_pos_func = getGetIntFuncPointer(pos_param.getIntType()); + GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); + + // Container will not be used when parm is const + const void * pos_container = pos_param.getContainer(); + const void * occur_container = occur_param.getContainer(); + + // Const value will not be used when param is not const + Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; + Int64 occur_const_val = OccurT::isConst() ? occur_param.template getInt(0) : -1; + + // Check if args are all const columns + if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && MatchTypeT::isConst()) + { + if (col_size == 0 || expr_param.isNullAt(0) || pat_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || match_type_param.isNullAt(0)) + { + res_arg.column = res_arg.type->createColumnConst(col_size, Null()); + return; + } + + int flags = getDefaultFlags(); + String expr = expr_param.getString(0); + String pat = pat_param.getString(0); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + + Int64 pos = PosT::isConst() ? pos_const_val : get_pos_func(pos_container, 0); + Int64 occur = OccurT::isConst() ? occur_const_val : get_occur_func(occur_container, 0); + String match_type = match_type_param.getString(0); + + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + StringRef res = regexp.substr(expr.c_str(), expr.size(), pos, occur); + res_arg.column = res_arg.type->createColumnConst(col_size, toField(String(res))); + return; + } + + // Initialize result column + auto col_res = ColumnString::create(); + + constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || MatchTypeT::isNullableCol(); + +#define GET_POS_VALUE(idx) \ + do \ + { \ + if constexpr (PosT::isConst()) \ + pos = pos_const_val; \ + else \ + pos = get_pos_func(pos_container, idx); \ + } while (0); + +#define GET_OCCUR_VALUE(idx) \ + do \ + { \ + if constexpr (OccurT::isConst()) \ + occur = occur_const_val; \ + else \ + occur = get_occur_func(occur_container, idx); \ + } while (0); + + StringRef expr_ref; + String pat; + Int64 pos; + Int64 occur; + String match_type; + StringRef res_ref; + + // Start to execute instr + if (canMemorize()) + { + // Codes in this if branch execute instr with memorized regexp + + const auto & regexp = memorize(pat_param, match_type_param, collator); + if constexpr (has_nullable_col) + { + // Process nullable columns with memorized regexp + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size); + + for (size_t i = 0; i < col_size; ++i) + { + if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i)) + { + null_map[i] = 1; + continue; + } + + null_map[i] = 0; + + expr_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + + res_ref = regexp->substr(expr_ref.data, expr_ref.size, pos, occur); + col_res->insertData(res_ref.data, res_ref.size); + } + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + } + else + { + // Process pure vector columns with memorized regexp. + // columns are impossible to be a nullable column here. + + for (size_t i = 0; i < col_size; ++i) + { + expr_param.getStringRef(i, expr_ref); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + + res_ref = regexp->instr(expr_ref.data, expr_ref.size, pos, occur); + col_res->insertData(res_ref.data, res_ref.size); + } + res_arg.column = std::move(col_res); + } + } + else + { + // Codes in this if branch execute instr without memorized regexp + + if constexpr (has_nullable_col) + { + // Process nullable columns without memorized regexp + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size); + + for (size_t i = 0; i < col_size; ++i) + { + if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || match_type_param.isNullAt(i)) + { + null_map[i] = 1; + continue; + } + + null_map[i] = 0; + + expr_param.getStringRef(i, expr_ref); + pat = pat_param.getString(i); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + match_type = match_type_param.getString(i); + + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + res_ref = regexp->substr(expr_ref.data, expr_ref.size, pos, occur); + col_res->insertData(res_ref.data, res_ref.size); + } + + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + } + else + { + // Process pure vector columns without memorized regexp + for (size_t i = 0; i < col_size; ++i) + { + expr_param.getStringRef(i, expr_ref); + pat = pat_param.getString(i); + if (unlikely(pat.empty())) + throw Exception(EMPTY_PAT_ERR_MSG); + GET_POS_VALUE(i) + GET_OCCUR_VALUE(i) + match_type = match_type_param.getString(i); + + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + res_ref = regexp->substr(expr_ref.data, expr_ref.size, pos, occur); + col_res->insertData(res_ref.data, res_ref.size); + } + + res_arg.column = std::move(col_res); + } + } + +#undef GET_OCCUR_VALUE +#undef GET_POS_VALUE + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + // Do something related with nullable columns + NullPresence null_presence = getNullPresense(block, arguments); + + if (null_presence.has_null_constant) + { + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(block.rows(), Null()); + return; + } + + const ColumnPtr & col_expr = block.getByPosition(arguments[0]).column; + const ColumnPtr & col_pat = block.getByPosition(arguments[1]).column; + + size_t arg_num = arguments.size(); + auto & RES_ARG_VAR_NAME = block.getByPosition(result); + + ColumnPtr col_pos; + ColumnPtr col_occur; + ColumnPtr col_match_type; + + // Go through cases to get arguments + switch(arg_num) + { + case REGEXP_INSTR_MAX_PARAM_NUM: + col_match_type = block.getByPosition(arguments[4]).column; + case REGEXP_MIN_PARAM_NUM + 2: + col_occur = block.getByPosition(arguments[3]).column; + case REGEXP_MIN_PARAM_NUM + 1: + col_pos = block.getByPosition(arguments[2]).column; + }; + + size_t col_size = col_expr->size(); + + ParamVariant EXPR_PV_VAR_NAME(col_expr, col_size, StringRef("", 0)); + ParamVariant PAT_PV_VAR_NAME(col_pat, col_size, StringRef("", 0)); + ParamVariant POS_PV_VAR_NAME(col_pos, col_size, 1); + ParamVariant OCCUR_PV_VAR_NAME(col_occur, col_size, 1); + ParamVariant MATCH_TYPE_PV_VAR_NAME(col_match_type, col_size, StringRef("", 0)); + + GET_ACTUAL_PARAMS_AND_EXECUTE() + } + +private: + TiDB::TiDBCollatorPtr collator = nullptr; +}; + +#undef GET_ACTUAL_PARAMS_AND_EXECUTE +#undef GET_EXPR_ACTUAL_PARAM +#undef GET_PAT_ACTUAL_PARAM +#undef GET_POS_ACTUAL_PARAM +#undef GET_OCCUR_ACTUAL_PARAM +#undef GET_MATCH_TYPE_ACTUAL_PARAM +#undef EXECUTE_REGEXP_SUBSTR + #undef GET_ACTUAL_INT_PARAM #undef GET_ACTUAL_STRING_PARAM #undef REGEXP_CLASS_MEM_FUNC_IMPL_NAME diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index db3b19cd761..791f3495562 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2210,6 +2210,86 @@ TEST_F(Regexp, testRegexpCustomerCases) } } +namespace +{ +template +std::vector getResultVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.result); + + return vecs; +} + +template +std::vector getExprVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.expression); + + return vecs; +} + +template +std::vector getPatVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.pattern); + + return vecs; +} + +template +std::vector getPosVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.position); + + return vecs; +} + +template +std::vector getOccurVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.occurrence); + + return vecs; +} + +template +std::vector getRetOpVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.return_option); + + return vecs; +} + +template +std::vector getMatchTypeVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.match_type); + + return vecs; +} +} + struct RegexpInstrCase { RegexpInstrCase(Int64 res, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, Int64 ret_op = 0, const String & mt = "") @@ -2233,84 +2313,415 @@ struct RegexpInstrCase , match_type(mt) {} - static std::vector getResultVec(const std::vector & test_cases) + static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.result); - - return vecs; + results = getResultVec(test_cases); + switch (param_num) + { + case 6: + match_types = getMatchTypeVec(test_cases); + case 5: + ret_ops = getRetOpVec(test_cases); + case 4: + occurs = getOccurVec(test_cases); + case 3: + positions = getPosVec(test_cases); + case 2: + pats = getPatVec(test_cases); + exprs = getExprVec(test_cases); + } } - static std::vector getExprVec(const std::vector & test_cases) + static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) { - std::vector vecs; - vecs.reserve(test_cases.size()); + null_map.clear(); + null_map.resize(REGEXP_INSTR_MAX_PARAM_NUM); for (const auto & elem : test_cases) - vecs.push_back(elem.expression); - - return vecs; + { + null_map[EXPR_NULL_MAP_IDX].push_back(elem.null_map[EXPR_NULL_MAP_IDX]); + null_map[PAT_NULL_MAP_IDX].push_back(elem.null_map[PAT_NULL_MAP_IDX]); + null_map[POS_NULL_MAP_IDX].push_back(elem.null_map[POS_NULL_MAP_IDX]); + null_map[OCCUR_NULL_MAP_IDX].push_back(elem.null_map[OCCUR_NULL_MAP_IDX]); + null_map[RET_OP_NULL_MAP_IDX].push_back(elem.null_map[RET_OP_NULL_MAP_IDX]); + null_map[MATCH_TYPE_NULL_MAP_IDX].push_back(elem.null_map[MATCH_TYPE_NULL_MAP_IDX]); + } + + setVecsWithoutNullMap(param_num, test_cases, results, exprs, pats, positions, occurs, ret_ops, match_types); } - static std::vector getPatVec(const std::vector & test_cases) + const static UInt8 REGEXP_INSTR_MAX_PARAM_NUM = 6; + const static UInt8 EXPR_NULL_MAP_IDX = 0; + const static UInt8 PAT_NULL_MAP_IDX = 1; + const static UInt8 POS_NULL_MAP_IDX = 2; + const static UInt8 OCCUR_NULL_MAP_IDX = 3; + const static UInt8 RET_OP_NULL_MAP_IDX = 4; + const static UInt8 MATCH_TYPE_NULL_MAP_IDX = 5; + + Int64 result; + std::vector null_map; + String expression; + String pattern; + Int64 position; + Int64 occurrence; + Int64 return_option; + String match_type; +}; + +// TODO add empty column test +TEST_F(Regexp, RegexpInstr) +{ + // Test: All columns are const { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.pattern); - - return vecs; + for (size_t row_size = 1; row_size < 3; ++row_size) + { + ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1))); + ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "aabab"), + createConstColumn(row_size, "aB"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1), + createConstColumn(row_size, "i"))); + } } - static std::vector getPosVec(const std::vector & test_cases) + // Test: null const { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.position); + size_t row_size = 2; + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn>(row_size, {}), + createConstColumn(row_size, "123"))); - return vecs; + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); } - static std::vector getOccurVec(const std::vector & test_cases) + std::vector test_cases; + std::vector results; + std::vector> null_maps; + std::vector exprs; + std::vector patterns; + std::vector positions; + std::vector occurs; + std::vector return_options; + std::vector match_types; + + // Test: All columns are pure vector { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.occurrence); - - return vecs; + // test regexp_instr(vector, vector) + test_cases = {{4, "ttttifl", "tifl"}, + {1, "tidb_tikv", "ti(db|kv)"}, + {1, "aaaaaa", "aa"}, + {0, "\n", "."}, + {1, "", "^$"}, + {0, "ab\naB", "^ab$"}, + {3, "pp跑ppのaaa", "(跑|の|P)"}}; + RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns))); + + // test regexp_instr(vector, vector, vector) + test_cases = {{4, "ttttifl", "tifl", 3}, + {6, "tidb_tikv", "ti(db|kv)", 2}, + {3, "aaaaaa", "aa", 3}, + {0, "\n", ".", 1}, + {3, "", "^$", 3}, + {0, "ab\naB", "^ab$", 1}, + {3, "pp跑ppのaaa", "(跑|の|P)", 2}}; + RegexpInstrCase::setVecsWithoutNullMap(3, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions))); + + // test regexp_instr(vector, vector, vector, vector) + test_cases = {{4, "ttttifl", "tifl", 3, 1}, + {6, "tidb_tikv", "ti(db|kv)", 2, 1}, + {5, "aaaaaa", "aa", 3, 2}, + {0, "\n", ".", 1, 1}, {0, "", "^$", 3, 2}, + {0, "ab\naB", "^ab$", 1, 1}, + {6, "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; + RegexpInstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs))); + + // test regexp_instr(vector, vector, vector, vector, vector) + test_cases = {{8, "ttttifl", "tifl", 3, 1, 1}, + {10, "tidb_tikv", "ti(db|kv)", 2, 1, 1}, + {7, "aaaaaa", "aa", 3, 2, 1}, + {0, "\n", ".", 1, 1, 1}, + {0, "", "^$", 3, 2, 1}, + {0, "ab\naB", "^ab$", 1, 1, 1}, + {7, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1}}; + RegexpInstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options))); + + // test regexp_instr(vector, vector, vector, vector, vector, vector) + test_cases = {{8, "ttttifl", "tifl", 3, 1, 1, ""}, + {10, "tidb_tikv", "ti(db|kv)", 2, 1, 1, ""}, + {7, "aaaaaa", "aa", 3, 2, 1, ""}, + {2, "\n", ".", 1, 1, 1, "s"}, + {0, "", "^$", 3, 2, 1, ""}, + {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, + {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; + RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + results = getResultVec(test_cases); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createColumn(match_types))); + + // test collation + const auto * utf8mb4_general_ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + test_cases = {{2, "ttiFl", "tifl", 1, 1, 0, ""}, + {0, "ttiFl", "tifl", 1, 1, 0, "c"}, + {2, "ttiFl", "tifl", 1, 1, 0, "i"}, + {2, "ttiFl", "tifl", 1, 1, 0, "ci"}, + {0, "ttiFl", "tifl", 1, 1, 0, "ic"}, + {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, + {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; + RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + results = getResultVec(test_cases); + ASSERT_COLUMN_EQ(createColumn(results), + executeFunction( + "regexp_instr", + {createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createColumn(match_types)}, + utf8mb4_general_ci_collator)); + } - static std::vector getRetOpVec(const std::vector & test_cases) + // Test: Args include nullable columns { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.return_option); - - return vecs; + // test regexp_instr(nullable vector, vector) + test_cases = {{0, {{1, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns))); + + // test regexp_instr(vector, nullable vector) + test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {0, {{0, 1, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createNullableVectorColumn(patterns, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, nullable vector) + test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, + {0, {{0, 0, 1, 0, 0, 0}}, "ttttifl", "tifl", 3}}; + RegexpInstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, vector, nullable vector) + test_cases = {{6, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, + {0, {{0, 0, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; + RegexpInstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createNullableVectorColumn(occurs, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, vector, vector, nullable vector) + test_cases = {{10, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}, + {0, {{0, 0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}}; + RegexpInstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createNullableVectorColumn(return_options, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]))); + + // test regexp_instr(vector, vector, vector, vector, vector, nullable vector) + test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "b", "B", 1, 1, 0, "i"}, + {0, {{0, 0, 0, 0, 0, 1}}, "b", "B", 1, 1, 0, "i"}}; + RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]), + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createNullableVectorColumn(match_types, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]))); } - static std::vector getMatchTypeVec(const std::vector & test_cases) + // Test: const, nullable and pure vector columns appear together { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.match_type); - - return vecs; + // test regexp_instr(nullable vector, vector, nullable vector, vector, const vector, vector) + test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, + {0, {{1, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, + {0, {{0, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, + {0, {{1, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}}; + RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), + executeFunction( + "regexp_instr", + createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + createColumn(occurs), + createConstColumn(test_cases.size(), 0), + createColumn(match_types))); } - static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) + // Test: Invalid parameter handling + { + // test empty pattern + test_cases = {{0, "ttt", ""}}; + RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns)), Exception); + + // test invalid ret_option + test_cases = {{0, "ttt", "t", 1, 1, 2}}; + RegexpInstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(return_options)), Exception); + + // test invalid match type + test_cases = {{0, "ttt", "t", 1, 1, 1, "p"}}; + RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); + ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(return_options), createColumn(match_types)), Exception); + } +} + +struct RegexpSubstrCase +{ + RegexpSubstrCase(Int64 res, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") + : result(res) + , expression(expr) + , pattern(pat) + , position(pos) + , occurrence(occur) + , match_type(mt) + {} + + RegexpSubstrCase(Int64 res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") + : result(res) + , null_map(null_map_) + , expression(expr) + , pattern(pat) + , position(pos) + , occurrence(occur) + , match_type(mt) + {} + + static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) { results = getResultVec(test_cases); - switch (param_num) { - case 6: - match_types = getMatchTypeVec(test_cases); + switch (param_num) + { case 5: - ret_ops = getRetOpVec(test_cases); + match_types = getMatchTypeVec(test_cases); case 4: occurs = getOccurVec(test_cases); case 3: @@ -2321,30 +2732,28 @@ struct RegexpInstrCase } } - static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) + static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) { null_map.clear(); - null_map.resize(REGEXP_INSTR_MAX_PARAM_NUM); + null_map.resize(REGEXP_SUBSTR_MAX_PARAM_NUM); for (const auto & elem : test_cases) { null_map[EXPR_NULL_MAP_IDX].push_back(elem.null_map[EXPR_NULL_MAP_IDX]); null_map[PAT_NULL_MAP_IDX].push_back(elem.null_map[PAT_NULL_MAP_IDX]); null_map[POS_NULL_MAP_IDX].push_back(elem.null_map[POS_NULL_MAP_IDX]); null_map[OCCUR_NULL_MAP_IDX].push_back(elem.null_map[OCCUR_NULL_MAP_IDX]); - null_map[RET_OP_NULL_MAP_IDX].push_back(elem.null_map[RET_OP_NULL_MAP_IDX]); null_map[MATCH_TYPE_NULL_MAP_IDX].push_back(elem.null_map[MATCH_TYPE_NULL_MAP_IDX]); } - setVecsWithoutNullMap(param_num, test_cases, results, exprs, pats, positions, occurs, ret_ops, match_types); + setVecsWithoutNullMap(param_num, test_cases, results, exprs, pats, positions, occurs, match_types); } - const static UInt8 REGEXP_INSTR_MAX_PARAM_NUM = 6; + const static UInt8 REGEXP_SUBSTR_MAX_PARAM_NUM = 5; const static UInt8 EXPR_NULL_MAP_IDX = 0; const static UInt8 PAT_NULL_MAP_IDX = 1; const static UInt8 POS_NULL_MAP_IDX = 2; const static UInt8 OCCUR_NULL_MAP_IDX = 3; - const static UInt8 RET_OP_NULL_MAP_IDX = 4; - const static UInt8 MATCH_TYPE_NULL_MAP_IDX = 5; + const static UInt8 MATCH_TYPE_NULL_MAP_IDX = 4; Int64 result; std::vector null_map; @@ -2352,38 +2761,37 @@ struct RegexpInstrCase String pattern; Int64 position; Int64 occurrence; - Int64 return_option; String match_type; }; // TODO add empty column test -TEST_F(Regexp, RegexpInstr) +TEST_F(Regexp, RegexpSubstr) { // Test: All columns are const { for (size_t row_size = 1; row_size < 3; ++row_size) { - ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), + ASSERT_COLUMN_EQ(createConstColumn(row_size, "123"), executeFunction( - "regexp_instr", + "regexp_substr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), executeFunction( - "regexp_instr", + "regexp_substr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."), createConstColumn(row_size, 2))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), executeFunction( - "regexp_instr", + "regexp_substr", createConstColumn(row_size, "11212"), createConstColumn(row_size, "12"), createConstColumn(row_size, 2), createConstColumn(row_size, 2))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), executeFunction( - "regexp_instr", + "regexp_substr", createConstColumn(row_size, "11212"), createConstColumn(row_size, "12"), createConstColumn(row_size, 2), @@ -2391,7 +2799,7 @@ TEST_F(Regexp, RegexpInstr) createConstColumn(row_size, 1))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), executeFunction( - "regexp_instr", + "regexp_substr", createConstColumn(row_size, "aabab"), createConstColumn(row_size, "aB"), createConstColumn(row_size, 2), @@ -2537,7 +2945,7 @@ TEST_F(Regexp, RegexpInstr) {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = RegexpInstrCase::getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2558,7 +2966,7 @@ TEST_F(Regexp, RegexpInstr) {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = RegexpInstrCase::getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", From af469bda6532133adb670ef12eb039ac467272b4 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 14 Nov 2022 16:01:45 +0800 Subject: [PATCH 66/87] refine substr impl --- .../Common/OptimizedRegularExpression.inl.h | 30 +--- dbms/src/Functions/tests/gtest_regexp.cpp | 154 ++++++++++-------- 2 files changed, 90 insertions(+), 94 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 0eaff4bac4c..f8f09dc2756 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -347,7 +347,7 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si pos = strstr(subject, required_substring.data()); if (nullptr == pos) - return 0; + return false; } return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0); @@ -367,12 +367,12 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si pos = strstr(subject, required_substring.data()); if (pos == nullptr) - return 0; + return false; else { match.offset = pos - subject; match.length = required_substring.size(); - return 1; + return true; } } else @@ -386,18 +386,18 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si pos = strstr(subject, required_substring.data()); if (nullptr == pos) - return 0; + return false; } StringPieceType piece; if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece)) - return 0; + return false; else { match.offset = piece.data() - subject; match.length = piece.length(); - return 1; + return true; } } } @@ -530,33 +530,19 @@ Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const c const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched size_t expr_size = subject_size - byte_offset; - size_t matched_index = 0; StringPieceType expr_sp(expr, expr_size); - size_t matched_str_size = 0; - String matched_str; // store the matched substring - - // RegexType::FindAndConsume will truncate expr_sp each time it is called. - // expr_sp_before_truncated stores the string before expr_sp is truncated so that - // we can find the matched index of the substr - StringPieceType expr_sp_before_truncated; + StringPieceType matched_str; while (occur > 0) { - expr_sp_before_truncated = expr_sp; bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); if (!success) return 0; - // byte_offset is used for locating the substr's start index in the string - // so we need to update it each time. - matched_index = expr_sp_before_truncated.find(matched_str); - matched_str_size = matched_str.size(); - byte_offset += matched_index + matched_str_size; - --occur; } - byte_offset -= matched_str_size; + byte_offset = matched_str.data() - subject; return ret_op == 0 ? bytePos2Utf8Pos(subject, byte_offset + 1) : bytePos2Utf8Pos(subject, byte_offset + matched_str.size() + 1); } diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index db3b19cd761..5497733d147 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2210,6 +2210,86 @@ TEST_F(Regexp, testRegexpCustomerCases) } } +namespace +{ +template +std::vector getResultVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.result); + + return vecs; +} + +template +std::vector getExprVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.expression); + + return vecs; +} + +template +std::vector getPatVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.pattern); + + return vecs; +} + +template +std::vector getPosVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.position); + + return vecs; +} + +template +std::vector getOccurVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.occurrence); + + return vecs; +} + +template +std::vector getRetOpVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.return_option); + + return vecs; +} + +template +std::vector getMatchTypeVec(const std::vector & test_cases) +{ + std::vector vecs; + vecs.reserve(test_cases.size()); + for (const auto & elem : test_cases) + vecs.push_back(elem.match_type); + + return vecs; +} +} + struct RegexpInstrCase { RegexpInstrCase(Int64 res, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, Int64 ret_op = 0, const String & mt = "") @@ -2233,76 +2313,6 @@ struct RegexpInstrCase , match_type(mt) {} - static std::vector getResultVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.result); - - return vecs; - } - - static std::vector getExprVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.expression); - - return vecs; - } - - static std::vector getPatVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.pattern); - - return vecs; - } - - static std::vector getPosVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.position); - - return vecs; - } - - static std::vector getOccurVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.occurrence); - - return vecs; - } - - static std::vector getRetOpVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.return_option); - - return vecs; - } - - static std::vector getMatchTypeVec(const std::vector & test_cases) - { - std::vector vecs; - vecs.reserve(test_cases.size()); - for (const auto & elem : test_cases) - vecs.push_back(elem.match_type); - - return vecs; - } - static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) { results = getResultVec(test_cases); @@ -2537,7 +2547,7 @@ TEST_F(Regexp, RegexpInstr) {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = RegexpInstrCase::getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2558,7 +2568,7 @@ TEST_F(Regexp, RegexpInstr) {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = RegexpInstrCase::getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", From d710184e1b75c4b2c12e38000e6bd9b78508abe2 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 15 Nov 2022 09:52:32 +0800 Subject: [PATCH 67/87] save --- dbms/src/Common/OptimizedRegularExpression.h | 4 +- .../Common/OptimizedRegularExpression.inl.h | 18 ++-- dbms/src/Flash/Coprocessor/DAGUtils.cpp | 2 +- dbms/src/Functions/FunctionsRegexp.cpp | 2 + dbms/src/Functions/FunctionsRegexp.h | 94 ++++++++++--------- dbms/src/Functions/tests/gtest_regexp.cpp | 19 ++-- 6 files changed, 70 insertions(+), 69 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index dc973021610..f0e1db6d532 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -115,14 +115,14 @@ class OptimizedRegularExpressionImpl } Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op); - StringRef substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur); + bool substr(const char * subject, size_t subject_size, StringRef & res, Int64 pos, Int64 occur); private: Int64 processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); Int64 instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); Int64 processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); - StringRef substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur); + bool substrImpl(const char * subject, size_t subject_size, StringRef & res, Int64 byte_pos, Int64 occur); bool is_trivial; bool required_substring_is_prefix; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index b0b1372340b..809cfce1005 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -568,34 +568,30 @@ Int64 OptimizedRegularExpressionImpl::instrImpl(const char * subjec } template -StringRef OptimizedRegularExpressionImpl::substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur) +bool OptimizedRegularExpressionImpl::substrImpl(const char * subject, size_t subject_size, StringRef & res, Int64 byte_pos, Int64 occur) { size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched size_t expr_size = subject_size - byte_offset; StringPieceType expr_sp(expr, expr_size); - StringRef matched_str; // store the matched substring - while (occur > 0) { - bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); + bool success = RegexType::FindAndConsume(&expr_sp, *re2, res); if (!success) - return ""; + return false; --occur; } - return matched_str; + return true; } template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); - checkInstrArgs(utf8_total_len, subject_size, pos, ret_op); - makeOccurValid(occur); if (unlikely(subject_size == 0)) @@ -606,19 +602,17 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s } template -StringRef OptimizedRegularExpressionImpl::substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur) +bool OptimizedRegularExpressionImpl::substr(const char * subject, size_t subject_size, StringRef & res, Int64 pos, Int64 occur) { Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); - checkSubstrArgs(utf8_total_len, subject_size, pos); - makeOccurValid(occur); if (unlikely(subject_size == 0)) return processSubstrEmptyStringExpr(subject, subject_size, pos, occur); size_t byte_pos = utf8Pos2bytePos(subject, pos); - return substrImpl(subject, subject_size, byte_pos, occur); + return substrImpl(subject, subject_size, res, byte_pos, occur); } #undef MIN_LENGTH_FOR_STRSTR diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index f1ff6141bee..08f17dbc795 100755 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -436,7 +436,7 @@ const std::unordered_map scalar_func_map({ {tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"}, {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"}, // {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"}, - // {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"}, + {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"}, //{tipb::ScalarFuncSig::JsonExtractSig, "cast"}, //{tipb::ScalarFuncSig::JsonUnquoteSig, "cast"}, diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index b268f28c193..3d885916fc2 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -287,6 +287,7 @@ struct ReplaceRegexpImpl using FunctionTiDBRegexp = FunctionStringRegexp; using FunctionRegexpLike = FunctionStringRegexp; using FunctionRegexpInstr = FunctionStringRegexpInstr; +using FunctionRegexpSubstr = FunctionStringRegexpSubstr; using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; @@ -297,6 +298,7 @@ void registerFunctionsRegexp(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); } } // namespace DB diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 3464be2aa68..10d8b41bc46 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -75,6 +75,10 @@ struct NameRegexpInstr { static constexpr auto name = "regexp_instr"; }; +struct NameRegexpSubstr +{ + static constexpr auto name = "regexp_substr"; +}; struct NameReplaceRegexpOne { static constexpr auto name = "replaceRegexpOne"; @@ -1599,10 +1603,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase if (has_data_type_nothing) return std::make_shared(std::make_shared()); - if (has_nullable_col) - return std::make_shared(std::make_shared()); - else - return std::make_shared(); + return std::make_shared(std::make_shared()); } template @@ -1642,8 +1643,12 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase String match_type = match_type_param.getString(0); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); - StringRef res = regexp.substr(expr.c_str(), expr.size(), pos, occur); - res_arg.column = res_arg.type->createColumnConst(col_size, toField(String(res))); + StringRef res_ref; + bool success = regexp.substr(expr.c_str(), expr.size(), res_ref, pos, occur); + if (success) + res_arg.column = res_arg.type->createColumnConst(col_size, toField(res_ref)); + else + res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; } @@ -1677,19 +1682,16 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase String match_type; StringRef res_ref; + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size); + // Start to execute instr if (canMemorize()) { - // Codes in this if branch execute instr with memorized regexp - const auto & regexp = memorize(pat_param, match_type_param, collator); if constexpr (has_nullable_col) { - // Process nullable columns with memorized regexp - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & null_map = nullmap_col->getData(); - null_map.resize(col_size); - for (size_t i = 0; i < col_size; ++i) { if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i)) @@ -1698,45 +1700,32 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase continue; } - null_map[i] = 0; - expr_param.getStringRef(i, expr_ref); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) - res_ref = regexp->substr(expr_ref.data, expr_ref.size, pos, occur); - col_res->insertData(res_ref.data, res_ref.size); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); } - res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { - // Process pure vector columns with memorized regexp. - // columns are impossible to be a nullable column here. - for (size_t i = 0; i < col_size; ++i) { expr_param.getStringRef(i, expr_ref); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) - res_ref = regexp->instr(expr_ref.data, expr_ref.size, pos, occur); - col_res->insertData(res_ref.data, res_ref.size); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); } - res_arg.column = std::move(col_res); } } else { - // Codes in this if branch execute instr without memorized regexp - + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + null_map.resize(col_size); if constexpr (has_nullable_col) { - // Process nullable columns without memorized regexp - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & null_map = nullmap_col->getData(); - null_map.resize(col_size); - for (size_t i = 0; i < col_size; ++i) { if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || match_type_param.isNullAt(i)) @@ -1745,48 +1734,40 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase continue; } - null_map[i] = 0; - - expr_param.getStringRef(i, expr_ref); pat = pat_param.getString(i); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); + expr_param.getStringRef(i, expr_ref); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) match_type = match_type_param.getString(i); auto regexp = createRegexpWithMatchType(pat, match_type, collator); - res_ref = regexp->substr(expr_ref.data, expr_ref.size, pos, occur); - col_res->insertData(res_ref.data, res_ref.size); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); } - - res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } else { - // Process pure vector columns without memorized regexp for (size_t i = 0; i < col_size; ++i) { - expr_param.getStringRef(i, expr_ref); pat = pat_param.getString(i); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); + + expr_param.getStringRef(i, expr_ref); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) match_type = match_type_param.getString(i); auto regexp = createRegexpWithMatchType(pat, match_type, collator); - res_ref = regexp->substr(expr_ref.data, expr_ref.size, pos, occur); - col_res->insertData(res_ref.data, res_ref.size); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); } - - res_arg.column = std::move(col_res); } } - #undef GET_OCCUR_VALUE #undef GET_POS_VALUE + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override @@ -1833,6 +1814,29 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase } private: + void executeAndSetResult( + Regexps::Pool::Pointer & regexp, + ColumnString::MutablePtr & col_res, + typename ColumnUInt8::Container & null_map, + size_t idx, + const char * subject, + size_t subject_size, + StringRef & res_ref, + Int64 pos, + Int64 occur) const + { + bool success = regexp->substr(subject, subject_size, res_ref, pos, occur); + if (success) + { + col_res->insertData(res_ref.data, res_ref.size); + null_map[idx] = 1; + } + else + { + null_map[idx] = 0; + } + } + TiDB::TiDBCollatorPtr collator = nullptr; }; diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 791f3495562..fbb48c9eed0 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2767,6 +2767,7 @@ struct RegexpSubstrCase // TODO add empty column test TEST_F(Regexp, RegexpSubstr) { + std::cout << "test 1\n"; // Test: All columns are const { for (size_t row_size = 1; row_size < 3; ++row_size) @@ -2776,20 +2777,20 @@ TEST_F(Regexp, RegexpSubstr) "regexp_substr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), + ASSERT_COLUMN_EQ(createOnlyNullColumnConst(row_size), executeFunction( "regexp_substr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."), createConstColumn(row_size, 2))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), + ASSERT_COLUMN_EQ(createConstColumn(row_size, "12"), executeFunction( "regexp_substr", createConstColumn(row_size, "11212"), createConstColumn(row_size, "12"), createConstColumn(row_size, 2), createConstColumn(row_size, 2))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + ASSERT_COLUMN_EQ(createConstColumn(row_size, "12"), executeFunction( "regexp_substr", createConstColumn(row_size, "11212"), @@ -2797,7 +2798,7 @@ TEST_F(Regexp, RegexpSubstr) createConstColumn(row_size, 2), createConstColumn(row_size, 2), createConstColumn(row_size, 1))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), + ASSERT_COLUMN_EQ(createConstColumn(row_size, "ab"), executeFunction( "regexp_substr", createConstColumn(row_size, "aabab"), @@ -2808,7 +2809,7 @@ TEST_F(Regexp, RegexpSubstr) createConstColumn(row_size, "i"))); } } - +std::cout << "test 2\n"; // Test: null const { size_t row_size = 2; @@ -2868,7 +2869,7 @@ TEST_F(Regexp, RegexpSubstr) std::vector occurs; std::vector return_options; std::vector match_types; - +std::cout << "test 3\n"; // Test: All columns are pure vector { // test regexp_instr(vector, vector) @@ -2979,7 +2980,7 @@ TEST_F(Regexp, RegexpSubstr) utf8mb4_general_ci_collator)); } - +std::cout << "test 4\n"; // Test: Args include nullable columns { // test regexp_instr(nullable vector, vector) @@ -3052,7 +3053,7 @@ TEST_F(Regexp, RegexpSubstr) createColumn(return_options), createNullableVectorColumn(match_types, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]))); } - +std::cout << "test 5\n"; // Test: const, nullable and pure vector columns appear together { // test regexp_instr(nullable vector, vector, nullable vector, vector, const vector, vector) @@ -3071,7 +3072,7 @@ TEST_F(Regexp, RegexpSubstr) createConstColumn(test_cases.size(), 0), createColumn(match_types))); } - +std::cout << "test 6\n"; // Test: Invalid parameter handling { // test empty pattern From 5a59a9352f48c0d83e7574e51e7605bae5331b65 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 15 Nov 2022 10:03:57 +0800 Subject: [PATCH 68/87] replace with template --- dbms/src/Functions/FunctionsRegexp.h | 71 +++++----------------------- 1 file changed, 12 insertions(+), 59 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 2dbadfe4532..a05549d4513 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -129,57 +129,10 @@ Int64 getIntFromField(Field & field) enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64 }; -Int64 getUInt8(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getUInt16(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getUInt32(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getUInt64(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getUInt128(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getInt8(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getInt16(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getInt32(const void * container, size_t idx) -{ - const auto * tmp = reinterpret_cast::Container *>(container); - return static_cast((*tmp)[idx]); -} - -Int64 getInt64(const void * container, size_t idx) +template +Int64 getInt(const void * container, size_t idx) { - const auto * tmp = reinterpret_cast::Container *>(container); + const auto * tmp = reinterpret_cast::Container *>(container); return static_cast((*tmp)[idx]); } @@ -190,23 +143,23 @@ GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) switch (int_type) { case IntType::UInt8: - return getUInt8; + return &getInt; case IntType::UInt16: - return getUInt16; + return &getInt; case IntType::UInt32: - return getUInt32; + return &getInt; case IntType::UInt64: - return getUInt64; + return &getInt; case IntType::UInt128: - return getUInt128; + return &getInt; case IntType::Int8: - return getInt8; + return &getInt; case IntType::Int16: - return getInt16; + return &getInt; case IntType::Int32: - return getInt32; + return &getInt; case IntType::Int64: - return getInt64; + return &getInt; default: throw Exception("Unexpected int type"); } From dcf998902f38041b2c6eb32d4ae7a43de331a1d0 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 15 Nov 2022 14:27:32 +0800 Subject: [PATCH 69/87] pass some tests --- dbms/src/Common/OptimizedRegularExpression.h | 2 +- .../Common/OptimizedRegularExpression.inl.h | 27 +++++- dbms/src/Functions/FunctionsRegexp.h | 18 ++-- dbms/src/Functions/tests/gtest_regexp.cpp | 95 ++++++++----------- 4 files changed, 71 insertions(+), 71 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index f0e1db6d532..fffa0a13d3a 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -121,7 +121,7 @@ class OptimizedRegularExpressionImpl Int64 processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); Int64 instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); - Int64 processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + bool processSubstrEmptyStringExpr(const char * expr, size_t expr_size, StringRef & res, size_t byte_pos, Int64 occur); bool substrImpl(const char * subject, size_t subject_size, StringRef & res, Int64 byte_pos, Int64 occur); bool is_trivial; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index b6b947c8050..e34f8d1cabb 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -510,6 +510,23 @@ Int64 OptimizedRegularExpressionImpl::processInstrEmptyStringExpr(c return pos; } +template +bool OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(const char * expr, size_t expr_size, StringRef & res, size_t byte_pos, Int64 occur) +{ + if (occur != 1 || byte_pos != 1) + return false; + + StringPieceType expr_sp(expr, expr_size); + StringPieceType matched_str; + bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); + if (!success) + return false; + + res.data = matched_str.data(); + res.size = matched_str.size(); + return true; +} + static inline void checkInstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) { if (unlikely(ret_op != 0 && ret_op != 1)) @@ -527,7 +544,7 @@ static inline void checkSubstrArgs(Int64 utf8_total_len, size_t subject_size, In static inline void makeOccurValid(Int64 & occur) { - occur = occur < 0 ? 1 : occur; + occur = occur < 1 ? 1 : occur; } template @@ -539,7 +556,6 @@ Int64 OptimizedRegularExpressionImpl::instrImpl(const char * subjec StringPieceType expr_sp(expr, expr_size); StringPieceType matched_str; - while (occur > 0) { bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); @@ -561,15 +577,18 @@ bool OptimizedRegularExpressionImpl::substrImpl(const char * subjec size_t expr_size = subject_size - byte_offset; StringPieceType expr_sp(expr, expr_size); + StringPieceType matched_str; while (occur > 0) { - bool success = RegexType::FindAndConsume(&expr_sp, *re2, res); + bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); if (!success) return false; --occur; } + res.data = matched_str.data(); + res.size = matched_str.size(); return true; } @@ -595,7 +614,7 @@ bool OptimizedRegularExpressionImpl::substr(const char * subject, s makeOccurValid(occur); if (unlikely(subject_size == 0)) - return processSubstrEmptyStringExpr(subject, subject_size, pos, occur); + return processSubstrEmptyStringExpr(subject, subject_size, res, pos, occur); size_t byte_pos = utf8Pos2bytePos(subject, pos); return substrImpl(subject, subject_size, res, byte_pos, occur); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index a764d789fc2..c6822575eb4 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -106,10 +106,10 @@ inline String addMatchTypeForPattern(const String & pattern, const String & matc return fmt::format("{}({})", mode, pattern); } -inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) +inline std::unique_ptr createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { String final_pattern = addMatchTypeForPattern(pattern, match_type, collator); - return Regexps::get(final_pattern, getDefaultFlags()); + return std::make_unique(final_pattern, getDefaultFlags()); } // Only int types used in ColumnsNumber.h can be valid @@ -1585,21 +1585,19 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase return; } - int flags = getDefaultFlags(); - String expr = expr_param.getString(0); String pat = pat_param.getString(0); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - Int64 pos = PosT::isConst() ? pos_const_val : get_pos_func(pos_container, 0); - Int64 occur = OccurT::isConst() ? occur_const_val : get_occur_func(occur_container, 0); + int flags = getDefaultFlags(); + String expr = expr_param.getString(0); String match_type = match_type_param.getString(0); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); StringRef res_ref; - bool success = regexp.substr(expr.c_str(), expr.size(), res_ref, pos, occur); + bool success = regexp.substr(expr.c_str(), expr.size(), res_ref, pos_const_val, occur_const_val); if (success) - res_arg.column = res_arg.type->createColumnConst(col_size, toField(res_ref)); + res_arg.column = res_arg.type->createColumnConst(col_size, toField(String(res_ref))); else res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; @@ -1747,7 +1745,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase // Go through cases to get arguments switch(arg_num) { - case REGEXP_INSTR_MAX_PARAM_NUM: + case REGEXP_SUBSTR_MAX_PARAM_NUM: col_match_type = block.getByPosition(arguments[4]).column; case REGEXP_MIN_PARAM_NUM + 2: col_occur = block.getByPosition(arguments[3]).column; @@ -1768,7 +1766,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase private: void executeAndSetResult( - Regexps::Pool::Pointer & regexp, + const std::unique_ptr & regexp, ColumnString::MutablePtr & col_res, typename ColumnUInt8::Container & null_map, size_t idx, diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 4ad0b8ff986..efa2936802e 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2772,92 +2772,75 @@ TEST_F(Regexp, RegexpSubstr) { for (size_t row_size = 1; row_size < 3; ++row_size) { - ASSERT_COLUMN_EQ(createConstColumn(row_size, "123"), + ASSERT_COLUMN_EQ(createConstColumn>(row_size, "123"), executeFunction( "regexp_substr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."))); - ASSERT_COLUMN_EQ(createOnlyNullColumnConst(row_size), + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), executeFunction( "regexp_substr", createConstColumn(row_size, "123"), createConstColumn(row_size, "12."), createConstColumn(row_size, 2))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, "12"), + ASSERT_COLUMN_EQ(createConstColumn>(row_size, "12"), executeFunction( "regexp_substr", createConstColumn(row_size, "11212"), createConstColumn(row_size, "12"), createConstColumn(row_size, 2), createConstColumn(row_size, 2))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, "12"), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "11212"), - createConstColumn(row_size, "12"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 1))); - ASSERT_COLUMN_EQ(createConstColumn(row_size, "ab"), + ASSERT_COLUMN_EQ(createConstColumn>(row_size, "ab"), executeFunction( "regexp_substr", createConstColumn(row_size, "aabab"), createConstColumn(row_size, "aB"), createConstColumn(row_size, 2), createConstColumn(row_size, 2), - createConstColumn(row_size, 1), createConstColumn(row_size, "i"))); } } std::cout << "test 2\n"; // Test: null const { - size_t row_size = 2; - ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn>(row_size, {}), - createConstColumn(row_size, "123"))); - - ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn>(row_size, {}))); - - ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn>(row_size, {}))); + for (size_t row_size = 1; row_size < 3; ++row_size) + { + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn>(row_size, {}), + createConstColumn(row_size, "123"))); + + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn>(row_size, {}))); - ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn>(row_size, {}))); - ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); - ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); + } } std::vector test_cases; From 2baa414cd1f6907250dd940b45ca9561dd9062fc Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 15 Nov 2022 17:15:04 +0800 Subject: [PATCH 70/87] pass all tests --- dbms/src/Functions/FunctionsRegexp.h | 12 +- dbms/src/Functions/tests/gtest_regexp.cpp | 298 ++++++++++------------ 2 files changed, 135 insertions(+), 175 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index c6822575eb4..e17e178dba1 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1453,7 +1453,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase ParamVariant RET_OP_PV_VAR_NAME(col_return_option, col_size, 0); ParamVariant MATCH_TYPE_PV_VAR_NAME(col_match_type, col_size, StringRef("", 0)); - GET_ACTUAL_PARAMS_AND_EXECUTE() + // GET_ACTUAL_PARAMS_AND_EXECUTE() } private: @@ -1648,6 +1648,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i)) { null_map[i] = 1; + col_res->insertData("", 0); continue; } @@ -1672,9 +1673,6 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase } else { - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & null_map = nullmap_col->getData(); - null_map.resize(col_size); if constexpr (has_nullable_col) { for (size_t i = 0; i < col_size; ++i) @@ -1682,6 +1680,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || match_type_param.isNullAt(i)) { null_map[i] = 1; + col_res->insertData("", 0); continue; } @@ -1780,11 +1779,12 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase if (success) { col_res->insertData(res_ref.data, res_ref.size); - null_map[idx] = 1; + null_map[idx] = 0; } else { - null_map[idx] = 0; + col_res->insertData("", 0); + null_map[idx] = 1; } } diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index efa2936802e..97f307fdc99 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2212,14 +2212,14 @@ TEST_F(Regexp, testRegexpCustomerCases) namespace { -template -std::vector getResultVec(const std::vector & test_cases) +template +std::vector getResultVec(const std::vector & test_cases) { - std::vector vecs; + std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.result); - + return vecs; } @@ -2315,7 +2315,7 @@ struct RegexpInstrCase static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) { - results = getResultVec(test_cases); + results = getResultVec(test_cases); switch (param_num) { case 6: @@ -2329,6 +2329,8 @@ struct RegexpInstrCase case 2: pats = getPatVec(test_cases); exprs = getExprVec(test_cases); + default: + throw DB::Exception("Invalid param_num"); } } @@ -2548,7 +2550,7 @@ TEST_F(Regexp, RegexpInstr) {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2569,7 +2571,7 @@ TEST_F(Regexp, RegexpInstr) {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = getResultVec(test_cases); + results = getResultVec(test_cases); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( "regexp_instr", @@ -2696,7 +2698,7 @@ TEST_F(Regexp, RegexpInstr) struct RegexpSubstrCase { - RegexpSubstrCase(Int64 res, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") + RegexpSubstrCase(const String & res, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") : result(res) , expression(expr) , pattern(pat) @@ -2705,7 +2707,7 @@ struct RegexpSubstrCase , match_type(mt) {} - RegexpSubstrCase(Int64 res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") + RegexpSubstrCase(const String & res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") : result(res) , null_map(null_map_) , expression(expr) @@ -2715,9 +2717,9 @@ struct RegexpSubstrCase , match_type(mt) {} - static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) + static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) { - results = getResultVec(test_cases); + results = getResultVec(test_cases); switch (param_num) { case 5: @@ -2729,10 +2731,13 @@ struct RegexpSubstrCase case 2: pats = getPatVec(test_cases); exprs = getExprVec(test_cases); + break; + default: + throw DB::Exception("Invalid param_num"); } } - static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) + static void setVecsWithNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector> & null_map, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) { null_map.clear(); null_map.resize(REGEXP_SUBSTR_MAX_PARAM_NUM); @@ -2755,7 +2760,7 @@ struct RegexpSubstrCase const static UInt8 OCCUR_NULL_MAP_IDX = 3; const static UInt8 MATCH_TYPE_NULL_MAP_IDX = 4; - Int64 result; + String result; std::vector null_map; String expression; String pattern; @@ -2767,7 +2772,6 @@ struct RegexpSubstrCase // TODO add empty column test TEST_F(Regexp, RegexpSubstr) { - std::cout << "test 1\n"; // Test: All columns are const { for (size_t row_size = 1; row_size < 3; ++row_size) @@ -2800,7 +2804,7 @@ TEST_F(Regexp, RegexpSubstr) createConstColumn(row_size, "i"))); } } -std::cout << "test 2\n"; + // Test: null const { for (size_t row_size = 1; row_size < 3; ++row_size) @@ -2843,235 +2847,191 @@ std::cout << "test 2\n"; } } - std::vector test_cases; - std::vector results; + std::vector test_cases; + std::vector results; std::vector> null_maps; std::vector exprs; std::vector patterns; std::vector positions; std::vector occurs; - std::vector return_options; std::vector match_types; -std::cout << "test 3\n"; + // Test: All columns are pure vector { - // test regexp_instr(vector, vector) - test_cases = {{4, "ttttifl", "tifl"}, - {1, "tidb_tikv", "ti(db|kv)"}, - {1, "aaaaaa", "aa"}, - {0, "\n", "."}, - {1, "", "^$"}, - {0, "ab\naB", "^ab$"}, - {3, "pp跑ppのaaa", "(跑|の|P)"}}; - RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createColumn(results), + // test regexp_substr(vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl"}, + {"tidb", "tidb_tikv", "ti(db|kv)"}, + {"aa", "aaaaaa", "a."}, + {"", "\n", "."}, + {"", "", "^$"}, + {"", "ab\naB", "^ab$"}, + {"跑", "pp跑ppのaaa", "(跑|の|P)"}}; + RegexpSubstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 0, 1, 0}), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns))); - // test regexp_instr(vector, vector, vector) - test_cases = {{4, "ttttifl", "tifl", 3}, - {6, "tidb_tikv", "ti(db|kv)", 2}, - {3, "aaaaaa", "aa", 3}, - {0, "\n", ".", 1}, - {3, "", "^$", 3}, - {0, "ab\naB", "^ab$", 1}, - {3, "pp跑ppのaaa", "(跑|の|P)", 2}}; - RegexpInstrCase::setVecsWithoutNullMap(3, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createColumn(results), + // test regexp_substr(vector, vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl", 3}, + {"tikv", "tidb_tikv", "ti(db|kv)", 2}, + {"aa", "aaaaaa", "aa", 3}, + {"", "\n", ".", 1}, + {"", "ab\naB", "^ab$", 1}, + {"跑", "pp跑ppのaaa", "(跑|の|P)", 2}}; + RegexpSubstrCase::setVecsWithoutNullMap(3, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 1, 0}), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns), createColumn(positions))); - // test regexp_instr(vector, vector, vector, vector) - test_cases = {{4, "ttttifl", "tifl", 3, 1}, - {6, "tidb_tikv", "ti(db|kv)", 2, 1}, - {5, "aaaaaa", "aa", 3, 2}, - {0, "\n", ".", 1, 1}, {0, "", "^$", 3, 2}, - {0, "ab\naB", "^ab$", 1, 1}, - {6, "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; - RegexpInstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createColumn(results), + // test regexp_substr(vector, vector, vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl", 3, 1}, + {"tikv", "tidb_tikv", "ti(db|kv)", 2, 1}, + {"aa", "aaaaaa", "aa", 3, 2}, + {"", "\n", ".", 1, 1}, + {"", "ab\naB", "^ab$", 1, 1}, + {"の", "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; + RegexpSubstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 1, 0}), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs))); - // test regexp_instr(vector, vector, vector, vector, vector) - test_cases = {{8, "ttttifl", "tifl", 3, 1, 1}, - {10, "tidb_tikv", "ti(db|kv)", 2, 1, 1}, - {7, "aaaaaa", "aa", 3, 2, 1}, - {0, "\n", ".", 1, 1, 1}, - {0, "", "^$", 3, 2, 1}, - {0, "ab\naB", "^ab$", 1, 1, 1}, - {7, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1}}; - RegexpInstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createColumn(results), + // test regexp_substr(vector, vector, vector, vector, vector) + test_cases = {{"tifl", "ttttifl", "tifl", 3, 1, ""}, + {"tikv", "tidb_tikv", "ti(db|kv)", 2, 1, ""}, + {"aa", "aaaaaa", "aa", 3, 2, ""}, + {"\n", "\n", ".", 1, 1, "s"}, + {"aB", "ab\naB", "^ab$", 3, 1, "mi"}, + {"跑", "pp跑ppのaaa", "(跑|の|P)", 2, 2, "i"}}; + RegexpSubstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, match_types); + results = getResultVec(test_cases); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 0, 0, 0}), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), - createColumn(return_options))); - - // test regexp_instr(vector, vector, vector, vector, vector, vector) - test_cases = {{8, "ttttifl", "tifl", 3, 1, 1, ""}, - {10, "tidb_tikv", "ti(db|kv)", 2, 1, 1, ""}, - {7, "aaaaaa", "aa", 3, 2, 1, ""}, - {2, "\n", ".", 1, 1, 1, "s"}, - {0, "", "^$", 3, 2, 1, ""}, - {6, "ab\naB", "^ab$", 3, 1, 1, "mi"}, - {4, "pp跑ppのaaa", "(跑|の|P)", 2, 2, 1, "i"}}; - RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = getResultVec(test_cases); - ASSERT_COLUMN_EQ(createColumn(results), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createColumn(return_options), createColumn(match_types))); // test collation const auto * utf8mb4_general_ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); - test_cases = {{2, "ttiFl", "tifl", 1, 1, 0, ""}, - {0, "ttiFl", "tifl", 1, 1, 0, "c"}, - {2, "ttiFl", "tifl", 1, 1, 0, "i"}, - {2, "ttiFl", "tifl", 1, 1, 0, "ci"}, - {0, "ttiFl", "tifl", 1, 1, 0, "ic"}, - {0, "ttiFl", "tifl", 1, 1, 0, "iccc"}, - {0, "ttiFl", "tifl", 1, 1, 0, "icic"}}; - RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - results = getResultVec(test_cases); - ASSERT_COLUMN_EQ(createColumn(results), + test_cases = {{"tiFl", "ttiFl", "tifl", 1, 1, ""}, + {"", "ttiFl", "tifl", 1, 1, "c"}, + {"tiFl", "ttiFl", "tifl", 1, 1, "i"}, + {"tiFl", "ttiFl", "tifl", 1, 1, "ci"}, + {"", "ttiFl", "tifl", 1, 1, "ic"}, + {"", "ttiFl", "tifl", 1, 1, "iccc"}, + {"", "ttiFl", "tifl", 1, 1, "icic"}}; + RegexpSubstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, match_types); + results = getResultVec(test_cases); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 0, 0, 1, 1, 1}), executeFunction( - "regexp_instr", + "regexp_substr", {createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), - createColumn(return_options), createColumn(match_types)}, utf8mb4_general_ci_collator)); - } -std::cout << "test 4\n"; + // Test: Args include nullable columns { - // test regexp_instr(nullable vector, vector) - test_cases = {{0, {{1, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, - {1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; - RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + // test regexp_substr(nullable vector, vector) + test_cases = {{"", {{1, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {"tidb", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpSubstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), executeFunction( - "regexp_instr", - createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + "regexp_substr", + createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), createColumn(patterns))); - // test regexp_instr(vector, nullable vector) - test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, - {0, {{0, 1, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; - RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]), + // test regexp_substr(vector, nullable vector) + test_cases = {{"tifl", {{0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, + {"", {{0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; + RegexpSubstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), - createNullableVectorColumn(patterns, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]))); + createNullableVectorColumn(patterns, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]))); - // test regexp_instr(vector, vector, nullable vector) - test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, - {0, {{0, 0, 1, 0, 0, 0}}, "ttttifl", "tifl", 3}}; - RegexpInstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + // test regexp_substr(vector, vector, nullable vector) + test_cases = {{"tifl", {{0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, + {"", {{0, 0, 1, 0, 0}}, "ttttifl", "tifl", 3}}; + RegexpSubstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns), - createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]))); + createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]))); - // test regexp_instr(vector, vector, vector, nullable vector) - test_cases = {{6, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, - {0, {{0, 0, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; - RegexpInstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]), + // test regexp_substr(vector, vector, vector, nullable vector) + test_cases = {{"tikv", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, + {"", {{0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; + RegexpSubstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns), createColumn(positions), - createNullableVectorColumn(occurs, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]))); + createNullableVectorColumn(occurs, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]))); - // test regexp_instr(vector, vector, vector, vector, nullable vector) - test_cases = {{10, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}, - {0, {{0, 0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}}; - RegexpInstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]), + // test regexp_substr(vector, vector, vector, vector, nullable vector) + test_cases = {{"b", {{0, 0, 0, 0, 0}}, "b", "B", 1, 1, "i"}, + {"", {{0, 0, 0, 0, 1}}, "b", "B", 1, 1, "i"}}; + RegexpSubstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]), executeFunction( - "regexp_instr", + "regexp_substr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), - createNullableVectorColumn(return_options, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]))); - - // test regexp_instr(vector, vector, vector, vector, vector, nullable vector) - test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "b", "B", 1, 1, 0, "i"}, - {0, {{0, 0, 0, 0, 0, 1}}, "b", "B", 1, 1, 0, "i"}}; - RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createColumn(return_options), - createNullableVectorColumn(match_types, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]))); + createNullableVectorColumn(match_types, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]))); } -std::cout << "test 5\n"; + // Test: const, nullable and pure vector columns appear together { - // test regexp_instr(nullable vector, vector, nullable vector, vector, const vector, vector) - test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, - {0, {{1, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, - {0, {{0, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}, - {0, {{1, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}}; - RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), + // test regexp_substr(nullable vector, vector, nullable vector, vector, const vector, vector) + test_cases = {{"tidb", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}, + {"", {{1, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}, + {"", {{0, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}, + {"", {{1, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}}; + RegexpSubstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); + ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), executeFunction( - "regexp_instr", - createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + "regexp_substr", + createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), createColumn(patterns), - createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), createColumn(occurs), - createConstColumn(test_cases.size(), 0), createColumn(match_types))); } -std::cout << "test 6\n"; + // Test: Invalid parameter handling { // test empty pattern - test_cases = {{0, "ttt", ""}}; - RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns)), Exception); - - // test invalid ret_option - test_cases = {{0, "ttt", "t", 1, 1, 2}}; - RegexpInstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(return_options)), Exception); + test_cases = {{"", "ttt", ""}}; + RegexpSubstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_THROW(executeFunction("regexp_substr", createNullableVectorColumn(exprs, {0}), createColumn(patterns)), Exception); // test invalid match type - test_cases = {{0, "ttt", "t", 1, 1, 1, "p"}}; - RegexpInstrCase::setVecsWithoutNullMap(6, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); - ASSERT_THROW(executeFunction("regexp_instr", createColumn(exprs), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(return_options), createColumn(match_types)), Exception); + test_cases = {{"", "ttt", "t", 1, 1, "p"}}; + RegexpSubstrCase::setVecsWithoutNullMap(5, test_cases, results, exprs, patterns, positions, occurs, match_types); + ASSERT_THROW(executeFunction("regexp_substr", createNullableVectorColumn(exprs, {0}), createColumn(patterns), createColumn(positions), createColumn(occurs), createColumn(match_types)), Exception); } } From 377b83dfd2f28bcbd659d9e587a5645ce675b0a3 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 17 Nov 2022 13:24:40 +0800 Subject: [PATCH 71/87] resolve comments --- .../Common/OptimizedRegularExpression.inl.h | 31 ++----------- dbms/src/Common/StringUtils/StringUtils.h | 29 ------------ dbms/src/Common/UTF8Helpers.h | 24 ++++++++++ dbms/src/Functions/FunctionsRegexp.h | 45 +++++++++++++------ 4 files changed, 60 insertions(+), 69 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index f8f09dc2756..b8e86bb0646 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -472,30 +473,6 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject } } -// Convert utf8 position to byte position. -// For Example: -// Taking string "ni好a" as an example. -// utf8 position of character 'a' in this string is 4 and byte position is 6. -static inline Int64 utf8Pos2bytePos(const char * str, Int64 utf8_pos) -{ - Int64 byte_index = 0; - utf8_pos--; - while (utf8_pos > 0) - { - byte_index += getUtf8Len(str[byte_index]); - utf8_pos--; - } - return byte_index + 1; -} - -static inline Int64 bytePos2Utf8Pos(const char * str, Int64 byte_pos) -{ - // byte_num means the number of byte before this byte_pos - Int64 byte_num = byte_pos - 1; - Int64 utf8_num = getStringUtf8Len(str, byte_num); - return utf8_num + 1; -} - template Int64 OptimizedRegularExpressionImpl::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur) { @@ -543,13 +520,13 @@ Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const c } byte_offset = matched_str.data() - subject; - return ret_op == 0 ? bytePos2Utf8Pos(subject, byte_offset + 1) : bytePos2Utf8Pos(subject, byte_offset + matched_str.size() + 1); + return ret_op == 0 ? DB::UTF8::bytePos2Utf8Pos(reinterpret_cast(subject), byte_offset + 1) : DB::UTF8::bytePos2Utf8Pos(reinterpret_cast(subject), byte_offset + matched_str.size() + 1); } template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { - Int64 utf8_total_len = getStringUtf8Len(subject, subject_size); + Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast(subject), subject_size); checkArgs(utf8_total_len, subject_size, pos, ret_op); @@ -558,7 +535,7 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s if (unlikely(subject_size == 0)) return processEmptyStringExpr(subject, subject_size, pos, occur); - size_t byte_pos = utf8Pos2bytePos(subject, pos); + size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast(subject), pos); return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op); } diff --git a/dbms/src/Common/StringUtils/StringUtils.h b/dbms/src/Common/StringUtils/StringUtils.h index bc865b9fdba..61b85f0912b 100644 --- a/dbms/src/Common/StringUtils/StringUtils.h +++ b/dbms/src/Common/StringUtils/StringUtils.h @@ -149,32 +149,3 @@ inline bool equalsCaseInsensitive(char a, char b) { return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b); } - -// Get how many bytes this utf8 character needs. -// Input must be the first byte of a utf8 character. -inline size_t getUtf8Len(uint8_t utf8_first_byte) -{ - uint8_t flag = 128; - size_t len = 0; - while (flag & utf8_first_byte) - { - len++; - flag >>= 1; - } - - return flag == 128 ? 1 : len; -} - -inline size_t getStringUtf8Len(const char * str, size_t total_len) -{ - size_t len = 0; - size_t utf8_len; - - for (size_t i = 0; i < total_len; i += utf8_len) - { - utf8_len = getUtf8Len(str[i]); - len++; - } - - return len; -} diff --git a/dbms/src/Common/UTF8Helpers.h b/dbms/src/Common/UTF8Helpers.h index 143f923ee8e..cf848b258ae 100644 --- a/dbms/src/Common/UTF8Helpers.h +++ b/dbms/src/Common/UTF8Helpers.h @@ -83,6 +83,30 @@ inline size_t countCodePoints(const UInt8 * data, size_t size) return res; } +// Convert utf8 position to byte position. +// For Example: +// Taking string "ni好a" as an example. +// utf8 position of character 'a' in this string is 4 and byte position is 6. +static inline Int64 utf8Pos2bytePos(const UInt8 * str, Int64 utf8_pos) +{ + Int64 byte_index = 0; + utf8_pos--; + while (utf8_pos > 0) + { + byte_index += seqLength(str[byte_index]); + utf8_pos--; + } + return byte_index + 1; +} + +static inline Int64 bytePos2Utf8Pos(const UInt8 * str, Int64 byte_pos) +{ + // byte_num means the number of byte before this byte_pos + Int64 byte_num = byte_pos - 1; + Int64 utf8_num = countCodePoints(str, byte_num); + return utf8_num + 1; +} + } // namespace UTF8 diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index a05549d4513..d7f254961de 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -127,7 +127,7 @@ Int64 getIntFromField(Field & field) } } -enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64 }; +enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 }; template Int64 getInt(const void * container, size_t idx) @@ -150,8 +150,6 @@ GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) return &getInt; case IntType::UInt64: return &getInt; - case IntType::UInt128: - return &getInt; case IntType::Int8: return &getInt; case IntType::Int16: @@ -165,6 +163,20 @@ GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) } } +template +void setResultColumnValuesNull(ColumnWithTypeAndName & res_arg, size_t col_size) +{ + // Initialize result column + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(col_size, 0); + + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(col_size, 1); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); +} + template class ParamString { @@ -1237,7 +1249,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Check if args are all const columns if constexpr (ExprT::isConst() && PatT::isConst() && PosT::isConst() && OccurT::isConst() && RetOpT::isConst() && MatchTypeT::isConst()) { - if (col_size == 0 || expr_param.isNullAt(0) || pat_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || ret_op_param.isNullAt(0) || match_type_param.isNullAt(0)) + if (expr_param.isNullAt(0) || pat_param.isNullAt(0) || pos_param.isNullAt(0) || occur_param.isNullAt(0) || ret_op_param.isNullAt(0) || match_type_param.isNullAt(0)) { res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; @@ -1245,17 +1257,13 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase int flags = getDefaultFlags(); String expr = expr_param.getString(0); + String match_type = match_type_param.getString(0); String pat = pat_param.getString(0); if (unlikely(pat.empty())) - throw Exception(EMPTY_PAT_ERR_MSG); - - Int64 pos = PosT::isConst() ? pos_const_val : get_pos_func(pos_container, 0); - Int64 occur = OccurT::isConst() ? occur_const_val : get_occur_func(occur_container, 0); - Int64 ret_op = RetOpT::isConst() ? ret_op_const_val : get_ret_op_func(ret_op_container, 0); - String match_type = match_type_param.getString(0); + throw Exception(EMPTY_PAT_ERR_MSG); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); - ResultType res = regexp.instr(expr.c_str(), expr.size(), pos, occur, ret_op); + ResultType res = regexp.instr(expr.c_str(), expr.size(), pos_const_val, occur_const_val, ret_op_const_val); res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; } @@ -1304,9 +1312,20 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase // Start to execute instr if (canMemorize()) { - // Codes in this if branch execute instr with memorized regexp + std::unique_ptr regexp; + if (col_size > 0) + { + regexp = memorize(pat_param, match_type_param, collator); + if (regexp == nullptr) + { + auto nullmap_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); + nullmap.resize(col_size, 1); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + return; + } + } - const auto & regexp = memorize(pat_param, match_type_param, collator); if constexpr (has_nullable_col) { // Process nullable columns with memorized regexp From be4334f369a52796539a2e22a991e88b336fac2a Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 21 Nov 2022 14:12:11 +0800 Subject: [PATCH 72/87] resolve comments --- dbms/src/Common/OptimizedRegularExpression.h | 2 +- .../Common/OptimizedRegularExpression.inl.h | 6 +- dbms/src/Functions/FunctionsRegexp.h | 243 ++++++++------- dbms/src/Functions/tests/gtest_regexp.cpp | 281 +++++++++--------- 4 files changed, 282 insertions(+), 250 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 46eab4878cd..c9b00b653fc 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -15,8 +15,8 @@ #pragma once #include -#include #include +#include #include #include diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index b8e86bb0646..ba2b234296c 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +#include #include #include -#include #include +#include #include #include @@ -504,7 +504,7 @@ template Int64 OptimizedRegularExpressionImpl::getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op) { size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 - const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched + const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched size_t expr_size = subject_size - byte_offset; StringPieceType expr_sp(expr, expr_size); diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index d7f254961de..57a584c1a9b 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -93,19 +93,29 @@ inline int getDefaultFlags() return flags; } -// add '()' outside of the pattern to get the matched substr +template inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { String mode = re2Util::getRE2ModeModifiers(match_type, collator); if (mode.empty()) - return fmt::format("({})",pattern); - return fmt::format("{}({})", mode, pattern); + { + if constexpr (need_subpattern) + return fmt::format("({})", pattern); + else + return pattern; + } + + if constexpr (need_subpattern) + return fmt::format("{}({})", mode, pattern); + else + return fmt::format("{}{}", mode, pattern); } -inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) +template +inline Regexps::Regexp createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { - String final_pattern = addMatchTypeForPattern(pattern, match_type, collator); - return Regexps::get(final_pattern, getDefaultFlags()); + String final_pattern = addMatchTypeForPattern(pattern, match_type, collator); + return Regexps::createRegexp(final_pattern, getDefaultFlags()); } // Only int types used in ColumnsNumber.h can be valid @@ -117,7 +127,8 @@ inline constexpr bool check_int_type() Int64 getIntFromField(Field & field) { - switch (field.getType()) { + switch (field.getType()) + { case Field::Types::Which::Int64: return field.safeGet(); case Field::Types::Which::UInt64: @@ -127,7 +138,17 @@ Int64 getIntFromField(Field & field) } } -enum class IntType { UInt8 = 0, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 }; +enum class IntType +{ + UInt8 = 0, + UInt16, + UInt32, + UInt64, + Int8, + Int16, + Int32, + Int64 +}; template Int64 getInt(const void * container, size_t idx) @@ -225,7 +246,10 @@ class ParamString static IntType getIntType() { throw Exception("ParamString not supports this function"); } template - Int64 getInt(size_t) const { throw Exception("ParamString not supports this function"); } + Int64 getInt(size_t) const + { + throw Exception("ParamString not supports this function"); + } String getString(size_t idx) const { @@ -421,7 +445,10 @@ class Param } template - Int64 getInt(size_t idx) const { return data.template getInt(idx); } + Int64 getInt(size_t idx) const + { + return data.template getInt(idx); + } void getStringRef(size_t idx, StringRef & dst) const { return data.getStringRef(idx, dst); } String getString(size_t idx) const { return data.getString(idx); } @@ -507,7 +534,8 @@ class ParamVariant // default ParamString's ParamType should be ParamType::StringNotNullAndNotConst ParamVariant(ColumnPtr col, size_t col_size, const StringRef & default_val) - : col_ptr(col), param(nullptr) + : col_ptr(col) + , param(nullptr) { if (col_ptr != nullptr) { @@ -523,7 +551,8 @@ class ParamVariant // default ParamInt's ParamType should be ParamType::IntNotNullAndNotConst ParamVariant(ColumnPtr col, size_t col_size, Int64 default_val) - : col_ptr(col), param(nullptr) + : col_ptr(col) + , param(nullptr) { if (col_ptr != nullptr) { @@ -635,14 +664,14 @@ class ParamVariant } #define APPLY_FOR_INT_CONTAINER(M, col_ptr, null_map, param) \ - M(UInt8, col_ptr, null_map, param) \ - M(UInt16, col_ptr, null_map, param) \ - M(UInt32, col_ptr, null_map, param) \ - M(UInt64, col_ptr, null_map, param) \ - M(Int8, col_ptr, null_map, param) \ - M(Int16, col_ptr, null_map, param) \ - M(Int32, col_ptr, null_map, param) \ - M(Int64, col_ptr, null_map, param) \ + M(UInt8, col_ptr, null_map, param) \ + M(UInt16, col_ptr, null_map, param) \ + M(UInt32, col_ptr, null_map, param) \ + M(UInt64, col_ptr, null_map, param) \ + M(Int8, col_ptr, null_map, param) \ + M(Int16, col_ptr, null_map, param) \ + M(Int32, col_ptr, null_map, param) \ + M(Int64, col_ptr, null_map, param) void handleIntNonConstCol(size_t col_size) { @@ -654,16 +683,17 @@ class ParamVariant // Construct actual param param_type = ParamType::IntNullableAndNotConst; -#define M(INT_TYPE, col_ptr, null_map, param) \ - else if (const auto * ptr = typeid_cast(&(*(col_ptr)))) \ - { \ +#define M(INT_TYPE, col_ptr, null_map, param) \ + else if (const auto * ptr = typeid_cast(&(*(col_ptr)))) \ + { \ (param) = new ParamIntNullableAndNotConst(col_size, null_map, reinterpret_cast(&(ptr->getData())), IntType::INT_TYPE); \ } - if (false) {} + if (false) + { + } APPLY_FOR_INT_CONTAINER(M, nested_ptr, null_map, param) - else - throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); #undef M } @@ -672,16 +702,17 @@ class ParamVariant // Construct actual param param_type = ParamType::IntNotNullableAndNotConst; -#define M(INT_TYPE, col_ptr, null_map, param) \ - else if (const auto * ptr = typeid_cast(&(*(col_ptr)))) \ - { \ +#define M(INT_TYPE, col_ptr, null_map, param) \ + else if (const auto * ptr = typeid_cast(&(*(col_ptr)))) \ + { \ (param) = new ParamIntNotNullableAndNotConst(col_size, reinterpret_cast(&(ptr->getData())), IntType::INT_TYPE); \ } - if (false) {} + if (false) + { + } APPLY_FOR_INT_CONTAINER(M, col_ptr, null_map, param) - else - throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else throw Exception("Invalid int type int regexp function", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); #undef M } @@ -736,24 +767,24 @@ class ParamVariant { \ switch ((pv_name).getParamType()) \ { \ - /* Expand this macro to enumerate all string cases */ \ - APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, (pv_name).param, param_name, next_process) \ + /* Expand this macro to enumerate all string cases */ \ + APPLY_FOR_PARAM_STRING_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, (pv_name).param, param_name, next_process) \ default: \ throw Exception("Unexpected ParamType"); \ } \ } while (0); // Common method to get actual int param -#define GET_ACTUAL_INT_PARAM(pv_name, param_name, next_process) \ - do \ - { \ - switch ((pv_name).getParamType()) \ - { \ - /* Expand this macro to enumerate all int cases */ \ - APPLY_FOR_PARAM_INT_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, (pv_name).param, param_name, next_process) \ - default: \ - throw Exception("Unexpected ParamType"); \ - } \ +#define GET_ACTUAL_INT_PARAM(pv_name, param_name, next_process) \ + do \ + { \ + switch ((pv_name).getParamType()) \ + { \ + /* Expand this macro to enumerate all int cases */ \ + APPLY_FOR_PARAM_INT_VARIANTS(ENUMERATE_PARAM_VARIANT_CASES, (pv_name).param, param_name, next_process) \ + default: \ + throw Exception("Unexpected ParamType"); \ + } \ } while (0); class FunctionStringRegexpBase @@ -771,7 +802,7 @@ class FunctionStringRegexpBase // We should pre compile the regular expression when: // - only pattern column is provided and it's a constant column // - pattern and match type columns are provided and they are both constant columns - template + template std::unique_ptr memorize(const ExprT & pat_param, const MatchTypeT & match_type_param, TiDB::TiDBCollatorPtr collator) const { if (pat_param.isNullAt(0) || match_type_param.isNullAt(0)) @@ -782,7 +813,7 @@ class FunctionStringRegexpBase throw Exception(EMPTY_PAT_ERR_MSG); String match_type = match_type_param.getString(0); - final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); + final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); int flags = getDefaultFlags(); return std::make_unique(final_pattern, flags); @@ -961,7 +992,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase String match_type = match_type_param.getString(0); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); ResultType res{regexp.match(expr)}; res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; @@ -980,7 +1011,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase std::unique_ptr regexp; if (col_size > 0) { - regexp = memorize(pat_param, match_type_param, collator); + regexp = memorize(pat_param, match_type_param, collator); if (regexp == nullptr) { auto nullmap_col = ColumnUInt8::create(); @@ -1056,8 +1087,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); - vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + vec_res[i] = regexp.match(expr_ref.data, expr_ref.size); // match } res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); @@ -1076,8 +1107,8 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); - vec_res[i] = regexp->match(expr_ref.data, expr_ref.size); // match + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + vec_res[i] = regexp.match(expr_ref.data, expr_ref.size); // match } res_arg.column = std::move(col_res); @@ -1126,59 +1157,59 @@ class FunctionStringRegexp : public FunctionStringRegexpBase #undef GET_MATCH_TYPE_ACTUAL_PARAM #undef EXECUTE_REGEXP_LIKE -#define EXECUTE_REGEXP_INSTR() \ - do \ - { \ +#define EXECUTE_REGEXP_INSTR() \ + do \ + { \ REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, *(EXPR_PARAM_PTR_VAR_NAME), *(PAT_PARAM_PTR_VAR_NAME), *(POS_PARAM_PTR_VAR_NAME), *(OCCUR_PARAM_PTR_VAR_NAME), *(RET_OP_PARAM_PTR_VAR_NAME), *(MATCH_TYPE_PARAM_PTR_VAR_NAME)); \ } while (0); // Method to get actual match type param -#define GET_MATCH_TYPE_ACTUAL_PARAM() \ - do \ - { \ +#define GET_MATCH_TYPE_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_STRING_PARAM(MATCH_TYPE_PV_VAR_NAME, MATCH_TYPE_PARAM_PTR_VAR_NAME, ({EXECUTE_REGEXP_INSTR()})) \ } while (0); // Method to get actual return option param -#define GET_RET_OP_ACTUAL_PARAM() \ - do \ - { \ +#define GET_RET_OP_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_INT_PARAM(RET_OP_PV_VAR_NAME, RET_OP_PARAM_PTR_VAR_NAME, ({GET_MATCH_TYPE_ACTUAL_PARAM()})) \ } while (0); // Method to get actual occur param -#define GET_OCCUR_ACTUAL_PARAM() \ - do \ - { \ +#define GET_OCCUR_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_INT_PARAM(OCCUR_PV_VAR_NAME, OCCUR_PARAM_PTR_VAR_NAME, ({GET_RET_OP_ACTUAL_PARAM()})) \ } while (0); // Method to get actual position param -#define GET_POS_ACTUAL_PARAM() \ - do \ - { \ +#define GET_POS_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_INT_PARAM(POS_PV_VAR_NAME, POS_PARAM_PTR_VAR_NAME, ({GET_OCCUR_ACTUAL_PARAM()})) \ } while (0); // Method to get actual pattern param -#define GET_PAT_ACTUAL_PARAM() \ - do \ - { \ +#define GET_PAT_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_STRING_PARAM(PAT_PV_VAR_NAME, PAT_PARAM_PTR_VAR_NAME, ({GET_POS_ACTUAL_PARAM()})) \ } while (0); // Method to get actual expression param -#define GET_EXPR_ACTUAL_PARAM() \ - do \ - { \ +#define GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_STRING_PARAM(EXPR_PV_VAR_NAME, EXPR_PARAM_PTR_VAR_NAME, ({GET_PAT_ACTUAL_PARAM()})) \ } while (0); // The entry to get actual params and execute regexp functions #define GET_ACTUAL_PARAMS_AND_EXECUTE() \ - do \ - { \ - GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ + GET_EXPR_ACTUAL_PARAM() \ } while (0); // Implementation of regexp_instr function @@ -1237,9 +1268,9 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase GetIntFuncPointerType get_ret_op_func = getGetIntFuncPointer(ret_op_param.getIntType()); // Container will not be used when parm is const - const void * pos_container = pos_param.getContainer(); - const void * occur_container = occur_param.getContainer(); - const void * ret_op_container = ret_op_param.getContainer(); + const void * pos_container = pos_param.getContainer(); + const void * occur_container = occur_param.getContainer(); + const void * ret_op_container = ret_op_param.getContainer(); // Const value will not be used when param is not const Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; @@ -1254,15 +1285,15 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; } - + int flags = getDefaultFlags(); String expr = expr_param.getString(0); String match_type = match_type_param.getString(0); String pat = pat_param.getString(0); if (unlikely(pat.empty())) - throw Exception(EMPTY_PAT_ERR_MSG); + throw Exception(EMPTY_PAT_ERR_MSG); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); ResultType res = regexp.instr(expr.c_str(), expr.size(), pos_const_val, occur_const_val, ret_op_const_val); res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; @@ -1275,30 +1306,30 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || RetOpT::isNullableCol() || MatchTypeT::isNullableCol(); -#define GET_POS_VALUE(idx) \ - do \ - { \ - if constexpr (PosT::isConst()) \ - pos = pos_const_val; \ - else \ +#define GET_POS_VALUE(idx) \ + do \ + { \ + if constexpr (PosT::isConst()) \ + pos = pos_const_val; \ + else \ pos = get_pos_func(pos_container, idx); \ } while (0); -#define GET_OCCUR_VALUE(idx) \ - do \ - { \ - if constexpr (OccurT::isConst()) \ - occur = occur_const_val; \ - else \ +#define GET_OCCUR_VALUE(idx) \ + do \ + { \ + if constexpr (OccurT::isConst()) \ + occur = occur_const_val; \ + else \ occur = get_occur_func(occur_container, idx); \ } while (0); -#define GET_RET_OP_VALUE(idx) \ - do \ - { \ - if constexpr (RetOpT::isConst()) \ - ret_op = ret_op_const_val; \ - else \ +#define GET_RET_OP_VALUE(idx) \ + do \ + { \ + if constexpr (RetOpT::isConst()) \ + ret_op = ret_op_const_val; \ + else \ ret_op = get_ret_op_func(ret_op_container, idx); \ } while (0); @@ -1315,7 +1346,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase std::unique_ptr regexp; if (col_size > 0) { - regexp = memorize(pat_param, match_type_param, collator); + regexp = memorize(pat_param, match_type_param, collator); if (regexp == nullptr) { auto nullmap_col = ColumnUInt8::create(); @@ -1392,8 +1423,8 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) match_type = match_type_param.getString(i); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); - vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + vec_res[i] = regexp.instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); } res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); @@ -1411,8 +1442,8 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) match_type = match_type_param.getString(i); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); - vec_res[i] = regexp->instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); + vec_res[i] = regexp.instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); } res_arg.column = std::move(col_res); @@ -1447,7 +1478,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase ColumnPtr col_match_type; // Go through cases to get arguments - switch(arg_num) + switch (arg_num) { case REGEXP_INSTR_MAX_PARAM_NUM: col_match_type = block.getByPosition(arguments[5]).column; diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 5497733d147..c1396f36261 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include +#include #include #include -#include #include -#include -#include + #include // NOLINT #include // NOLINT - #include #include @@ -2212,83 +2212,83 @@ TEST_F(Regexp, testRegexpCustomerCases) namespace { -template +template std::vector getResultVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.result); - + return vecs; } -template +template std::vector getExprVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.expression); - + return vecs; } -template +template std::vector getPatVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.pattern); - + return vecs; } -template +template std::vector getPosVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.position); - + return vecs; } -template +template std::vector getOccurVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.occurrence); - + return vecs; } -template +template std::vector getRetOpVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.return_option); - + return vecs; } -template +template std::vector getMatchTypeVec(const std::vector & test_cases) { std::vector vecs; vecs.reserve(test_cases.size()); for (const auto & elem : test_cases) vecs.push_back(elem.match_type); - + return vecs; } -} +} // namespace struct RegexpInstrCase { @@ -2300,7 +2300,7 @@ struct RegexpInstrCase , occurrence(occur) , return_option(ret_op) , match_type(mt) - {} + {} RegexpInstrCase(Int64 res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, Int64 ret_op = 0, const String & mt = "") : result(res) @@ -2311,12 +2311,13 @@ struct RegexpInstrCase , occurrence(occur) , return_option(ret_op) , match_type(mt) - {} + {} static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & ret_ops, std::vector & match_types) { results = getResultVec(test_cases); - switch (param_num) { + switch (param_num) + { case 6: match_types = getMatchTypeVec(test_cases); case 5: @@ -2374,40 +2375,40 @@ TEST_F(Regexp, RegexpInstr) for (size_t row_size = 1; row_size < 3; ++row_size) { ASSERT_COLUMN_EQ(createConstColumn(row_size, 1), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 0), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 4), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "11212"), - createConstColumn(row_size, "12"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "11212"), - createConstColumn(row_size, "12"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 1))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1))); ASSERT_COLUMN_EQ(createConstColumn(row_size, 6), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "aabab"), - createConstColumn(row_size, "aB"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 1), - createConstColumn(row_size, "i"))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "aabab"), + createConstColumn(row_size, "aB"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 1), + createConstColumn(row_size, "i"))); } } @@ -2415,50 +2416,50 @@ TEST_F(Regexp, RegexpInstr) { size_t row_size = 2; ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn>(row_size, {}), - createConstColumn(row_size, "123"))); - + executeFunction( + "regexp_instr", + createConstColumn>(row_size, {}), + createConstColumn(row_size, "123"))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_instr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_instr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); } std::vector test_cases; @@ -2483,10 +2484,10 @@ TEST_F(Regexp, RegexpInstr) {3, "pp跑ppのaaa", "(跑|の|P)"}}; RegexpInstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createColumn(results), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns))); + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns))); // test regexp_instr(vector, vector, vector) test_cases = {{4, "ttttifl", "tifl", 3}, @@ -2508,7 +2509,8 @@ TEST_F(Regexp, RegexpInstr) test_cases = {{4, "ttttifl", "tifl", 3, 1}, {6, "tidb_tikv", "ti(db|kv)", 2, 1}, {5, "aaaaaa", "aa", 3, 2}, - {0, "\n", ".", 1, 1}, {0, "", "^$", 3, 2}, + {0, "\n", ".", 1, 1}, + {0, "", "^$", 3, 2}, {0, "ab\naB", "^ab$", 1, 1}, {6, "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; RegexpInstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); @@ -2573,13 +2575,12 @@ TEST_F(Regexp, RegexpInstr) executeFunction( "regexp_instr", {createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createColumn(return_options), - createColumn(match_types)}, + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createColumn(match_types)}, utf8mb4_general_ci_collator)); - } // Test: Args include nullable columns @@ -2589,70 +2590,70 @@ TEST_F(Regexp, RegexpInstr) {1, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), - createColumn(patterns))); + executeFunction( + "regexp_instr", + createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns))); // test regexp_instr(vector, nullable vector) test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, {0, {{0, 1, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; RegexpInstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createColumn(exprs), - createNullableVectorColumn(patterns, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]))); + executeFunction( + "regexp_instr", + createColumn(exprs), + createNullableVectorColumn(patterns, null_maps[RegexpInstrCase::PAT_NULL_MAP_IDX]))); // test regexp_instr(vector, vector, nullable vector) test_cases = {{4, {{0, 0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, {0, {{0, 0, 1, 0, 0, 0}}, "ttttifl", "tifl", 3}}; RegexpInstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns), - createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]))); + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]))); // test regexp_instr(vector, vector, vector, nullable vector) test_cases = {{6, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, {0, {{0, 0, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; RegexpInstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createNullableVectorColumn(occurs, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]))); + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createNullableVectorColumn(occurs, null_maps[RegexpInstrCase::OCCUR_NULL_MAP_IDX]))); // test regexp_instr(vector, vector, vector, vector, nullable vector) test_cases = {{10, {{0, 0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}, {0, {{0, 0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2, 1}}; RegexpInstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createNullableVectorColumn(return_options, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]))); + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createNullableVectorColumn(return_options, null_maps[RegexpInstrCase::RET_OP_NULL_MAP_IDX]))); // test regexp_instr(vector, vector, vector, vector, vector, nullable vector) test_cases = {{1, {{0, 0, 0, 0, 0, 0}}, "b", "B", 1, 1, 0, "i"}, {0, {{0, 0, 0, 0, 0, 1}}, "b", "B", 1, 1, 0, "i"}}; RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]), - executeFunction( - "regexp_instr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createColumn(return_options), - createNullableVectorColumn(match_types, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]))); + executeFunction( + "regexp_instr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(return_options), + createNullableVectorColumn(match_types, null_maps[RegexpInstrCase::MATCH_TYPE_NULL_MAP_IDX]))); } // Test: const, nullable and pure vector columns appear together @@ -2664,14 +2665,14 @@ TEST_F(Regexp, RegexpInstr) {0, {{1, 0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, 0, "i"}}; RegexpInstrCase::setVecsWithNullMap(6, test_cases, results, null_maps, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), - executeFunction( - "regexp_instr", - createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), - createColumn(patterns), - createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), - createColumn(occurs), - createConstColumn(test_cases.size(), 0), - createColumn(match_types))); + executeFunction( + "regexp_instr", + createNullableVectorColumn(exprs, null_maps[RegexpInstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpInstrCase::POS_NULL_MAP_IDX]), + createColumn(occurs), + createConstColumn(test_cases.size(), 0), + createColumn(match_types))); } // Test: Invalid parameter handling From 9eced0e088ea9111bc99675632f6e25e3a9f7599 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 21 Nov 2022 19:38:45 +0800 Subject: [PATCH 73/87] add test case --- dbms/src/Functions/tests/gtest_regexp.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index c1396f36261..8030f9a0b41 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2512,7 +2512,8 @@ TEST_F(Regexp, RegexpInstr) {0, "\n", ".", 1, 1}, {0, "", "^$", 3, 2}, {0, "ab\naB", "^ab$", 1, 1}, - {6, "pp跑ppのaaa", "(跑|の|P)", 2, 2}}; + {6, "pp跑ppのaaa", "(跑|の|P)", 2, 2}, + {0, "pp跑ppのaaa", "(跑|の|P)", 2, 10}}; RegexpInstrCase::setVecsWithoutNullMap(4, test_cases, results, exprs, patterns, positions, occurs, return_options, match_types); ASSERT_COLUMN_EQ(createColumn(results), executeFunction( From 96087c67132421455ce2205555e740fa92af6468 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 22 Nov 2022 10:28:53 +0800 Subject: [PATCH 74/87] resolve comment --- dbms/src/Functions/FunctionsRegexp.h | 36 +++++++++++++--------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 57a584c1a9b..35b2e14cfb0 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -93,28 +93,18 @@ inline int getDefaultFlags() return flags; } -template inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { String mode = re2Util::getRE2ModeModifiers(match_type, collator); if (mode.empty()) - { - if constexpr (need_subpattern) - return fmt::format("({})", pattern); - else - return pattern; - } + return pattern; - if constexpr (need_subpattern) - return fmt::format("{}({})", mode, pattern); - else - return fmt::format("{}{}", mode, pattern); + return fmt::format("{}{}", mode, pattern); } -template inline Regexps::Regexp createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator) { - String final_pattern = addMatchTypeForPattern(pattern, match_type, collator); + String final_pattern = addMatchTypeForPattern(pattern, match_type, collator); return Regexps::createRegexp(final_pattern, getDefaultFlags()); } @@ -812,8 +802,11 @@ class FunctionStringRegexpBase if (unlikely(final_pattern.empty())) throw Exception(EMPTY_PAT_ERR_MSG); + if (need_subpattern) + final_pattern = fmt::format("({})", final_pattern); + String match_type = match_type_param.getString(0); - final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); + final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator); int flags = getDefaultFlags(); return std::make_unique(final_pattern, flags); @@ -992,7 +985,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase String match_type = match_type_param.getString(0); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); ResultType res{regexp.match(expr)}; res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; @@ -1087,7 +1080,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp.match(expr_ref.data, expr_ref.size); // match } @@ -1107,7 +1100,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp.match(expr_ref.data, expr_ref.size); // match } @@ -1293,7 +1286,8 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + pat = fmt::format("({})", pat); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); ResultType res = regexp.instr(expr.c_str(), expr.size(), pos_const_val, occur_const_val, ret_op_const_val); res_arg.column = res_arg.type->createColumnConst(col_size, toField(res)); return; @@ -1419,11 +1413,12 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase pat = pat_param.getString(i); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); + pat = fmt::format("({})", pat); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) match_type = match_type_param.getString(i); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp.instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); } @@ -1438,11 +1433,12 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase pat = pat_param.getString(i); if (unlikely(pat.empty())) throw Exception(EMPTY_PAT_ERR_MSG); + pat = fmt::format("({})", pat); GET_POS_VALUE(i) GET_OCCUR_VALUE(i) GET_RET_OP_VALUE(i) match_type = match_type_param.getString(i); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); vec_res[i] = regexp.instr(expr_ref.data, expr_ref.size, pos, occur, ret_op); } From 6b667a85f447c776699a5232fd8fe48bc1597cc0 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 22 Nov 2022 14:08:19 +0800 Subject: [PATCH 75/87] add some tests --- dbms/src/Functions/FunctionsRegexp.cpp | 2 +- dbms/src/Functions/FunctionsRegexp.h | 14 --------- dbms/src/Functions/tests/gtest_regexp.cpp | 35 +++++++++++++++++++++-- tests/fullstack-test/expr/regexp.test | 6 ++++ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.cpp b/dbms/src/Functions/FunctionsRegexp.cpp index b268f28c193..18bd60c17fa 100644 --- a/dbms/src/Functions/FunctionsRegexp.cpp +++ b/dbms/src/Functions/FunctionsRegexp.cpp @@ -113,7 +113,7 @@ struct ReplaceRegexpImpl if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) { match_occ++; - /// if occ > 0, it will replace all the match expr, otherwise it only replace the occ-th match + /// if occ == 0, it will replace all the match expr, otherwise it only replace the occ-th match if (occ == 0 || match_occ == occ) { const auto & match = matches[0]; diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 35b2e14cfb0..3f8f1cfb5dc 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -174,20 +174,6 @@ GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) } } -template -void setResultColumnValuesNull(ColumnWithTypeAndName & res_arg, size_t col_size) -{ - // Initialize result column - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(col_size, 0); - - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(col_size, 1); - res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); -} - template class ParamString { diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 8030f9a0b41..6ba45d95aef 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -1790,7 +1790,6 @@ TEST_F(Regexp, testRegexpTiDBCase) ASSERT_ANY_THROW((DB::MatchImpl::constantConstant("", "\\", '\\', "", nullptr, res))); } -// TODO test empty columns // We can only test regexp_like function as regexp is the subset of regexp_like TEST_F(Regexp, RegexpLike) { @@ -2367,7 +2366,6 @@ struct RegexpInstrCase String match_type; }; -// TODO add empty column test TEST_F(Regexp, RegexpInstr) { // Test: All columns are const @@ -2676,6 +2674,39 @@ TEST_F(Regexp, RegexpInstr) createColumn(match_types))); } + // Test: empty column tests + { + ASSERT_COLUMN_EQ(createConstColumn(0, 1), + executeFunction( + "regexp_instr", + createConstColumn(0, "m"), + createConstColumn(0, "m"), + createConstColumn(0, 1), + createConstColumn(0, 1), + createConstColumn(0, 1), + createConstColumn(0, "m"))); + + ASSERT_COLUMN_EQ(createColumn({}), + executeFunction( + "regexp_instr", + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}))); + + ASSERT_COLUMN_EQ(createColumn({}), + executeFunction( + "regexp_instr", + createColumn({}), + createColumn({}), + createConstColumn(0, 1), + createColumn({}), + createColumn({}), + createConstColumn(0, ""))); + } + // Test: Invalid parameter handling { // test empty pattern diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index dc6115e2534..808fb77e8ad 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -94,3 +94,9 @@ mysql> set tidb_enforce_mpp=1; select regexp_like(data, pattern, match_type) as | 1 | | 1 | +------+ + +mysql> drop table if exists test.t; +mysql> create table test.t (data varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "1", pos, occur, 1, match_type) as res from test.t; From e2dbec5c8062580c5fd4c1890b8d80a0bae9f3f8 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 23 Nov 2022 11:44:36 +0800 Subject: [PATCH 76/87] add tests --- dbms/src/Functions/FunctionsRegexp.h | 6 +++--- tests/fullstack-test/expr/regexp.test | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 3f8f1cfb5dc..2bd6c475b3a 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1464,11 +1464,11 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase { case REGEXP_INSTR_MAX_PARAM_NUM: col_match_type = block.getByPosition(arguments[5]).column; - case REGEXP_MIN_PARAM_NUM + 3: + case REGEXP_INSTR_MAX_PARAM_NUM -1: col_return_option = block.getByPosition(arguments[4]).column; - case REGEXP_MIN_PARAM_NUM + 2: + case REGEXP_INSTR_MAX_PARAM_NUM - 2: col_occur = block.getByPosition(arguments[3]).column; - case REGEXP_MIN_PARAM_NUM + 1: + case REGEXP_INSTR_MAX_PARAM_NUM - 3: col_pos = block.getByPosition(arguments[2]).column; }; diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index 808fb77e8ad..1e8cd423266 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -100,3 +100,18 @@ mysql> create table test.t (data varchar(30), pattern varchar(30), pos int, occu mysql> alter table test.t set tiflash replica 1; func> wait_table test t mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "1", pos, occur, 1, match_type) as res from test.t; +mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "", pos, occur, 1, match_type) as res from test.t; + +mysql> drop table if exists test.t; +mysql> create table test.t (data varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); +mysql> insert into test.t values ('123', '12.', 1, 1, 0, ''), ('aBb', "bb", 1, 1, 0, 'i'), ('ab\nabc', '^abc$', 1, 1, 0, 'm'); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_instr(expr, pattern, 1, 1, 0, match_type) as res from test.t; ++------+ +| res | ++------+ +| 1 | +| 2 | +| 4 | ++------+ From 2f38660700b0ea770f08a37ef15935aa483bd8e9 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 23 Nov 2022 13:08:07 +0800 Subject: [PATCH 77/87] fix ut --- dbms/src/Functions/FunctionsRegexp.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 47138e578b0..87935839fea 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1624,8 +1624,9 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase int flags = getDefaultFlags(); String expr = expr_param.getString(0); String match_type = match_type_param.getString(0); + pat = fmt::format("({})", pat); - Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); + Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); StringRef res_ref; bool success = regexp.substr(expr.c_str(), expr.size(), res_ref, pos_const_val, occur_const_val); if (success) @@ -1737,8 +1738,9 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase GET_POS_VALUE(i) GET_OCCUR_VALUE(i) match_type = match_type_param.getString(i); + pat = fmt::format("({})", pat); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); } } @@ -1754,8 +1756,9 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase GET_POS_VALUE(i) GET_OCCUR_VALUE(i) match_type = match_type_param.getString(i); + pat = fmt::format("({})", pat); - auto regexp = createRegexpWithMatchType(pat, match_type, collator); + auto regexp = createRegexpWithMatchType(pat, match_type, collator); executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); } } From 298611ba9c2e12a60361d66788c943f29dd53643 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 23 Nov 2022 13:09:42 +0800 Subject: [PATCH 78/87] resolve comment --- tests/fullstack-test/expr/regexp.test | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index 1e8cd423266..e4315f9b1fb 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -96,14 +96,12 @@ mysql> set tidb_enforce_mpp=1; select regexp_like(data, pattern, match_type) as +------+ mysql> drop table if exists test.t; -mysql> create table test.t (data varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); +mysql> create table test.t (expr varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); mysql> alter table test.t set tiflash replica 1; func> wait_table test t mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "1", pos, occur, 1, match_type) as res from test.t; mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "", pos, occur, 1, match_type) as res from test.t; -mysql> drop table if exists test.t; -mysql> create table test.t (data varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); mysql> insert into test.t values ('123', '12.', 1, 1, 0, ''), ('aBb', "bb", 1, 1, 0, 'i'), ('ab\nabc', '^abc$', 1, 1, 0, 'm'); mysql> alter table test.t set tiflash replica 1; func> wait_table test t From de7fd31fb767bbedacd713e61d92463423ac05f3 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 24 Nov 2022 13:25:35 +0800 Subject: [PATCH 79/87] fix critical --- dbms/src/Functions/FunctionsRegexp.h | 34 +++++++++++++++++------ dbms/src/Functions/tests/gtest_regexp.cpp | 1 - 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 2bd6c475b3a..47f36853539 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -41,6 +41,7 @@ #include #include +#include #include #include @@ -185,7 +186,8 @@ class ParamString // For passing compilation explicit ParamString(Int64) - : const_string(nullptr, 0) + : const_string_data(nullptr) + , const_string_data_size(0) , chars(nullptr) , offsets(nullptr) { @@ -193,17 +195,26 @@ class ParamString } explicit ParamString(const StringRef & str_ref) - : const_string(str_ref) + : const_string_data(nullptr) + , const_string_data_size(0) , chars(nullptr) , offsets(nullptr) { + // Deep copy + const_string_data = new char[str_ref.size]; + if (const_string_data == nullptr) + throw Exception("ParamString constructor get a nullptr"); + + memcpy(const_string_data, str_ref.data, str_ref.size); + const_string_data_size = str_ref.size; if constexpr (!is_const) throw Exception("non-const parm should not call this constructor"); } // For passing compilation explicit ParamString(const void *) - : const_string(nullptr, 0) + : const_string_data(nullptr) + , const_string_data_size(0) , chars(nullptr) , offsets(nullptr) { @@ -211,7 +222,8 @@ class ParamString } ParamString(const void * chars_, const void * offsets_) - : const_string(nullptr, 0) + : const_string_data(nullptr) + , const_string_data_size(0) , chars(reinterpret_cast(chars_)) , offsets(reinterpret_cast(offsets_)) { @@ -219,6 +231,11 @@ class ParamString throw Exception("const parm should not call this constructor"); } + ~ParamString() + { + delete[] const_string_data; + } + static IntType getIntType() { throw Exception("ParamString not supports this function"); } template @@ -230,7 +247,7 @@ class ParamString String getString(size_t idx) const { if constexpr (is_const) - return String(const_string.data, const_string.size); + return String(const_string_data, const_string_data_size); else return String(reinterpret_cast(&(*chars)[offsetAt(idx)]), sizeAt(idx) - 1); } @@ -239,8 +256,8 @@ class ParamString { if constexpr (is_const) { - dst.data = const_string.data; - dst.size = const_string.size; + dst.data = const_string_data; + dst.size = const_string_data_size; } else { @@ -258,7 +275,8 @@ class ParamString size_t offsetAt(size_t i) const { return i == 0 ? 0 : (*offsets)[i - 1]; } size_t sizeAt(size_t i) const { return i == 0 ? (*offsets)[0] : ((*offsets)[i] - (*offsets)[i - 1]); } - StringRef const_string; + char * const_string_data; + size_t const_string_data_size; // for vector string const Chars_t * chars; diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 6ba45d95aef..9b70839fcee 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2066,7 +2066,6 @@ TEST_F(Regexp, RegexpLike) createNullableVectorColumn(match_types, match_type_nulls))); } - std::cout << "case 9" << std::endl; // case 9 test empty columns { ASSERT_COLUMN_EQ(createColumn({}), From 43f9e2c13c742cc490c59b1eed8903b7fdb0e64e Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 28 Nov 2022 15:22:25 +0800 Subject: [PATCH 80/87] merge master --- .../Common/OptimizedRegularExpression.inl.h | 25 +- dbms/src/Common/UTF8Helpers.h | 6 +- .../CreatingSetsBlockInputStream.cpp | 2 +- .../ExchangeSenderBlockInputStream.h | 1 - .../DataStreams/TiRemoteBlockInputStream.h | 11 + .../src/Flash/Coprocessor/CoprocessorReader.h | 13 +- .../Coprocessor/DAGBlockOutputStream.cpp | 1 - dbms/src/Flash/Coprocessor/DAGDriver.cpp | 5 + .../src/Flash/Coprocessor/DAGResponseWriter.h | 1 - .../StreamingDAGResponseWriter.cpp | 8 +- .../Coprocessor/StreamingDAGResponseWriter.h | 1 - .../Coprocessor/UnaryDAGResponseWriter.cpp | 9 +- .../Coprocessor/UnaryDAGResponseWriter.h | 3 +- .../tests/gtest_streaming_writer.cpp | 1 - .../gtest_ti_remote_block_inputstream.cpp | 2 - .../Mpp/BroadcastOrPassThroughWriter.cpp | 8 +- .../Flash/Mpp/BroadcastOrPassThroughWriter.h | 1 - .../Flash/Mpp/FineGrainedShuffleWriter.cpp | 8 +- dbms/src/Flash/Mpp/FineGrainedShuffleWriter.h | 1 - dbms/src/Flash/Mpp/HashPartitionWriter.cpp | 8 +- dbms/src/Flash/Mpp/HashPartitionWriter.h | 1 - .../Mpp/tests/gtest_mpp_exchange_writer.cpp | 4 - dbms/src/Functions/FunctionsRegexp.h | 12 +- dbms/src/Server/Server.cpp | 2 - .../Delta/ColumnFilePersistedSet.cpp | 2 +- .../DeltaMerge/Delta/DeltaValueSpace.cpp | 5 +- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 44 ++-- .../DeltaMerge/tests/gtest_segment.cpp | 230 ++++-------------- .../tests/gtest_segment_test_basic.cpp | 204 ++++++++++++---- .../tests/gtest_segment_test_basic.h | 2 +- .../tests/gtest_segment_test_randomized.cpp | 12 +- .../tests/gtest_sst_files_stream.cpp | 20 +- dbms/src/TestUtils/FunctionTestUtils.h | 10 + dbms/src/TestUtils/InputStreamTestUtils.cpp | 89 ++++--- metrics/grafana/tiflash_summary.json | 4 +- tests/fullstack-test/expr/regexp.test | 6 +- 36 files changed, 367 insertions(+), 395 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index ef9278537c6..3f9da675772 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -481,10 +481,7 @@ Int64 OptimizedRegularExpressionImpl::processInstrEmptyStringExpr(c return 0; StringPieceType expr_sp(expr, expr_size); - bool success = RegexType::FindAndConsume(&expr_sp, *re2); - if (!success) - return 0; - return pos; + return RegexType::FindAndConsume(&expr_sp, *re2) ? pos : 0; } template @@ -495,8 +492,7 @@ bool OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(c StringPieceType expr_sp(expr, expr_size); StringPieceType matched_str; - bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); - if (!success) + if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) return false; res.data = matched_str.data(); @@ -506,17 +502,13 @@ bool OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(c static inline void checkInstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) { - if (unlikely(ret_op != 0 && ret_op != 1)) - throw DB::Exception("Incorrect argument to regexp function: return_option must be 1 or 0"); - - if (unlikely(pos <= 0 || (pos > utf8_total_len && subject_size != 0))) - throw DB::Exception("Index out of bounds in regular function."); + RUNTIME_CHECK_MSG(!(ret_op != 0 && ret_op != 1), "Incorrect argument to regexp function: return_option must be 1 or 0"); + RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function."); } static inline void checkSubstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos) { - if (unlikely(pos <= 0 || (pos > utf8_total_len && subject_size != 0))) - throw DB::Exception("Index out of bounds in regular function."); + RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function."); } static inline void makeOccurValid(Int64 & occur) @@ -533,10 +525,10 @@ Int64 OptimizedRegularExpressionImpl::instrImpl(const char * subjec StringPieceType expr_sp(expr, expr_size); StringPieceType matched_str; + while (occur > 0) { - bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); - if (!success) + if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) return 0; --occur; @@ -557,8 +549,7 @@ bool OptimizedRegularExpressionImpl::substrImpl(const char * subjec StringPieceType matched_str; while (occur > 0) { - bool success = RegexType::FindAndConsume(&expr_sp, *re2, &matched_str); - if (!success) + if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) return false; --occur; diff --git a/dbms/src/Common/UTF8Helpers.h b/dbms/src/Common/UTF8Helpers.h index cf848b258ae..47dd6260c28 100644 --- a/dbms/src/Common/UTF8Helpers.h +++ b/dbms/src/Common/UTF8Helpers.h @@ -90,12 +90,8 @@ inline size_t countCodePoints(const UInt8 * data, size_t size) static inline Int64 utf8Pos2bytePos(const UInt8 * str, Int64 utf8_pos) { Int64 byte_index = 0; - utf8_pos--; - while (utf8_pos > 0) - { + while (--utf8_pos > 0) byte_index += seqLength(str[byte_index]); - utf8_pos--; - } return byte_index + 1; } diff --git a/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp b/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp index a7ac8b25225..5ce4a8a799d 100644 --- a/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp +++ b/dbms/src/DataStreams/CreatingSetsBlockInputStream.cpp @@ -85,7 +85,7 @@ Block CreatingSetsBlockInputStream::readImpl() { Block res; - createAll(); + RUNTIME_CHECK(created == true); if (isCancelledOrThrowIfKilled()) return res; diff --git a/dbms/src/DataStreams/ExchangeSenderBlockInputStream.h b/dbms/src/DataStreams/ExchangeSenderBlockInputStream.h index 9044ff293dc..18a1503c71e 100644 --- a/dbms/src/DataStreams/ExchangeSenderBlockInputStream.h +++ b/dbms/src/DataStreams/ExchangeSenderBlockInputStream.h @@ -46,7 +46,6 @@ class ExchangeSenderBlockInputStream : public IProfilingBlockInputStream } void readSuffixImpl() override { - writer->finishWrite(); LOG_DEBUG(log, "finish write with {} rows", total_rows); } diff --git a/dbms/src/DataStreams/TiRemoteBlockInputStream.h b/dbms/src/DataStreams/TiRemoteBlockInputStream.h index 6ba8f05102d..5d523a7d372 100644 --- a/dbms/src/DataStreams/TiRemoteBlockInputStream.h +++ b/dbms/src/DataStreams/TiRemoteBlockInputStream.h @@ -195,6 +195,17 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream if (kill) remote_reader->cancel(); } + + void readPrefixImpl() override + { + // for CoprocessorReader, we send Coprocessor requests in readPrefixImpl + if constexpr (std::is_same_v) + { + remote_reader->open(); + } + // note that for ExchangeReceiver, we have sent EstablishMPPConnection requests before we construct the pipeline + } + Block readImpl() override { if (block_queue.empty()) diff --git a/dbms/src/Flash/Coprocessor/CoprocessorReader.h b/dbms/src/Flash/Coprocessor/CoprocessorReader.h index c076c1ded4a..e1e41b8fbab 100644 --- a/dbms/src/Flash/Coprocessor/CoprocessorReader.h +++ b/dbms/src/Flash/Coprocessor/CoprocessorReader.h @@ -88,12 +88,18 @@ class CoprocessorReader , resp_iter(std::move(tasks), cluster, concurrency, &Poco::Logger::get("pingcap/coprocessor")) , collected(false) , concurrency_(concurrency) + {} + + const DAGSchema & getOutputSchema() const { return schema; } + + // `open` will call the resp_iter's `open` to send coprocessor request. + void open() { resp_iter.open(); + opened = true; } - const DAGSchema & getOutputSchema() const { return schema; } - + // `cancel` will call the resp_iter's `cancel` to abort the data receiving and prevent the next retry. void cancel() { resp_iter.cancel(); } @@ -143,6 +149,8 @@ class CoprocessorReader // stream_id, decoder_ptr are only meaningful for ExchagneReceiver. CoprocessorReaderResult nextResult(std::queue & block_queue, const Block & header, size_t /*stream_id*/, std::unique_ptr & /*decoder_ptr*/) { + RUNTIME_CHECK(opened == true); + auto && [result, has_next] = resp_iter.next(); if (!result.error.empty()) return {nullptr, true, result.error.message(), false}; @@ -182,5 +190,6 @@ class CoprocessorReader bool collected = false; int concurrency_; + bool opened = false; }; } // namespace DB diff --git a/dbms/src/Flash/Coprocessor/DAGBlockOutputStream.cpp b/dbms/src/Flash/Coprocessor/DAGBlockOutputStream.cpp index b280a352626..8e8440c5e1b 100644 --- a/dbms/src/Flash/Coprocessor/DAGBlockOutputStream.cpp +++ b/dbms/src/Flash/Coprocessor/DAGBlockOutputStream.cpp @@ -35,7 +35,6 @@ void DAGBlockOutputStream::writeSuffix() { // todo error handle response_writer->flush(); - response_writer->finishWrite(); } } // namespace DB diff --git a/dbms/src/Flash/Coprocessor/DAGDriver.cpp b/dbms/src/Flash/Coprocessor/DAGDriver.cpp index 663b270cd21..cfde043963f 100644 --- a/dbms/src/Flash/Coprocessor/DAGDriver.cpp +++ b/dbms/src/Flash/Coprocessor/DAGDriver.cpp @@ -110,6 +110,11 @@ try dag_context); dag_output_stream = std::make_shared(streams.in->getHeader(), std::move(response_writer)); copyData(*streams.in, *dag_output_stream); + if (dag_context.collect_execution_summaries) + { + ExecutionSummaryCollector summary_collector(dag_context); + summary_collector.addExecuteSummaries(*dag_response); + } } else { diff --git a/dbms/src/Flash/Coprocessor/DAGResponseWriter.h b/dbms/src/Flash/Coprocessor/DAGResponseWriter.h index 077652fa51b..f6a87880c4a 100644 --- a/dbms/src/Flash/Coprocessor/DAGResponseWriter.h +++ b/dbms/src/Flash/Coprocessor/DAGResponseWriter.h @@ -30,7 +30,6 @@ class DAGResponseWriter virtual void write(const Block & block) = 0; /// flush cached blocks for batch writer virtual void flush() = 0; - virtual void finishWrite() = 0; virtual ~DAGResponseWriter() = default; const DAGContext & dagContext() const { return dag_context; } diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp index 0b02c0b8d65..dd347ca8929 100644 --- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp +++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp @@ -60,12 +60,6 @@ StreamingDAGResponseWriter::StreamingDAGResponseWriter( : (records_per_chunk - 1); } -template -void StreamingDAGResponseWriter::finishWrite() -{ - assert(0 == rows_in_blocks); -} - template void StreamingDAGResponseWriter::flush() { @@ -80,9 +74,9 @@ void StreamingDAGResponseWriter::write(const Block & block) block.columns() == dag_context.result_field_types.size(), "Output column size mismatch with field type size"); size_t rows = block.rows(); - rows_in_blocks += rows; if (rows > 0) { + rows_in_blocks += rows; blocks.push_back(block); } diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h index e26935a969b..def4f59d01b 100644 --- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h +++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h @@ -37,7 +37,6 @@ class StreamingDAGResponseWriter : public DAGResponseWriter DAGContext & dag_context_); void write(const Block & block) override; void flush() override; - void finishWrite() override; private: void encodeThenWriteBlocks(); diff --git a/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.cpp index 2cb0eddd089..b245c353c49 100644 --- a/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.cpp +++ b/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.cpp @@ -71,19 +71,14 @@ void UnaryDAGResponseWriter::appendWarningsToDAGResponse() dag_response->set_warning_count(dag_context.getWarningCount()); } -void UnaryDAGResponseWriter::finishWrite() +void UnaryDAGResponseWriter::flush() { if (current_records_num > 0) { encodeChunkToDAGResponse(); } + // TODO separate from UnaryDAGResponseWriter and support mpp/batchCop. appendWarningsToDAGResponse(); - - if (dag_context.collect_execution_summaries) - { - ExecutionSummaryCollector summary_collector(dag_context); - summary_collector.addExecuteSummaries(*dag_response); - } } void UnaryDAGResponseWriter::write(const Block & block) diff --git a/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.h b/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.h index 7eb493ba3d6..b1b590d0179 100644 --- a/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.h +++ b/dbms/src/Flash/Coprocessor/UnaryDAGResponseWriter.h @@ -38,8 +38,7 @@ class UnaryDAGResponseWriter : public DAGResponseWriter DAGContext & dag_context_); void write(const Block & block) override; - void flush() override {} - void finishWrite() override; + void flush() override; void encodeChunkToDAGResponse(); void appendWarningsToDAGResponse(); diff --git a/dbms/src/Flash/Coprocessor/tests/gtest_streaming_writer.cpp b/dbms/src/Flash/Coprocessor/tests/gtest_streaming_writer.cpp index 57cf544c9e3..5442cd02d29 100644 --- a/dbms/src/Flash/Coprocessor/tests/gtest_streaming_writer.cpp +++ b/dbms/src/Flash/Coprocessor/tests/gtest_streaming_writer.cpp @@ -145,7 +145,6 @@ try for (const auto & block : blocks) dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 4. Start to check write_report. size_t expect_rows = block_rows * block_num; diff --git a/dbms/src/Flash/Coprocessor/tests/gtest_ti_remote_block_inputstream.cpp b/dbms/src/Flash/Coprocessor/tests/gtest_ti_remote_block_inputstream.cpp index 798aff6c842..1c01e5c36df 100644 --- a/dbms/src/Flash/Coprocessor/tests/gtest_ti_remote_block_inputstream.cpp +++ b/dbms/src/Flash/Coprocessor/tests/gtest_ti_remote_block_inputstream.cpp @@ -306,7 +306,6 @@ class TestTiRemoteBlockInputStream : public testing::Test for (const auto & block : source_blocks) dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 3. send execution summary writer->add_summary = true; @@ -333,7 +332,6 @@ class TestTiRemoteBlockInputStream : public testing::Test for (const auto & block : source_blocks) dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 3. send execution summary writer->add_summary = true; diff --git a/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.cpp b/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.cpp index 289511a35eb..9303b4ad8a4 100644 --- a/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.cpp +++ b/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.cpp @@ -33,12 +33,6 @@ BroadcastOrPassThroughWriter::BroadcastOrPassThroughWriter( chunk_codec_stream = std::make_unique()->newCodecStream(dag_context.result_field_types); } -template -void BroadcastOrPassThroughWriter::finishWrite() -{ - assert(0 == rows_in_blocks); -} - template void BroadcastOrPassThroughWriter::flush() { @@ -53,9 +47,9 @@ void BroadcastOrPassThroughWriter::write(const Block & block) block.columns() == dag_context.result_field_types.size(), "Output column size mismatch with field type size"); size_t rows = block.rows(); - rows_in_blocks += rows; if (rows > 0) { + rows_in_blocks += rows; blocks.push_back(block); } diff --git a/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.h b/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.h index 37ed2980db7..47c46307ee2 100644 --- a/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.h +++ b/dbms/src/Flash/Mpp/BroadcastOrPassThroughWriter.h @@ -32,7 +32,6 @@ class BroadcastOrPassThroughWriter : public DAGResponseWriter DAGContext & dag_context_); void write(const Block & block) override; void flush() override; - void finishWrite() override; private: void encodeThenWriteBlocks(); diff --git a/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.cpp b/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.cpp index 947bfbafae5..cc6dba5631e 100644 --- a/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.cpp +++ b/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.cpp @@ -44,12 +44,6 @@ FineGrainedShuffleWriter::FineGrainedShuffleWriter( chunk_codec_stream = std::make_unique()->newCodecStream(dag_context.result_field_types); } -template -void FineGrainedShuffleWriter::finishWrite() -{ - assert(0 == rows_in_blocks); -} - template void FineGrainedShuffleWriter::prepare(const Block & sample_block) { @@ -85,9 +79,9 @@ void FineGrainedShuffleWriter::write(const Block & block) "Output column size mismatch with field type size"); size_t rows = block.rows(); - rows_in_blocks += rows; if (rows > 0) { + rows_in_blocks += rows; blocks.push_back(block); } diff --git a/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.h b/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.h index 49a88a4653f..3c91518cea4 100644 --- a/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.h +++ b/dbms/src/Flash/Mpp/FineGrainedShuffleWriter.h @@ -36,7 +36,6 @@ class FineGrainedShuffleWriter : public DAGResponseWriter void prepare(const Block & sample_block) override; void write(const Block & block) override; void flush() override; - void finishWrite() override; private: void batchWriteFineGrainedShuffle(); diff --git a/dbms/src/Flash/Mpp/HashPartitionWriter.cpp b/dbms/src/Flash/Mpp/HashPartitionWriter.cpp index 692f05bea11..5f7286833f9 100644 --- a/dbms/src/Flash/Mpp/HashPartitionWriter.cpp +++ b/dbms/src/Flash/Mpp/HashPartitionWriter.cpp @@ -40,12 +40,6 @@ HashPartitionWriter::HashPartitionWriter( chunk_codec_stream = std::make_unique()->newCodecStream(dag_context.result_field_types); } -template -void HashPartitionWriter::finishWrite() -{ - assert(0 == rows_in_blocks); -} - template void HashPartitionWriter::flush() { @@ -60,9 +54,9 @@ void HashPartitionWriter::write(const Block & block) block.columns() == dag_context.result_field_types.size(), "Output column size mismatch with field type size"); size_t rows = block.rows(); - rows_in_blocks += rows; if (rows > 0) { + rows_in_blocks += rows; blocks.push_back(block); } diff --git a/dbms/src/Flash/Mpp/HashPartitionWriter.h b/dbms/src/Flash/Mpp/HashPartitionWriter.h index b0565a1b64a..eb82c54592e 100644 --- a/dbms/src/Flash/Mpp/HashPartitionWriter.h +++ b/dbms/src/Flash/Mpp/HashPartitionWriter.h @@ -34,7 +34,6 @@ class HashPartitionWriter : public DAGResponseWriter DAGContext & dag_context_); void write(const Block & block) override; void flush() override; - void finishWrite() override; private: void partitionAndEncodeThenWriteBlocks(); diff --git a/dbms/src/Flash/Mpp/tests/gtest_mpp_exchange_writer.cpp b/dbms/src/Flash/Mpp/tests/gtest_mpp_exchange_writer.cpp index 51b07ebdf1e..fa3456d96eb 100644 --- a/dbms/src/Flash/Mpp/tests/gtest_mpp_exchange_writer.cpp +++ b/dbms/src/Flash/Mpp/tests/gtest_mpp_exchange_writer.cpp @@ -172,7 +172,6 @@ try dag_writer->prepare(block.cloneEmpty()); dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 4. Start to check write_report. std::vector decoded_blocks; @@ -233,7 +232,6 @@ try for (const auto & block : blocks) dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 4. Start to check write_report. size_t per_part_rows = block_rows * block_num / part_num; @@ -294,7 +292,6 @@ try for (const auto & block : blocks) dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 4. Start to check write_report. size_t per_part_rows = block_rows * block_num / part_num; @@ -347,7 +344,6 @@ try for (const auto & block : blocks) dag_writer->write(block); dag_writer->flush(); - dag_writer->finishWrite(); // 4. Start to check write_report. size_t expect_rows = block_rows * block_num; diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 8a71a678ae4..6bfa35c9169 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -124,9 +124,9 @@ Int64 getIntFromField(Field & field) { switch (field.getType()) { - case Field::Types::Which::Int64: + case Field::Types::Int64: return field.safeGet(); - case Field::Types::Which::UInt64: + case Field::Types::UInt64: return field.safeGet(); default: throw Exception("Unexpected int type"); @@ -206,9 +206,6 @@ class ParamString { // Deep copy const_string_data = new char[str_ref.size]; - if (const_string_data == nullptr) - throw Exception("ParamString constructor get a nullptr"); - memcpy(const_string_data, str_ref.data, str_ref.size); const_string_data_size = str_ref.size; if constexpr (!is_const) @@ -1486,7 +1483,7 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase { case REGEXP_INSTR_MAX_PARAM_NUM: col_match_type = block.getByPosition(arguments[5]).column; - case REGEXP_INSTR_MAX_PARAM_NUM -1: + case REGEXP_INSTR_MAX_PARAM_NUM - 1: col_return_option = block.getByPosition(arguments[4]).column; case REGEXP_INSTR_MAX_PARAM_NUM - 2: col_occur = block.getByPosition(arguments[3]).column; @@ -1841,8 +1838,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase Int64 pos, Int64 occur) const { - bool success = regexp.substr(subject, subject_size, res_ref, pos, occur); - if (success) + if (regexp.substr(subject, subject_size, res_ref, pos, occur)) { col_res->insertData(res_ref.data, res_ref.size); null_map[idx] = 0; diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp index 5a1409da2c5..7a0368d0beb 100644 --- a/dbms/src/Server/Server.cpp +++ b/dbms/src/Server/Server.cpp @@ -1284,8 +1284,6 @@ int Server::main(const std::vector & /*args*/) main_config_reloader.reset(); users_config_reloader.reset(); - - DynamicThreadPool::global_instance.reset(); }); /// This object will periodically calculate some metrics. diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index cf053cc3a92..72770e79541 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -324,6 +324,7 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con auto compaction = std::make_shared(next_compaction_level, minor_compaction_version); auto & level = persisted_files_levels[next_compaction_level]; + next_compaction_level++; if (!level.empty()) { bool is_all_trivial_move = true; @@ -365,7 +366,6 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con if (!is_all_trivial_move) return compaction; } - next_compaction_level++; } return nullptr; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp index d9ed87a74f9..43c5c2cd9b0 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp @@ -401,7 +401,10 @@ bool DeltaValueSpace::compact(DMContext & context) LOG_DEBUG(log, "Compact stop because structure got updated, delta={}", simpleInfo()); return false; } - + // Reset to 0 if the minor compaction succeed, + // and it may trigger another minor compaction if there is still too many column files. + // This process will stop when there is no more minor compaction to be done. + last_try_compact_column_files.store(0); LOG_DEBUG(log, "{} delta={}", compaction_task->info(), info()); } wbs.writeRemoves(); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index a7640b3d874..9280f3f516a 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -901,10 +901,11 @@ BlockInputStreams DeltaMergeStore::readRaw(const Context & db_context, auto after_segment_read = [&](const DMContextPtr & dm_context_, const SegmentPtr & segment_) { this->checkSegmentUpdate(dm_context_, segment_, ThreadType::Read); }; - size_t final_num_stream = std::min(num_streams, tasks.size()); String req_info; if (db_context.getDAGContext() != nullptr && db_context.getDAGContext()->isMPPTask()) req_info = db_context.getDAGContext()->getMPPTaskId().toString(); + // We can use num_streams as parallelism when read thread is enabled. + size_t final_num_stream = enable_read_thread ? num_streams : std::min(num_streams, tasks.size()); auto read_task_pool = std::make_shared( physical_table_id, dm_context, @@ -919,21 +920,23 @@ BlockInputStreams DeltaMergeStore::readRaw(const Context & db_context, enable_read_thread); BlockInputStreams res; - for (size_t i = 0; i < final_num_stream; ++i) + if (enable_read_thread) { - BlockInputStreamPtr stream; - if (enable_read_thread) + for (size_t i = 0; i < final_num_stream; ++i) { - stream = std::make_shared( + res.emplace_back(std::make_shared( read_task_pool, columns_to_read, extra_table_id_index, physical_table_id, - req_info); + req_info)); } - else + } + else + { + for (size_t i = 0; i < final_num_stream; ++i) { - stream = std::make_shared( + res.emplace_back(std::make_shared( dm_context, read_task_pool, after_segment_read, @@ -944,9 +947,8 @@ BlockInputStreams DeltaMergeStore::readRaw(const Context & db_context, /* read_mode */ ReadMode::Raw, extra_table_id_index, physical_table_id, - req_info); + req_info)); } - res.push_back(stream); } return res; } @@ -987,7 +989,8 @@ BlockInputStreams DeltaMergeStore::read(const Context & db_context, }; GET_METRIC(tiflash_storage_read_tasks_count).Increment(tasks.size()); - size_t final_num_stream = std::max(1, std::min(num_streams, tasks.size())); + // We can use num_streams as parallelism when read thread is enabled. + size_t final_num_stream = enable_read_thread ? std::max(1, num_streams) : std::max(1, std::min(num_streams, tasks.size())); auto read_task_pool = std::make_shared( physical_table_id, dm_context, @@ -1002,21 +1005,23 @@ BlockInputStreams DeltaMergeStore::read(const Context & db_context, enable_read_thread); BlockInputStreams res; - for (size_t i = 0; i < final_num_stream; ++i) + if (enable_read_thread) { - BlockInputStreamPtr stream; - if (enable_read_thread) + for (size_t i = 0; i < final_num_stream; ++i) { - stream = std::make_shared( + res.emplace_back(std::make_shared( read_task_pool, columns_to_read, extra_table_id_index, physical_table_id, - log_tracing_id); + log_tracing_id)); } - else + } + else + { + for (size_t i = 0; i < final_num_stream; ++i) { - stream = std::make_shared( + res.emplace_back(std::make_shared( dm_context, read_task_pool, after_segment_read, @@ -1027,9 +1032,8 @@ BlockInputStreams DeltaMergeStore::read(const Context & db_context, /* read_mode = */ is_fast_scan ? ReadMode::Fast : ReadMode::Normal, extra_table_id_index, physical_table_id, - log_tracing_id); + log_tracing_id)); } - res.push_back(stream); } LOG_DEBUG(tracing_logger, "Read create stream done"); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp index bffd450b3ef..f21d572ee48 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -53,119 +54,6 @@ bool shouldCompactStableWithTooMuchDataOutOfSegmentRange(const DMContext & conte } namespace tests { -class SegmentFrameworkTest : public SegmentTestBasic -{ -}; - -TEST_F(SegmentFrameworkTest, PrepareWriteBlock) -try -{ - reloadWithOptions(SegmentTestOptions{.is_common_handle = false}); - - auto s1_id = splitSegmentAt(DELTA_MERGE_FIRST_SEGMENT_ID, 10); - ASSERT_TRUE(s1_id.has_value()); - auto s2_id = splitSegmentAt(*s1_id, 20); - ASSERT_TRUE(s2_id.has_value()); - - // s1 has range [10, 20) - { - auto [begin, end] = getSegmentKeyRange(*s1_id); - ASSERT_EQ(10, begin); - ASSERT_EQ(20, end); - } - - { - // write_rows == segment_rows, start_key not specified - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 10); - ASSERT_EQ(1, blocks.size()); - auto handle_column = blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({10, 11, 12, 13, 14, 15, 16, 17, 18, 19}), handle_data); - } - { - // write_rows > segment_rows, start_key not specified - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 13); - ASSERT_EQ(2, blocks.size()); - { - auto handle_column = blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({10, 11, 12, 13, 14, 15, 16, 17, 18, 19}), handle_data); - } - { - auto handle_column = blocks[1].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({10, 11, 12}), handle_data); - } - } - { - // start_key specified, end_key - start_key < write_rows - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 2, /* at */ 16); - ASSERT_EQ(1, blocks.size()); - const auto & handle_column = blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16, 17}), handle_data); - } - { - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 4, /* at */ 16); - ASSERT_EQ(1, blocks.size()); - const auto & handle_column = blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16, 17, 18, 19}), handle_data); - } - { - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 5, /* at */ 16); - ASSERT_EQ(2, blocks.size()); - { - const auto & handle_column = blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16, 17, 18, 19}), handle_data); - } - { - const auto & handle_column = blocks[1].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16}), handle_data); - } - } - { - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 10, /* at */ 16); - ASSERT_EQ(3, blocks.size()); - { - const auto & handle_column = blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16, 17, 18, 19}), handle_data); - } - { - const auto & handle_column = blocks[1].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16, 17, 18, 19}), handle_data); - } - { - const auto & handle_column = blocks[2].getByName(EXTRA_HANDLE_COLUMN_NAME).column; - const auto & handle_data = typeid_cast &>(*handle_column).getData(); - ASSERT_EQ(PaddedPODArray({16, 17}), handle_data); - } - } - { - // write rows < segment rows, start key not specified, should choose a random start. - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 3); - ASSERT_EQ(1, blocks.size()); - ASSERT_EQ(3, blocks[0].rows()); - } - { - // Let's check whether the generated handles will be starting from 12, for at least once. - auto start_from_12 = 0; - for (size_t i = 0; i < 100; i++) - { - auto blocks = prepareWriteBlocksInSegmentRange(*s1_id, 3); - if (blocks[0].getByName(EXTRA_HANDLE_COLUMN_NAME).column->getInt(0) == 12) - start_from_12++; - } - ASSERT_TRUE(start_from_12 > 0); // We should hit at least 1 times in 100 iters. - ASSERT_TRUE(start_from_12 < 50); // We should not hit 50 times in 100 iters :) - } -} -CATCH - class SegmentOperationTest : public SegmentTestBasic { @@ -524,12 +412,11 @@ CATCH TEST_F(SegmentOperationTest, SegmentLogicalSplit) try { - { - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; - options.db_settings.dt_enable_logical_split = true; - reloadWithOptions(options); - } + reloadWithOptions( + {.db_settings = { + .dt_segment_stable_pack_rows = 100, + .dt_enable_logical_split = true, + }}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 400, /* at */ 0); flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); @@ -560,12 +447,8 @@ CATCH TEST_F(SegmentOperationTest, Issue5570) try { - { - SegmentTestOptions options; - // a smaller pack rows for logical split - options.db_settings.dt_segment_stable_pack_rows = 100; - reloadWithOptions(options); - } + // a smaller pack rows for logical split + reloadWithOptions({.db_settings = {.dt_segment_stable_pack_rows = 100}}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 200); flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); @@ -622,13 +505,12 @@ CATCH TEST_F(SegmentOperationTest, DeltaPagesAfterDeltaMerge) try { - { - SegmentTestOptions options; - // a smaller pack rows for logical split - options.db_settings.dt_segment_stable_pack_rows = 100; - options.db_settings.dt_enable_logical_split = true; - reloadWithOptions(options); - } + // a smaller pack rows for logical split + reloadWithOptions( + {.db_settings = { + .dt_segment_stable_pack_rows = 100, + .dt_enable_logical_split = true, + }}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100); @@ -716,10 +598,11 @@ class SegmentEnableLogicalSplitTest : public SegmentOperationTest void SetUp() override { SegmentOperationTest::SetUp(); - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; - options.db_settings.dt_enable_logical_split = true; - reloadWithOptions(options); + reloadWithOptions( + {.db_settings = { + .dt_segment_stable_pack_rows = 100, + .dt_enable_logical_split = true, + }}); ASSERT_TRUE(dm_context->enable_logical_split); } }; @@ -792,9 +675,7 @@ class SegmentSplitTest : public SegmentTestBasic TEST_F(SegmentSplitTest, AutoModePhycialSplitByDefault) try { - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; - reloadWithOptions(options); + reloadWithOptions({.db_settings = {.dt_segment_stable_pack_rows = 100}}); ASSERT_FALSE(dm_context->enable_logical_split); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 1000); @@ -811,11 +692,12 @@ CATCH TEST_F(SegmentSplitTest, PhysicalSplitMode) try { - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; // Even if we explicitly set enable_logical_split, we will still do physical split in SplitMode::Physical. - options.db_settings.dt_enable_logical_split = true; - reloadWithOptions(options); + reloadWithOptions( + {.db_settings = { + .dt_segment_stable_pack_rows = 100, + .dt_enable_logical_split = true, + }}); ASSERT_TRUE(dm_context->enable_logical_split); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 1000); @@ -870,9 +752,7 @@ CATCH TEST_F(SegmentSplitTest, LogicalSplitModeDoesLogicalSplit) try { - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; - reloadWithOptions(options); + reloadWithOptions({.db_settings = {.dt_segment_stable_pack_rows = 100}}); // Logical split will be performed if we use logical split mode, even when enable_logical_split is false. ASSERT_FALSE(dm_context->enable_logical_split); @@ -912,9 +792,7 @@ CATCH TEST_F(SegmentSplitTest, LogicalSplitModeOnePackInStable) try { - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; - reloadWithOptions(options); + reloadWithOptions({.db_settings = {.dt_segment_stable_pack_rows = 100}}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 50); flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); @@ -932,9 +810,7 @@ CATCH TEST_F(SegmentSplitTest, LogicalSplitModeOnePackWithHoleInStable) try { - SegmentTestOptions options; - options.db_settings.dt_segment_stable_pack_rows = 100; - reloadWithOptions(options); + reloadWithOptions({.db_settings = {.dt_segment_stable_pack_rows = 100}}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 10, /* at */ 0); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 10, /* at */ 90); @@ -1008,9 +884,7 @@ CATCH TEST_F(SegmentSplitAtTest, AutoModeEnableLogicalSplit) try { - SegmentTestOptions options; - options.db_settings.dt_enable_logical_split = true; - reloadWithOptions(options); + reloadWithOptions({.db_settings = {.dt_enable_logical_split = true}}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100, /* at */ 0); flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); @@ -1047,9 +921,7 @@ CATCH TEST_F(SegmentSplitAtTest, PhysicalSplitMode) try { - SegmentTestOptions options; - options.db_settings.dt_enable_logical_split = true; - reloadWithOptions(options); + reloadWithOptions({.db_settings = {.dt_enable_logical_split = true}}); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100, /* at */ 0); flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); @@ -1284,10 +1156,10 @@ class IsEmptyTest : public SegmentTestBasic TEST_F(IsEmptyTest, Basic) try { - auto fast_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptyFastPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); + }); ASSERT_EQ(0, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); - ASSERT_EQ(fast_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath)); writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100, /* at */ 0); ASSERT_FALSE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); @@ -1347,17 +1219,17 @@ try // We will consider it to be empty after compaction. mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID); - auto fast_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptyFastPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); + }); ASSERT_EQ(0, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); - ASSERT_EQ(fast_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath)); // For empty segment, delete range will not cause it to be "not empty". deleteRangeSegment(DELTA_MERGE_FIRST_SEGMENT_ID); - fast_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptyFastPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(DELTA_MERGE_FIRST_SEGMENT_ID)); + }); ASSERT_EQ(0, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); - ASSERT_EQ(fast_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath)); } CATCH @@ -1373,10 +1245,10 @@ try ASSERT_EQ(100, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); // This is the slow path, because ColumnFileInMemory exists for both left and right segments after logical split. - auto slow_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptySlowPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(*right_seg)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptySlowPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(*right_seg)); + }); ASSERT_EQ(0, getSegmentRowNum(*right_seg)); - ASSERT_EQ(slow_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptySlowPath)); } CATCH @@ -1393,10 +1265,10 @@ try ASSERT_EQ(100, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); // This is the slow path, because ColumnFileTiny exists for both left and right segments after logical split. - auto slow_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptySlowPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(*right_seg)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptySlowPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(*right_seg)); + }); ASSERT_EQ(0, getSegmentRowNum(*right_seg)); - ASSERT_EQ(slow_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptySlowPath)); } CATCH @@ -1414,10 +1286,10 @@ try ASSERT_EQ(100, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); // This goes into the fast path thanks to pack filter. - auto fast_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(*right_seg)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptyFastPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(*right_seg)); + }); ASSERT_EQ(0, getSegmentRowNum(*right_seg)); - ASSERT_EQ(fast_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptyFastPath)); } CATCH @@ -1439,10 +1311,10 @@ try ASSERT_EQ(100, getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID)); // This is the slow path, because pack filter will not work. - auto slow_count = ProfileEvents::get(ProfileEvents::DMSegmentIsEmptySlowPath); - ASSERT_TRUE(isSegmentDefinitelyEmpty(*seg_2)); + ASSERT_PROFILE_EVENT(ProfileEvents::DMSegmentIsEmptySlowPath, +1, { + ASSERT_TRUE(isSegmentDefinitelyEmpty(*seg_2)); + }); ASSERT_EQ(0, getSegmentRowNum(*seg_2)); - ASSERT_EQ(slow_count + 1, ProfileEvents::get(ProfileEvents::DMSegmentIsEmptySlowPath)); ASSERT_FALSE(isSegmentDefinitelyEmpty(*seg_3)); ASSERT_EQ(42, getSegmentRowNum(*seg_3)); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index 1e7c18af560..c5f9199f241 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -301,7 +301,34 @@ Block SegmentTestBasic::prepareWriteBlock(Int64 start_key, Int64 end_key, bool i is_deleted); } -std::vector SegmentTestBasic::prepareWriteBlocksInSegmentRange(PageId segment_id, UInt64 total_write_rows, std::optional write_start_key, bool is_deleted) +Block mergeBlocks(std::vector && blocks) +{ + auto accumulated_block = std::move(blocks[0]); + + for (size_t block_idx = 1; block_idx < blocks.size(); ++block_idx) + { + auto block = std::move(blocks[block_idx]); + + size_t columns = block.columns(); + size_t rows = block.rows(); + + for (size_t i = 0; i < columns; ++i) + { + MutableColumnPtr mutable_column = (*std::move(accumulated_block.getByPosition(i).column)).mutate(); + mutable_column->insertRangeFrom(*block.getByPosition(i).column, 0, rows); + accumulated_block.getByPosition(i).column = std::move(mutable_column); + } + } + + SortDescription sort; + sort.emplace_back(EXTRA_HANDLE_COLUMN_NAME, 1, 0); + sort.emplace_back(VERSION_COLUMN_NAME, 1, 0); + stableSortBlock(accumulated_block, sort); + + return accumulated_block; +} + +Block SegmentTestBasic::prepareWriteBlockInSegmentRange(PageId segment_id, UInt64 total_write_rows, std::optional write_start_key, bool is_deleted) { RUNTIME_CHECK(total_write_rows < std::numeric_limits::max()); @@ -364,7 +391,7 @@ std::vector SegmentTestBasic::prepareWriteBlocksInSegmentRange(PageId seg remaining_rows); } - return blocks; + return mergeBlocks(std::move(blocks)); } void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows, std::optional start_at) @@ -380,11 +407,8 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows, std::o auto [start_key, end_key] = getSegmentKeyRange(segment_id); LOG_DEBUG(logger, "write to segment, segment={} segment_rows={} start_key={} end_key={}", segment->info(), segment_row_num, start_key, end_key); - auto blocks = prepareWriteBlocksInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false); - for (const auto & block : blocks) - { - segment->write(*dm_context, block, false); - } + auto block = prepareWriteBlockInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false); + segment->write(*dm_context, block, false); EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); operation_statistics["write"]++; @@ -399,46 +423,35 @@ void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_r RUNTIME_CHECK(segments.find(segment_id) != segments.end()); - auto ingest_data = [&](SegmentPtr segment, const Block & block) { + auto segment = segments[segment_id]; + size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); + auto [start_key, end_key] = getSegmentKeyRange(segment_id); + LOG_DEBUG(logger, "ingest to segment, segment={} segment_rows={} start_key={} end_key={}", segment->info(), segment_row_num, start_key, end_key); + + { + auto block = prepareWriteBlockInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false); WriteBatches ingest_wbs(dm_context->storage_pool, dm_context->getWriteLimiter()); auto delegator = storage_path_pool->getStableDiskDelegator(); auto parent_path = delegator.choosePath(); auto file_id = storage_pool->newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); auto input_stream = std::make_shared(block); DMFileBlockOutputStream::Flags flags; - auto dm_file = writeIntoNewDMFile( - *dm_context, - table_columns, - input_stream, - file_id, - parent_path, - flags); + auto dm_file = writeIntoNewDMFile(*dm_context, table_columns, input_stream, file_id, parent_path, flags); ingest_wbs.data.putExternal(file_id, /* tag */ 0); ingest_wbs.writeLogAndData(); delegator.addDTFile(file_id, dm_file->getBytesOnDisk(), parent_path); - { - WriteBatches wbs(dm_context->storage_pool, dm_context->getWriteLimiter()); - auto ref_id = storage_pool->newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); - wbs.data.putRefPage(ref_id, dm_file->pageId()); - auto ref_file = DMFile::restore(dm_context->db_context.getFileProvider(), file_id, ref_id, parent_path, DMFile::ReadMetaMode::all()); - wbs.writeLogAndData(); - auto column_file = std::make_shared(*dm_context, ref_file, segment->getRowKeyRange()); - ColumnFiles column_files; - column_files.push_back(column_file); - ASSERT_TRUE(segment->ingestColumnFiles(*dm_context, segment->getRowKeyRange(), column_files, /* clear_data_in_range */ true)); - } - ingest_wbs.rollbackWrittenLogAndData(); - }; - auto segment = segments[segment_id]; - size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); - auto [start_key, end_key] = getSegmentKeyRange(segment_id); - LOG_DEBUG(logger, "ingest to segment, segment={} segment_rows={} start_key={} end_key={}", segment->info(), segment_row_num, start_key, end_key); + WriteBatches wbs(dm_context->storage_pool, dm_context->getWriteLimiter()); + auto ref_id = storage_pool->newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); + wbs.data.putRefPage(ref_id, dm_file->pageId()); + auto ref_file = DMFile::restore(dm_context->db_context.getFileProvider(), file_id, ref_id, parent_path, DMFile::ReadMetaMode::all()); + wbs.writeLogAndData(); + auto column_file = std::make_shared(*dm_context, ref_file, segment->getRowKeyRange()); + ColumnFiles column_files; + column_files.push_back(column_file); + ASSERT_TRUE(segment->ingestColumnFiles(*dm_context, segment->getRowKeyRange(), column_files, /* clear_data_in_range */ true)); - auto blocks = prepareWriteBlocksInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false); - for (const auto & block : blocks) - { - ingest_data(segment, block); + ingest_wbs.rollbackWrittenLogAndData(); } EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); @@ -458,11 +471,8 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id, UInt64 wri auto [start_key, end_key] = getSegmentKeyRange(segment_id); LOG_DEBUG(logger, "write deleted pack to segment, segment={} segment_rows={} start_key={} end_key={}", segment->info(), segment_row_num, start_key, end_key); - auto blocks = prepareWriteBlocksInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ true); - for (const auto & block : blocks) - { - segment->write(*dm_context, block, false); - } + auto block = prepareWriteBlockInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ true); + segment->write(*dm_context, block, false); EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); operation_statistics["writeDelete"]++; @@ -490,13 +500,7 @@ void SegmentTestBasic::replaceSegmentData(const std::vector & segments_i auto file_id = storage_pool->newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); auto input_stream = std::make_shared(block); - auto dm_file = writeIntoNewDMFile( - *dm_context, - table_columns, - input_stream, - file_id, - parent_path, - {}); + auto dm_file = writeIntoNewDMFile(*dm_context, table_columns, input_stream, file_id, parent_path, {}); ingest_wbs.data.putExternal(file_id, /* tag */ 0); ingest_wbs.writeLogAndData(); @@ -592,6 +596,110 @@ void SegmentTestBasic::printFinishedOperations() const LOG_INFO(logger, "======= End Finished Operations Statistics ======="); } +class SegmentFrameworkTest : public SegmentTestBasic +{ +}; + +TEST_F(SegmentFrameworkTest, PrepareWriteBlock) +try +{ + reloadWithOptions({.is_common_handle = false}); + + auto s1_id = splitSegmentAt(DELTA_MERGE_FIRST_SEGMENT_ID, 10); + ASSERT_TRUE(s1_id.has_value()); + auto s2_id = splitSegmentAt(*s1_id, 20); + ASSERT_TRUE(s2_id.has_value()); + + // s1 has range [10, 20) + { + auto [begin, end] = getSegmentKeyRange(*s1_id); + ASSERT_EQ(10, begin); + ASSERT_EQ(20, end); + } + + { + // write_rows == segment_rows, start_key not specified + version = 0; + auto block = prepareWriteBlockInSegmentRange(*s1_id, 10); + ASSERT_COLUMN_EQ( + block.getByName(EXTRA_HANDLE_COLUMN_NAME), + createColumn({10, 11, 12, 13, 14, 15, 16, 17, 18, 19})); + ASSERT_COLUMN_EQ( + block.getByName(VERSION_COLUMN_NAME), + createColumn({1, 1, 1, 1, 1, 1, 1, 1, 1, 1})); + } + { + // write_rows > segment_rows, start_key not specified + version = 0; + auto block = prepareWriteBlockInSegmentRange(*s1_id, 13); + ASSERT_COLUMN_EQ( + block.getByName(EXTRA_HANDLE_COLUMN_NAME), + createColumn({10, 10, 11, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19})); + ASSERT_COLUMN_EQ( + block.getByName(VERSION_COLUMN_NAME), + createColumn({1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1})); + } + { + // start_key specified, end_key - start_key < write_rows + version = 0; + auto block = prepareWriteBlockInSegmentRange(*s1_id, 2, /* at */ 16); + ASSERT_COLUMN_EQ( + block.getByName(EXTRA_HANDLE_COLUMN_NAME), + createColumn({16, 17})); + ASSERT_COLUMN_EQ( + block.getByName(VERSION_COLUMN_NAME), + createColumn({1, 1})); + } + { + version = 0; + auto block = prepareWriteBlockInSegmentRange(*s1_id, 4, /* at */ 16); + ASSERT_COLUMN_EQ( + block.getByName(EXTRA_HANDLE_COLUMN_NAME), + createColumn({16, 17, 18, 19})); + ASSERT_COLUMN_EQ( + block.getByName(VERSION_COLUMN_NAME), + createColumn({1, 1, 1, 1})); + } + { + version = 0; + auto block = prepareWriteBlockInSegmentRange(*s1_id, 5, /* at */ 16); + ASSERT_COLUMN_EQ( + block.getByName(EXTRA_HANDLE_COLUMN_NAME), + createColumn({16, 16, 17, 18, 19})); + ASSERT_COLUMN_EQ( + block.getByName(VERSION_COLUMN_NAME), + createColumn({1, 2, 1, 1, 1})); + } + { + version = 0; + auto block = prepareWriteBlockInSegmentRange(*s1_id, 10, /* at */ 16); + ASSERT_COLUMN_EQ( + block.getByName(EXTRA_HANDLE_COLUMN_NAME), + createColumn({16, 16, 16, 17, 17, 17, 18, 18, 19, 19})); + ASSERT_COLUMN_EQ( + block.getByName(VERSION_COLUMN_NAME), + createColumn({1, 2, 3, 1, 2, 3, 1, 2, 1, 2})); + } + { + // write rows < segment rows, start key not specified, should choose a random start. + auto block = prepareWriteBlockInSegmentRange(*s1_id, 3); + ASSERT_EQ(3, block.rows()); + } + { + // Let's check whether the generated handles will be starting from 12, for at least once. + auto start_from_12 = 0; + for (size_t i = 0; i < 100; i++) + { + auto block = prepareWriteBlockInSegmentRange(*s1_id, 3); + if (block.getByName(EXTRA_HANDLE_COLUMN_NAME).column->getInt(0) == 12) + start_from_12++; + } + ASSERT_TRUE(start_from_12 > 0); // We should hit at least 1 times in 100 iters. + ASSERT_TRUE(start_from_12 < 50); // We should not hit 50 times in 100 iters :) + } +} +CATCH + } // namespace tests } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h index da68b4e5adc..bc3e1da5cde 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h @@ -74,7 +74,7 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic void replaceSegmentData(const std::vector & segments_id, const Block & block); Block prepareWriteBlock(Int64 start_key, Int64 end_key, bool is_deleted = false); - std::vector prepareWriteBlocksInSegmentRange(PageId segment_id, UInt64 total_write_rows, std::optional write_start_key = std::nullopt, bool is_deleted = false); + Block prepareWriteBlockInSegmentRange(PageId segment_id, UInt64 total_write_rows, std::optional write_start_key = std::nullopt, bool is_deleted = false); size_t getSegmentRowNumWithoutMVCC(PageId segment_id); size_t getSegmentRowNum(PageId segment_id); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_randomized.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_randomized.cpp index 16ef0ee7287..eb2126f107b 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_randomized.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_randomized.cpp @@ -317,9 +317,7 @@ class SegmentRandomizedTest : public SegmentTestBasic TEST_F(SegmentRandomizedTest, FastCommonHandle) try { - SegmentTestOptions options; - options.is_common_handle = true; - reloadWithOptions(options); + reloadWithOptions({.is_common_handle = true}); run(/* n */ 500, /* min key */ -50000, /* max key */ 50000); } CATCH @@ -328,9 +326,7 @@ CATCH TEST_F(SegmentRandomizedTest, FastIntHandle) try { - SegmentTestOptions options; - options.is_common_handle = false; - reloadWithOptions(options); + reloadWithOptions({.is_common_handle = false}); run(/* n */ 500, /* min key */ -50000, /* max key */ 50000); } CATCH @@ -340,9 +336,7 @@ CATCH TEST_F(SegmentRandomizedTest, DISABLED_ForCI) try { - SegmentTestOptions options; - options.is_common_handle = true; - reloadWithOptions(options); + reloadWithOptions({.is_common_handle = true}); run(50000, /* min key */ -50000, /* max key */ 50000); } CATCH diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_sst_files_stream.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_sst_files_stream.cpp index 94949bdcf01..25c2a55dd59 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_sst_files_stream.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_sst_files_stream.cpp @@ -111,7 +111,7 @@ try ASSERT_EQ(blocks.size(), 1); auto block = blocks[0]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4})); } { auto blocks = prepareBlocks(1, 14, 3); @@ -119,27 +119,27 @@ try { auto block = blocks[0]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({1, 2, 3}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({1, 2, 3})); } { auto block = blocks[1]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({4, 5, 6}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({4, 5, 6})); } { auto block = blocks[2]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({7, 8, 9}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({7, 8, 9})); } { auto block = blocks[3]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({10, 11, 12}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({10, 11, 12})); } { auto block = blocks[4]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({13}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({13})); } } { @@ -148,17 +148,17 @@ try { auto block = blocks[0]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({1}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({1})); } { auto block = blocks[1]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({2}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({2})); } { auto block = blocks[2]; auto col = block.getByName(MutableSupport::tidb_pk_column_name); - ASSERT_COLUMN_EQ(col, createColumn({3}, "col")); + ASSERT_COLUMN_EQ(col, createColumn({3})); } } } @@ -430,4 +430,4 @@ CATCH } // namespace tests } // namespace DM -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/TestUtils/FunctionTestUtils.h b/dbms/src/TestUtils/FunctionTestUtils.h index a6d7049420c..37e0f9783ca 100644 --- a/dbms/src/TestUtils/FunctionTestUtils.h +++ b/dbms/src/TestUtils/FunctionTestUtils.h @@ -841,5 +841,15 @@ class FunctionTest : public ::testing::Test #define ASSERT_COLUMNS_EQ_R(expected, actual) ASSERT_TRUE(DB::tests::columnsEqual((expected), (actual), true)) /// unrestrictly checking columns equality, only checking data set equality #define ASSERT_COLUMNS_EQ_UR(expected, actual) ASSERT_TRUE(DB::tests::columnsEqual((expected), (actual), false)) + +/// Check the profile event change after the body. +#define ASSERT_PROFILE_EVENT(event, diff_expr, ...) \ + do \ + { \ + auto profile_event_count = ProfileEvents::get(event); \ + {__VA_ARGS__}; \ + ASSERT_EQ(profile_event_count diff_expr, ProfileEvents::get(event)); \ + } while (false); + } // namespace tests } // namespace DB diff --git a/dbms/src/TestUtils/InputStreamTestUtils.cpp b/dbms/src/TestUtils/InputStreamTestUtils.cpp index 1e08fdf88c0..be521f0ab3a 100644 --- a/dbms/src/TestUtils/InputStreamTestUtils.cpp +++ b/dbms/src/TestUtils/InputStreamTestUtils.cpp @@ -16,8 +16,10 @@ #include #include #include +#include #include #include +#include namespace DB { @@ -386,12 +388,14 @@ ::testing::AssertionResult UnorderedInputStreamVSBlockUnrestrictlyCompareColumns Block expect_block(columns); expect_block.checkNumberOfRows(); // check the input - // Blocks can be unordered when read-thread-pool enabled. - // So read all blocks and sort them by handle column or column at position 0. + // Blocks can be unordered when read-thread-pool or fast-scan or bitmap-filter(currently not supported) is enabled. + // Especially, when fast-scan or bitmap-filter is enabled, it's not just disorder between blocks, but also rows inside the block are unordered. + // So read all blocks and sort them by handle before the comparison. size_t num_rows_expect = expect_block.rows(); size_t num_rows_read = 0; std::vector blocks; stream->readPrefix(); + // Read all blocks. while (Block read_block = stream->read()) { num_rows_read += read_block.rows(); @@ -413,52 +417,67 @@ ::testing::AssertionResult UnorderedInputStreamVSBlockUnrestrictlyCompareColumns blocks.emplace_back(std::move(read_block)); } stream->readSuffix(); + Block blk; + // Sort rows by handle. + if (!blocks.empty()) + { + blk = blocks.front().cloneEmpty(); + // First, merge all blocks into one. + auto mut_cols = blk.cloneEmptyColumns(); + for (const auto & b : blocks) + { + for (size_t i = 0; i < b.columns(); i++) + { + const auto & col = *b.getByPosition(i).column; + mut_cols[i]->insertRangeFrom(col, 0, col.size()); + } + } - auto cmp_blk = [](const Block & a, const Block & b) { - const auto & col_a = a.has(EXTRA_HANDLE_COLUMN_NAME) ? a.getByName(EXTRA_HANDLE_COLUMN_NAME) : a.getByPosition(0); - const auto & col_b = b.has(EXTRA_HANDLE_COLUMN_NAME) ? b.getByName(EXTRA_HANDLE_COLUMN_NAME) : b.getByPosition(0); - if (col_a.column->empty() || col_b.column->empty()) + // Sort all columns by handle. Assume position 0 is hanle column in these tests. + auto & handle_col = mut_cols[0]; + std::vector ids; + for (size_t i = 0; i < handle_col->size(); i++) { - return false; + ids.push_back(i); } - const auto & field_a = (*col_a.column)[0]; - const auto & field_b = (*col_b.column)[0]; - return field_a < field_b; - }; - std::sort(blocks.begin(), blocks.end(), cmp_blk); - - size_t start_offset = 0; - for (const auto & read_block : blocks) - { - for (size_t col_idx = 0; col_idx < colnames.size(); ++col_idx) + std::sort(ids.begin(), ids.end(), [&](size_t a, size_t b) { + return handle_col->getDataAt(a) < handle_col->getDataAt(b); + }); + auto sorted_cols = blk.cloneEmptyColumns(); + for (size_t pos = 0; pos < sorted_cols.size(); pos++) { - const auto & col_name = colnames[col_idx]; - // Copy the [start_offset, read_block.rows()) of `expect_block` - const auto & expect_full_col = expect_block.getByPosition(col_idx); - auto expect_col = expect_full_col.cloneEmpty(); - auto column_data = expect_col.type->createColumn(); - column_data->insertRangeFrom(*expect_full_col.column, start_offset, read_block.rows()); - expect_col.column = std::move(column_data); - - const auto & actual_col = read_block.getByName(col_name); - if (auto res = columnEqual(expect_col, actual_col); !res) + auto & sorted_col = sorted_cols[pos]; + ColumnPtr col = std::move(mut_cols[pos]); + for (auto id : ids) { - auto expect_expr = fmt::format("expect block: {}", getColumnsContent(expect_block.getColumnsWithTypeAndName(), start_offset, start_offset + read_block.rows())); - Block actual_block_to_cmp; - for (const auto & col_name : colnames) - actual_block_to_cmp.insert(read_block.getByName(col_name)); - auto actual_expr = fmt::format("actual block: {}", getColumnsContent(actual_block_to_cmp.getColumnsWithTypeAndName())); - return res << fmt::format("\n details: [column={}] [prev_nrows={}] [cur_nrows={}]:\n {}\n {}", col_name, start_offset, start_offset + read_block.rows(), expect_expr, actual_expr); + sorted_col->insertRangeFrom(*col, id, 1); } } + blk.setColumns(std::move(sorted_cols)); + } - start_offset += read_block.rows(); + for (size_t col_idx = 0; col_idx < colnames.size(); ++col_idx) + { + const auto & col_name = colnames[col_idx]; + const auto & expect_full_col = expect_block.getByPosition(col_idx); + const auto & actual_col = blk.getByName(col_name); + if (auto res = columnEqual(expect_full_col, actual_col); !res) + { + auto expect_expr = fmt::format("expect block: {}", getColumnsContent(expect_block.getColumnsWithTypeAndName(), 0, blk.rows())); + Block actual_block_to_cmp; + for (const auto & name : colnames) + { + actual_block_to_cmp.insert(blk.getByName(name)); + } + auto actual_expr = fmt::format("actual block: {}", getColumnsContent(actual_block_to_cmp.getColumnsWithTypeAndName())); + return res << fmt::format("\n details: [column={}] [prev_nrows={}] [cur_nrows={}]:\n {}\n {}", col_name, 0, blk.rows(), expect_expr, actual_expr); + } } if (num_rows_expect == num_rows_read) return ::testing::AssertionSuccess(); - // Less rows than expected + // Fewer rows than expected auto reason = fmt::format(R"r( ({}).read() return num of rows Which is: {} the num rows of ({}) diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json index 9819d0827ea..9d69c8ec6d8 100644 --- a/metrics/grafana/tiflash_summary.json +++ b/metrics/grafana/tiflash_summary.json @@ -8202,7 +8202,7 @@ "y": 24 }, "hiddenSeries": false, - "id": 159, + "id": 163, "legend": { "alignAsTable": false, "avg": false, @@ -8427,7 +8427,7 @@ "y": 32 }, "hiddenSeries": false, - "id": 161, + "id": 164, "legend": { "alignAsTable": true, "avg": false, diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index e4315f9b1fb..89df50a8c56 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -99,10 +99,10 @@ mysql> drop table if exists test.t; mysql> create table test.t (expr varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); mysql> alter table test.t set tiflash replica 1; func> wait_table test t -mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "1", pos, occur, 1, match_type) as res from test.t; -mysql> set tidb_enforce_mpp=1; select regexp_instr("1", "", pos, occur, 1, match_type) as res from test.t; +mysql> set tidb_enforce_mpp=1; select regexp_instr(_utf8mb4'1', _utf8mb4'1', pos, occur, 1, match_type) as res from test.t; +mysql> set tidb_enforce_mpp=1; select regexp_instr(_utf8mb4'1', _utf8mb4'', pos, occur, 1, match_type) as res from test.t; -mysql> insert into test.t values ('123', '12.', 1, 1, 0, ''), ('aBb', "bb", 1, 1, 0, 'i'), ('ab\nabc', '^abc$', 1, 1, 0, 'm'); +mysql> insert into test.t values (_utf8mb4'123', _utf8mb4'12.', 1, 1, 0, _utf8mb4''), (_utf8mb4'aBb', _utf8mb4'bb', 1, 1, 0, _utf8mb4'i'), (_utf8mb4'ab\nabc', _utf8mb4'^abc$', 1, 1, 0, _utf8mb4'm'); mysql> alter table test.t set tiflash replica 1; func> wait_table test t mysql> set tidb_enforce_mpp=1; select regexp_instr(expr, pattern, 1, 1, 0, match_type) as res from test.t; From b8a3474220a3ff2ba161edf54fedd903abcdf06d Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 29 Nov 2022 16:03:17 +0800 Subject: [PATCH 81/87] resolve comments --- dbms/src/Common/OptimizedRegularExpression.h | 7 +- .../Common/OptimizedRegularExpression.inl.h | 34 ++-- dbms/src/Functions/FunctionsRegexp.h | 141 +++++++------ dbms/src/Functions/tests/gtest_regexp.cpp | 188 +++++++++--------- 4 files changed, 190 insertions(+), 180 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 997233add8a..83b03aa3851 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #if USE_RE2_ST @@ -114,14 +115,14 @@ class OptimizedRegularExpressionImpl } Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op); - bool substr(const char * subject, size_t subject_size, StringRef & res, Int64 pos, Int64 occur); + std::optional substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur); private: Int64 processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); Int64 instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op); - bool processSubstrEmptyStringExpr(const char * expr, size_t expr_size, StringRef & res, size_t byte_pos, Int64 occur); - bool substrImpl(const char * subject, size_t subject_size, StringRef & res, Int64 byte_pos, Int64 occur); + std::optional processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur); + std::optional substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur); bool is_trivial; bool required_substring_is_prefix; diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 3f9da675772..5835eb276aa 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -17,11 +17,12 @@ #include #include #include +#include #include #include -#include #include +#include #define MIN_LENGTH_FOR_STRSTR 3 #define MAX_SUBPATTERNS 5 @@ -485,19 +486,17 @@ Int64 OptimizedRegularExpressionImpl::processInstrEmptyStringExpr(c } template -bool OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(const char * expr, size_t expr_size, StringRef & res, size_t byte_pos, Int64 occur) +std::optional OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur) { if (occur != 1 || byte_pos != 1) - return false; - + return std::optional(); + StringPieceType expr_sp(expr, expr_size); StringPieceType matched_str; if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) - return false; + return std::optional(); - res.data = matched_str.data(); - res.size = matched_str.size(); - return true; + return std::optional(StringRef(matched_str.data(), matched_str.size())); } static inline void checkInstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op) @@ -539,10 +538,10 @@ Int64 OptimizedRegularExpressionImpl::instrImpl(const char * subjec } template -bool OptimizedRegularExpressionImpl::substrImpl(const char * subject, size_t subject_size, StringRef & res, Int64 byte_pos, Int64 occur) +std::optional OptimizedRegularExpressionImpl::substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur) { size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8 - const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched + const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched size_t expr_size = subject_size - byte_offset; StringPieceType expr_sp(expr, expr_size); @@ -550,20 +549,19 @@ bool OptimizedRegularExpressionImpl::substrImpl(const char * subjec while (occur > 0) { if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) - return false; + return std::optional(); --occur; } - res.data = matched_str.data(); - res.size = matched_str.size(); - return true; + return std::optional(StringRef(matched_str.data(), matched_str.size())); } template Int64 OptimizedRegularExpressionImpl::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op) { - Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast(subject), subject_size);; + Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast(subject), subject_size); + ; checkInstrArgs(utf8_total_len, subject_size, pos, ret_op); makeOccurValid(occur); @@ -575,17 +573,17 @@ Int64 OptimizedRegularExpressionImpl::instr(const char * subject, s } template -bool OptimizedRegularExpressionImpl::substr(const char * subject, size_t subject_size, StringRef & res, Int64 pos, Int64 occur) +std::optional OptimizedRegularExpressionImpl::substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur) { Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast(subject), subject_size); checkSubstrArgs(utf8_total_len, subject_size, pos); makeOccurValid(occur); if (unlikely(subject_size == 0)) - return processSubstrEmptyStringExpr(subject, subject_size, res, pos, occur); + return processSubstrEmptyStringExpr(subject, subject_size, pos, occur); size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast(subject), pos); - return substrImpl(subject, subject_size, res, byte_pos, occur); + return substrImpl(subject, subject_size, byte_pos, occur); } #undef MIN_LENGTH_FOR_STRSTR diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 6bfa35c9169..63f2564ff0e 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -179,6 +179,19 @@ GetIntFuncPointerType getGetIntFuncPointer(IntType int_type) } } +// We need to fill something into StringColumn when all elements are null +inline void fillColumnStringWhenAllNull(decltype(ColumnString::create()) & col_res, size_t size) +{ + auto & col_res_data = col_res->getChars(); + auto & col_res_offsets = col_res->getOffsets(); + size_t offset = 0; + for (size_t i = 0; i < size; ++i) + { + col_res_data[offset++] = 0; + col_res_offsets[i] = offset; + } +} + template class ParamString { @@ -1516,52 +1529,52 @@ class FunctionStringRegexpInstr : public FunctionStringRegexpBase #undef GET_MATCH_TYPE_ACTUAL_PARAM #undef EXECUTE_REGEXP_INSTR -#define EXECUTE_REGEXP_SUBSTR() \ - do \ - { \ +#define EXECUTE_REGEXP_SUBSTR() \ + do \ + { \ REGEXP_CLASS_MEM_FUNC_IMPL_NAME(RES_ARG_VAR_NAME, *(EXPR_PARAM_PTR_VAR_NAME), *(PAT_PARAM_PTR_VAR_NAME), *(POS_PARAM_PTR_VAR_NAME), *(OCCUR_PARAM_PTR_VAR_NAME), *(MATCH_TYPE_PARAM_PTR_VAR_NAME)); \ } while (0); // Method to get actual match type param -#define GET_MATCH_TYPE_ACTUAL_PARAM() \ - do \ - { \ +#define GET_MATCH_TYPE_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_STRING_PARAM(MATCH_TYPE_PV_VAR_NAME, MATCH_TYPE_PARAM_PTR_VAR_NAME, ({EXECUTE_REGEXP_SUBSTR()})) \ } while (0); // Method to get actual occur param -#define GET_OCCUR_ACTUAL_PARAM() \ - do \ - { \ +#define GET_OCCUR_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_INT_PARAM(OCCUR_PV_VAR_NAME, OCCUR_PARAM_PTR_VAR_NAME, ({GET_MATCH_TYPE_ACTUAL_PARAM()})) \ } while (0); // Method to get actual position param -#define GET_POS_ACTUAL_PARAM() \ - do \ - { \ +#define GET_POS_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_INT_PARAM(POS_PV_VAR_NAME, POS_PARAM_PTR_VAR_NAME, ({GET_OCCUR_ACTUAL_PARAM()})) \ } while (0); // Method to get actual pattern param -#define GET_PAT_ACTUAL_PARAM() \ - do \ - { \ +#define GET_PAT_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_STRING_PARAM(PAT_PV_VAR_NAME, PAT_PARAM_PTR_VAR_NAME, ({GET_POS_ACTUAL_PARAM()})) \ } while (0); // Method to get actual expression param -#define GET_EXPR_ACTUAL_PARAM() \ - do \ - { \ +#define GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ GET_ACTUAL_STRING_PARAM(EXPR_PV_VAR_NAME, EXPR_PARAM_PTR_VAR_NAME, ({GET_PAT_ACTUAL_PARAM()})) \ } while (0); // The entry to get actual params and execute regexp functions #define GET_ACTUAL_PARAMS_AND_EXECUTE() \ - do \ - { \ - GET_EXPR_ACTUAL_PARAM() \ + do \ + { \ + GET_EXPR_ACTUAL_PARAM() \ } while (0); // Implementation of regexp_substr function @@ -1616,8 +1629,8 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase GetIntFuncPointerType get_occur_func = getGetIntFuncPointer(occur_param.getIntType()); // Container will not be used when parm is const - const void * pos_container = pos_param.getContainer(); - const void * occur_container = occur_param.getContainer(); + const void * pos_container = pos_param.getContainer(); + const void * occur_container = occur_param.getContainer(); // Const value will not be used when param is not const Int64 pos_const_val = PosT::isConst() ? pos_param.template getInt(0) : -1; @@ -1642,10 +1655,9 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase pat = fmt::format("({})", pat); Regexps::Regexp regexp(addMatchTypeForPattern(pat, match_type, collator), flags); - StringRef res_ref; - bool success = regexp.substr(expr.c_str(), expr.size(), res_ref, pos_const_val, occur_const_val); - if (success) - res_arg.column = res_arg.type->createColumnConst(col_size, toField(String(res_ref))); + auto res = regexp.substr(expr.c_str(), expr.size(), pos_const_val, occur_const_val); + if (res) + res_arg.column = res_arg.type->createColumnConst(col_size, toField(res.value().toString())); else res_arg.column = res_arg.type->createColumnConst(col_size, Null()); return; @@ -1653,24 +1665,25 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase // Initialize result column auto col_res = ColumnString::create(); + col_res->reserve(col_size * 15); constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || MatchTypeT::isNullableCol(); -#define GET_POS_VALUE(idx) \ - do \ - { \ - if constexpr (PosT::isConst()) \ - pos = pos_const_val; \ - else \ +#define GET_POS_VALUE(idx) \ + do \ + { \ + if constexpr (PosT::isConst()) \ + pos = pos_const_val; \ + else \ pos = get_pos_func(pos_container, idx); \ } while (0); -#define GET_OCCUR_VALUE(idx) \ - do \ - { \ - if constexpr (OccurT::isConst()) \ - occur = occur_const_val; \ - else \ +#define GET_OCCUR_VALUE(idx) \ + do \ + { \ + if constexpr (OccurT::isConst()) \ + occur = occur_const_val; \ + else \ occur = get_occur_func(occur_container, idx); \ } while (0); @@ -1679,10 +1692,9 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase Int64 pos; Int64 occur; String match_type; - StringRef res_ref; - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & null_map = nullmap_col->getData(); + auto null_map_col = ColumnUInt8::create(); + typename ColumnUInt8::Container & null_map = null_map_col->getData(); null_map.resize(col_size); // Start to execute instr @@ -1694,10 +1706,9 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase regexp = memorize(pat_param, match_type_param, collator); if (regexp == nullptr) { - auto nullmap_col = ColumnUInt8::create(); - typename ColumnUInt8::Container & nullmap = nullmap_col->getData(); - nullmap.resize(col_size, 1); - res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + null_map.resize(col_size, 1); + fillColumnStringWhenAllNull(col_res, col_size); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(null_map_col)); return; } } @@ -1717,7 +1728,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase GET_POS_VALUE(i) GET_OCCUR_VALUE(i) - executeAndSetResult(*regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); + executeAndSetResult(*regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); } } else @@ -1728,7 +1739,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase GET_POS_VALUE(i) GET_OCCUR_VALUE(i) - executeAndSetResult(*regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); + executeAndSetResult(*regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); } } } @@ -1756,7 +1767,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase pat = fmt::format("({})", pat); auto regexp = createRegexpWithMatchType(pat, match_type, collator); - executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); } } else @@ -1774,13 +1785,13 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase pat = fmt::format("({})", pat); auto regexp = createRegexpWithMatchType(pat, match_type, collator); - executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, res_ref, pos, occur); + executeAndSetResult(regexp, col_res, null_map, i, expr_ref.data, expr_ref.size, pos, occur); } } } #undef GET_OCCUR_VALUE #undef GET_POS_VALUE - res_arg.column = ColumnNullable::create(std::move(col_res), std::move(nullmap_col)); + res_arg.column = ColumnNullable::create(std::move(col_res), std::move(null_map_col)); } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override @@ -1805,13 +1816,13 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase ColumnPtr col_match_type; // Go through cases to get arguments - switch(arg_num) + switch (arg_num) { case REGEXP_SUBSTR_MAX_PARAM_NUM: col_match_type = block.getByPosition(arguments[4]).column; - case REGEXP_MIN_PARAM_NUM + 2: + case REGEXP_SUBSTR_MAX_PARAM_NUM - 1: col_occur = block.getByPosition(arguments[3]).column; - case REGEXP_MIN_PARAM_NUM + 1: + case REGEXP_SUBSTR_MAX_PARAM_NUM - 2: col_pos = block.getByPosition(arguments[2]).column; }; @@ -1828,19 +1839,19 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase private: void executeAndSetResult( - Regexps::Regexp & regexp, - ColumnString::MutablePtr & col_res, - typename ColumnUInt8::Container & null_map, - size_t idx, - const char * subject, - size_t subject_size, - StringRef & res_ref, - Int64 pos, - Int64 occur) const + Regexps::Regexp & regexp, + ColumnString::MutablePtr & col_res, + typename ColumnUInt8::Container & null_map, + size_t idx, + const char * subject, + size_t subject_size, + Int64 pos, + Int64 occur) const { - if (regexp.substr(subject, subject_size, res_ref, pos, occur)) + auto res = regexp.substr(subject, subject_size, pos, occur); + if (res) { - col_res->insertData(res_ref.data, res_ref.size); + col_res->insertData(res.value().data, res.value().size); null_map[idx] = 0; } else diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index f803c901978..c0bf746a3ba 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2210,7 +2210,7 @@ TEST_F(Regexp, testRegexpCustomerCases) namespace { -template +template std::vector getResultVec(const std::vector & test_cases) { std::vector vecs; @@ -2696,7 +2696,7 @@ TEST_F(Regexp, RegexpInstr) createColumn({}), createColumn({}), createColumn({}))); - + ASSERT_COLUMN_EQ(createColumn({}), executeFunction( "regexp_instr", @@ -2736,7 +2736,7 @@ struct RegexpSubstrCase , position(pos) , occurrence(occur) , match_type(mt) - {} + {} RegexpSubstrCase(const String & res, const std::vector & null_map_, const String & expr, const String & pat, Int64 pos = 1, Int64 occur = 1, const String & mt = "") : result(res) @@ -2746,7 +2746,7 @@ struct RegexpSubstrCase , position(pos) , occurrence(occur) , match_type(mt) - {} + {} static void setVecsWithoutNullMap(int param_num, const std::vector test_cases, std::vector & results, std::vector & exprs, std::vector & pats, std::vector & positions, std::vector & occurs, std::vector & match_types) { @@ -2808,31 +2808,31 @@ TEST_F(Regexp, RegexpSubstr) for (size_t row_size = 1; row_size < 3; ++row_size) { ASSERT_COLUMN_EQ(createConstColumn>(row_size, "123"), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, "12"), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "11212"), - createConstColumn(row_size, "12"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "11212"), + createConstColumn(row_size, "12"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, "ab"), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "aabab"), - createConstColumn(row_size, "aB"), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn(row_size, "i"))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "aabab"), + createConstColumn(row_size, "aB"), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn(row_size, "i"))); } } @@ -2841,40 +2841,40 @@ TEST_F(Regexp, RegexpSubstr) for (size_t row_size = 1; row_size < 3; ++row_size) { ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_substr", - createConstColumn>(row_size, {}), - createConstColumn(row_size, "123"))); - + executeFunction( + "regexp_substr", + createConstColumn>(row_size, {}), + createConstColumn(row_size, "123"))); + ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "123"), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); ASSERT_COLUMN_EQ(createConstColumn>(row_size, {}), - executeFunction( - "regexp_substr", - createConstColumn(row_size, "123"), - createConstColumn(row_size, "12."), - createConstColumn(row_size, 2), - createConstColumn(row_size, 2), - createConstColumn>(row_size, {}))); + executeFunction( + "regexp_substr", + createConstColumn(row_size, "123"), + createConstColumn(row_size, "12."), + createConstColumn(row_size, 2), + createConstColumn(row_size, 2), + createConstColumn>(row_size, {}))); } } @@ -2899,10 +2899,10 @@ TEST_F(Regexp, RegexpSubstr) {"跑", "pp跑ppのaaa", "(跑|の|P)"}}; RegexpSubstrCase::setVecsWithoutNullMap(2, test_cases, results, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 0, 0, 1, 0, 1, 0}), - executeFunction( - "regexp_substr", - createColumn(exprs), - createColumn(patterns))); + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns))); // test regexp_substr(vector, vector, vector) test_cases = {{"tifl", "ttttifl", "tifl", 3}, @@ -2968,10 +2968,10 @@ TEST_F(Regexp, RegexpSubstr) executeFunction( "regexp_substr", {createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createColumn(match_types)}, + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createColumn(match_types)}, utf8mb4_general_ci_collator)); } @@ -2982,56 +2982,56 @@ TEST_F(Regexp, RegexpSubstr) {"tidb", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; RegexpSubstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), - executeFunction( - "regexp_substr", - createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), - createColumn(patterns))); + executeFunction( + "regexp_substr", + createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns))); // test regexp_substr(vector, nullable vector) test_cases = {{"tifl", {{0, 0, 0, 0, 0}}, "ttttifl", "tifl"}, {"", {{0, 1, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)"}}; RegexpSubstrCase::setVecsWithNullMap(2, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]), - executeFunction( - "regexp_substr", - createColumn(exprs), - createNullableVectorColumn(patterns, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]))); + executeFunction( + "regexp_substr", + createColumn(exprs), + createNullableVectorColumn(patterns, null_maps[RegexpSubstrCase::PAT_NULL_MAP_IDX]))); // test regexp_substr(vector, vector, nullable vector) test_cases = {{"tifl", {{0, 0, 0, 0, 0}}, "ttttifl", "tifl", 3}, {"", {{0, 0, 1, 0, 0}}, "ttttifl", "tifl", 3}}; RegexpSubstrCase::setVecsWithNullMap(3, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), - executeFunction( - "regexp_substr", - createColumn(exprs), - createColumn(patterns), - createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]))); + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]))); // test regexp_substr(vector, vector, vector, nullable vector) test_cases = {{"tikv", {{0, 0, 0, 0, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}, {"", {{0, 0, 0, 1, 0}}, "tidb_tikv", "ti(db|kv)", 1, 2}}; RegexpSubstrCase::setVecsWithNullMap(4, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]), - executeFunction( - "regexp_substr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createNullableVectorColumn(occurs, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]))); + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createNullableVectorColumn(occurs, null_maps[RegexpSubstrCase::OCCUR_NULL_MAP_IDX]))); // test regexp_substr(vector, vector, vector, vector, nullable vector) test_cases = {{"b", {{0, 0, 0, 0, 0}}, "b", "B", 1, 1, "i"}, {"", {{0, 0, 0, 0, 1}}, "b", "B", 1, 1, "i"}}; RegexpSubstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]), - executeFunction( - "regexp_substr", - createColumn(exprs), - createColumn(patterns), - createColumn(positions), - createColumn(occurs), - createNullableVectorColumn(match_types, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]))); + executeFunction( + "regexp_substr", + createColumn(exprs), + createColumn(patterns), + createColumn(positions), + createColumn(occurs), + createNullableVectorColumn(match_types, null_maps[RegexpSubstrCase::MATCH_TYPE_NULL_MAP_IDX]))); } // Test: const, nullable and pure vector columns appear together @@ -3043,13 +3043,13 @@ TEST_F(Regexp, RegexpSubstr) {"", {{1, 0, 1, 0, 0}}, "tidb_tikv", "ti(db|Kv)", 1, 1, "i"}}; RegexpSubstrCase::setVecsWithNullMap(5, test_cases, results, null_maps, exprs, patterns, positions, occurs, match_types); ASSERT_COLUMN_EQ(createNullableVectorColumn(results, {0, 1, 1, 1}), - executeFunction( - "regexp_substr", - createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), - createColumn(patterns), - createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), - createColumn(occurs), - createColumn(match_types))); + executeFunction( + "regexp_substr", + createNullableVectorColumn(exprs, null_maps[RegexpSubstrCase::EXPR_NULL_MAP_IDX]), + createColumn(patterns), + createNullableVectorColumn(positions, null_maps[RegexpSubstrCase::POS_NULL_MAP_IDX]), + createColumn(occurs), + createColumn(match_types))); } // Test: Invalid parameter handling From 4224a1d13d86e6c17eef4c2f5e59dfd77c87b430 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 30 Nov 2022 10:01:37 +0800 Subject: [PATCH 82/87] add some tests --- dbms/src/Functions/tests/gtest_regexp.cpp | 31 ++++++++++++++++++++++- tests/fullstack-test/expr/regexp.test | 22 ++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index c0bf746a3ba..42411b83f05 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2800,7 +2800,6 @@ struct RegexpSubstrCase String match_type; }; -// TODO add empty column test TEST_F(Regexp, RegexpSubstr) { // Test: All columns are const @@ -3052,6 +3051,36 @@ TEST_F(Regexp, RegexpSubstr) createColumn(match_types))); } + // Test: empty column tests + { + ASSERT_COLUMN_EQ(createConstColumn>(0, ""), + executeFunction( + "regexp_substr", + createConstColumn(0, "m"), + createConstColumn(0, "m"), + createConstColumn(0, 1), + createConstColumn(0, 1), + createConstColumn(0, "m"))); + + ASSERT_COLUMN_EQ(createColumn>({}), + executeFunction( + "regexp_substr", + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}), + createColumn({}))); + + ASSERT_COLUMN_EQ(createColumn>({}), + executeFunction( + "regexp_substr", + createColumn({}), + createColumn({}), + createConstColumn(0, 1), + createColumn({}), + createConstColumn(0, ""))); + } + // Test: Invalid parameter handling { // test empty pattern diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index 89df50a8c56..ad41cc8ec49 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# test regexp and regexp_like mysql> drop table if exists test.t mysql> create table test.t (data varchar(30), data_not_null varchar(30) not null, pattern varchar(30), pattern_not_null varchar(30) not null); mysql> insert into test.t values ('aaaa', 'AAAA', '^a.*', '^A.*'), ('abcd', 'abcd', null, '^a..d$'), (null, 'bbb', 'bb$', 'bb$'),('中文测试','中文测试','中文','^....$'),('中English混合','中English混合','^中English','^..nglish..$'); @@ -95,6 +96,7 @@ mysql> set tidb_enforce_mpp=1; select regexp_like(data, pattern, match_type) as | 1 | +------+ +# test regexp_instr mysql> drop table if exists test.t; mysql> create table test.t (expr varchar(30), pattern varchar(30), pos int, occur int, ret_op int, match_type varchar(30)); mysql> alter table test.t set tiflash replica 1; @@ -113,3 +115,23 @@ mysql> set tidb_enforce_mpp=1; select regexp_instr(expr, pattern, 1, 1, 0, match | 2 | | 4 | +------+ + +# test regexp_substr +mysql> drop table if exists test.t; +mysql> create table test.t (expr varchar(30), pattern varchar(30), pos int, occur int, match_type varchar(30)); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_substr(_utf8mb4'1', _utf8mb4'1', pos, occur, match_type) as res from test.t; +mysql> set tidb_enforce_mpp=1; select regexp_substr(_utf8mb4'1', _utf8mb4'', pos, occur, match_type) as res from test.t; + +mysql> insert into test.t values (_utf8mb4'123', _utf8mb4'12.', 1, 1, 0, _utf8mb4''), (_utf8mb4'aBb', _utf8mb4'bb', 1, 1, _utf8mb4'i'), (_utf8mb4'ab\nabc', _utf8mb4'^abc$', 1, 1, _utf8mb4'm'); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t +mysql> set tidb_enforce_mpp=1; select regexp_instr(expr, pattern, 1, 1, 0, match_type) as res from test.t; ++------+ +| res | ++------+ +| 1 | +| 2 | +| 4 | ++------+ From 524c8d8020a72d5cb19c8ae10e957ccad7e97075 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 30 Nov 2022 10:26:42 +0800 Subject: [PATCH 83/87] resolve comments --- dbms/src/Common/OptimizedRegularExpression.h | 1 + dbms/src/Common/OptimizedRegularExpression.inl.h | 6 +++--- dbms/src/Functions/FunctionsRegexp.h | 12 +++++++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 83b03aa3851..18f14cb6121 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -16,6 +16,7 @@ #include #include +#include #include #include diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h index 5835eb276aa..a73e78562b5 100644 --- a/dbms/src/Common/OptimizedRegularExpression.inl.h +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -489,12 +489,12 @@ template std::optional OptimizedRegularExpressionImpl::processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur) { if (occur != 1 || byte_pos != 1) - return std::optional(); + return std::nullopt; StringPieceType expr_sp(expr, expr_size); StringPieceType matched_str; if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) - return std::optional(); + return std::nullopt; return std::optional(StringRef(matched_str.data(), matched_str.size())); } @@ -549,7 +549,7 @@ std::optional OptimizedRegularExpressionImpl::substrImpl while (occur > 0) { if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str)) - return std::optional(); + return std::nullopt; --occur; } diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 63f2564ff0e..cfa3855f140 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -184,6 +184,9 @@ inline void fillColumnStringWhenAllNull(decltype(ColumnString::create()) & col_r { auto & col_res_data = col_res->getChars(); auto & col_res_offsets = col_res->getOffsets(); + col_res_data.resize(size); + col_res_offsets.resize(size); + size_t offset = 0; for (size_t i = 0; i < size; ++i) { @@ -1665,7 +1668,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase // Initialize result column auto col_res = ColumnString::create(); - col_res->reserve(col_size * 15); + col_res->reserve(col_size); constexpr bool has_nullable_col = ExprT::isNullableCol() || PatT::isNullableCol() || PosT::isNullableCol() || OccurT::isNullableCol() || MatchTypeT::isNullableCol(); @@ -1695,7 +1698,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase auto null_map_col = ColumnUInt8::create(); typename ColumnUInt8::Container & null_map = null_map_col->getData(); - null_map.resize(col_size); + null_map.resize(col_size, 1); // Start to execute instr if (canMemorize()) @@ -1706,7 +1709,6 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase regexp = memorize(pat_param, match_type_param, collator); if (regexp == nullptr) { - null_map.resize(col_size, 1); fillColumnStringWhenAllNull(col_res, col_size); res_arg.column = ColumnNullable::create(std::move(col_res), std::move(null_map_col)); return; @@ -1719,7 +1721,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase { if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i)) { - null_map[i] = 1; + // null_map has been set to 1 in the previous col_res->insertData("", 0); continue; } @@ -1751,7 +1753,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase { if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || match_type_param.isNullAt(i)) { - null_map[i] = 1; + // null_map has been set to 1 in the previous col_res->insertData("", 0); continue; } From 2c471df64bcc360209dce40cff8ac2437f3d7d7d Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 30 Nov 2022 10:39:32 +0800 Subject: [PATCH 84/87] tweaking --- dbms/src/Functions/FunctionsRegexp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index cfa3855f140..21af57c02f6 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1700,7 +1700,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase typename ColumnUInt8::Container & null_map = null_map_col->getData(); null_map.resize(col_size, 1); - // Start to execute instr + // Start to execute substr if (canMemorize()) { std::unique_ptr regexp; From 28a184197bbb9fb78ad44a0ffa97031d9201efb3 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 30 Nov 2022 11:51:11 +0800 Subject: [PATCH 85/87] fix ft --- tests/fullstack-test/expr/regexp.test | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/fullstack-test/expr/regexp.test b/tests/fullstack-test/expr/regexp.test index ad41cc8ec49..e4f8e002e15 100644 --- a/tests/fullstack-test/expr/regexp.test +++ b/tests/fullstack-test/expr/regexp.test @@ -124,14 +124,14 @@ func> wait_table test t mysql> set tidb_enforce_mpp=1; select regexp_substr(_utf8mb4'1', _utf8mb4'1', pos, occur, match_type) as res from test.t; mysql> set tidb_enforce_mpp=1; select regexp_substr(_utf8mb4'1', _utf8mb4'', pos, occur, match_type) as res from test.t; -mysql> insert into test.t values (_utf8mb4'123', _utf8mb4'12.', 1, 1, 0, _utf8mb4''), (_utf8mb4'aBb', _utf8mb4'bb', 1, 1, _utf8mb4'i'), (_utf8mb4'ab\nabc', _utf8mb4'^abc$', 1, 1, _utf8mb4'm'); +mysql> insert into test.t values (_utf8mb4'123', _utf8mb4'12.', 1, 1, _utf8mb4''), (_utf8mb4'aBb', _utf8mb4'bb', 1, 1, _utf8mb4'i'), (_utf8mb4'ab\nabc', _utf8mb4'^abc$', 1, 1, _utf8mb4'm'); mysql> alter table test.t set tiflash replica 1; func> wait_table test t -mysql> set tidb_enforce_mpp=1; select regexp_instr(expr, pattern, 1, 1, 0, match_type) as res from test.t; +mysql> set tidb_enforce_mpp=1; select regexp_substr(expr, pattern, 1, 1, match_type) as res from test.t; +------+ | res | +------+ -| 1 | -| 2 | -| 4 | +| 123 | +| Bb | +| abc | +------+ From 358a850d9f587e2c9e2d3974672e125b1cf94309 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 1 Dec 2022 11:25:01 +0800 Subject: [PATCH 86/87] format --- dbms/src/Common/OptimizedRegularExpression.h | 2 +- dbms/src/Functions/FunctionsRegexp.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index 18f14cb6121..661b1233cfb 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -15,8 +15,8 @@ #pragma once #include -#include #include +#include #include #include diff --git a/dbms/src/Functions/FunctionsRegexp.h b/dbms/src/Functions/FunctionsRegexp.h index 21af57c02f6..b1b167a4f15 100644 --- a/dbms/src/Functions/FunctionsRegexp.h +++ b/dbms/src/Functions/FunctionsRegexp.h @@ -1721,7 +1721,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase { if (expr_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i)) { - // null_map has been set to 1 in the previous + // null_map has been set to 1 in the previous col_res->insertData("", 0); continue; } @@ -1753,7 +1753,7 @@ class FunctionStringRegexpSubstr : public FunctionStringRegexpBase { if (expr_param.isNullAt(i) || pat_param.isNullAt(i) || pos_param.isNullAt(i) || occur_param.isNullAt(i) || match_type_param.isNullAt(i)) { - // null_map has been set to 1 in the previous + // null_map has been set to 1 in the previous col_res->insertData("", 0); continue; } From 16545860b4609fff41f6605416ce6c4a1cb01a88 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 1 Dec 2022 13:18:46 +0800 Subject: [PATCH 87/87] fix ut --- dbms/src/Functions/tests/gtest_regexp.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 42411b83f05..bdbd61f92a5 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -2327,6 +2327,7 @@ struct RegexpInstrCase case 2: pats = getPatVec(test_cases); exprs = getExprVec(test_cases); + break; default: throw DB::Exception("Invalid param_num"); }