Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
138 commits
Select commit Hold shift + click to select a range
8c5768b
save
xzhangxian1008 Sep 23, 2022
8b9c149
save
xzhangxian1008 Sep 26, 2022
f4aeb00
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 26, 2022
c9922bc
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 27, 2022
c98b151
ready to compile
xzhangxian1008 Sep 27, 2022
c126dc5
successfully compile
xzhangxian1008 Sep 27, 2022
2eabd61
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 27, 2022
51ff173
clean up
xzhangxian1008 Sep 28, 2022
74d7096
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 28, 2022
32879c2
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 29, 2022
1adc613
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 30, 2022
573f877
pass tests, for the moment
xzhangxian1008 Sep 30, 2022
08f3305
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 1, 2022
101c4ff
pass gtests
xzhangxian1008 Oct 1, 2022
2be0ea7
ut passed
xzhangxian1008 Oct 7, 2022
7150b5f
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 7, 2022
61baf63
format
xzhangxian1008 Oct 7, 2022
74f090e
fix ut
xzhangxian1008 Oct 7, 2022
46a0499
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 9, 2022
5f1d1f8
save works
xzhangxian1008 Oct 10, 2022
db38cf4
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 10, 2022
1de2747
pass compilation
xzhangxian1008 Oct 10, 2022
fdade39
format
xzhangxian1008 Oct 11, 2022
9a03b7a
add the convertion of int col
xzhangxian1008 Oct 11, 2022
a6ede08
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 12, 2022
c5e6672
undef
xzhangxian1008 Oct 12, 2022
8f44a5d
save works
xzhangxian1008 Oct 12, 2022
291fcc8
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 13, 2022
efb2d73
tweaking
xzhangxian1008 Oct 13, 2022
5a4b64a
resolve
xzhangxian1008 Oct 13, 2022
4561050
tweaking
xzhangxian1008 Oct 13, 2022
35ac32a
save
xzhangxian1008 Oct 13, 2022
b7f9a5d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 13, 2022
910598f
save works
xzhangxian1008 Oct 13, 2022
85074af
save works
xzhangxian1008 Oct 14, 2022
e471028
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 17, 2022
3561f6d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 18, 2022
ceeb1a2
need gtest
xzhangxian1008 Oct 18, 2022
f3e6979
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 18, 2022
0b18c2a
fix integration test
xzhangxian1008 Oct 18, 2022
5474fd0
fix
xzhangxian1008 Oct 19, 2022
6368c9d
workaround
xzhangxian1008 Oct 19, 2022
7b45047
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 21, 2022
6892ea0
add todo
xzhangxian1008 Oct 21, 2022
e08fbcb
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 21, 2022
72877ce
fix
xzhangxian1008 Oct 21, 2022
34c7849
pass const test
xzhangxian1008 Oct 21, 2022
4003f73
pass some gtests
xzhangxian1008 Oct 21, 2022
45debaf
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 24, 2022
f89f4eb
tweaking
xzhangxian1008 Oct 24, 2022
02f56a6
modify the processing of const null etc...
xzhangxian1008 Oct 24, 2022
4fc6944
merge empty_pattern
xzhangxian1008 Oct 24, 2022
8f89448
refine macro name
xzhangxian1008 Oct 24, 2022
c859f64
pass pure vector tests
xzhangxian1008 Oct 25, 2022
28c14a6
pass collation
xzhangxian1008 Oct 26, 2022
71663c8
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 26, 2022
3400eb6
unport replace and make memorization in multi-threads
xzhangxian1008 Oct 26, 2022
640461d
remove ParamDefault
xzhangxian1008 Oct 26, 2022
df0ef3e
merge
xzhangxian1008 Oct 26, 2022
f7e3302
tweaking
xzhangxian1008 Oct 26, 2022
40c1fd2
tweaking
xzhangxian1008 Oct 26, 2022
89667b6
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 27, 2022
bfcf3fb
resolve comments
xzhangxian1008 Oct 27, 2022
c3417d0
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 27, 2022
54aa324
finish
xzhangxian1008 Oct 27, 2022
6cac98c
tweaking
xzhangxian1008 Oct 28, 2022
6ebc3f6
modify punctuation and format
xzhangxian1008 Oct 31, 2022
08ff36d
resolve comments
xzhangxian1008 Oct 31, 2022
202f44d
tweaking
xzhangxian1008 Oct 31, 2022
8e231fc
solve not all const col
xzhangxian1008 Oct 31, 2022
515d19f
Update dbms/src/Functions/FunctionsRegexp.cpp
xzhangxian1008 Nov 1, 2022
a450985
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 1, 2022
add642f
resolve comments
xzhangxian1008 Nov 1, 2022
e80dc23
fix bug
xzhangxian1008 Nov 1, 2022
a423149
add match_type ft
xzhangxian1008 Nov 1, 2022
b3e6627
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 2, 2022
4e9eb84
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 3, 2022
602b380
refactor the handling of parms
xzhangxian1008 Nov 3, 2022
ad0e351
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 3, 2022
8622c01
compress macros
xzhangxian1008 Nov 3, 2022
a94293f
format
xzhangxian1008 Nov 3, 2022
cb93dea
tweaking
xzhangxian1008 Nov 3, 2022
a91a6cf
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 4, 2022
27ff20e
add tests
xzhangxian1008 Nov 4, 2022
444cb5e
tweaking
xzhangxian1008 Nov 4, 2022
a2d4bae
tweaking
xzhangxian1008 Nov 4, 2022
31ed79e
tweaking
xzhangxian1008 Nov 4, 2022
8cdf3ae
refine
xzhangxian1008 Nov 4, 2022
88edd55
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 7, 2022
8ad4307
resolve comment
xzhangxian1008 Nov 7, 2022
8613e89
merge and update instr
xzhangxian1008 Nov 7, 2022
3048031
pass compilation
xzhangxian1008 Nov 7, 2022
38571b8
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 7, 2022
6db90ee
fix ut
xzhangxian1008 Nov 7, 2022
e1d1501
refine header
xzhangxian1008 Nov 7, 2022
3b5806a
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 8, 2022
c23bb97
refine header
xzhangxian1008 Nov 8, 2022
e22079d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 8, 2022
f74afa9
replace getMatchedIndex with find and refine comments
xzhangxian1008 Nov 8, 2022
288ac97
refinw
xzhangxian1008 Nov 8, 2022
94d5b78
clean code
xzhangxian1008 Nov 8, 2022
e94ccd1
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 14, 2022
8efe7af
refine check type
xzhangxian1008 Nov 14, 2022
1ead665
start
xzhangxian1008 Nov 14, 2022
af469bd
refine substr impl
xzhangxian1008 Nov 14, 2022
d710184
save
xzhangxian1008 Nov 15, 2022
5a59a93
replace with template
xzhangxian1008 Nov 15, 2022
880dfe7
merge instr
xzhangxian1008 Nov 15, 2022
dcf9989
pass some tests
xzhangxian1008 Nov 15, 2022
2baa414
pass all tests
xzhangxian1008 Nov 15, 2022
377b83d
resolve comments
xzhangxian1008 Nov 17, 2022
ac59e15
merge and tweaking
xzhangxian1008 Nov 17, 2022
be4334f
resolve comments
xzhangxian1008 Nov 21, 2022
12ddfcb
merge instr
xzhangxian1008 Nov 21, 2022
e8f6b0c
Merge branch 'master' of https://github.com/pingcap/tiflash into substr
xzhangxian1008 Nov 21, 2022
9eced0e
add test case
xzhangxian1008 Nov 21, 2022
96087c6
resolve comment
xzhangxian1008 Nov 22, 2022
6b667a8
add some tests
xzhangxian1008 Nov 22, 2022
368046d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 22, 2022
8522d4b
Merge branch 'instr' into substr
xzhangxian1008 Nov 22, 2022
e2dbec5
add tests
xzhangxian1008 Nov 23, 2022
2f38660
fix ut
xzhangxian1008 Nov 23, 2022
298611b
resolve comment
xzhangxian1008 Nov 23, 2022
5f60c62
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 24, 2022
de7fd31
fix critical
xzhangxian1008 Nov 24, 2022
326664b
Merge branch 'instr' into substr
xzhangxian1008 Nov 24, 2022
43f9e2c
merge master
xzhangxian1008 Nov 28, 2022
e3c6df3
resolve conflict
xzhangxian1008 Nov 28, 2022
9cdb7e3
Merge branch 'master' of https://github.com/pingcap/tiflash into substr
xzhangxian1008 Nov 29, 2022
b8a3474
resolve comments
xzhangxian1008 Nov 29, 2022
4224a1d
add some tests
xzhangxian1008 Nov 30, 2022
524c8d8
resolve comments
xzhangxian1008 Nov 30, 2022
2c471df
tweaking
xzhangxian1008 Nov 30, 2022
28a1841
fix ft
xzhangxian1008 Nov 30, 2022
2c524a2
Merge branch 'master' of https://github.com/pingcap/tiflash into substr
xzhangxian1008 Dec 1, 2022
358a850
format
xzhangxian1008 Dec 1, 2022
1654586
fix ut
xzhangxian1008 Dec 1, 2022
63593ab
Merge branch 'master' into substr
ti-chi-bot Dec 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions dbms/src/Common/OptimizedRegularExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
#pragma once

#include <Common/config.h>
#include <common/StringRef.h>
#include <common/types.h>
#include <re2/re2.h>

#include <memory>
#include <optional>
#include <string>
#include <vector>
#if USE_RE2_ST
Expand Down Expand Up @@ -114,11 +116,14 @@ class OptimizedRegularExpressionImpl
}

Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op);
std::optional<StringRef> substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur);

private:
Int64 processEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur);
Int64 getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op);
Int64 processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur);
Int64 instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op);

std::optional<StringRef> processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur);
std::optional<StringRef> substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur);

bool is_trivial;
bool required_substring_is_prefix;
Expand Down
68 changes: 61 additions & 7 deletions dbms/src/Common/OptimizedRegularExpression.inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>
#include <Poco/Exception.h>
#include <common/StringRef.h>
#include <common/defines.h>
#include <common/types.h>

#include <iostream>
#include <optional>

#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
Expand Down Expand Up @@ -474,7 +476,7 @@ unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject
}

template <bool thread_safe>
Int64 OptimizedRegularExpressionImpl<thread_safe>::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur)
Int64 OptimizedRegularExpressionImpl<thread_safe>::processInstrEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur)
{
if (occur != 1)
return 0;
Expand All @@ -483,19 +485,38 @@ Int64 OptimizedRegularExpressionImpl<thread_safe>::processEmptyStringExpr(const
return RegexType::FindAndConsume(&expr_sp, *re2) ? pos : 0;
}

static inline void checkArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op)
template <bool thread_safe>
std::optional<StringRef> OptimizedRegularExpressionImpl<thread_safe>::processSubstrEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur)
{
if (occur != 1 || byte_pos != 1)
return std::nullopt;

StringPieceType expr_sp(expr, expr_size);
StringPieceType matched_str;
if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str))
return std::nullopt;

return std::optional<StringRef>(StringRef(matched_str.data(), matched_str.size()));
}

static inline void checkInstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op)
{
RUNTIME_CHECK_MSG(!(ret_op != 0 && ret_op != 1), "Incorrect argument to regexp function: return_option must be 1 or 0");
RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function.");
}

static inline void checkSubstrArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos)
{
RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function.");
}

static inline void makeOccurValid(Int64 & occur)
{
occur = occur < 0 ? 1 : occur;
occur = occur < 1 ? 1 : occur;
Comment thread
windtalker marked this conversation as resolved.
}

template <bool thread_safe>
Int64 OptimizedRegularExpressionImpl<thread_safe>::getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op)
Int64 OptimizedRegularExpressionImpl<thread_safe>::instrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op)
{
size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8
const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched
Expand All @@ -516,20 +537,53 @@ Int64 OptimizedRegularExpressionImpl<thread_safe>::getSubstrMatchedIndex(const c
return ret_op == 0 ? DB::UTF8::bytePos2Utf8Pos(reinterpret_cast<const UInt8 *>(subject), byte_offset + 1) : DB::UTF8::bytePos2Utf8Pos(reinterpret_cast<const UInt8 *>(subject), byte_offset + matched_str.size() + 1);
}

template <bool thread_safe>
std::optional<StringRef> OptimizedRegularExpressionImpl<thread_safe>::substrImpl(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur)
{
size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8
const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched
size_t expr_size = subject_size - byte_offset;

StringPieceType expr_sp(expr, expr_size);
StringPieceType matched_str;
while (occur > 0)
{
if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str))
return std::nullopt;

--occur;
}

return std::optional<StringRef>(StringRef(matched_str.data(), matched_str.size()));
}

template <bool thread_safe>
Int64 OptimizedRegularExpressionImpl<thread_safe>::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op)
{
Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(subject), subject_size);
;
checkInstrArgs(utf8_total_len, subject_size, pos, ret_op);
makeOccurValid(occur);

checkArgs(utf8_total_len, subject_size, pos, ret_op);
if (unlikely(subject_size == 0))
return processInstrEmptyStringExpr(subject, subject_size, pos, occur);

size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast<const UInt8 *>(subject), pos);
return instrImpl(subject, subject_size, byte_pos, occur, ret_op);
}

template <bool thread_safe>
std::optional<StringRef> OptimizedRegularExpressionImpl<thread_safe>::substr(const char * subject, size_t subject_size, Int64 pos, Int64 occur)
{
Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(subject), subject_size);
checkSubstrArgs(utf8_total_len, subject_size, pos);
makeOccurValid(occur);

if (unlikely(subject_size == 0))
return processEmptyStringExpr(subject, subject_size, pos, occur);
return processSubstrEmptyStringExpr(subject, subject_size, pos, occur);

size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast<const UInt8 *>(subject), pos);
return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op);
return substrImpl(subject, subject_size, byte_pos, occur);
}

#undef MIN_LENGTH_FOR_STRSTR
Expand Down
2 changes: 1 addition & 1 deletion dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
{tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"},
{tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"},
// {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"},
// {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"},
{tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"},

//{tipb::ScalarFuncSig::JsonExtractSig, "cast"},
//{tipb::ScalarFuncSig::JsonUnquoteSig, "cast"},
Expand Down
2 changes: 2 additions & 0 deletions dbms/src/Functions/FunctionsRegexp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ struct ReplaceRegexpImpl
using FunctionTiDBRegexp = FunctionStringRegexp<NameTiDBRegexp>;
using FunctionRegexpLike = FunctionStringRegexp<NameRegexpLike>;
using FunctionRegexpInstr = FunctionStringRegexpInstr<NameRegexpInstr>;
using FunctionRegexpSubstr = FunctionStringRegexpSubstr<NameRegexpSubstr>;
using FunctionReplaceRegexpOne = FunctionStringReplace<ReplaceRegexpImpl<true>, NameReplaceRegexpOne>;
using FunctionReplaceRegexpAll = FunctionStringReplace<ReplaceRegexpImpl<false>, NameReplaceRegexpAll>;

Expand All @@ -297,6 +298,7 @@ void registerFunctionsRegexp(FunctionFactory & factory)
factory.registerFunction<FunctionTiDBRegexp>();
factory.registerFunction<FunctionRegexpLike>();
factory.registerFunction<FunctionRegexpInstr>();
factory.registerFunction<FunctionRegexpSubstr>();
}

} // namespace DB
Loading