diff --git a/benchmarks/src/regex_search.cpp b/benchmarks/src/regex_search.cpp index 28e88c4b313..bc6a3d12537 100644 --- a/benchmarks/src/regex_search.cpp +++ b/benchmarks/src/regex_search.cpp @@ -36,5 +36,10 @@ BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(?:bibe)+", "(?:bibe)+")->Arg(2)->Arg(3)->Arg(4); +BENCHMARK_CAPTURE(bm_lorem_search, R"(\bbibe)", R"(\bbibe)")->Arg(2)->Arg(3)->Arg(4); +BENCHMARK_CAPTURE(bm_lorem_search, R"(\Bibe)", R"(\Bibe)")->Arg(2)->Arg(3)->Arg(4); +BENCHMARK_CAPTURE(bm_lorem_search, R"((?=....)bibe)", R"((?=....)bibe)")->Arg(2)->Arg(3)->Arg(4); +BENCHMARK_CAPTURE(bm_lorem_search, R"((?=bibe)....)", R"((?=bibe)....)")->Arg(2)->Arg(3)->Arg(4); +BENCHMARK_CAPTURE(bm_lorem_search, R"((?!lorem)bibe)", R"((?!lorem)bibe)")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_MAIN(); diff --git a/stl/inc/regex b/stl/inc/regex index 7bc37074893..38d5fdec477 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1781,7 +1781,7 @@ public: return true; } - _BidIt _Skip(_BidIt, _BidIt, _Node_base* = nullptr); + _BidIt _Skip(_BidIt, _BidIt, _Node_base* = nullptr, unsigned int _Recursion_depth = 0U); private: _Tgt_state_t<_It> _Tgt_state; @@ -4107,12 +4107,14 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N } template -_BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) { +_BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip( + _BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg, unsigned int _Recursion_depth) { // skip until possible match // assumes --_First_arg is valid static constexpr char _Line_terminators_char[] = {static_cast(_Meta_cr), static_cast(_Meta_nl)}; static constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast(_Meta_cr), static_cast(_Meta_nl), static_cast(_Meta_ls), static_cast(_Meta_ps)}; + constexpr unsigned int _Max_recursion_depth = 50U; _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; while (_First_arg != _Last && _Nx) { // check current node @@ -4227,17 +4229,54 @@ _BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg break; } + case _N_assert: + { + if (_Recursion_depth >= _Max_recursion_depth) { + return _First_arg; + } + + _Node_assert* _Node = static_cast<_Node_assert*>(_Nx); + _First_arg = _Skip(_First_arg, _Last, _Node->_Child); + _BidIt _Next; + for (;;) { + _Next = _Skip(_First_arg, _Last, _Node->_Next, _Recursion_depth + 1U); + if (_Next == _First_arg) { + return _First_arg; + } + + _First_arg = _Skip(_Next, _Last, _Node->_Child, _Recursion_depth + 1U); + if (_Next == _First_arg) { + return _First_arg; + } + } + } + + case _N_neg_assert: + // we skip the negated assertion body and continue examining the rest of the regex + break; + + case _N_wbound: + { + bool _Negated = (_Nx->_Flags & _Fl_negate) != 0; + bool _Prev_word = _STD _Is_word(*_STD _Prev_iter(_First_arg)); + for (; _First_arg != _Last; ++_First_arg) { + bool _Next_word = _STD _Is_word(*_First_arg); + if (_Negated == (_Next_word == _Prev_word)) { + break; + } + _Prev_word = _Next_word; + } + return _First_arg; + } + case _N_begin: + case _N_endif: break; case _N_end: case _N_none: - case _N_wbound: case _N_dot: - case _N_assert: - case _N_neg_assert: case _N_back: - case _N_endif: case _N_end_rep: default: return _First_arg; diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 92fe235892f..a1e41b84421 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -234,8 +234,8 @@ void test_VSO_167760_nested_quantifiers_should_not_infinite_loop() { void test_DDB_153116_replacements() { g_regexTester.should_replace_to("abc def def ghi", "^", "X", format_default, "Xabc def def ghi"); g_regexTester.should_replace_to("abc def def ghi", "$", "X", format_default, "abc def def ghiX"); - g_regexTester.should_replace_to("abc def def ghi", "\\b", "X", format_default, "XabcX XdefX XdefX XghiX"); - g_regexTester.should_replace_to("abc def def ghi", "\\B", "X", format_default, "aXbXc dXeXf dXeXf gXhXi"); + g_regexTester.should_replace_to("abc def def ghi", "\\b", "X", format_default, "XabcX XdefX XdefX XghiX"); + g_regexTester.should_replace_to("abc def def ghi", "\\B", "X", format_default, "aXbXc X dXeXf dXeXf X gXhXi"); g_regexTester.should_replace_to("abc def def ghi", "(?=ef)", "X", format_default, "abc dXef dXef ghi"); g_regexTester.should_replace_to("abc def def ghi", "(?!ef)", "X", format_default, "XaXbXcX XdeXfX XdeXfX XgXhXiX"); } @@ -2092,6 +2092,17 @@ void test_gh_5509() { } } +void test_gh_5576() { + // GH-5576 sped up searches for regexes that start with assertions + // by extending the skip heuristic in the matcher. + // We test here that the skip heuristic is correct + // for positive and negative lookahead assertions. + g_regexTester.should_replace_to("AbGweEfFllLLlffflElF", "(?=[[:lower:]][[:upper:]])[fFlL]{2}", R"(X$&)", + match_default, "AbGweEXfFlXlLLlffflEXlF"); + g_regexTester.should_replace_to("AbGweEfFllLLlffflElF", "(?![[:upper:]]|[[:lower:]]{2})[fFlL]{2}", R"(X$&)", + match_default, "AbGweEXfFlXlLLlffflEXlF"); +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -2141,6 +2152,7 @@ int main() { test_gh_5377(); test_gh_5490(); test_gh_5509(); + test_gh_5576(); return g_regexTester.result(); }