diff --git a/benchmarks/src/regex_search.cpp b/benchmarks/src/regex_search.cpp index 8a96a98e77f..28e88c4b313 100644 --- a/benchmarks/src/regex_search.cpp +++ b/benchmarks/src/regex_search.cpp @@ -31,6 +31,7 @@ void bm_lorem_search(benchmark::State& state, const char* pattern) { } } +BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4); diff --git a/stl/inc/regex b/stl/inc/regex index 5e622565c3d..88ec4f59821 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -33,6 +33,16 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new +// Controls whether LWG-2503 "multiline option should be added to syntax_option_type" is implemented. +// Defining this to 0 requests Standard behavior: +// * For ECMAScript, matching is non-multiline by default, but regex_constants::multiline can be requested. +// * For POSIX grammars, matching is non-multiline, and regex_constants::multiline is ignored (N5008 [tab:re.synopt]). +// Defining this to 1 requests legacy behavior: +// * For all grammars, matching is multiline, and regex_constants::multiline is redundant. +#ifndef _REGEX_LEGACY_MULTILINE_MODE +#define _REGEX_LEGACY_MULTILINE_MODE 0 +#endif + #ifndef _REGEX_MAX_COMPLEXITY_COUNT #define _REGEX_MAX_COMPLEXITY_COUNT 10000000L // set to 0 to disable #endif // !defined(_REGEX_MAX_COMPLEXITY_COUNT) @@ -121,10 +131,11 @@ namespace regex_constants { _Gmask = 0x3F, _Any_posix = basic | extended | grep | egrep | awk, - icase = 0x0100, - nosubs = 0x0200, - optimize = 0x0400, - collate = 0x0800 + icase = 0x0100, + nosubs = 0x0200, + optimize = 0x0400, + collate = 0x0800, + multiline = 0x1000 }; _BITMASK_OPS(_EXPORT_STD, syntax_option_type) @@ -1666,6 +1677,15 @@ public: if (_Re->_Flags & _Fl_begin_needs_d) { _Char_class_d = _Lookup_char_class(static_cast<_Elem>('D')); } + +// sanitize multiline mode setting +#if _REGEX_LEGACY_MULTILINE_MODE + _Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars +#else // ^^^ _REGEX_LEGACY_MULTILINE_MODE / !_REGEX_LEGACY_MULTILINE_MODE vvv + if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only + _Sflags &= ~regex_constants::multiline; + } +#endif // ^^^ !_REGEX_LEGACY_MULTILINE_MODE ^^^ } void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags @@ -1920,6 +1940,7 @@ public: static constexpr flag_type awk = regex_constants::awk; static constexpr flag_type grep = regex_constants::grep; static constexpr flag_type egrep = regex_constants::egrep; + static constexpr flag_type multiline = regex_constants::multiline; basic_regex() = default; // construct empty object @@ -3833,6 +3854,11 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0); } +template +bool _Is_ecmascript_line_terminator(_Elem _Ch) { + return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps; +} + template bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match if (0 < _Max_stack_count && --_Max_stack_count <= 0) { @@ -3852,18 +3878,19 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N case _N_bol: if ((_Mflags & regex_constants::match_prev_avail) || _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding newline - _Failed = *_Prev_iter(_Tgt_state._Cur) != _Meta_nl; + _Failed = !(_Sflags & regex_constants::multiline) + || !_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_Tgt_state._Cur)); } else { _Failed = (_Mflags & regex_constants::match_not_bol) != 0; } - break; case _N_eol: if (_Tgt_state._Cur == _End) { _Failed = (_Mflags & regex_constants::match_not_eol) != 0; } else { - _Failed = *_Tgt_state._Cur != _Meta_nl; + _Failed = + !(_Sflags & regex_constants::multiline) || !_STD _Is_ecmascript_line_terminator(*_Tgt_state._Cur); } break; @@ -3881,7 +3908,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N if (_Ch == _Elem()) { _Failed = true; } - } else if (_Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps) { // ECMAScript + } else if (_STD _Is_ecmascript_line_terminator(_Ch)) { _Failed = true; } @@ -4054,30 +4081,55 @@ template _BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) { // skip until possible match // assumes --_First_arg is valid - _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; + static constexpr char _Line_terminators_char[] = {static_cast(_Meta_cr), static_cast(_Meta_nl)}; + static constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast(_Meta_cr), + static_cast(_Meta_nl), static_cast(_Meta_ls), static_cast(_Meta_ps)}; + _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; while (_First_arg != _Last && _Nx) { // check current node switch (_Nx->_Kind) { // handle current node's type case _N_nop: break; - case _N_bol: - { // check for embedded newline - // return iterator to character just after the newline; for input like "\nabc" - // matching "^abc", _First_arg could be pointing at 'a', so we need to check - // --_First_arg for '\n' - if (*_Prev_iter(_First_arg) != _Meta_nl) { - _First_arg = _STD find(_First_arg, _Last, _Meta_nl); + case _N_bol: // check for beginning anchor + if (_Sflags & regex_constants::multiline) { + // multiline mode: check for embedded line terminator + // return iterator to character just after the newline; for input like "\nabc" + // matching "^abc", _First_arg could be pointing at 'a', so we need to check + // --_First_arg for '\n' + if (!_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_First_arg))) { + if constexpr (sizeof(_Elem) == 1) { + _First_arg = _STD find_first_of( + _First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char)); + } else { + _First_arg = _STD find_first_of( + _First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t)); + } + if (_First_arg != _Last) { ++_First_arg; } } return _First_arg; + } else { + // non-multiline mode: never matches because --_First_arg is valid + return _Last; } case _N_eol: - return _STD find(_First_arg, _Last, _Meta_nl); + if (_Sflags & regex_constants::multiline) { + // multiline mode: matches at next line terminator or end of input + if constexpr (sizeof(_Elem) == 1) { + return _STD find_first_of( + _First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char)); + } else { + return _STD find_first_of( + _First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t)); + } + } else { + return _Last; // non-multiline mode: matches at end of input or not at all + } case _N_str: { // check for string match diff --git a/tests/libcxx/expected_results.txt b/tests/libcxx/expected_results.txt index ed1781f9b1a..d5f9274662e 100644 --- a/tests/libcxx/expected_results.txt +++ b/tests/libcxx/expected_results.txt @@ -575,12 +575,6 @@ std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp FAIL # *** MISSING LWG ISSUE RESOLUTIONS *** -# LWG-2503 "multiline option should be added to syntax_option_type" -std/re/re.alg/re.alg.search/no_update_pos.pass.cpp FAIL -std/re/re.const/re.matchflag/match_multiline.pass.cpp FAIL -std/re/re.const/re.matchflag/match_not_eol.pass.cpp FAIL -std/re/re.const/re.synopt/syntax_option_type.pass.cpp FAIL - # LWG-2532 "Satisfying a promise at thread exit" (Open) std/thread/futures/futures.promise/set_exception_at_thread_exit.pass.cpp FAIL std/thread/futures/futures.promise/set_lvalue_at_thread_exit.pass.cpp FAIL diff --git a/tests/std/test.lst b/tests/std/test.lst index 4c7c40603d1..46be77e6b8b 100644 --- a/tests/std/test.lst +++ b/tests/std/test.lst @@ -154,6 +154,7 @@ tests\Dev11_1140665_unique_ptr_array_conversions tests\Dev11_1150223_shared_mutex tests\Dev11_1158803_regex_thread_safety tests\Dev11_1180290_filesystem_error_code +tests\GH_000073_regex_multiline_escape_hatch tests\GH_000140_adl_proof_comparison tests\GH_000140_adl_proof_construction tests\GH_000140_adl_proof_views diff --git a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst new file mode 100644 index 00000000000..19f025bd0e6 --- /dev/null +++ b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +RUNALL_INCLUDE ..\usual_matrix.lst diff --git a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp new file mode 100644 index 00000000000..31968afa26c --- /dev/null +++ b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#define _REGEX_LEGACY_MULTILINE_MODE 1 + +#include +#include +#include +#include + +#include + +using namespace std; +using namespace std::regex_constants; + +regex_fixture g_regexTester; + +void test_VSO_225160_match_bol_flag() { + // Old tests for caret anchor in default multiline mode + for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) { + const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_bol); + emptyAnchor.should_search_match("\n", ""); + emptyAnchor.should_search_match("\n", "", match_not_bol); + + const test_regex beginCd(&g_regexTester, R"(^cd)", syntax); + beginCd.should_search_match("ab\ncdefg", "cd"); + beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol); + + beginCd.should_search_match("cdefg", "cd"); + beginCd.should_search_fail("cdefg", match_not_bol); + beginCd.should_search_match("\ncdefg", "cd"); + beginCd.should_search_match("\ncdefg", "cd", match_not_bol); + + beginCd.should_search_fail("ab\nxcdefg"); + beginCd.should_search_fail("ab\nxcdefg", match_not_bol); + } +} + +void test_VSO_225160_match_eol_flag() { + // Old tests for dollar anchor in default multiline mode + for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) { + const test_regex emptyAnchor(&g_regexTester, R"($)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_eol); + emptyAnchor.should_search_match("\n", ""); + emptyAnchor.should_search_match("\n", "", match_not_eol); + + const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax); + cdEnd.should_search_match("abcd\nefg", "cd"); + cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol); + + cdEnd.should_search_match("abcd", "cd"); + cdEnd.should_search_fail("abcd", match_not_eol); + cdEnd.should_search_match("abcd\n", "cd"); + cdEnd.should_search_match("abcd\n", "cd", match_not_eol); + + cdEnd.should_search_fail("abcdx\nefg"); + cdEnd.should_search_fail("abcdx\nefg", match_not_eol); + } +} + +void test_gh_73() { + for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_match("b\na", "a"); + a_anchored_on_both_sides.should_search_match("a\nb", "a"); + a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol); + a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", syntax); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_match("b\na", "a"); + a_anchored_front.should_search_match("\na", "a"); + a_anchored_front.should_search_fail("a", match_not_bol); + a_anchored_front.should_search_match("\na", "a", match_not_bol); + a_anchored_front.should_search_match("b\na", "a", match_not_bol); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", syntax); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_match("a\nb", "a"); + a_anchored_back.should_search_match("a\n", "a"); + a_anchored_back.should_search_fail("a", match_not_eol); + a_anchored_back.should_search_match("a\n", "a", match_not_eol); + a_anchored_back.should_search_match("a\nb", "a", match_not_eol); + } + } +} + +int main() { + test_VSO_225160_match_bol_flag(); + test_VSO_225160_match_eol_flag(); + test_gh_73(); + + return g_regexTester.result(); +} diff --git a/tests/std/tests/VSO_0000000_regex_interface/test.cpp b/tests/std/tests/VSO_0000000_regex_interface/test.cpp index c057696eb3a..c05c4aa4173 100644 --- a/tests/std/tests/VSO_0000000_regex_interface/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_interface/test.cpp @@ -387,9 +387,9 @@ void test_VSO_180466_regex_search_missing_Unchecked_call() { } void test_VSO_226914_match_prev_avail() { - // N.B. assumes our nonstandard multiline behavior. See also: LWG-2343, LWG-2503 + // test exercises multiline mode const char bol_haystack[] = {'\n', 'a'}; - const regex bol_anchor(R"(^a)"); + const regex bol_anchor(R"(^a)", regex_constants::multiline); assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor)); assert(!regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_not_bol)); assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_prev_avail)); diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index c927b2ff91c..92fe235892f 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -453,48 +453,97 @@ void test_VSO_208146_regex_smoke_test_rewritten_explicit_quantifier() { } void test_VSO_225160_match_bol_flag() { - // Note that this tests that we are consistent about the ECMAScript "multiline" setting being - // true, but the standard currently appears to mandate that that is false. We don't want to - // break existing customers, but we should at least be consistently multiline. - // See also: LWG-2343, LWG-2503 - const test_regex emptyAnchor(&g_regexTester, R"(^)"); - emptyAnchor.should_search_match("", ""); - emptyAnchor.should_search_fail("", match_not_bol); - emptyAnchor.should_search_match("\n", ""); - emptyAnchor.should_search_match("\n", "", match_not_bol); - - const test_regex beginCd(&g_regexTester, R"(^cd)"); - beginCd.should_search_match("ab\ncdefg", "cd"); - beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol); - - beginCd.should_search_match("cdefg", "cd"); - beginCd.should_search_fail("cdefg", match_not_bol); - beginCd.should_search_match("\ncdefg", "cd"); - beginCd.should_search_match("\ncdefg", "cd", match_not_bol); - - beginCd.should_search_fail("ab\nxcdefg"); - beginCd.should_search_fail("ab\nxcdefg", match_not_bol); + // After implementation of LWG-2503/GH-73: These tests make sure that + // we consistently implement the "multiline" option for ECMAScript + // (whether the ECMAScript flag is included or not) + for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { + for (string line_terminator : {"\n", "\r"}) { + const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_bol); + emptyAnchor.should_search_match(line_terminator, ""); + emptyAnchor.should_search_match(line_terminator, "", match_not_bol); + + const test_regex beginCd(&g_regexTester, R"(^cd)", syntax); + beginCd.should_search_match("ab" + line_terminator + "cdefg", "cd"); + beginCd.should_search_match("ab" + line_terminator + "cdefg", "cd", match_not_bol); + + beginCd.should_search_match("cdefg", "cd"); + beginCd.should_search_fail("cdefg", match_not_bol); + beginCd.should_search_match(line_terminator + "cdefg", "cd"); + beginCd.should_search_match(line_terminator + "cdefg", "cd", match_not_bol); + + beginCd.should_search_fail("ab" + line_terminator + "xcdefg"); + beginCd.should_search_fail("ab" + line_terminator + "xcdefg", match_not_bol); + } + + for (wstring line_terminator : + {L"\n", L"\r", L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR + const test_wregex emptyAnchor(&g_regexTester, LR"(^)", syntax); + emptyAnchor.should_search_match(L"", L""); + emptyAnchor.should_search_fail(L"", match_not_bol); + emptyAnchor.should_search_match(line_terminator, L""); + emptyAnchor.should_search_match(line_terminator, L"", match_not_bol); + + const test_wregex beginCd(&g_regexTester, LR"(^cd)", syntax); + beginCd.should_search_match(L"ab" + line_terminator + L"cdefg", L"cd"); + beginCd.should_search_match(L"ab" + line_terminator + L"cdefg", L"cd", match_not_bol); + + beginCd.should_search_match(L"cdefg", L"cd"); + beginCd.should_search_fail(L"cdefg", match_not_bol); + beginCd.should_search_match(line_terminator + L"cdefg", L"cd"); + beginCd.should_search_match(line_terminator + L"cdefg", L"cd", match_not_bol); + + beginCd.should_search_fail(L"ab" + line_terminator + L"xcdefg"); + beginCd.should_search_fail(L"ab" + line_terminator + L"xcdefg", match_not_bol); + } + } } void test_VSO_225160_match_eol_flag() { // Ditto multiline comment - const test_regex emptyAnchor(&g_regexTester, R"($)"); - emptyAnchor.should_search_match("", ""); - emptyAnchor.should_search_fail("", match_not_eol); - emptyAnchor.should_search_match("\n", ""); - emptyAnchor.should_search_match("\n", "", match_not_eol); - - const test_regex cdEnd(&g_regexTester, R"(cd$)"); - cdEnd.should_search_match("abcd\nefg", "cd"); - cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol); - - cdEnd.should_search_match("abcd", "cd"); - cdEnd.should_search_fail("abcd", match_not_eol); - cdEnd.should_search_match("abcd\n", "cd"); - cdEnd.should_search_match("abcd\n", "cd", match_not_eol); - - cdEnd.should_search_fail("abcdx\nefg"); - cdEnd.should_search_fail("abcdx\nefg", match_not_eol); + for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { + for (string line_terminator : {"\n", "\r"}) { + const test_regex emptyAnchor(&g_regexTester, R"($)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_eol); + emptyAnchor.should_search_match(line_terminator, ""); + emptyAnchor.should_search_match(line_terminator, "", match_not_eol); + + const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax); + cdEnd.should_search_match("abcd" + line_terminator + "efg", "cd"); + cdEnd.should_search_match("abcd" + line_terminator + "efg", "cd", match_not_eol); + + cdEnd.should_search_match("abcd", "cd"); + cdEnd.should_search_fail("abcd", match_not_eol); + cdEnd.should_search_match("abcd" + line_terminator, "cd"); + cdEnd.should_search_match("abcd" + line_terminator, "cd", match_not_eol); + + cdEnd.should_search_fail("abcdx" + line_terminator + "efg"); + cdEnd.should_search_fail("abcdx" + line_terminator + "efg", match_not_eol); + } + + for (wstring line_terminator : + {L"\n", L"\r", L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR + const test_wregex emptyAnchor(&g_regexTester, LR"($)", syntax); + emptyAnchor.should_search_match(L"", L""); + emptyAnchor.should_search_fail(L"", match_not_eol); + emptyAnchor.should_search_match(line_terminator, L""); + emptyAnchor.should_search_match(line_terminator, L"", match_not_eol); + + const test_wregex cdEnd(&g_regexTester, LR"(cd$)", syntax); + cdEnd.should_search_match(L"abcd" + line_terminator + L"efg", L"cd"); + cdEnd.should_search_match(L"abcd" + line_terminator + L"efg", L"cd", match_not_eol); + + cdEnd.should_search_match(L"abcd", L"cd"); + cdEnd.should_search_fail(L"abcd", match_not_eol); + cdEnd.should_search_match(L"abcd" + line_terminator, L"cd"); + cdEnd.should_search_match(L"abcd" + line_terminator, L"cd", match_not_eol); + + cdEnd.should_search_fail(L"abcdx" + line_terminator + L"efg"); + cdEnd.should_search_fail(L"abcdx" + line_terminator + L"efg", match_not_eol); + } + } } void test_VSO_226914_word_boundaries() { @@ -558,8 +607,102 @@ void test_construction_from_nullptr_and_zero() { } } +void test_gh_73() { + // GH-73: LWG-2503 multiline option should be added to syntax_option_type + for (syntax_option_type grammar : {basic, grep, extended, egrep, awk}) { + for (syntax_option_type multiline_mode : {syntax_option_type{}, multiline}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", grammar | multiline_mode); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_fail("b\na"); + a_anchored_on_both_sides.should_search_fail("a\nb"); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", grammar | multiline_mode); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_fail("b\na"); + a_anchored_front.should_search_fail("\na"); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", grammar | multiline_mode); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_fail("a\nb"); + a_anchored_back.should_search_fail("a\n"); + } + } + } + + for (syntax_option_type grammar : {syntax_option_type{}, ECMAScript}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", grammar); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_fail("b\na"); + a_anchored_on_both_sides.should_search_fail("a\nb"); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", grammar); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_fail("b\na"); + a_anchored_front.should_search_fail("\na"); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", grammar); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_fail("a\nb"); + a_anchored_back.should_search_fail("a\n"); + } + } + + for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_match("b\na", "a"); + a_anchored_on_both_sides.should_search_match("a\nb", "a"); + a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol); + a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", syntax); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_match("b\na", "a"); + a_anchored_front.should_search_match("\na", "a"); + a_anchored_front.should_search_fail("a", match_not_bol); + a_anchored_front.should_search_match("\na", "a", match_not_bol); + a_anchored_front.should_search_match("b\na", "a", match_not_bol); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", syntax); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_match("a\nb", "a"); + a_anchored_back.should_search_match("a\n", "a"); + a_anchored_back.should_search_fail("a", match_not_eol); + a_anchored_back.should_search_match("a\n", "a", match_not_eol); + a_anchored_back.should_search_match("a\nb", "a", match_not_eol); + } + } +} + void test_gh_731() { - // GH-731 : Incorrect behavior for capture groups + // GH-731: : Incorrect behavior for capture groups // GH-996: regex_search behaves incorrectly when the regex contains R"(\[)" // Several bugs were fixed in ECMAScript (depth-first) and POSIX (leftmost-longest) matching rules. @@ -1533,7 +1676,7 @@ void test_gh_5362_grep() { { const test_regex middle_nl_with_dollar(&g_regexTester, "a$\nb$", grep); middle_nl_with_dollar.should_search_match("a$\nb", "b"); - middle_nl_with_dollar.should_search_match("a\nb", "a"); + middle_nl_with_dollar.should_search_match("a\nb", "b"); middle_nl_with_dollar.should_search_match("ba", "a"); middle_nl_with_dollar.should_search_match("a", "a"); middle_nl_with_dollar.should_search_match("b", "b"); @@ -1913,16 +2056,28 @@ void test_gh_5509() { } { - test_regex anchored_string_plus_regex(&g_regexTester, "((?:^aw)+)"); - anchored_string_plus_regex.should_search_match_capture_groups( + test_regex anchored_string_plus_regex_multi(&g_regexTester, "((?:^aw)+)", multiline); + anchored_string_plus_regex_multi.should_search_match_capture_groups( "blwerofa\nawaweraf", "aw", match_default, {{9, 11}}); + anchored_string_plus_regex_multi.should_search_fail("blwerof\naerwaf"); + } + + { + test_regex anchored_string_plus_regex(&g_regexTester, "((?:^aw)+)"); + anchored_string_plus_regex.should_search_fail("blwerofa\nawaweraf"); anchored_string_plus_regex.should_search_fail("blwerof\naerwaf"); } { - test_regex anchored_string_plus_regex(&g_regexTester, "((?:$\naw)+)"); - anchored_string_plus_regex.should_search_match_capture_groups( + test_regex anchored_string_plus_regex_multi(&g_regexTester, "((?:$\naw)+)", multiline); + anchored_string_plus_regex_multi.should_search_match_capture_groups( "blwerofa\nawaweraf", "\naw", match_default, {{8, 11}}); + anchored_string_plus_regex_multi.should_search_fail("blwerof\naerwaf"); + } + + { + test_regex anchored_string_plus_regex(&g_regexTester, "((?:$\naw)+)"); + anchored_string_plus_regex.should_search_fail("blwerofa\nawaweraf"); anchored_string_plus_regex.should_search_fail("blwerof\naerwaf"); } @@ -1964,6 +2119,7 @@ int main() { test_VSO_225160_match_eol_flag(); test_VSO_226914_word_boundaries(); test_construction_from_nullptr_and_zero(); + test_gh_73(); test_gh_731(); test_gh_992(); test_gh_993(); diff --git a/tests/tr1/tests/regex2/test.cpp b/tests/tr1/tests/regex2/test.cpp index 83706bb66bc..c720774d3c2 100644 --- a/tests/tr1/tests/regex2/test.cpp +++ b/tests/tr1/tests/regex2/test.cpp @@ -132,7 +132,7 @@ static const regex_test tests[] = { {__LINE__, T("a$"), T("ba"), "1 1 2", ALL}, {__LINE__, T("a$"), T("ab"), "0", ALL}, - {__LINE__, T("^a$"), T("b\na"), "1 2 3", ALL}, + {__LINE__, T("^a$"), T("b\na"), "0", ALL}, {__LINE__, T("\\b"), T("a"), "1 0 0", ECMA}, {__LINE__, T("\\b"), T(""), "-1", BASIC | GREP | EXTENDED | EGREP},