From 1ef8a05170294b06a6009d55fbe16bd9f139b9b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Fri, 23 May 2025 19:43:26 +0200 Subject: [PATCH 1/6] ``: Add multiline option and make non-multiline mode the default --- benchmarks/src/regex_search.cpp | 1 + stl/inc/regex | 76 ++++-- tests/libcxx/expected_results.txt | 6 - tests/std/test.lst | 1 + .../env.lst | 4 + .../test.cpp | 107 ++++++++ .../VSO_0000000_regex_interface/test.cpp | 4 +- .../std/tests/VSO_0000000_regex_use/test.cpp | 242 ++++++++++++++---- tests/tr1/tests/regex2/test.cpp | 2 +- 9 files changed, 373 insertions(+), 70 deletions(-) create mode 100644 tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst create mode 100644 tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp diff --git a/benchmarks/src/regex_search.cpp b/benchmarks/src/regex_search.cpp index 8a96a98e77f..28e88c4b313 100644 --- a/benchmarks/src/regex_search.cpp +++ b/benchmarks/src/regex_search.cpp @@ -31,6 +31,7 @@ void bm_lorem_search(benchmark::State& state, const char* pattern) { } } +BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4); BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4); diff --git a/stl/inc/regex b/stl/inc/regex index 4c04da603b8..bb38105d6f6 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -121,10 +121,11 @@ namespace regex_constants { _Gmask = 0x3F, _Any_posix = basic | extended | grep | egrep | awk, - icase = 0x0100, - nosubs = 0x0200, - optimize = 0x0400, - collate = 0x0800 + icase = 0x0100, + nosubs = 0x0200, + optimize = 0x0400, + collate = 0x0800, + multiline = 0x1000 }; _BITMASK_OPS(_EXPORT_STD, syntax_option_type) @@ -1666,6 +1667,15 @@ public: if (_Re->_Flags & _Fl_begin_needs_d) { _Char_class_d = _Lookup_char_class(static_cast<_Elem>('D')); } + +// sanitize multiline mode setting +#ifdef _REGEX_MAKE_MULTILINE_MODE_DEFAULT + _Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars +#else // ^^^ defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) / !defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) vvv + if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only + _Sflags &= ~regex_constants::multiline; + } +#endif // ^^^ !defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) ^^^ } void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags @@ -1920,6 +1930,7 @@ public: static constexpr flag_type awk = regex_constants::awk; static constexpr flag_type grep = regex_constants::grep; static constexpr flag_type egrep = regex_constants::egrep; + static constexpr flag_type multiline = regex_constants::multiline; basic_regex() = default; // construct empty object @@ -3833,6 +3844,11 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0); } +template +bool _Is_ecmascript_line_terminator(_Elem _Ch) { + return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps; +} + template bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match if (0 < _Max_stack_count && --_Max_stack_count <= 0) { @@ -3852,18 +3868,19 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N case _N_bol: if ((_Mflags & regex_constants::match_prev_avail) || _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding newline - _Failed = *_Prev_iter(_Tgt_state._Cur) != _Meta_nl; + _Failed = !(_Sflags & regex_constants::multiline) + || !_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_Tgt_state._Cur)); } else { _Failed = (_Mflags & regex_constants::match_not_bol) != 0; } - break; case _N_eol: if (_Tgt_state._Cur == _End) { _Failed = (_Mflags & regex_constants::match_not_eol) != 0; } else { - _Failed = *_Tgt_state._Cur != _Meta_nl; + _Failed = + !(_Sflags & regex_constants::multiline) || !_STD _Is_ecmascript_line_terminator(*_Tgt_state._Cur); } break; @@ -3881,7 +3898,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N if (_Ch == _Elem()) { _Failed = true; } - } else if (_Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps) { // ECMAScript + } else if (_STD _Is_ecmascript_line_terminator(_Ch)) { _Failed = true; } @@ -4054,30 +4071,55 @@ template _BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) { // skip until possible match // assumes --_First_arg is valid - _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; + constexpr char _Line_terminators_char[] = {static_cast(_Meta_cr), static_cast(_Meta_nl)}; + constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast(_Meta_cr), static_cast(_Meta_nl), + static_cast(_Meta_ls), static_cast(_Meta_ps)}; + _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; while (_First_arg != _Last && _Nx) { // check current node switch (_Nx->_Kind) { // handle current node's type case _N_nop: break; - case _N_bol: - { // check for embedded newline - // return iterator to character just after the newline; for input like "\nabc" - // matching "^abc", _First_arg could be pointing at 'a', so we need to check - // --_First_arg for '\n' - if (*_Prev_iter(_First_arg) != _Meta_nl) { - _First_arg = _STD find(_First_arg, _Last, _Meta_nl); + case _N_bol: // check for beginning anchor + if (_Sflags & regex_constants::multiline) { + // multiline mode: check for embedded line terminator + // return iterator to character just after the newline; for input like "\nabc" + // matching "^abc", _First_arg could be pointing at 'a', so we need to check + // --_First_arg for '\n' + if (!_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_First_arg))) { + if constexpr (sizeof(_Elem) == 1) { + _First_arg = _STD find_first_of( + _First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char)); + } else { + _First_arg = _STD find_first_of( + _First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t)); + } + if (_First_arg != _Last) { ++_First_arg; } } return _First_arg; + } else { + // non-multiline mode: never matches because --_First_arg is valid + return _Last; } case _N_eol: - return _STD find(_First_arg, _Last, _Meta_nl); + if (_Sflags & regex_constants::multiline) { + // multiline mode: matches at next line terminator or end of input + if constexpr (sizeof(_Elem) == 1) { + return _STD find_first_of( + _First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char)); + } else { + return _STD find_first_of( + _First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t)); + } + } else { + return _Last; // non-multiline mode: matches at end of input or not at all + } case _N_str: { // check for string match diff --git a/tests/libcxx/expected_results.txt b/tests/libcxx/expected_results.txt index ed1781f9b1a..d5f9274662e 100644 --- a/tests/libcxx/expected_results.txt +++ b/tests/libcxx/expected_results.txt @@ -575,12 +575,6 @@ std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp FAIL # *** MISSING LWG ISSUE RESOLUTIONS *** -# LWG-2503 "multiline option should be added to syntax_option_type" -std/re/re.alg/re.alg.search/no_update_pos.pass.cpp FAIL -std/re/re.const/re.matchflag/match_multiline.pass.cpp FAIL -std/re/re.const/re.matchflag/match_not_eol.pass.cpp FAIL -std/re/re.const/re.synopt/syntax_option_type.pass.cpp FAIL - # LWG-2532 "Satisfying a promise at thread exit" (Open) std/thread/futures/futures.promise/set_exception_at_thread_exit.pass.cpp FAIL std/thread/futures/futures.promise/set_lvalue_at_thread_exit.pass.cpp FAIL diff --git a/tests/std/test.lst b/tests/std/test.lst index 4c7c40603d1..46be77e6b8b 100644 --- a/tests/std/test.lst +++ b/tests/std/test.lst @@ -154,6 +154,7 @@ tests\Dev11_1140665_unique_ptr_array_conversions tests\Dev11_1150223_shared_mutex tests\Dev11_1158803_regex_thread_safety tests\Dev11_1180290_filesystem_error_code +tests\GH_000073_regex_multiline_escape_hatch tests\GH_000140_adl_proof_comparison tests\GH_000140_adl_proof_construction tests\GH_000140_adl_proof_views diff --git a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst new file mode 100644 index 00000000000..19f025bd0e6 --- /dev/null +++ b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/env.lst @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +RUNALL_INCLUDE ..\usual_matrix.lst diff --git a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp new file mode 100644 index 00000000000..ea66b886982 --- /dev/null +++ b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT + +#include +#include +#include +#include + +#include + +using namespace std; +using namespace std::regex_constants; + +regex_fixture g_regexTester; + +void test_VSO_225160_match_bol_flag() { + // Old tests for caret anchor in default multiline mode + for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) { + const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_bol); + emptyAnchor.should_search_match("\n", ""); + emptyAnchor.should_search_match("\n", "", match_not_bol); + + const test_regex beginCd(&g_regexTester, R"(^cd)", syntax); + beginCd.should_search_match("ab\ncdefg", "cd"); + beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol); + + beginCd.should_search_match("cdefg", "cd"); + beginCd.should_search_fail("cdefg", match_not_bol); + beginCd.should_search_match("\ncdefg", "cd"); + beginCd.should_search_match("\ncdefg", "cd", match_not_bol); + + beginCd.should_search_fail("ab\nxcdefg"); + beginCd.should_search_fail("ab\nxcdefg", match_not_bol); + } +} + +void test_VSO_225160_match_eol_flag() { + // Old tests for dollar anchor in default multiline mode + for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) { + const test_regex emptyAnchor(&g_regexTester, R"($)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_eol); + emptyAnchor.should_search_match("\n", ""); + emptyAnchor.should_search_match("\n", "", match_not_eol); + + const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax); + cdEnd.should_search_match("abcd\nefg", "cd"); + cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol); + + cdEnd.should_search_match("abcd", "cd"); + cdEnd.should_search_fail("abcd", match_not_eol); + cdEnd.should_search_match("abcd\n", "cd"); + cdEnd.should_search_match("abcd\n", "cd", match_not_eol); + + cdEnd.should_search_fail("abcdx\nefg"); + cdEnd.should_search_fail("abcdx\nefg", match_not_eol); + } +} + +void test_gh_73() { + for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_match("b\na", "a"); + a_anchored_on_both_sides.should_search_match("a\nb", "a"); + a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol); + a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", syntax); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_match("b\na", "a"); + a_anchored_front.should_search_match("\na", "a"); + a_anchored_front.should_search_fail("a", match_not_bol); + a_anchored_front.should_search_match("\na", "a", match_not_bol); + a_anchored_front.should_search_match("b\na", "a", match_not_bol); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", syntax); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_match("a\nb", "a"); + a_anchored_back.should_search_match("a\n", "a"); + a_anchored_back.should_search_fail("a", match_not_eol); + a_anchored_back.should_search_match("a\n", "a", match_not_eol); + a_anchored_back.should_search_match("a\nb", "a", match_not_eol); + } + } +} + +int main() { + test_VSO_225160_match_bol_flag(); + test_VSO_225160_match_eol_flag(); + test_gh_73(); + + return g_regexTester.result(); +} diff --git a/tests/std/tests/VSO_0000000_regex_interface/test.cpp b/tests/std/tests/VSO_0000000_regex_interface/test.cpp index c057696eb3a..3fd281f4b85 100644 --- a/tests/std/tests/VSO_0000000_regex_interface/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_interface/test.cpp @@ -387,9 +387,9 @@ void test_VSO_180466_regex_search_missing_Unchecked_call() { } void test_VSO_226914_match_prev_avail() { - // N.B. assumes our nonstandard multiline behavior. See also: LWG-2343, LWG-2503 + // test assumes multiline mode const char bol_haystack[] = {'\n', 'a'}; - const regex bol_anchor(R"(^a)"); + const regex bol_anchor(R"(^a)", regex_constants::multiline); assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor)); assert(!regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_not_bol)); assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_prev_avail)); diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 7160c9007fc..066a2f957e9 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -453,48 +453,95 @@ void test_VSO_208146_regex_smoke_test_rewritten_explicit_quantifier() { } void test_VSO_225160_match_bol_flag() { - // Note that this tests that we are consistent about the ECMAScript "multiline" setting being - // true, but the standard currently appears to mandate that that is false. We don't want to - // break existing customers, but we should at least be consistently multiline. - // See also: LWG-2343, LWG-2503 - const test_regex emptyAnchor(&g_regexTester, R"(^)"); - emptyAnchor.should_search_match("", ""); - emptyAnchor.should_search_fail("", match_not_bol); - emptyAnchor.should_search_match("\n", ""); - emptyAnchor.should_search_match("\n", "", match_not_bol); - - const test_regex beginCd(&g_regexTester, R"(^cd)"); - beginCd.should_search_match("ab\ncdefg", "cd"); - beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol); - - beginCd.should_search_match("cdefg", "cd"); - beginCd.should_search_fail("cdefg", match_not_bol); - beginCd.should_search_match("\ncdefg", "cd"); - beginCd.should_search_match("\ncdefg", "cd", match_not_bol); - - beginCd.should_search_fail("ab\nxcdefg"); - beginCd.should_search_fail("ab\nxcdefg", match_not_bol); + // After implementation of LWG-2503/GH-73: These tests make sure that + // we consistently implement "multiline" option for ECMAScript + // (whether the ECMAScript flag is included or not) + for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { + for (string line_terminator : {"\n", "\r"}) { + const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_bol); + emptyAnchor.should_search_match(line_terminator, ""); + emptyAnchor.should_search_match(line_terminator, "", match_not_bol); + + const test_regex beginCd(&g_regexTester, R"(^cd)", syntax); + beginCd.should_search_match("ab" + line_terminator + "cdefg", "cd"); + beginCd.should_search_match("ab" + line_terminator + "cdefg", "cd", match_not_bol); + + beginCd.should_search_match("cdefg", "cd"); + beginCd.should_search_fail("cdefg", match_not_bol); + beginCd.should_search_match(line_terminator + "cdefg", "cd"); + beginCd.should_search_match(line_terminator + "cdefg", "cd", match_not_bol); + + beginCd.should_search_fail("ab" + line_terminator + "xcdefg"); + beginCd.should_search_fail("ab" + line_terminator + "xcdefg", match_not_bol); + } + + for (wstring line_terminator : {L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR + const test_wregex emptyAnchor(&g_regexTester, LR"(^)", syntax); + emptyAnchor.should_search_match(L"", L""); + emptyAnchor.should_search_fail(L"", match_not_bol); + emptyAnchor.should_search_match(line_terminator, L""); + emptyAnchor.should_search_match(line_terminator, L"", match_not_bol); + + const test_wregex beginCd(&g_regexTester, LR"(^cd)", syntax); + beginCd.should_search_match(L"ab" + line_terminator + L"cdefg", L"cd"); + beginCd.should_search_match(L"ab" + line_terminator + L"cdefg", L"cd", match_not_bol); + + beginCd.should_search_match(L"cdefg", L"cd"); + beginCd.should_search_fail(L"cdefg", match_not_bol); + beginCd.should_search_match(line_terminator + L"cdefg", L"cd"); + beginCd.should_search_match(line_terminator + L"cdefg", L"cd", match_not_bol); + + beginCd.should_search_fail(L"ab" + line_terminator + L"xcdefg"); + beginCd.should_search_fail(L"ab" + line_terminator + L"xcdefg", match_not_bol); + } + } } void test_VSO_225160_match_eol_flag() { // Ditto multiline comment - const test_regex emptyAnchor(&g_regexTester, R"($)"); - emptyAnchor.should_search_match("", ""); - emptyAnchor.should_search_fail("", match_not_eol); - emptyAnchor.should_search_match("\n", ""); - emptyAnchor.should_search_match("\n", "", match_not_eol); - - const test_regex cdEnd(&g_regexTester, R"(cd$)"); - cdEnd.should_search_match("abcd\nefg", "cd"); - cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol); - - cdEnd.should_search_match("abcd", "cd"); - cdEnd.should_search_fail("abcd", match_not_eol); - cdEnd.should_search_match("abcd\n", "cd"); - cdEnd.should_search_match("abcd\n", "cd", match_not_eol); - - cdEnd.should_search_fail("abcdx\nefg"); - cdEnd.should_search_fail("abcdx\nefg", match_not_eol); + for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { + for (string line_terminator : {"\n", "\r"}) { + const test_regex emptyAnchor(&g_regexTester, R"($)", syntax); + emptyAnchor.should_search_match("", ""); + emptyAnchor.should_search_fail("", match_not_eol); + emptyAnchor.should_search_match(line_terminator, ""); + emptyAnchor.should_search_match(line_terminator, "", match_not_eol); + + const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax); + cdEnd.should_search_match("abcd" + line_terminator + "efg", "cd"); + cdEnd.should_search_match("abcd" + line_terminator + "efg", "cd", match_not_eol); + + cdEnd.should_search_match("abcd", "cd"); + cdEnd.should_search_fail("abcd", match_not_eol); + cdEnd.should_search_match("abcd" + line_terminator, "cd"); + cdEnd.should_search_match("abcd" + line_terminator, "cd", match_not_eol); + + cdEnd.should_search_fail("abcdx" + line_terminator + "efg"); + cdEnd.should_search_fail("abcdx" + line_terminator + "efg", match_not_eol); + } + + for (wstring line_terminator : {L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR + const test_wregex emptyAnchor(&g_regexTester, LR"($)", syntax); + emptyAnchor.should_search_match(L"", L""); + emptyAnchor.should_search_fail(L"", match_not_eol); + emptyAnchor.should_search_match(line_terminator, L""); + emptyAnchor.should_search_match(line_terminator, L"", match_not_eol); + + const test_wregex cdEnd(&g_regexTester, LR"(cd$)", syntax); + cdEnd.should_search_match(L"abcd" + line_terminator + L"efg", L"cd"); + cdEnd.should_search_match(L"abcd" + line_terminator + L"efg", L"cd", match_not_eol); + + cdEnd.should_search_match(L"abcd", L"cd"); + cdEnd.should_search_fail(L"abcd", match_not_eol); + cdEnd.should_search_match(L"abcd" + line_terminator, L"cd"); + cdEnd.should_search_match(L"abcd" + line_terminator, L"cd", match_not_eol); + + cdEnd.should_search_fail(L"abcdx" + line_terminator + L"efg"); + cdEnd.should_search_fail(L"abcdx" + line_terminator + L"efg", match_not_eol); + } + } } void test_VSO_226914_word_boundaries() { @@ -558,8 +605,102 @@ void test_construction_from_nullptr_and_zero() { } } +void test_gh_73() { + // GH-73: LWG-2503 multiline option should be added to syntax_option_type + for (syntax_option_type grammar : {basic, grep, extended, egrep, awk}) { + for (syntax_option_type multiline_mode : {syntax_option_type{}, multiline}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", grammar | multiline_mode); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_fail("b\na"); + a_anchored_on_both_sides.should_search_fail("a\nb"); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", grammar | multiline_mode); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_fail("b\na"); + a_anchored_front.should_search_fail("\na"); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", grammar | multiline_mode); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_fail("a\nb"); + a_anchored_back.should_search_fail("a\n"); + } + } + } + + for (syntax_option_type grammar : {syntax_option_type{}, ECMAScript}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", grammar); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_fail("b\na"); + a_anchored_on_both_sides.should_search_fail("a\nb"); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", grammar); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_fail("b\na"); + a_anchored_front.should_search_fail("\na"); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", grammar); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_fail("a\nb"); + a_anchored_back.should_search_fail("a\n"); + } + } + + for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { + { + test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax); + a_anchored_on_both_sides.should_search_match("a", "a"); + a_anchored_on_both_sides.should_search_match("b\na", "a"); + a_anchored_on_both_sides.should_search_match("a\nb", "a"); + a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol); + a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol); + } + + { + test_regex a_anchored_front(&g_regexTester, "^a", syntax); + a_anchored_front.should_search_match("a", "a"); + a_anchored_front.should_search_match("a\n", "a"); + a_anchored_front.should_search_match("a\nb", "a"); + a_anchored_front.should_search_match("b\na", "a"); + a_anchored_front.should_search_match("\na", "a"); + a_anchored_front.should_search_fail("a", match_not_bol); + a_anchored_front.should_search_match("\na", "a", match_not_bol); + a_anchored_front.should_search_match("b\na", "a", match_not_bol); + } + + { + test_regex a_anchored_back(&g_regexTester, "a$", syntax); + a_anchored_back.should_search_match("a", "a"); + a_anchored_back.should_search_match("\na", "a"); + a_anchored_back.should_search_match("b\na", "a"); + a_anchored_back.should_search_match("a\nb", "a"); + a_anchored_back.should_search_match("a\n", "a"); + a_anchored_back.should_search_fail("a", match_not_eol); + a_anchored_back.should_search_match("a\n", "a", match_not_eol); + a_anchored_back.should_search_match("a\nb", "a", match_not_eol); + } + } +} + void test_gh_731() { - // GH-731 : Incorrect behavior for capture groups + // GH-731: : Incorrect behavior for capture groups // GH-996: regex_search behaves incorrectly when the regex contains R"(\[)" // Several bugs were fixed in ECMAScript (depth-first) and POSIX (leftmost-longest) matching rules. @@ -1533,7 +1674,7 @@ void test_gh_5362_grep() { { const test_regex middle_nl_with_dollar(&g_regexTester, "a$\nb$", grep); middle_nl_with_dollar.should_search_match("a$\nb", "b"); - middle_nl_with_dollar.should_search_match("a\nb", "a"); + middle_nl_with_dollar.should_search_match("a\nb", "b"); middle_nl_with_dollar.should_search_match("ba", "a"); middle_nl_with_dollar.should_search_match("a", "a"); middle_nl_with_dollar.should_search_match("b", "b"); @@ -1913,16 +2054,28 @@ void test_gh_5509() { } { - test_regex anchored_string_plus_regex(&g_regexTester, "((?:^aw)+)"); - anchored_string_plus_regex.should_search_match_capture_groups( + test_regex anchored_string_plus_regex_multi(&g_regexTester, "((?:^aw)+)", multiline); + anchored_string_plus_regex_multi.should_search_match_capture_groups( "blwerofa\nawaweraf", "aw", match_default, {{9, 11}}); + anchored_string_plus_regex_multi.should_search_fail("blwerof\naerwaf"); + } + + { + test_regex anchored_string_plus_regex(&g_regexTester, "((?:^aw)+)"); + anchored_string_plus_regex.should_search_fail("blwerofa\nawaweraf"); anchored_string_plus_regex.should_search_fail("blwerof\naerwaf"); } { - test_regex anchored_string_plus_regex(&g_regexTester, "((?:$\naw)+)"); - anchored_string_plus_regex.should_search_match_capture_groups( + test_regex anchored_string_plus_regex_multi(&g_regexTester, "((?:$\naw)+)", multiline); + anchored_string_plus_regex_multi.should_search_match_capture_groups( "blwerofa\nawaweraf", "\naw", match_default, {{8, 11}}); + anchored_string_plus_regex_multi.should_search_fail("blwerof\naerwaf"); + } + + { + test_regex anchored_string_plus_regex(&g_regexTester, "((?:$\naw)+)"); + anchored_string_plus_regex.should_search_fail("blwerofa\nawaweraf"); anchored_string_plus_regex.should_search_fail("blwerof\naerwaf"); } @@ -1964,6 +2117,7 @@ int main() { test_VSO_225160_match_eol_flag(); test_VSO_226914_word_boundaries(); test_construction_from_nullptr_and_zero(); + test_gh_73(); test_gh_731(); test_gh_992(); test_gh_993(); diff --git a/tests/tr1/tests/regex2/test.cpp b/tests/tr1/tests/regex2/test.cpp index 83706bb66bc..c720774d3c2 100644 --- a/tests/tr1/tests/regex2/test.cpp +++ b/tests/tr1/tests/regex2/test.cpp @@ -132,7 +132,7 @@ static const regex_test tests[] = { {__LINE__, T("a$"), T("ba"), "1 1 2", ALL}, {__LINE__, T("a$"), T("ab"), "0", ALL}, - {__LINE__, T("^a$"), T("b\na"), "1 2 3", ALL}, + {__LINE__, T("^a$"), T("b\na"), "0", ALL}, {__LINE__, T("\\b"), T("a"), "1 0 0", ECMA}, {__LINE__, T("\\b"), T(""), "-1", BASIC | GREP | EXTENDED | EGREP}, From ec646527b5db77ecc476ea3b7ac4943d2a29f700 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Jun 2025 12:58:46 -0700 Subject: [PATCH 2/6] Update comments. --- tests/std/tests/VSO_0000000_regex_interface/test.cpp | 2 +- tests/std/tests/VSO_0000000_regex_use/test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_interface/test.cpp b/tests/std/tests/VSO_0000000_regex_interface/test.cpp index 3fd281f4b85..c05c4aa4173 100644 --- a/tests/std/tests/VSO_0000000_regex_interface/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_interface/test.cpp @@ -387,7 +387,7 @@ void test_VSO_180466_regex_search_missing_Unchecked_call() { } void test_VSO_226914_match_prev_avail() { - // test assumes multiline mode + // test exercises multiline mode const char bol_haystack[] = {'\n', 'a'}; const regex bol_anchor(R"(^a)", regex_constants::multiline); assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor)); diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 31da2d9685b..282c64cbafe 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -454,7 +454,7 @@ void test_VSO_208146_regex_smoke_test_rewritten_explicit_quantifier() { void test_VSO_225160_match_bol_flag() { // After implementation of LWG-2503/GH-73: These tests make sure that - // we consistently implement "multiline" option for ECMAScript + // we consistently implement the "multiline" option for ECMAScript // (whether the ECMAScript flag is included or not) for (syntax_option_type syntax : {multiline, ECMAScript | multiline}) { for (string line_terminator : {"\n", "\r"}) { From 34fde65b3956940933a7ea4934aff0ec13222ce0 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Jun 2025 13:16:13 -0700 Subject: [PATCH 3/6] Use static constexpr for arrays. --- stl/inc/regex | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 5cec65f3870..3187a9e45ca 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -4071,10 +4071,10 @@ template _BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) { // skip until possible match // assumes --_First_arg is valid - constexpr char _Line_terminators_char[] = {static_cast(_Meta_cr), static_cast(_Meta_nl)}; - constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast(_Meta_cr), static_cast(_Meta_nl), - static_cast(_Meta_ls), static_cast(_Meta_ps)}; - _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; + static constexpr char _Line_terminators_char[] = {static_cast(_Meta_cr), static_cast(_Meta_nl)}; + static constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast(_Meta_cr), + static_cast(_Meta_nl), static_cast(_Meta_ls), static_cast(_Meta_ps)}; + _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; while (_First_arg != _Last && _Nx) { // check current node switch (_Nx->_Kind) { // handle current node's type From 776a7c57c4893f8bc0127d6095bab636ade08515 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Jun 2025 14:20:11 -0700 Subject: [PATCH 4/6] Test wide CR and LF as line terminators. --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 282c64cbafe..92fe235892f 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -477,7 +477,8 @@ void test_VSO_225160_match_bol_flag() { beginCd.should_search_fail("ab" + line_terminator + "xcdefg", match_not_bol); } - for (wstring line_terminator : {L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR + for (wstring line_terminator : + {L"\n", L"\r", L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR const test_wregex emptyAnchor(&g_regexTester, LR"(^)", syntax); emptyAnchor.should_search_match(L"", L""); emptyAnchor.should_search_fail(L"", match_not_bol); @@ -522,7 +523,8 @@ void test_VSO_225160_match_eol_flag() { cdEnd.should_search_fail("abcdx" + line_terminator + "efg", match_not_eol); } - for (wstring line_terminator : {L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR + for (wstring line_terminator : + {L"\n", L"\r", L"\u2028", L"\u2029"}) { // U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR const test_wregex emptyAnchor(&g_regexTester, LR"($)", syntax); emptyAnchor.should_search_match(L"", L""); emptyAnchor.should_search_fail(L"", match_not_eol); From dd1c7a19699c77d01aad7ed007914d529d0a11d2 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Jun 2025 15:12:18 -0700 Subject: [PATCH 5/6] Make the option always defined to 0 or 1, add a detailed comment. --- stl/inc/regex | 16 +++++++++++++--- .../test.cpp | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 3187a9e45ca..bba27650ded 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -33,6 +33,16 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new +// Controls whether LWG-2503 "multiline option should be added to syntax_option_type" is implemented. +// Defining this to 0 requests Standard behavior: +// * For ECMAScript, matching is non-multiline by default, but regex_constants::multiline can be requested. +// * For POSIX grammars, matching is non-multiline, and regex_constants::multiline is ignored (N5008 [tab:re.synopt]). +// Defining this to 1 requests legacy behavior: +// * For all grammars, matching is multiline, and regex_constants::multiline is redundant. +#ifndef _REGEX_MAKE_MULTILINE_MODE_DEFAULT +#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT 0 +#endif + #ifndef _REGEX_MAX_COMPLEXITY_COUNT #define _REGEX_MAX_COMPLEXITY_COUNT 10000000L // set to 0 to disable #endif // !defined(_REGEX_MAX_COMPLEXITY_COUNT) @@ -1669,13 +1679,13 @@ public: } // sanitize multiline mode setting -#ifdef _REGEX_MAKE_MULTILINE_MODE_DEFAULT +#if _REGEX_MAKE_MULTILINE_MODE_DEFAULT _Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars -#else // ^^^ defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) / !defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) vvv +#else // ^^^ _REGEX_MAKE_MULTILINE_MODE_DEFAULT / !_REGEX_MAKE_MULTILINE_MODE_DEFAULT vvv if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only _Sflags &= ~regex_constants::multiline; } -#endif // ^^^ !defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) ^^^ +#endif // ^^^ !_REGEX_MAKE_MULTILINE_MODE_DEFAULT ^^^ } void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags diff --git a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp index ea66b886982..b494174dfcc 100644 --- a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp +++ b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT +#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT 1 #include #include From 549d1cedcc84e074cc4d8a39ddd71bb6031e5754 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Jun 2025 15:14:46 -0700 Subject: [PATCH 6/6] Rename to `_REGEX_LEGACY_MULTILINE_MODE`. --- stl/inc/regex | 10 +++++----- .../GH_000073_regex_multiline_escape_hatch/test.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index bba27650ded..88ec4f59821 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -39,8 +39,8 @@ _STL_DISABLE_CLANG_WARNINGS // * For POSIX grammars, matching is non-multiline, and regex_constants::multiline is ignored (N5008 [tab:re.synopt]). // Defining this to 1 requests legacy behavior: // * For all grammars, matching is multiline, and regex_constants::multiline is redundant. -#ifndef _REGEX_MAKE_MULTILINE_MODE_DEFAULT -#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT 0 +#ifndef _REGEX_LEGACY_MULTILINE_MODE +#define _REGEX_LEGACY_MULTILINE_MODE 0 #endif #ifndef _REGEX_MAX_COMPLEXITY_COUNT @@ -1679,13 +1679,13 @@ public: } // sanitize multiline mode setting -#if _REGEX_MAKE_MULTILINE_MODE_DEFAULT +#if _REGEX_LEGACY_MULTILINE_MODE _Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars -#else // ^^^ _REGEX_MAKE_MULTILINE_MODE_DEFAULT / !_REGEX_MAKE_MULTILINE_MODE_DEFAULT vvv +#else // ^^^ _REGEX_LEGACY_MULTILINE_MODE / !_REGEX_LEGACY_MULTILINE_MODE vvv if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only _Sflags &= ~regex_constants::multiline; } -#endif // ^^^ !_REGEX_MAKE_MULTILINE_MODE_DEFAULT ^^^ +#endif // ^^^ !_REGEX_LEGACY_MULTILINE_MODE ^^^ } void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags diff --git a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp index b494174dfcc..31968afa26c 100644 --- a/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp +++ b/tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT 1 +#define _REGEX_LEGACY_MULTILINE_MODE 1 #include #include