From 95099b3d5e95f676e3d5d6c35c6feb8037deaed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Fri, 6 Dec 2024 14:09:36 +0100 Subject: [PATCH 1/5] ``: Limit backreference parsing to single digit for basic regular expressions --- stl/inc/regex | 21 +++++++++++-------- .../std/tests/VSO_0000000_regex_use/test.cpp | 15 +++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 77efc9d32f3..95c54540eb2 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1187,7 +1187,8 @@ _NODISCARD bool operator!=(const match_results<_BidIt, _Alloc>& _Left, const mat } #endif // !_HAS_CXX20 -_INLINE_VAR constexpr unsigned int _BRE_MAX_GRP = 9U; +_INLINE_VAR constexpr unsigned int _BRE_MAX_GRP = 9U; +_INLINE_VAR constexpr int _BRE_MAX_BACKREF_DIGITS = 1; _INLINE_VAR constexpr unsigned int _Bmp_max = 256U; // must fit in an unsigned int _INLINE_VAR constexpr unsigned int _Bmp_shift = 3U; @@ -1705,7 +1706,7 @@ private: // parsing int _Do_digits(int _Base, int _Count, regex_constants::error_type _Error_type); - bool _DecimalDigits(regex_constants::error_type _Error_type); + bool _DecimalDigits2(regex_constants::error_type _Error_type, int _Count = INT_MAX); void _HexDigits(int); bool _OctalDigits(); void _Do_ex_class(_Meta_type); @@ -3950,9 +3951,9 @@ int _Parser<_FwdIt, _Elem, _RxTraits>::_Do_digits( } template -bool _Parser<_FwdIt, _Elem, _RxTraits>::_DecimalDigits( - regex_constants::error_type _Error_type) { // check for decimal value - return _Do_digits(10, INT_MAX, _Error_type) != INT_MAX; +bool _Parser<_FwdIt, _Elem, _RxTraits>::_DecimalDigits2( + const regex_constants::error_type _Error_type, const int _Count /*= INT_MAX */) { // check for decimal value + return _Do_digits(10, _Count, _Error_type) != _Count; } template @@ -4041,7 +4042,7 @@ _Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_ClassEscape2() { // check for class return _Prs_chr; } else if ((_L_flags & _L_esc_wsd) && _CharacterClassEscape(false)) { return _Prs_set; - } else if (_DecimalDigits(regex_constants::error_escape)) { // check for invalid value + } else if (_DecimalDigits2(regex_constants::error_escape)) { // check for invalid value if (_Val != 0) { _Error(regex_constants::error_escape); } @@ -4333,7 +4334,9 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterEscape() { // check for valid template void _Parser<_FwdIt, _Elem, _RxTraits>::_AtomEscape() { // check for valid atom escape - if ((_L_flags & _L_bckr) && _DecimalDigits(regex_constants::error_backref)) { // check for valid back reference + if ((_L_flags & _L_bckr) + && _DecimalDigits2(regex_constants::error_backref, + (_L_flags & _L_lim_bckr) ? _BRE_MAX_BACKREF_DIGITS : INT_MAX)) { // check for valid back reference if (_Val == 0) { // handle \0 if (!(_L_flags & _L_bzr_chr)) { _Error(regex_constants::error_escape); @@ -4365,7 +4368,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Quantifier() { // check for quantifier _Max = 1; } else if (_Mchar == _Meta_lbr) { // check for valid bracketed value _Next(); - if (!_DecimalDigits(regex_constants::error_badbrace)) { + if (!_DecimalDigits2(regex_constants::error_badbrace)) { _Error(regex_constants::error_badbrace); } @@ -4375,7 +4378,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Quantifier() { // check for quantifier } else { // check for decimal constant following comma _Next(); if (_Mchar != _Meta_rbr) { - if (!_DecimalDigits(regex_constants::error_badbrace)) { + if (!_DecimalDigits2(regex_constants::error_badbrace)) { _Error(regex_constants::error_badbrace); } diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index ad2efc0d4a6..6b36d98e20c 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -669,6 +669,20 @@ void test_gh_5160() { neg_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE } +void test_gh_5167() { + // GH-5167: Limit backreference parsing to single digit for basic regular expressions + g_regexTester.should_match("abab0", R"x(\(ab*\)\10)x", basic); + g_regexTester.should_match("abab0", R"x(\(ab*\)\10)x", grep); + g_regexTester.should_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\25\18)x", basic); + g_regexTester.should_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\25\18)x", grep); + g_regexTester.should_not_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\15\28)x", basic); + g_regexTester.should_not_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\15\28)x", grep); + g_regexTester.should_throw(R"x(abc\1d)x", error_backref, basic); + g_regexTester.should_throw(R"x(abc\1d)x", error_backref, grep); + g_regexTester.should_throw(R"x(abc\10)x", error_backref, basic); + g_regexTester.should_throw(R"x(abc\10)x", error_backref, grep); +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -699,6 +713,7 @@ int main() { test_gh_4995(); test_gh_5058(); test_gh_5160(); + test_gh_5167(); return g_regexTester.result(); } From 843f0ce46c598bac5b65a94318a53285e2a2a32e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Tue, 10 Dec 2024 17:17:38 +0100 Subject: [PATCH 2/5] remove unnecessary check --- stl/inc/regex | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 95c54540eb2..384c7016b78 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1187,7 +1187,6 @@ _NODISCARD bool operator!=(const match_results<_BidIt, _Alloc>& _Left, const mat } #endif // !_HAS_CXX20 -_INLINE_VAR constexpr unsigned int _BRE_MAX_GRP = 9U; _INLINE_VAR constexpr int _BRE_MAX_BACKREF_DIGITS = 1; _INLINE_VAR constexpr unsigned int _Bmp_max = 256U; // must fit in an unsigned int @@ -4343,8 +4342,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_AtomEscape() { // check for valid atom } else { _Nfa._Add_char(static_cast<_Elem>(_Val)); } - } else if (((_L_flags & _L_lim_bckr) && _BRE_MAX_GRP < static_cast(_Val)) - || _Grp_idx < static_cast(_Val) || _Finished_grps.size() <= static_cast(_Val) + } else if (_Grp_idx < static_cast(_Val) || _Finished_grps.size() <= static_cast(_Val) || !_Finished_grps[static_cast(_Val)]) { _Error(regex_constants::error_backref); } else { From 7ce6eae84849a38f4b694cc6ec756b552db580f3 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 11 Jan 2025 12:55:15 -0800 Subject: [PATCH 3/5] Rename to `_Bre_max_backref_digits`, move to point of use. --- stl/inc/regex | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 384c7016b78..94039c36fcb 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1187,8 +1187,6 @@ _NODISCARD bool operator!=(const match_results<_BidIt, _Alloc>& _Left, const mat } #endif // !_HAS_CXX20 -_INLINE_VAR constexpr int _BRE_MAX_BACKREF_DIGITS = 1; - _INLINE_VAR constexpr unsigned int _Bmp_max = 256U; // must fit in an unsigned int _INLINE_VAR constexpr unsigned int _Bmp_shift = 3U; _INLINE_VAR constexpr unsigned int _Bmp_chrs = 1U << _Bmp_shift; // # of bits to be stored in each char @@ -4333,9 +4331,10 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterEscape() { // check for valid template void _Parser<_FwdIt, _Elem, _RxTraits>::_AtomEscape() { // check for valid atom escape + constexpr int _Bre_max_backref_digits = 1; if ((_L_flags & _L_bckr) && _DecimalDigits2(regex_constants::error_backref, - (_L_flags & _L_lim_bckr) ? _BRE_MAX_BACKREF_DIGITS : INT_MAX)) { // check for valid back reference + (_L_flags & _L_lim_bckr) ? _Bre_max_backref_digits : INT_MAX)) { // check for valid back reference if (_Val == 0) { // handle \0 if (!(_L_flags & _L_bzr_chr)) { _Error(regex_constants::error_escape); From 3ca146b64997d294ff19c1b3b89dfda6a5586f48 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 11 Jan 2025 13:02:25 -0800 Subject: [PATCH 4/5] Space. The final frontier. --- stl/inc/regex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/regex b/stl/inc/regex index 94039c36fcb..42c7721545b 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3949,7 +3949,7 @@ int _Parser<_FwdIt, _Elem, _RxTraits>::_Do_digits( template bool _Parser<_FwdIt, _Elem, _RxTraits>::_DecimalDigits2( - const regex_constants::error_type _Error_type, const int _Count /*= INT_MAX */) { // check for decimal value + const regex_constants::error_type _Error_type, const int _Count /* = INT_MAX */) { // check for decimal value return _Do_digits(10, _Count, _Error_type) != _Count; } From 2a35fde8306e07cfa344a38700b1c0ca537fbc1d Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 11 Jan 2025 13:09:12 -0800 Subject: [PATCH 5/5] `R"x(meow)x"` => `R"(meow)"` --- .../std/tests/VSO_0000000_regex_use/test.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 6b36d98e20c..7f1528039ab 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -671,16 +671,16 @@ void test_gh_5160() { void test_gh_5167() { // GH-5167: Limit backreference parsing to single digit for basic regular expressions - g_regexTester.should_match("abab0", R"x(\(ab*\)\10)x", basic); - g_regexTester.should_match("abab0", R"x(\(ab*\)\10)x", grep); - g_regexTester.should_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\25\18)x", basic); - g_regexTester.should_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\25\18)x", grep); - g_regexTester.should_not_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\15\28)x", basic); - g_regexTester.should_not_match("abbcdccdc5abb8", R"x(\(ab*\)\([cd]*\)\15\28)x", grep); - g_regexTester.should_throw(R"x(abc\1d)x", error_backref, basic); - g_regexTester.should_throw(R"x(abc\1d)x", error_backref, grep); - g_regexTester.should_throw(R"x(abc\10)x", error_backref, basic); - g_regexTester.should_throw(R"x(abc\10)x", error_backref, grep); + g_regexTester.should_match("abab0", R"(\(ab*\)\10)", basic); + g_regexTester.should_match("abab0", R"(\(ab*\)\10)", grep); + g_regexTester.should_match("abbcdccdc5abb8", R"(\(ab*\)\([cd]*\)\25\18)", basic); + g_regexTester.should_match("abbcdccdc5abb8", R"(\(ab*\)\([cd]*\)\25\18)", grep); + g_regexTester.should_not_match("abbcdccdc5abb8", R"(\(ab*\)\([cd]*\)\15\28)", basic); + g_regexTester.should_not_match("abbcdccdc5abb8", R"(\(ab*\)\([cd]*\)\15\28)", grep); + g_regexTester.should_throw(R"(abc\1d)", error_backref, basic); + g_regexTester.should_throw(R"(abc\1d)", error_backref, grep); + g_regexTester.should_throw(R"(abc\10)", error_backref, basic); + g_regexTester.should_throw(R"(abc\10)", error_backref, grep); } int main() {