From 071789070f94fa2d13154568e93bb02c8159c443 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Wed, 23 Apr 2025 19:14:48 +0200 Subject: [PATCH 1/2] ``: Improve `std::collate::do_transform()`'s handling of wrongly encoded input --- stl/inc/locale | 8 +- stl/src/xstrxfrm.cpp | 2 +- stl/src/xwcsxfrm.cpp | 4 +- .../tests/GH_005236_collate_facet/test.cpp | 78 +++++++++++++++++++ 4 files changed, 87 insertions(+), 5 deletions(-) diff --git a/stl/inc/locale b/stl/inc/locale index 41963efa50..cc08f4d4ee 100644 --- a/stl/inc/locale +++ b/stl/inc/locale @@ -166,10 +166,14 @@ protected: size_t _Count; string_type _Str; - for (_Count = static_cast(_Last - _First); 0 < _Count;) { + for (_Count = static_cast(_Last - _First); _Str.size() < _Count;) { // grow string if locale-specific strxfrm fails _Str.resize(_Count); - if ((_Count = _LStrxfrm(&_Str[0], &_Str[0] + _Str.size(), _First, _Last, &_Coll)) <= _Str.size()) { + _Count = _LStrxfrm(&_Str[0], &_Str[0] + _Str.size(), _First, _Last, &_Coll); + + if (_Count == static_cast(-1)) { + // return empty string in case of error + _Count = 0; break; } } diff --git a/stl/src/xstrxfrm.cpp b/stl/src/xstrxfrm.cpp index 193d957df2..c094a843a4 100644 --- a/stl/src/xstrxfrm.cpp +++ b/stl/src/xstrxfrm.cpp @@ -50,7 +50,7 @@ _EXTERN_C_UNLESS_PURE // string1 array are indeterminate. // // Exceptions: -// Non-standard: if OM/API error, return INT_MAX. +// Non-standard: if OM/API error, return SIZE_MAX. _CRTIMP2_PURE size_t __CLRCALL_PURE_OR_CDECL _Strxfrm(_Out_writes_(end1 - string1) _Post_readable_size_(return) char* string1, _In_z_ char* end1, const char* string2, const char* end2, const _Collvec* ploc) noexcept { diff --git a/stl/src/xwcsxfrm.cpp b/stl/src/xwcsxfrm.cpp index f3a2300dac..631c735d63 100644 --- a/stl/src/xwcsxfrm.cpp +++ b/stl/src/xwcsxfrm.cpp @@ -43,7 +43,7 @@ _EXTERN_C_UNLESS_PURE // string1 array are indeterminate. // // Exceptions: -// Non-standard: if OM/API error, return INT_MAX. +// Non-standard: if OM/API error, return SIZE_MAX. _CRTIMP2_PURE size_t __CLRCALL_PURE_OR_CDECL _Wcsxfrm(_Out_writes_(end1 - string1) _Post_readable_size_(return) wchar_t* string1, _In_z_ wchar_t* end1, const wchar_t* string2, const wchar_t* end2, const _Collvec* ploc) noexcept { @@ -84,7 +84,7 @@ _CRTIMP2_PURE size_t __CLRCALL_PURE_OR_CDECL _Wcsxfrm(_Out_writes_(end1 - string size = __crtLCMapStringW(locale_name, LCMAP_SORTKEY, string2, static_cast(n2), nullptr, 0); if (size == 0) { - size = INT_MAX; // default error + size = static_cast(-1); // default error } } else { // string successfully mapped, convert to wide char diff --git a/tests/std/tests/GH_005236_collate_facet/test.cpp b/tests/std/tests/GH_005236_collate_facet/test.cpp index a54c301585..4a8c4634dd 100644 --- a/tests/std/tests/GH_005236_collate_facet/test.cpp +++ b/tests/std/tests/GH_005236_collate_facet/test.cpp @@ -19,6 +19,83 @@ using namespace std; +// GH-5210 "std::collate<_Elem>::do_transform() should behave appropriately when _LStrxfrm() fails" +void test_gh_5210() { +#ifndef SKIP_COLLATE_TRANSFORM_TESTS + { + locale utf8_locale("en_US.utf8"); + const auto& coll = use_facet>(utf8_locale); + + const string test = "this i\xA0s a very brok\x80n utf-8\xC8string"; + assert(coll.transform(test.data(), test.data() + test.size()) == string{}); + } + + { + locale en_us_locale("en_US"); + const auto& coll = use_facet>(en_us_locale); + + { + const string test1 = "fluffy kittens"; + const string test2 = "fluffy Kittens"; + assert(coll.transform(test1.data(), test1.data() + test1.size()) + < coll.transform(test2.data(), test2.data() + test2.size())); + } + { + const string test1 = "Riddle"; + const string test2 = "middle"; + assert(coll.transform(test1.data(), test1.data() + test1.size()) + > coll.transform(test2.data(), test2.data() + test2.size())); + } + } + + { + locale en_us_locale("en_US"); + const auto& coll = use_facet>(en_us_locale); + + { + const wstring test1 = L"fluffy kittens"; + const wstring test2 = L"fluffy Kittens"; + assert(coll.transform(test1.data(), test1.data() + test1.size()) + < coll.transform(test2.data(), test2.data() + test2.size())); + } + { + const wstring test1 = L"Riddle"; + const wstring test2 = L"middle"; + assert(coll.transform(test1.data(), test1.data() + test1.size()) + > coll.transform(test2.data(), test2.data() + test2.size())); + } + } + + { + locale de_DE_phone_locale("de_DE_phoneb"); + const auto& coll = use_facet>(de_DE_phone_locale); + + { + const wstring test1 = L"Strasse"; + const wstring test2 = L"Stra\u00DFe"; // U+00DF LATIN SMALL LETTER SHARP S + + // sharp s collates like "ss" + assert(coll.transform(test1.data(), test1.data() + test1.size()) + == coll.transform(test2.data(), test2.data() + test2.size())); + } + { + const wstring test1 = L"Kachel"; + const wstring test2 = L"Kaetzchen"; + const wstring test3 = L"K\u00E4tzchen"; // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS + const wstring test4 = L"Kater"; + + // umlaut a collates like "ae" + assert(coll.transform(test1.data(), test1.data() + test1.size()) + < coll.transform(test2.data(), test2.data() + test2.size())); + assert(coll.transform(test2.data(), test2.data() + test2.size()) + == coll.transform(test3.data(), test3.data() + test3.size())); + assert(coll.transform(test3.data(), test3.data() + test3.size()) + < coll.transform(test4.data(), test4.data() + test4.size())); + } + } +#endif // !defined(SKIP_COLLATE_TRANSFORM_TESTS) +} + // GH-5236 "std::collate does not respect collation order when compiled with /MD(d) /Zc:wchar_t-" void test_gh_5236() { const wchar_t Ue = L'\u00DC'; // U+00DC LATIN CAPITAL LETTER U WITH DIARESIS @@ -39,5 +116,6 @@ void test_gh_5236() { } int main() { + test_gh_5210(); test_gh_5236(); } From 3eada712b05526b5efdae0ca34cf1336f99ea92b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 24 Apr 2025 15:18:49 -0700 Subject: [PATCH 2/2] Use conventional locale names. --- tests/std/tests/GH_005236_collate_facet/test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/std/tests/GH_005236_collate_facet/test.cpp b/tests/std/tests/GH_005236_collate_facet/test.cpp index 4a8c4634dd..4938f5cc1a 100644 --- a/tests/std/tests/GH_005236_collate_facet/test.cpp +++ b/tests/std/tests/GH_005236_collate_facet/test.cpp @@ -23,7 +23,7 @@ using namespace std; void test_gh_5210() { #ifndef SKIP_COLLATE_TRANSFORM_TESTS { - locale utf8_locale("en_US.utf8"); + locale utf8_locale("en-US.UTF-8"); const auto& coll = use_facet>(utf8_locale); const string test = "this i\xA0s a very brok\x80n utf-8\xC8string"; @@ -31,7 +31,7 @@ void test_gh_5210() { } { - locale en_us_locale("en_US"); + locale en_us_locale("en-US"); const auto& coll = use_facet>(en_us_locale); { @@ -49,7 +49,7 @@ void test_gh_5210() { } { - locale en_us_locale("en_US"); + locale en_us_locale("en-US"); const auto& coll = use_facet>(en_us_locale); { @@ -67,7 +67,7 @@ void test_gh_5210() { } { - locale de_DE_phone_locale("de_DE_phoneb"); + locale de_DE_phone_locale("de-DE_phoneb"); const auto& coll = use_facet>(de_DE_phone_locale); {