From 9f574ff364d6e37bc5cf44e9bb7cef5f7a186cd4 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Jul 2024 20:15:19 +0300 Subject: [PATCH 01/46] test and benchmark --- benchmarks/CMakeLists.txt | 1 + benchmarks/src/bitset_from_string.cpp | 88 +++++++++++++++++++ .../VSO_0000000_vector_algorithms/test.cpp | 36 ++++++++ 3 files changed, 125 insertions(+) create mode 100644 benchmarks/src/bitset_from_string.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 32c6e1925fa..71c692a2dd5 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -108,6 +108,7 @@ function(add_benchmark name) target_compile_definitions(benchmark-${name} PRIVATE BENCHMARK_STATIC_DEFINE) endfunction() +add_benchmark(bitset_from_string src/bitset_from_string.cpp) add_benchmark(bitset_to_string src/bitset_to_string.cpp) add_benchmark(find_and_count src/find_and_count.cpp) add_benchmark(find_first_of src/find_first_of.cpp) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp new file mode 100644 index 00000000000..81bf82b2b57 --- /dev/null +++ b/benchmarks/src/bitset_from_string.cpp @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace { + template + std::basic_string random_digits_init(size_t min_length, const size_t n) { + mt19937_64 rnd{}; + uniform_int_distribution<> dis('0', '1'); + std::basic_string str; + + const size_t number_of_bitsets = (min_length + n - 1) / n; + if (number_of_bitsets == 0) { + std::abort(); + } + const size_t actual_size = number_of_bitsets * n + number_of_bitsets - 1; + + str.resize_and_overwrite(actual_size, [&dis, &rnd, n](charT* dest, size_t len) { + const charT* end = dest + len; + for (;;) { + for (size_t i = 0; i != n; ++i, ++dest) { + *dest = static_cast(dis(rnd)); + } + + if (dest == end) { + break; + } + + *dest = charT{'\0'}; + ++dest; + } + + return len; + }); + return str; + } + + enum class length_type : bool { char_count, null_term }; + + template + void BM_bitset_from_string(benchmark::State& state) { + const auto bit_string = random_digits_init(2048, N); + for (auto _ : state) { + benchmark::DoNotOptimize(bit_string); + const charT* data = bit_string.c_str(); + for (size_t pos = 0, max = bit_string.size(); pos < max; pos += N + 1) { + if constexpr (Length == length_type::char_count) { + bitset bs(data + pos, N); + benchmark::DoNotOptimize(bs); + } else { + bitset bs(data + pos); + benchmark::DoNotOptimize(bs); + } + } + } + } +} // namespace + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK_MAIN(); diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 5a7a1520412..0dcb9e509ac 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -1049,6 +1049,15 @@ void test_randomized_bitset_base_count(mt19937_64& gen) { test_randomized_bitset_base(make_index_sequence{}, gen); } +template +void assert_throws_inv(F f) { + try { + f(); + assert(false); + } catch (std::invalid_argument&) { + } +} + void test_bitset(mt19937_64& gen) { assert(bitset<0>(0x0ULL).to_string() == ""); assert(bitset<0>(0xFEDCBA9876543210ULL).to_string() == ""); @@ -1090,6 +1099,33 @@ void test_bitset(mt19937_64& gen) { assert(bitset<75>(0xFEDCBA9876543210ULL).to_string() == U"000000000001111111011011100101110101001100001110110010101000011001000010000"); // not vectorized + assert(bitset<0>("").to_ullong() == 0); + assert(bitset<0>("1").to_ullong() == 0); + assert_throws_inv([] { (void) bitset<0>("x"); }); + + assert(bitset<45>("101110000000111010001011100101001111111111111").to_ullong() == 0x1701D1729FFFULL); + assert(bitset<45>("110101001100001110110010101000011001000010000").to_ullong() == 0x1A9876543210ULL); + assert(bitset<45>("111").to_ullong() == 0x7); + assert_throws_inv([] { (void) bitset<45>("11x11"); }); + + assert(bitset<64>( + "xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", basic_string::npos, 'o', 'x') + .to_ullong() + == 0xFEDCBA9876543210ULL); + assert(bitset<64>(L"xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", basic_string::npos, + L'o', L'x') + .to_ullong() + == 0xFEDCBA9876543210ULL); + +#ifdef __cpp_lib_char8_t + assert(bitset<75>(u8"000000000001111111011011100101110101001100001110110010101000011001000010000").to_ullong() + == 0xFEDCBA9876543210ULL); +#endif // __cpp_lib_char8_t + assert(bitset<75>(u"000000000001111111011011100101110101001100001110110010101000011001000010000").to_ullong() + == 0xFEDCBA9876543210ULL); + assert(bitset<75>(U"000000000001111111011011100101110101001100001110110010101000011001000010000").to_ullong() + == 0xFEDCBA9876543210ULL); // not vectorized + test_randomized_bitset_base_count<512 - 5, 32 + 10>(gen); } From 4318dc5d04cc6b8261c101863c65ebb47d856670 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Jul 2024 20:22:00 +0300 Subject: [PATCH 02/46] null hypothesis --- stl/inc/bitset | 84 ++++++++++++++++++++++------------- stl/src/vector_algorithms.cpp | 48 ++++++++++++++++++++ 2 files changed, 102 insertions(+), 30 deletions(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index f7eba3f7c7a..7135ac95148 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -28,6 +28,10 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1( char* _Dest, const void* _Src, size_t _Size_bits, char _Elem0, char _Elem1) noexcept; __declspec(noalias) void __stdcall __std_bitset_to_string_2( wchar_t* _Dest, const void* _Src, size_t _Size_bits, wchar_t _Elem0, wchar_t _Elem1) noexcept; +__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept; +__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept; } // extern "C" #endif // _USE_STD_VECTOR_ALGORITHMS @@ -115,46 +119,66 @@ public: private: template _CONSTEXPR23 void _Construct(const _Elem* const _Ptr, size_t _Count, const _Elem _Elem0, const _Elem _Elem1) { - if (_Count > _Bits) { - for (size_t _Idx = _Bits; _Idx < _Count; ++_Idx) { - const auto _Ch = _Ptr[_Idx]; - if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { - _Xinv(); +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { + bool _Result; + + if constexpr (sizeof(_Elem) == 1) { + _Result = __std_bitset_from_string_1(_Array, reinterpret_cast(_Ptr), sizeof(_Array), _Bits, + _Count, static_cast(_Elem0), static_cast(_Elem1)); + } else { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Elem) == 2); + _Result = __std_bitset_from_string_2(_Array, reinterpret_cast(_Ptr), sizeof(_Array), + _Bits, _Count, static_cast(_Elem0), static_cast(_Elem1)); + } + + if (!_Result) { + _Xinv(); + } + } else +#endif // _USE_STD_VECTOR_ALGORITHMS + { + if (_Count > _Bits) { + for (size_t _Idx = _Bits; _Idx < _Count; ++_Idx) { + const auto _Ch = _Ptr[_Idx]; + if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { + _Xinv(); + } } + + _Count = _Bits; } - _Count = _Bits; - } + size_t _Wpos = 0; + if (_Count != 0) { + size_t _Bits_used_in_word = 0; + auto _Last = _Ptr + _Count; + _Ty _This_word = 0; + do { + --_Last; + const auto _Ch = *_Last; + _This_word |= static_cast<_Ty>(_Traits::eq(_Elem1, _Ch)) << _Bits_used_in_word; + if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { + _Xinv(); + } - size_t _Wpos = 0; - if (_Count != 0) { - size_t _Bits_used_in_word = 0; - auto _Last = _Ptr + _Count; - _Ty _This_word = 0; - do { - --_Last; - const auto _Ch = *_Last; - _This_word |= static_cast<_Ty>(_Traits::eq(_Elem1, _Ch)) << _Bits_used_in_word; - if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { - _Xinv(); - } + if (++_Bits_used_in_word == _Bitsperword) { + _Array[_Wpos] = _This_word; + ++_Wpos; + _This_word = 0; + _Bits_used_in_word = 0; + } + } while (_Ptr != _Last); - if (++_Bits_used_in_word == _Bitsperword) { + if (_Bits_used_in_word != 0) { _Array[_Wpos] = _This_word; ++_Wpos; - _This_word = 0; - _Bits_used_in_word = 0; } - } while (_Ptr != _Last); - - if (_Bits_used_in_word != 0) { - _Array[_Wpos] = _This_word; - ++_Wpos; } - } - for (; _Wpos <= _Words; ++_Wpos) { - _Array[_Wpos] = 0; + for (; _Wpos <= _Words; ++_Wpos) { + _Array[_Wpos] = 0; + } } } diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 2abf6bff8e4..132a725f733 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3664,5 +3664,53 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2( } } +} // extern "C" + +namespace { + template + bool __std_bitset_from_string_fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, + const size_t _Size_bits, const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { + const auto _Dest_bytes = static_cast(_Dest); + size_t _Size_convert = _Size_chars; + + if (_Size_chars > _Size_bits) { + _Size_convert = _Size_bits; + + for (size_t _Ix = 0, _Mx = _Size_chars - _Size_bits; _Ix < _Mx; ++_Ix) { + if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } + } + } + + _CSTD memset(_Dest, 0, _Size_bytes); + + for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { + const _Elem _Cur = _Src[_Size_chars - _Ix - 1]; + + if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } + + _Dest_bytes[_Ix >> 3] |= static_cast(_Cur == _Elem1) << (_Ix & 0x7); + } + + return true; + } + +} // unnamed namespace + +extern "C" { + +__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { + return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); +} + +__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { + return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); +} + } // extern "C" #endif // defined(_M_IX86) || defined(_M_X64) From 70374d943451bf45928310d542629242b67dd600 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Jul 2024 21:34:23 +0300 Subject: [PATCH 03/46] use threshold --- stl/inc/bitset | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index 7135ac95148..0bc1f173710 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -120,7 +120,8 @@ private: template _CONSTEXPR23 void _Construct(const _Elem* const _Ptr, size_t _Count, const _Elem _Elem0, const _Elem _Elem1) { #if _USE_STD_VECTOR_ALGORITHMS - if constexpr (_Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { + if constexpr (_Bits >= _Bitset_vector_threshold + && _Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { bool _Result; if constexpr (sizeof(_Elem) == 1) { @@ -486,7 +487,6 @@ public: _CONSTEXPR23 void _To_string( _Elem* const _Buf, const size_t _Len, const _Elem _Elem0, const _Elem _Elem1) const noexcept { #if _USE_STD_VECTOR_ALGORITHMS - constexpr size_t _Bitset_vector_threshold = 32; if constexpr (_Bits >= _Bitset_vector_threshold && is_integral_v<_Elem> && sizeof(_Elem) <= 2) { if (!_Is_constant_evaluated()) { if constexpr (sizeof(_Elem) == 1) { @@ -511,8 +511,9 @@ public: private: friend hash>; - static constexpr ptrdiff_t _Bitsperword = CHAR_BIT * sizeof(_Ty); - static constexpr ptrdiff_t _Words = _Bits == 0 ? 0 : (_Bits - 1) / _Bitsperword; // NB: number of words - 1 + static constexpr size_t _Bitset_vector_threshold = 32; + static constexpr ptrdiff_t _Bitsperword = CHAR_BIT * sizeof(_Ty); + static constexpr ptrdiff_t _Words = _Bits == 0 ? 0 : (_Bits - 1) / _Bitsperword; // NB: number of words - 1 _CONSTEXPR23 void _Trim() noexcept { // clear any trailing bits in last word constexpr bool _Work_to_do = _Bits == 0 || _Bits % _Bitsperword != 0; From 551eef710a319654968980668c734af22cb996c8 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Jul 2024 21:35:39 +0300 Subject: [PATCH 04/46] assume bitset word size --- stl/src/vector_algorithms.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 132a725f733..498ca678d7c 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3537,8 +3537,8 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1( if (_Size_bits > 0) { __assume(_Size_bits < 32); - uint32_t _Val = 0; - memcpy(&_Val, _Src, (_Size_bits + 7) / 8); + uint32_t _Val; + memcpy(&_Val, _Src, 4); const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1); char _Tmp[32]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems); @@ -3567,11 +3567,7 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1( if (_Size_bits > 0) { __assume(_Size_bits < 16); uint16_t _Val; - if (_Size_bits > 8) { - memcpy(&_Val, _Src, 2); - } else { - _Val = *reinterpret_cast(_Src); - } + memcpy(&_Val, _Src, 2); const __m128i _Elems = _Bitset_to_string_1_step(_Val, _Px0, _Px1); char _Tmp[16]; _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Elems); @@ -3614,11 +3610,7 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2( if (_Size_bits > 0) { __assume(_Size_bits < 16); uint16_t _Val; - if (_Size_bits > 8) { - memcpy(&_Val, _Src, 2); - } else { - _Val = *reinterpret_cast(_Src); - } + memcpy(&_Val, _Src, 2); const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1); wchar_t _Tmp[16]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems); From a38fe936c4e7af25e447562dbd21df405b3b508d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Jul 2024 22:50:16 +0300 Subject: [PATCH 05/46] Revert "use threshold" This reverts commit 70374d943451bf45928310d542629242b67dd600. --- stl/inc/bitset | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index 0bc1f173710..7135ac95148 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -120,8 +120,7 @@ private: template _CONSTEXPR23 void _Construct(const _Elem* const _Ptr, size_t _Count, const _Elem _Elem0, const _Elem _Elem1) { #if _USE_STD_VECTOR_ALGORITHMS - if constexpr (_Bits >= _Bitset_vector_threshold - && _Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { + if constexpr (_Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { bool _Result; if constexpr (sizeof(_Elem) == 1) { @@ -487,6 +486,7 @@ public: _CONSTEXPR23 void _To_string( _Elem* const _Buf, const size_t _Len, const _Elem _Elem0, const _Elem _Elem1) const noexcept { #if _USE_STD_VECTOR_ALGORITHMS + constexpr size_t _Bitset_vector_threshold = 32; if constexpr (_Bits >= _Bitset_vector_threshold && is_integral_v<_Elem> && sizeof(_Elem) <= 2) { if (!_Is_constant_evaluated()) { if constexpr (sizeof(_Elem) == 1) { @@ -511,9 +511,8 @@ public: private: friend hash>; - static constexpr size_t _Bitset_vector_threshold = 32; - static constexpr ptrdiff_t _Bitsperword = CHAR_BIT * sizeof(_Ty); - static constexpr ptrdiff_t _Words = _Bits == 0 ? 0 : (_Bits - 1) / _Bitsperword; // NB: number of words - 1 + static constexpr ptrdiff_t _Bitsperword = CHAR_BIT * sizeof(_Ty); + static constexpr ptrdiff_t _Words = _Bits == 0 ? 0 : (_Bits - 1) / _Bitsperword; // NB: number of words - 1 _CONSTEXPR23 void _Trim() noexcept { // clear any trailing bits in last word constexpr bool _Work_to_do = _Bits == 0 || _Bits % _Bitsperword != 0; From 2b64c6e353ef44523cc59e9b8e6acb60cc71a89e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Jul 2024 23:28:21 +0300 Subject: [PATCH 06/46] SSE4.2 algorithm --- stl/src/vector_algorithms.cpp | 100 +++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 498ca678d7c..1229b5d9877 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3696,12 +3696,108 @@ extern "C" { __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { - return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + if (_Use_sse42()) { + const __m128i _Dx0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128()); + const __m128i _Dx1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128()); + + const char* _Src_end = _Src + _Size_chars; + + uint16_t* _Dst_words = reinterpret_cast(_Dest); + uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); + + const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + for (;;) { + __m128i _Val = _mm_undefined_si128(); + + if (const size_t _Left = _Src_end - _Src; _Left > 16) { + _Src_end -= 16; + _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint16_t)); + } + break; + } else { + _Src_end = _Src; + char _Tmp[16]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); + char* const _Tmpd = _Tmp + (16 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left); + _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); + } + + const __m128i _Ex1 = _mm_cmpeq_epi8(_Val, _Dx1); + const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); + + if (!_mm_testc_si128(_Ex1, _Ex0)) { + return false; + } + + if (_Dst_words != _Dst_words_end) { + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; + } + } + + return true; + } else { + return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } } __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { - return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + if (_Use_sse42()) { + const __m128i _Dx0 = _mm_set1_epi16(_Elem0); + const __m128i _Dx1 = _mm_set1_epi16(_Elem1); + + const wchar_t* _Src_end = _Src + _Size_chars; + + uint8_t* _Dst_words = reinterpret_cast(_Dest); + uint8_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint8_t); + + const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); + + for (;;) { + __m128i _Val = _mm_undefined_si128(); + + if (const size_t _Left = _Src_end - _Src; _Left > 8) { + _Src_end -= 8; + _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint8_t)); + } + break; + } else { + _Src_end = _Src; + wchar_t _Tmp[8]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); + wchar_t* const _Tmpd = _Tmp + (8 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); + _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); + } + + const __m128i _Ex1 = _mm_cmpeq_epi16(_Val, _Dx1); + const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); + + if (!_mm_testc_si128(_Ex1, _Ex0)) { + return false; + } + + if (_Dst_words != _Dst_words_end) { + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; + } + } + + return true; + } else { + return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } } } // extern "C" From 73097bc5e1bdda4d0f65c5c8eccbf3611d071e44 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 14:33:03 +0300 Subject: [PATCH 07/46] more cases --- benchmarks/src/bitset_from_string.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 81bf82b2b57..8c40d2ab0c0 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -65,22 +65,30 @@ namespace { } } // namespace +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); From afaa90dabaa6eab7481551b0d0c3bb2a7593e043 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 15:30:47 +0300 Subject: [PATCH 08/46] use threshold, now separate one --- stl/inc/bitset | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index 7135ac95148..51e3e153d21 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -120,7 +120,9 @@ private: template _CONSTEXPR23 void _Construct(const _Elem* const _Ptr, size_t _Count, const _Elem _Elem0, const _Elem _Elem1) { #if _USE_STD_VECTOR_ALGORITHMS - if constexpr (_Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { + constexpr size_t _Bitset_from_string_vector_threshold = 16; + if constexpr (_Bits >= _Bitset_from_string_vector_threshold + && _Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { bool _Result; if constexpr (sizeof(_Elem) == 1) { @@ -486,8 +488,8 @@ public: _CONSTEXPR23 void _To_string( _Elem* const _Buf, const size_t _Len, const _Elem _Elem0, const _Elem _Elem1) const noexcept { #if _USE_STD_VECTOR_ALGORITHMS - constexpr size_t _Bitset_vector_threshold = 32; - if constexpr (_Bits >= _Bitset_vector_threshold && is_integral_v<_Elem> && sizeof(_Elem) <= 2) { + constexpr size_t _Bitset_to_string_vector_threshold = 32; + if constexpr (_Bits >= _Bitset_to_string_vector_threshold && is_integral_v<_Elem> && sizeof(_Elem) <= 2) { if (!_Is_constant_evaluated()) { if constexpr (sizeof(_Elem) == 1) { __std_bitset_to_string_1(reinterpret_cast(_Buf), _Array, _Len, static_cast(_Elem0), From 2e3a816f9e0b833235e2813ca18ee0abb8637755 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 15:55:43 +0300 Subject: [PATCH 09/46] AVX2 algorithm --- stl/src/vector_algorithms.cpp | 104 +++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1229b5d9877..e3a82a167df 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3696,7 +3696,56 @@ extern "C" { __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { - if (_Use_sse42()) { + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const __m256i _Dx0 = _mm256_set1_epi8(_Elem0); + const __m256i _Dx1 = _mm256_set1_epi8(_Elem1); + + const char* _Src_end = _Src + _Size_chars; + + uint32_t* _Dst_words = reinterpret_cast(_Dest); + uint32_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint32_t); + + const __m256i _Shuf = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + for (;;) { + __m256i _Val = _mm256_undefined_si256(); + + if (const size_t _Left = _Src_end - _Src; _Left > 32) { + _Src_end -= 32; + _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint32_t)); + } + break; + } else { + _Src_end = _Src; + char _Tmp[32]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); + char* const _Tmpd = _Tmp + (32 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left); + _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); + } + + const __m256i _Ex1 = _mm256_cmpeq_epi8(_Val, _Dx1); + const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); + + if (!_mm256_testc_si256(_Ex1, _Ex0)) { + return false; + } + + if (_Dst_words != _Dst_words_end) { + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = _rotr(static_cast(_mm256_movemask_epi8(_Ex2)), 16); + ++_Dst_words; + } + } + + return true; + } else if (_Use_sse42()) { const __m128i _Dx0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128()); const __m128i _Dx1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128()); @@ -3749,7 +3798,58 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { - if (_Use_sse42()) { + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const __m256i _Dx0 = _mm256_set1_epi16(_Elem0); + const __m256i _Dx1 = _mm256_set1_epi16(_Elem1); + + const wchar_t* _Src_end = _Src + _Size_chars; + + uint16_t* _Dst_words = reinterpret_cast(_Dest); + uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); + + const __m256i _Shuf = _mm256_set_epi8( // + -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14, // + -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); + + for (;;) { + __m256i _Val = _mm256_undefined_si256(); + + if (const size_t _Left = _Src_end - _Src; _Left > 16) { + _Src_end -= 16; + _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint16_t)); + } + break; + } else { + _Src_end = _Src; + wchar_t _Tmp[16]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); + wchar_t* const _Tmpd = _Tmp + (16 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); + _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); + } + + const __m256i _Ex1 = _mm256_cmpeq_epi16(_Val, _Dx1); + const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); + + if (!_mm256_testc_si256(_Ex1, _Ex0)) { + return false; + } + + if (_Dst_words != _Dst_words_end) { + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); + *_Dst_words = _rotr16(static_cast(_Tmp | (_Tmp >> 8)), 8); + ++_Dst_words; + } + } + + return true; + } else if (_Use_sse42()) { const __m128i _Dx0 = _mm_set1_epi16(_Elem0); const __m128i _Dx1 = _mm_set1_epi16(_Elem1); From 22e5bae16ab6e210d4a9d5462e85a97eac7d9011 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 16:11:19 +0300 Subject: [PATCH 10/46] mighty ARM --- stl/src/vector_algorithms.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e3a82a167df..1299c7b8da8 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3696,6 +3696,7 @@ extern "C" { __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { +#ifndef _M_ARM64EC if (_Use_avx2() && _Size_bits >= 256) { _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 @@ -3791,13 +3792,16 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const } return true; - } else { + } else +#endif // !defined(_M_ARM64EC) + { return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } } __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { +#ifndef _M_ARM64EC if (_Use_avx2() && _Size_bits >= 256) { _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 @@ -3895,7 +3899,9 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const } return true; - } else { + } +#endif // !defined(_M_ARM64EC) + else { return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } } From 36108d195ca2e50afa6abfd3a9e7c58b4ff36194 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 16:13:42 +0300 Subject: [PATCH 11/46] assumption-shmassumption --- stl/inc/bitset | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/stl/inc/bitset b/stl/inc/bitset index 51e3e153d21..7d2997dbdfc 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -23,6 +23,10 @@ _STL_DISABLE_CLANG_WARNINGS #endif // !defined(_STD_BITSET_TO_STREAM_STACK_RESERVATION) #if _USE_STD_VECTOR_ALGORITHMS +// These bitset functions sometimes assume bit array has zero padding to multiple of 2 or 4 bytes +// The assumptions hold true even for vNext suggestion to use smaller types for small bitsets +// due to vectorization thresholds. + extern "C" { __declspec(noalias) void __stdcall __std_bitset_to_string_1( char* _Dest, const void* _Src, size_t _Size_bits, char _Elem0, char _Elem1) noexcept; From adfde23f2df58683464c952857ce829af160e2dd Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 16:16:56 +0300 Subject: [PATCH 12/46] more useful row --- benchmarks/src/bitset_from_string.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 8c40d2ab0c0..a412edd6a31 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -65,29 +65,29 @@ namespace { } } // namespace -BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); -BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); -BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); -BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); BENCHMARK(BM_bitset_from_string); From 4eeb157f2d334d96f407e5787ac42c577d4af3f0 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 17:23:49 +0300 Subject: [PATCH 13/46] fewer instructions for AVX2 wchar_t case --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1299c7b8da8..205c3c4b03e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3740,7 +3740,7 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const if (_Dst_words != _Dst_words_end) { const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = _rotr(static_cast(_mm256_movemask_epi8(_Ex2)), 16); + *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); ++_Dst_words; } } @@ -3814,8 +3814,8 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); const __m256i _Shuf = _mm256_set_epi8( // - -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14, // - -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); + +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); for (;;) { __m256i _Val = _mm256_undefined_si256(); @@ -3847,7 +3847,7 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const if (_Dst_words != _Dst_words_end) { const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); - *_Dst_words = _rotr16(static_cast(_Tmp | (_Tmp >> 8)), 8); + *_Dst_words = static_cast(_rotl(_Tmp, 8)); ++_Dst_words; } } From c360e88dcdc78f4faba10e786795495be717a7e1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 17:25:35 +0300 Subject: [PATCH 14/46] constexpr --- stl/inc/bitset | 90 +++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index 7d2997dbdfc..417f5958e54 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -127,65 +127,65 @@ private: constexpr size_t _Bitset_from_string_vector_threshold = 16; if constexpr (_Bits >= _Bitset_from_string_vector_threshold && _Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { - bool _Result; - - if constexpr (sizeof(_Elem) == 1) { - _Result = __std_bitset_from_string_1(_Array, reinterpret_cast(_Ptr), sizeof(_Array), _Bits, - _Count, static_cast(_Elem0), static_cast(_Elem1)); - } else { - _STL_INTERNAL_STATIC_ASSERT(sizeof(_Elem) == 2); - _Result = __std_bitset_from_string_2(_Array, reinterpret_cast(_Ptr), sizeof(_Array), - _Bits, _Count, static_cast(_Elem0), static_cast(_Elem1)); - } + if (!_STD _Is_constant_evaluated()) { + bool _Result; + + if constexpr (sizeof(_Elem) == 1) { + _Result = __std_bitset_from_string_1(_Array, reinterpret_cast(_Ptr), sizeof(_Array), + _Bits, _Count, static_cast(_Elem0), static_cast(_Elem1)); + } else { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Elem) == 2); + _Result = __std_bitset_from_string_2(_Array, reinterpret_cast(_Ptr), sizeof(_Array), + _Bits, _Count, static_cast(_Elem0), static_cast(_Elem1)); + } - if (!_Result) { - _Xinv(); + if (!_Result) { + _Xinv(); + } } - } else + } #endif // _USE_STD_VECTOR_ALGORITHMS - { - if (_Count > _Bits) { - for (size_t _Idx = _Bits; _Idx < _Count; ++_Idx) { - const auto _Ch = _Ptr[_Idx]; - if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { - _Xinv(); - } + if (_Count > _Bits) { + for (size_t _Idx = _Bits; _Idx < _Count; ++_Idx) { + const auto _Ch = _Ptr[_Idx]; + if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { + _Xinv(); } - - _Count = _Bits; } - size_t _Wpos = 0; - if (_Count != 0) { - size_t _Bits_used_in_word = 0; - auto _Last = _Ptr + _Count; - _Ty _This_word = 0; - do { - --_Last; - const auto _Ch = *_Last; - _This_word |= static_cast<_Ty>(_Traits::eq(_Elem1, _Ch)) << _Bits_used_in_word; - if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { - _Xinv(); - } + _Count = _Bits; + } - if (++_Bits_used_in_word == _Bitsperword) { - _Array[_Wpos] = _This_word; - ++_Wpos; - _This_word = 0; - _Bits_used_in_word = 0; - } - } while (_Ptr != _Last); + size_t _Wpos = 0; + if (_Count != 0) { + size_t _Bits_used_in_word = 0; + auto _Last = _Ptr + _Count; + _Ty _This_word = 0; + do { + --_Last; + const auto _Ch = *_Last; + _This_word |= static_cast<_Ty>(_Traits::eq(_Elem1, _Ch)) << _Bits_used_in_word; + if (!_Traits::eq(_Elem1, _Ch) && !_Traits::eq(_Elem0, _Ch)) { + _Xinv(); + } - if (_Bits_used_in_word != 0) { + if (++_Bits_used_in_word == _Bitsperword) { _Array[_Wpos] = _This_word; ++_Wpos; + _This_word = 0; + _Bits_used_in_word = 0; } - } + } while (_Ptr != _Last); - for (; _Wpos <= _Words; ++_Wpos) { - _Array[_Wpos] = 0; + if (_Bits_used_in_word != 0) { + _Array[_Wpos] = _This_word; + ++_Wpos; } } + + for (; _Wpos <= _Words; ++_Wpos) { + _Array[_Wpos] = 0; + } } public: From 9fe1973ee9e0b0174757a6f0eb106852f2d90cf1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 17:26:53 +0300 Subject: [PATCH 15/46] early return --- stl/inc/bitset | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stl/inc/bitset b/stl/inc/bitset index 417f5958e54..2774c185863 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -142,6 +142,8 @@ private: if (!_Result) { _Xinv(); } + + return; } } #endif // _USE_STD_VECTOR_ALGORITHMS From 89f9806976d6c1c80134b3ada39c4e7d95903318 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 18:01:26 +0300 Subject: [PATCH 16/46] pick correct range of too long string --- stl/src/vector_algorithms.cpp | 297 ++++++++++++++++++++++------------ 1 file changed, 195 insertions(+), 102 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 205c3c4b03e..8ea400cddf0 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3690,27 +3690,9 @@ namespace { return true; } -} // unnamed namespace - -extern "C" { - -__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, - size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bits >= 256) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - const __m256i _Dx0 = _mm256_set1_epi8(_Elem0); - const __m256i _Dx1 = _mm256_set1_epi8(_Elem1); - - const char* _Src_end = _Src + _Size_chars; - - uint32_t* _Dst_words = reinterpret_cast(_Dest); - uint32_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint32_t); - - const __m256i _Shuf = _mm256_set_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - + template + bool __std_bitset_from_string_1_avx_loop( + const char* const _Src, const char* _Src_end, const __m256i _Dx0, const __m256i _Dx1, _OutFn _Out) { for (;;) { __m256i _Val = _mm256_undefined_si256(); @@ -3718,10 +3700,7 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const _Src_end -= 32; _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); } else if (_Left == 0) { - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint32_t)); - } - break; + return true; } else { _Src_end = _Src; char _Tmp[32]; @@ -3738,25 +3717,13 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const return false; } - if (_Dst_words != _Dst_words_end) { - const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); - ++_Dst_words; - } + _Out(_Ex1); } + } - return true; - } else if (_Use_sse42()) { - const __m128i _Dx0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128()); - const __m128i _Dx1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128()); - - const char* _Src_end = _Src + _Size_chars; - - uint16_t* _Dst_words = reinterpret_cast(_Dest); - uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); - - const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - + template + bool __std_bitset_from_string_1_sse_loop( + const char* const _Src, const char* _Src_end, const __m128i _Dx0, const __m128i _Dx1, _OutFn _Out) { for (;;) { __m128i _Val = _mm_undefined_si128(); @@ -3764,10 +3731,7 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const _Src_end -= 16; _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); } else if (_Left == 0) { - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint16_t)); - } - break; + return true; } else { _Src_end = _Src; char _Tmp[16]; @@ -3784,39 +3748,13 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const return false; } - if (_Dst_words != _Dst_words_end) { - const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; - } + _Out(_Ex1); } - - return true; - } else -#endif // !defined(_M_ARM64EC) - { - return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } -} - -__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, - size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bits >= 256) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - const __m256i _Dx0 = _mm256_set1_epi16(_Elem0); - const __m256i _Dx1 = _mm256_set1_epi16(_Elem1); - - const wchar_t* _Src_end = _Src + _Size_chars; - - uint16_t* _Dst_words = reinterpret_cast(_Dest); - uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); - - const __m256i _Shuf = _mm256_set_epi8( // - +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); + template + bool __std_bitset_from_string_2_avx_loop( + const wchar_t* const _Src, const wchar_t* _Src_end, const __m256i _Dx0, const __m256i _Dx1, _OutFn _Out) { for (;;) { __m256i _Val = _mm256_undefined_si256(); @@ -3824,10 +3762,7 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const _Src_end -= 16; _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); } else if (_Left == 0) { - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint16_t)); - } - break; + return true; } else { _Src_end = _Src; wchar_t _Tmp[16]; @@ -3844,26 +3779,13 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const return false; } - if (_Dst_words != _Dst_words_end) { - const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); - *_Dst_words = static_cast(_rotl(_Tmp, 8)); - ++_Dst_words; - } + _Out(_Ex1); } + } - return true; - } else if (_Use_sse42()) { - const __m128i _Dx0 = _mm_set1_epi16(_Elem0); - const __m128i _Dx1 = _mm_set1_epi16(_Elem1); - - const wchar_t* _Src_end = _Src + _Size_chars; - - uint8_t* _Dst_words = reinterpret_cast(_Dest); - uint8_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint8_t); - - const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); - + template + bool __std_bitset_from_string_2_sse_loop( + const wchar_t* const _Src, const wchar_t* _Src_end, const __m128i _Dx0, const __m128i _Dx1, _OutFn _Out) { for (;;) { __m128i _Val = _mm_undefined_si128(); @@ -3871,10 +3793,7 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const _Src_end -= 8; _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); } else if (_Left == 0) { - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, (_Dst_words_end - _Dst_words) * sizeof(uint8_t)); - } - break; + return true; } else { _Src_end = _Src; wchar_t _Tmp[8]; @@ -3891,11 +3810,185 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const return false; } + _Out(_Ex1); + } + } + +} // unnamed namespace + +extern "C" { + +__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { +#ifndef _M_ARM64EC + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const __m256i _Dx0 = _mm256_set1_epi8(_Elem0); + const __m256i _Dx1 = _mm256_set1_epi8(_Elem1); + + uint32_t* _Dst_words = reinterpret_cast(_Dest); + uint32_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint32_t); + + auto _Out = [&_Dst_words, _Dst_words_end](const __m256i _Ex1) { + const __m256i _Shuf = _mm256_set_epi8( // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + if (_Dst_words != _Dst_words_end) { + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); + ++_Dst_words; + } + }; + + if (_Size_chars <= _Size_bits) { + if (!__std_bitset_from_string_1_avx_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { + return false; + } + } else { + const auto _Discard = [](__m256i) {}; + + if (!__std_bitset_from_string_1_avx_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { + return false; + } + + if (!__std_bitset_from_string_1_avx_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { + return false; + } + } + + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); + } + + return true; + } else if (_Use_sse42()) { + const __m128i _Dx0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128()); + const __m128i _Dx1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128()); + + uint16_t* _Dst_words = reinterpret_cast(_Dest); + uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); + + auto _Out = [&_Dst_words, _Dst_words_end](const __m128i _Ex1) { + const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + if (_Dst_words != _Dst_words_end) { + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; + } + }; + + if (_Size_chars <= _Size_bits) { + if (!__std_bitset_from_string_1_sse_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { + return false; + } + } else { + const auto _Discard = [](__m128i) {}; + + if (!__std_bitset_from_string_1_sse_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { + return false; + } + + if (!__std_bitset_from_string_1_sse_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { + return false; + } + } + + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); + } + + return true; + } else +#endif // !defined(_M_ARM64EC) + { + return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } +} + +__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { +#ifndef _M_ARM64EC + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const __m256i _Dx0 = _mm256_set1_epi16(_Elem0); + const __m256i _Dx1 = _mm256_set1_epi16(_Elem1); + + uint16_t* _Dst_words = reinterpret_cast(_Dest); + uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); + + auto _Out = [&_Dst_words, _Dst_words_end](const __m256i _Ex1) { + const __m256i _Shuf = _mm256_set_epi8( // + +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); + + if (_Dst_words != _Dst_words_end) { + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); + *_Dst_words = static_cast(_rotl(_Tmp, 8)); + ++_Dst_words; + } + }; + + if (_Size_chars <= _Size_bits) { + if (!__std_bitset_from_string_2_avx_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { + return false; + } + } else { + const auto _Discard = [](__m256i) {}; + + if (!__std_bitset_from_string_2_avx_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { + return false; + } + + if (!__std_bitset_from_string_2_avx_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { + return false; + } + } + + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); + } + + return true; + } else if (_Use_sse42()) { + const __m128i _Dx0 = _mm_set1_epi16(_Elem0); + const __m128i _Dx1 = _mm_set1_epi16(_Elem1); + + uint8_t* _Dst_words = reinterpret_cast(_Dest); + uint8_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint8_t); + + auto _Out = [&_Dst_words, _Dst_words_end](const __m128i _Ex1) { + const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); + if (_Dst_words != _Dst_words_end) { const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); ++_Dst_words; } + }; + + if (_Size_chars <= _Size_bits) { + if (!__std_bitset_from_string_2_sse_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { + return false; + } + } else { + const auto _Discard = [](__m128i) {}; + + if (!__std_bitset_from_string_2_sse_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { + return false; + } + + if (!__std_bitset_from_string_2_sse_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { + return false; + } + } + + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); } return true; From bc993ccbb77dc296f605c421b68e575e4cf9b6c7 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 19:23:25 +0300 Subject: [PATCH 17/46] this check is unnecessary --- stl/src/vector_algorithms.cpp | 64 ++++++++++++++++------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 8ea400cddf0..1a0a6e91ddd 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3827,19 +3827,18 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const const __m256i _Dx0 = _mm256_set1_epi8(_Elem0); const __m256i _Dx1 = _mm256_set1_epi8(_Elem1); - uint32_t* _Dst_words = reinterpret_cast(_Dest); - uint32_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint32_t); + uint32_t* _Dst_words = reinterpret_cast(_Dest); + void* _Dst_words_end = _Dst_words; + _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words, _Dst_words_end](const __m256i _Ex1) { + auto _Out = [&_Dst_words](const __m256i _Ex1) { const __m256i _Shuf = _mm256_set_epi8( // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - if (_Dst_words != _Dst_words_end) { - const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); - ++_Dst_words; - } + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); + ++_Dst_words; }; if (_Size_chars <= _Size_bits) { @@ -3867,17 +3866,15 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const const __m128i _Dx0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128()); const __m128i _Dx1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128()); - uint16_t* _Dst_words = reinterpret_cast(_Dest); - uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); + uint16_t* _Dst_words = reinterpret_cast(_Dest); + void* _Dst_words_end = _Dst_words; + _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words, _Dst_words_end](const __m128i _Ex1) { + auto _Out = [&_Dst_words](const __m128i _Ex1) { const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - if (_Dst_words != _Dst_words_end) { - const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; - } + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; }; if (_Size_chars <= _Size_bits) { @@ -3917,20 +3914,19 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const const __m256i _Dx0 = _mm256_set1_epi16(_Elem0); const __m256i _Dx1 = _mm256_set1_epi16(_Elem1); - uint16_t* _Dst_words = reinterpret_cast(_Dest); - uint16_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint16_t); + uint16_t* _Dst_words = reinterpret_cast(_Dest); + void* _Dst_words_end = _Dst_words; + _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words, _Dst_words_end](const __m256i _Ex1) { + auto _Out = [&_Dst_words](const __m256i _Ex1) { const __m256i _Shuf = _mm256_set_epi8( // +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); - if (_Dst_words != _Dst_words_end) { - const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); - *_Dst_words = static_cast(_rotl(_Tmp, 8)); - ++_Dst_words; - } + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); + *_Dst_words = static_cast(_rotl(_Tmp, 8)); + ++_Dst_words; }; if (_Size_chars <= _Size_bits) { @@ -3958,17 +3954,15 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const const __m128i _Dx0 = _mm_set1_epi16(_Elem0); const __m128i _Dx1 = _mm_set1_epi16(_Elem1); - uint8_t* _Dst_words = reinterpret_cast(_Dest); - uint8_t* const _Dst_words_end = _Dst_words + _Size_bytes / sizeof(uint8_t); + uint8_t* _Dst_words = reinterpret_cast(_Dest); + void* _Dst_words_end = _Dst_words; + _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words, _Dst_words_end](const __m128i _Ex1) { + auto _Out = [&_Dst_words](const __m128i _Ex1) { const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); - - if (_Dst_words != _Dst_words_end) { - const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; - } + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; }; if (_Size_chars <= _Size_bits) { From fce8222352d1d00c2fac64ae5a46fb85cc59a35e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 22:10:38 +0300 Subject: [PATCH 18/46] TMP! --- stl/src/vector_algorithms.cpp | 488 ++++++++++++++++------------------ 1 file changed, 230 insertions(+), 258 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1a0a6e91ddd..affbf80b8f8 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3659,337 +3659,309 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2( } // extern "C" namespace { - template - bool __std_bitset_from_string_fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, - const size_t _Size_bits, const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { - const auto _Dest_bytes = static_cast(_Dest); - size_t _Size_convert = _Size_chars; - if (_Size_chars > _Size_bits) { - _Size_convert = _Size_bits; + namespace __std_bitset_from_string { - for (size_t _Ix = 0, _Mx = _Size_chars - _Size_bits; _Ix < _Mx; ++_Ix) { - if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { - return false; + template + bool _Fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits, + const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { + const auto _Dest_bytes = static_cast(_Dest); + size_t _Size_convert = _Size_chars; + + if (_Size_chars > _Size_bits) { + _Size_convert = _Size_bits; + + for (size_t _Ix = 0, _Mx = _Size_chars - _Size_bits; _Ix < _Mx; ++_Ix) { + if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } } } - } - _CSTD memset(_Dest, 0, _Size_bytes); + _CSTD memset(_Dest, 0, _Size_bytes); + + for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { + const _Elem _Cur = _Src[_Size_chars - _Ix - 1]; - for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { - const _Elem _Cur = _Src[_Size_chars - _Ix - 1]; + if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } - if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { - return false; + _Dest_bytes[_Ix >> 3] |= static_cast(_Cur == _Elem1) << (_Ix & 0x7); } - _Dest_bytes[_Ix >> 3] |= static_cast(_Cur == _Elem1) << (_Ix & 0x7); + return true; } - return true; - } - - template - bool __std_bitset_from_string_1_avx_loop( - const char* const _Src, const char* _Src_end, const __m256i _Dx0, const __m256i _Dx1, _OutFn _Out) { - for (;;) { - __m256i _Val = _mm256_undefined_si256(); + struct _Traits_1_avx { + using _Vec = __m256i; + using _Word = uint32_t; - if (const size_t _Left = _Src_end - _Src; _Left > 32) { - _Src_end -= 32; - _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - char _Tmp[32]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); - char* const _Tmpd = _Tmp + (32 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left); - _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); + static __m256i _Set(const char _Val) noexcept { + return _mm256_set1_epi8(_Val); } - const __m256i _Ex1 = _mm256_cmpeq_epi8(_Val, _Dx1); - const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); + static void _Out(uint32_t*& _Dst_words, const __m256i _Ex1) noexcept { + const __m256i _Shuf = _mm256_set_epi8( // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - if (!_mm256_testc_si256(_Ex1, _Ex0)) { - return false; + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); + ++_Dst_words; } - _Out(_Ex1); - } - } + template + static bool _Loop(const char* const _Src, const char* _Src_end, const __m256i _Dx0, const __m256i _Dx1, + _OutFn _Out) noexcept { + for (;;) { + __m256i _Val = _mm256_undefined_si256(); - template - bool __std_bitset_from_string_1_sse_loop( - const char* const _Src, const char* _Src_end, const __m128i _Dx0, const __m128i _Dx1, _OutFn _Out) { - for (;;) { - __m128i _Val = _mm_undefined_si128(); + if (const size_t _Left = _Src_end - _Src; _Left > 32) { + _Src_end -= 32; + _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + return true; + } else { + _Src_end = _Src; + char _Tmp[32]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); + char* const _Tmpd = _Tmp + (32 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left); + _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); + } - if (const size_t _Left = _Src_end - _Src; _Left > 16) { - _Src_end -= 16; - _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - char _Tmp[16]; - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); - char* const _Tmpd = _Tmp + (16 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left); - _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); - } + const __m256i _Ex1 = _mm256_cmpeq_epi8(_Val, _Dx1); + const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); - const __m128i _Ex1 = _mm_cmpeq_epi8(_Val, _Dx1); - const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); + if (!_mm256_testc_si256(_Ex1, _Ex0)) { + return false; + } - if (!_mm_testc_si128(_Ex1, _Ex0)) { - return false; + _Out(_Ex1); + } } + }; - _Out(_Ex1); - } - } + struct _Traits_1_sse { + using _Vec = __m128i; + using _Word = uint16_t; - template - bool __std_bitset_from_string_2_avx_loop( - const wchar_t* const _Src, const wchar_t* _Src_end, const __m256i _Dx0, const __m256i _Dx1, _OutFn _Out) { - for (;;) { - __m256i _Val = _mm256_undefined_si256(); + static __m128i _Set(const char _Val) noexcept { + return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); + } - if (const size_t _Left = _Src_end - _Src; _Left > 16) { - _Src_end -= 16; - _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - wchar_t _Tmp[16]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); - wchar_t* const _Tmpd = _Tmp + (16 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); - _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); + static void _Out(uint16_t*& _Dst_words, const __m128i _Ex1) noexcept { + const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; } - const __m256i _Ex1 = _mm256_cmpeq_epi16(_Val, _Dx1); - const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); + template + static bool _Loop(const char* const _Src, const char* _Src_end, const __m128i _Dx0, const __m128i _Dx1, + _OutFn _Out) noexcept { + for (;;) { + __m128i _Val = _mm_undefined_si128(); - if (!_mm256_testc_si256(_Ex1, _Ex0)) { - return false; - } + if (const size_t _Left = _Src_end - _Src; _Left > 16) { + _Src_end -= 16; + _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + return true; + } else { + _Src_end = _Src; + char _Tmp[16]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); + char* const _Tmpd = _Tmp + (16 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left); + _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); + } - _Out(_Ex1); - } - } + const __m128i _Ex1 = _mm_cmpeq_epi8(_Val, _Dx1); + const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); - template - bool __std_bitset_from_string_2_sse_loop( - const wchar_t* const _Src, const wchar_t* _Src_end, const __m128i _Dx0, const __m128i _Dx1, _OutFn _Out) { - for (;;) { - __m128i _Val = _mm_undefined_si128(); + if (!_mm_testc_si128(_Ex1, _Ex0)) { + return false; + } - if (const size_t _Left = _Src_end - _Src; _Left > 8) { - _Src_end -= 8; - _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - wchar_t _Tmp[8]; - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); - wchar_t* const _Tmpd = _Tmp + (8 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); - _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); + _Out(_Ex1); + } } + }; - const __m128i _Ex1 = _mm_cmpeq_epi16(_Val, _Dx1); - const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); + struct _Traits_2_avx { + using _Vec = __m256i; + using _Word = uint16_t; - if (!_mm_testc_si128(_Ex1, _Ex0)) { - return false; + static __m256i _Set(const wchar_t _Val) noexcept { + return _mm256_set1_epi16(_Val); } - _Out(_Ex1); - } - } + static void _Out(uint16_t*& _Dst_words, const __m256i _Ex1) noexcept { + const __m256i _Shuf = _mm256_set_epi8( // + +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); -} // unnamed namespace + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); + *_Dst_words = static_cast(_rotl(_Tmp, 8)); + ++_Dst_words; + } -extern "C" { + template + static bool _Loop(const wchar_t* const _Src, const wchar_t* _Src_end, const __m256i _Dx0, + const __m256i _Dx1, _OutFn _Out) noexcept { + for (;;) { + __m256i _Val = _mm256_undefined_si256(); -__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, - size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bits >= 256) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - const __m256i _Dx0 = _mm256_set1_epi8(_Elem0); - const __m256i _Dx1 = _mm256_set1_epi8(_Elem1); + if (const size_t _Left = _Src_end - _Src; _Left > 16) { + _Src_end -= 16; + _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + return true; + } else { + _Src_end = _Src; + wchar_t _Tmp[16]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); + wchar_t* const _Tmpd = _Tmp + (16 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); + _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); + } - uint32_t* _Dst_words = reinterpret_cast(_Dest); - void* _Dst_words_end = _Dst_words; - _Advance_bytes(_Dst_words_end, _Size_bytes); + const __m256i _Ex1 = _mm256_cmpeq_epi16(_Val, _Dx1); + const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); - auto _Out = [&_Dst_words](const __m256i _Ex1) { - const __m256i _Shuf = _mm256_set_epi8( // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + if (!_mm256_testc_si256(_Ex1, _Ex0)) { + return false; + } - const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); - ++_Dst_words; + _Out(_Ex1); + } + } }; - if (_Size_chars <= _Size_bits) { - if (!__std_bitset_from_string_1_avx_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { - return false; - } - } else { - const auto _Discard = [](__m256i) {}; + struct _Traits_2_sse { + using _Vec = __m128i; + using _Word = uint8_t; - if (!__std_bitset_from_string_1_avx_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { - return false; + static __m128i _Set(const wchar_t _Val) noexcept { + return _mm_set1_epi16(_Val); } - if (!__std_bitset_from_string_1_avx_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { - return false; + static void _Out(uint8_t*& _Dst_words, const __m128i _Ex1) noexcept { + const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); + ++_Dst_words; } - } - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); - } + template + static bool _Loop(const wchar_t* const _Src, const wchar_t* _Src_end, const __m128i _Dx0, + const __m128i _Dx1, _OutFn _Out) noexcept { + for (;;) { + __m128i _Val = _mm_undefined_si128(); - return true; - } else if (_Use_sse42()) { - const __m128i _Dx0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128()); - const __m128i _Dx1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128()); - - uint16_t* _Dst_words = reinterpret_cast(_Dest); - void* _Dst_words_end = _Dst_words; - _Advance_bytes(_Dst_words_end, _Size_bytes); - - auto _Out = [&_Dst_words](const __m128i _Ex1) { - const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; - }; + if (const size_t _Left = _Src_end - _Src; _Left > 8) { + _Src_end -= 8; + _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); + } else if (_Left == 0) { + return true; + } else { + _Src_end = _Src; + wchar_t _Tmp[8]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); + wchar_t* const _Tmpd = _Tmp + (8 - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); + _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); + } - if (_Size_chars <= _Size_bits) { - if (!__std_bitset_from_string_1_sse_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { - return false; - } - } else { - const auto _Discard = [](__m128i) {}; + const __m128i _Ex1 = _mm_cmpeq_epi16(_Val, _Dx1); + const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); - if (!__std_bitset_from_string_1_sse_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { - return false; - } + if (!_mm_testc_si128(_Ex1, _Ex0)) { + return false; + } - if (!__std_bitset_from_string_1_sse_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { - return false; + _Out(_Ex1); + } } - } - - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); - } - - return true; - } else -#endif // !defined(_M_ARM64EC) - { - return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } -} - -__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, - size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bits >= 256) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + }; - const __m256i _Dx0 = _mm256_set1_epi16(_Elem0); - const __m256i _Dx1 = _mm256_set1_epi16(_Elem1); + template + static bool _Impl(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, + _Elem _Elem0, _Elem _Elem1) noexcept { + const auto _Dx0 = _Traits::_Set(_Elem0); + const auto _Dx1 = _Traits::_Set(_Elem1); - uint16_t* _Dst_words = reinterpret_cast(_Dest); - void* _Dst_words_end = _Dst_words; - _Advance_bytes(_Dst_words_end, _Size_bytes); + auto _Dst_words = reinterpret_cast<_Traits::_Word*>(_Dest); + void* _Dst_words_end = _Dst_words; + _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words](const __m256i _Ex1) { - const __m256i _Shuf = _mm256_set_epi8( // - +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); + auto _Out = [&_Dst_words](const _Traits::_Vec _Ex1) { _Traits::_Out(_Dst_words, _Ex1); }; - const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); - *_Dst_words = static_cast(_rotl(_Tmp, 8)); - ++_Dst_words; - }; + if (_Size_chars <= _Size_bits) { + if (!_Traits::_Loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { + return false; + } + } else { + const auto _Discard = [](_Traits::_Vec) {}; - if (_Size_chars <= _Size_bits) { - if (!__std_bitset_from_string_2_avx_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { - return false; - } - } else { - const auto _Discard = [](__m256i) {}; + if (!_Traits::_Loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { + return false; + } - if (!__std_bitset_from_string_2_avx_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { - return false; + if (!_Traits::_Loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { + return false; + } } - if (!__std_bitset_from_string_2_avx_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { - return false; + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); } - } - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); + return true; } - return true; - } else if (_Use_sse42()) { - const __m128i _Dx0 = _mm_set1_epi16(_Elem0); - const __m128i _Dx1 = _mm_set1_epi16(_Elem1); - - uint8_t* _Dst_words = reinterpret_cast(_Dest); - void* _Dst_words_end = _Dst_words; - _Advance_bytes(_Dst_words_end, _Size_bytes); - - auto _Out = [&_Dst_words](const __m128i _Ex1) { - const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); - const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; - }; + } // namespace __std_bitset_from_string - if (_Size_chars <= _Size_bits) { - if (!__std_bitset_from_string_2_sse_loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { - return false; - } - } else { - const auto _Discard = [](__m128i) {}; +} // unnamed namespace - if (!__std_bitset_from_string_2_sse_loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { - return false; - } +extern "C" { - if (!__std_bitset_from_string_2_sse_loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { - return false; - } - } +__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { + using namespace __std_bitset_from_string; - if (_Dst_words != _Dst_words_end) { - _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); - } +#ifndef _M_ARM64EC + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - return true; + return _Impl<_Traits_1_avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else if (_Use_sse42()) { + return _Impl<_Traits_1_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else +#endif // !defined(_M_ARM64EC) + { + return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } +} + +__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { + using namespace __std_bitset_from_string; + +#ifndef _M_ARM64EC + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + return _Impl<_Traits_2_avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else if (_Use_sse42()) { + return _Impl<_Traits_2_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else #endif // !defined(_M_ARM64EC) - else { - return __std_bitset_from_string_fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + { + return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } } From 692dd765677fb818020138d330a1842b142e17a9 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 22:16:43 +0300 Subject: [PATCH 19/46] clarify flow --- stl/src/vector_algorithms.cpp | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index affbf80b8f8..cee517a8156 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3898,24 +3898,21 @@ namespace { void* _Dst_words_end = _Dst_words; _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words](const _Traits::_Vec _Ex1) { _Traits::_Out(_Dst_words, _Ex1); }; + auto _Out = [&_Dst_words](const _Traits::_Vec _Ex1) { _Traits::_Out(_Dst_words, _Ex1); }; + const size_t _Size_convert = (_Size_chars <= _Size_bits) ? _Size_chars : _Size_bits; - if (_Size_chars <= _Size_bits) { - if (!_Traits::_Loop(_Src, _Src + _Size_chars, _Dx0, _Dx1, _Out)) { - return false; - } - } else { - const auto _Discard = [](_Traits::_Vec) {}; - - if (!_Traits::_Loop(_Src, _Src + _Size_bits, _Dx0, _Dx1, _Out)) { - return false; - } + // Convert characters to bits + if (!_Traits::_Loop(_Src, _Src + _Size_convert, _Dx0, _Dx1, _Out)) { + return false; + } - if (!_Traits::_Loop(_Src + _Size_bits, _Src + _Size_chars, _Dx0, _Dx1, _Discard)) { - return false; - } + // Verify remaining characters, if any + if (_Size_convert != _Size_chars + && !_Traits::_Loop(_Src + _Size_convert, _Src + _Size_chars, _Dx0, _Dx1, [](_Traits::_Vec) {})) { + return false; } + // Trim tail (may be natural tail, or too short string, or both) if (_Dst_words != _Dst_words_end) { _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); } From 8c04d1aa06ce101d755a58ffc8465eb80b549f82 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 22:22:53 +0300 Subject: [PATCH 20/46] more TMP! --- stl/src/vector_algorithms.cpp | 43 ++++++++++++++++------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index cee517a8156..5aaaed4d8b1 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3920,6 +3920,23 @@ namespace { return true; } + template + bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, + _Elem _Elem0, _Elem _Elem1) { +#ifndef _M_ARM64EC + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + return _Impl<_Avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else if (_Use_sse42()) { + return _Impl<_Sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else +#endif // !defined(_M_ARM64EC) + { + return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } + } + } // namespace __std_bitset_from_string } // unnamed namespace @@ -3930,36 +3947,14 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { using namespace __std_bitset_from_string; -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bits >= 256) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - return _Impl<_Traits_1_avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } else if (_Use_sse42()) { - return _Impl<_Traits_1_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } else -#endif // !defined(_M_ARM64EC) - { - return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } + return _Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } __declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { using namespace __std_bitset_from_string; -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bits >= 256) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - return _Impl<_Traits_2_avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } else if (_Use_sse42()) { - return _Impl<_Traits_2_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } else -#endif // !defined(_M_ARM64EC) - { - return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); - } + return _Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); } } // extern "C" From db18ee2c649e157eeb8ec25ade85eef4209d0a3d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 13 Jul 2024 22:26:07 +0300 Subject: [PATCH 21/46] mighty ARM again --- stl/src/vector_algorithms.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 5aaaed4d8b1..385a16a7d17 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3694,6 +3694,7 @@ namespace { } struct _Traits_1_avx { +#ifndef _M_ARM64EC using _Vec = __m256i; using _Word = uint32_t; @@ -3741,9 +3742,11 @@ namespace { _Out(_Ex1); } } +#endif // !defined(_M_ARM64EC) }; struct _Traits_1_sse { +#ifndef _M_ARM64EC using _Vec = __m128i; using _Word = uint16_t; @@ -3788,9 +3791,11 @@ namespace { _Out(_Ex1); } } +#endif // !defined(_M_ARM64EC) }; struct _Traits_2_avx { +#ifndef _M_ARM64EC using _Vec = __m256i; using _Word = uint16_t; @@ -3839,9 +3844,11 @@ namespace { _Out(_Ex1); } } +#endif // !defined(_M_ARM64EC) }; struct _Traits_2_sse { +#ifndef _M_ARM64EC using _Vec = __m128i; using _Word = uint8_t; @@ -3886,8 +3893,10 @@ namespace { _Out(_Ex1); } } +#endif // !defined(_M_ARM64EC) }; +#ifndef _M_ARM64EC template static bool _Impl(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, _Elem _Elem0, _Elem _Elem1) noexcept { @@ -3919,6 +3928,7 @@ namespace { return true; } +#endif // !defined(_M_ARM64EC) template bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, From dc191d0031149210d4262fe710933b854bc7a423 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 14 Jul 2024 07:39:57 +0300 Subject: [PATCH 22/46] fallback fixup --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 385a16a7d17..79f2d0ef544 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3671,7 +3671,7 @@ namespace { if (_Size_chars > _Size_bits) { _Size_convert = _Size_bits; - for (size_t _Ix = 0, _Mx = _Size_chars - _Size_bits; _Ix < _Mx; ++_Ix) { + for (size_t _Ix = _Size_bits, _Mx = _Size_chars; _Ix < _Mx; ++_Ix) { if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { return false; } @@ -3681,7 +3681,7 @@ namespace { _CSTD memset(_Dest, 0, _Size_bytes); for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { - const _Elem _Cur = _Src[_Size_chars - _Ix - 1]; + const _Elem _Cur = _Src[_Size_convert - _Ix - 1]; if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { return false; From 93abc9e20354c7163305631c3acc199c8b130950 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 14 Jul 2024 07:40:41 +0300 Subject: [PATCH 23/46] fallback move --- stl/src/vector_algorithms.cpp | 62 +++++++++++++++++------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 79f2d0ef544..df666c680bb 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3662,37 +3662,6 @@ namespace { namespace __std_bitset_from_string { - template - bool _Fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits, - const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { - const auto _Dest_bytes = static_cast(_Dest); - size_t _Size_convert = _Size_chars; - - if (_Size_chars > _Size_bits) { - _Size_convert = _Size_bits; - - for (size_t _Ix = _Size_bits, _Mx = _Size_chars; _Ix < _Mx; ++_Ix) { - if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { - return false; - } - } - } - - _CSTD memset(_Dest, 0, _Size_bytes); - - for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { - const _Elem _Cur = _Src[_Size_convert - _Ix - 1]; - - if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { - return false; - } - - _Dest_bytes[_Ix >> 3] |= static_cast(_Cur == _Elem1) << (_Ix & 0x7); - } - - return true; - } - struct _Traits_1_avx { #ifndef _M_ARM64EC using _Vec = __m256i; @@ -3930,6 +3899,37 @@ namespace { } #endif // !defined(_M_ARM64EC) + template + bool _Fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits, + const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { + const auto _Dest_bytes = static_cast(_Dest); + size_t _Size_convert = _Size_chars; + + if (_Size_chars > _Size_bits) { + _Size_convert = _Size_bits; + + for (size_t _Ix = _Size_bits, _Mx = _Size_chars; _Ix < _Mx; ++_Ix) { + if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } + } + } + + _CSTD memset(_Dest, 0, _Size_bytes); + + for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { + const _Elem _Cur = _Src[_Size_convert - _Ix - 1]; + + if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } + + _Dest_bytes[_Ix >> 3] |= static_cast(_Cur == _Elem1) << (_Ix & 0x7); + } + + return true; + } + template bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, _Elem _Elem0, _Elem _Elem1) { From b48b553959d074c376e5b2ec3dee7b186a8eff28 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 14 Jul 2024 10:00:54 +0300 Subject: [PATCH 24/46] array --- benchmarks/src/bitset_from_string.cpp | 45 +++++++++++++-------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index a412edd6a31..8bce84ceca5 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include #include #include @@ -12,47 +13,43 @@ using namespace std; namespace { - template - std::basic_string random_digits_init(size_t min_length, const size_t n) { + template + const auto random_digits_init() { mt19937_64 rnd{}; uniform_int_distribution<> dis('0', '1'); std::basic_string str; - const size_t number_of_bitsets = (min_length + n - 1) / n; - if (number_of_bitsets == 0) { - std::abort(); - } - const size_t actual_size = number_of_bitsets * n + number_of_bitsets - 1; + constexpr size_t number_of_bitsets = (Min_length + N - 1) / N; + static_assert(number_of_bitsets != 0); - str.resize_and_overwrite(actual_size, [&dis, &rnd, n](charT* dest, size_t len) { - const charT* end = dest + len; - for (;;) { - for (size_t i = 0; i != n; ++i, ++dest) { - *dest = static_cast(dis(rnd)); - } + constexpr size_t actual_size = number_of_bitsets * (N + 1); // +1 for \0 - if (dest == end) { - break; - } + std::array result; - *dest = charT{'\0'}; - ++dest; + for (auto dest = result.begin(); dest != result.end();) { + for (size_t i = 0; i != N; ++i, ++dest) { + *dest = static_cast(dis(rnd)); } - return len; - }); - return str; + *dest = charT{'\0'}; + ++dest; + } + + return result; } enum class length_type : bool { char_count, null_term }; + template + const auto random_digits = random_digits_init(); + template void BM_bitset_from_string(benchmark::State& state) { - const auto bit_string = random_digits_init(2048, N); + const auto& bit_string = random_digits; for (auto _ : state) { benchmark::DoNotOptimize(bit_string); - const charT* data = bit_string.c_str(); - for (size_t pos = 0, max = bit_string.size(); pos < max; pos += N + 1) { + const charT* const data = bit_string.data(); + for (size_t pos = 0, max = bit_string.size(); pos != max; pos += N + 1) { if constexpr (Length == length_type::char_count) { bitset bs(data + pos, N); benchmark::DoNotOptimize(bs); From 3529d5c9b5c0f07cbc29634ae43949c4bddf6fab Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 14 Jul 2024 10:02:23 +0300 Subject: [PATCH 25/46] padding --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index df666c680bb..11dcfeb86a1 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3890,7 +3890,7 @@ namespace { return false; } - // Trim tail (may be natural tail, or too short string, or both) + // Trim tail (may be padding tail, or too short string, or both) if (_Dst_words != _Dst_words_end) { _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); } From 3d075c60cf8f09b331c20be3cbf1cd29ca93f721 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 14 Jul 2024 11:40:37 +0300 Subject: [PATCH 26/46] common word increment --- stl/src/vector_algorithms.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 11dcfeb86a1..af0b03ccd61 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3671,14 +3671,13 @@ namespace { return _mm256_set1_epi8(_Val); } - static void _Out(uint32_t*& _Dst_words, const __m256i _Ex1) noexcept { + static uint32_t _To_bits(const __m256i _Ex1) noexcept { const __m256i _Shuf = _mm256_set_epi8( // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); - ++_Dst_words; + return _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); } template @@ -3723,11 +3722,10 @@ namespace { return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); } - static void _Out(uint16_t*& _Dst_words, const __m128i _Ex1) noexcept { + static uint16_t _To_bits(const __m128i _Ex1) noexcept { const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; + return static_cast(_mm_movemask_epi8(_Ex2)); } template @@ -3772,15 +3770,13 @@ namespace { return _mm256_set1_epi16(_Val); } - static void _Out(uint16_t*& _Dst_words, const __m256i _Ex1) noexcept { + static uint16_t _To_bits(const __m256i _Ex1) noexcept { const __m256i _Shuf = _mm256_set_epi8( // +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); - const auto _Tmp = static_cast(_mm256_movemask_epi8(_Ex2)); - *_Dst_words = static_cast(_rotl(_Tmp, 8)); - ++_Dst_words; + return static_cast(_rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 8)); } template @@ -3825,11 +3821,10 @@ namespace { return _mm_set1_epi16(_Val); } - static void _Out(uint8_t*& _Dst_words, const __m128i _Ex1) noexcept { + static uint8_t _To_bits(const __m128i _Ex1) noexcept { const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); - *_Dst_words = static_cast(_mm_movemask_epi8(_Ex2)); - ++_Dst_words; + return static_cast(_mm_movemask_epi8(_Ex2)); } template @@ -3876,7 +3871,11 @@ namespace { void* _Dst_words_end = _Dst_words; _Advance_bytes(_Dst_words_end, _Size_bytes); - auto _Out = [&_Dst_words](const _Traits::_Vec _Ex1) { _Traits::_Out(_Dst_words, _Ex1); }; + auto _Out = [&_Dst_words](const _Traits::_Vec _Ex1) { + *_Dst_words = _Traits::_To_bits(_Ex1); + ++_Dst_words; + }; + const size_t _Size_convert = (_Size_chars <= _Size_bits) ? _Size_chars : _Size_bits; // Convert characters to bits From e63ab56e624a02d2c3d7dba2171440d4b8c07607 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 14 Jul 2024 12:21:17 +0300 Subject: [PATCH 27/46] TMP loop --- stl/src/vector_algorithms.cpp | 212 +++++++++++++--------------------- 1 file changed, 80 insertions(+), 132 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index af0b03ccd61..3b1b330cda7 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3662,9 +3662,45 @@ namespace { namespace __std_bitset_from_string { - struct _Traits_1_avx { -#ifndef _M_ARM64EC - using _Vec = __m256i; +#ifdef _M_ARM64EC + using _Traits_1_avx = void; + using _Traits_1_sse = void; + using _Traits_2_avx = void; + using _Traits_2_sse = void; +#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv + struct _Traits_avx { + using _Vec = __m256i; + + static __m256i _Load(const void* _Src) noexcept { + return _mm256_loadu_si256(reinterpret_cast(_Src)); + } + + static void _Store(void* _Dest, const __m256i _Val) noexcept { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Val); + } + + static bool _Check(const __m256i _Val, const __m256i _Ex1, const __m256i _Dx0) noexcept { + return _mm256_testc_si256(_Ex1, _mm256_xor_si256(_Val, _Dx0)); + } + }; + + struct _Traits_sse { + using _Vec = __m128i; + + static __m128i _Load(const void* _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + + static void _Store(void* _Dest, const __m128i _Val) noexcept { + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Val); + } + + static bool _Check(const __m128i _Val, const __m128i _Ex1, const __m128i _Dx0) noexcept { + return _mm_testc_si128(_Ex1, _mm_xor_si128(_Val, _Dx0)); + } + }; + + struct _Traits_1_avx : _Traits_avx { using _Word = uint32_t; static __m256i _Set(const char _Val) noexcept { @@ -3680,42 +3716,12 @@ namespace { return _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); } - template - static bool _Loop(const char* const _Src, const char* _Src_end, const __m256i _Dx0, const __m256i _Dx1, - _OutFn _Out) noexcept { - for (;;) { - __m256i _Val = _mm256_undefined_si256(); - - if (const size_t _Left = _Src_end - _Src; _Left > 32) { - _Src_end -= 32; - _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - char _Tmp[32]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); - char* const _Tmpd = _Tmp + (32 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left); - _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); - } - - const __m256i _Ex1 = _mm256_cmpeq_epi8(_Val, _Dx1); - const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); - - if (!_mm256_testc_si256(_Ex1, _Ex0)) { - return false; - } - - _Out(_Ex1); - } + static __m256i _Cmp(const __m256i _Val, const __m256i _Dx1) noexcept { + return _mm256_cmpeq_epi8(_Val, _Dx1); } -#endif // !defined(_M_ARM64EC) }; - struct _Traits_1_sse { -#ifndef _M_ARM64EC - using _Vec = __m128i; + struct _Traits_1_sse : _Traits_sse { using _Word = uint16_t; static __m128i _Set(const char _Val) noexcept { @@ -3728,42 +3734,12 @@ namespace { return static_cast(_mm_movemask_epi8(_Ex2)); } - template - static bool _Loop(const char* const _Src, const char* _Src_end, const __m128i _Dx0, const __m128i _Dx1, - _OutFn _Out) noexcept { - for (;;) { - __m128i _Val = _mm_undefined_si128(); - - if (const size_t _Left = _Src_end - _Src; _Left > 16) { - _Src_end -= 16; - _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - char _Tmp[16]; - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); - char* const _Tmpd = _Tmp + (16 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left); - _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); - } - - const __m128i _Ex1 = _mm_cmpeq_epi8(_Val, _Dx1); - const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); - - if (!_mm_testc_si128(_Ex1, _Ex0)) { - return false; - } - - _Out(_Ex1); - } + static __m128i _Cmp(const __m128i _Val, const __m128i _Dx1) noexcept { + return _mm_cmpeq_epi8(_Val, _Dx1); } -#endif // !defined(_M_ARM64EC) }; - struct _Traits_2_avx { -#ifndef _M_ARM64EC - using _Vec = __m256i; + struct _Traits_2_avx : _Traits_avx { using _Word = uint16_t; static __m256i _Set(const wchar_t _Val) noexcept { @@ -3779,42 +3755,12 @@ namespace { return static_cast(_rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 8)); } - template - static bool _Loop(const wchar_t* const _Src, const wchar_t* _Src_end, const __m256i _Dx0, - const __m256i _Dx1, _OutFn _Out) noexcept { - for (;;) { - __m256i _Val = _mm256_undefined_si256(); - - if (const size_t _Left = _Src_end - _Src; _Left > 16) { - _Src_end -= 16; - _Val = _mm256_loadu_si256(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - wchar_t _Tmp[16]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Dx0); - wchar_t* const _Tmpd = _Tmp + (16 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); - _Val = _mm256_loadu_si256(reinterpret_cast(_Tmp)); - } - - const __m256i _Ex1 = _mm256_cmpeq_epi16(_Val, _Dx1); - const __m256i _Ex0 = _mm256_xor_si256(_Val, _Dx0); - - if (!_mm256_testc_si256(_Ex1, _Ex0)) { - return false; - } - - _Out(_Ex1); - } + static __m256i _Cmp(const __m256i _Val, const __m256i _Dx1) noexcept { + return _mm256_cmpeq_epi16(_Val, _Dx1); } -#endif // !defined(_M_ARM64EC) }; - struct _Traits_2_sse { -#ifndef _M_ARM64EC - using _Vec = __m128i; + struct _Traits_2_sse : _Traits_sse { using _Word = uint8_t; static __m128i _Set(const wchar_t _Val) noexcept { @@ -3827,40 +3773,42 @@ namespace { return static_cast(_mm_movemask_epi8(_Ex2)); } - template - static bool _Loop(const wchar_t* const _Src, const wchar_t* _Src_end, const __m128i _Dx0, - const __m128i _Dx1, _OutFn _Out) noexcept { - for (;;) { - __m128i _Val = _mm_undefined_si128(); + static __m128i _Cmp(const __m128i _Val, const __m128i _Dx1) noexcept { + return _mm_cmpeq_epi16(_Val, _Dx1); + } + }; - if (const size_t _Left = _Src_end - _Src; _Left > 8) { - _Src_end -= 8; - _Val = _mm_loadu_si128(reinterpret_cast(_Src_end)); - } else if (_Left == 0) { - return true; - } else { - _Src_end = _Src; - wchar_t _Tmp[8]; - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Dx0); - wchar_t* const _Tmpd = _Tmp + (8 - _Left); - _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(wchar_t)); - _Val = _mm_loadu_si128(reinterpret_cast(_Tmp)); - } + template + bool _Loop(const _Elem* const _Src, const _Elem* _Src_end, const typename _Traits::_Vec _Dx0, + const typename _Traits::_Vec _Dx1, _OutFn _Out) noexcept { + for (;;) { + typename _Traits::_Vec _Val; + constexpr size_t _Per_vec = sizeof(_Val) / sizeof(_Elem); + + if (const size_t _Left = _Src_end - _Src; _Left >= _Per_vec) { + _Src_end -= _Per_vec; + _Val = _Traits::_Load(_Src_end); + } else if (_Left == 0) { + return true; + } else { + _Src_end = _Src; + _Elem _Tmp[_Per_vec]; + _Traits::_Store(_Tmp, _Dx0); + _Elem* const _Tmpd = _Tmp + (_Per_vec - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(_Elem)); + _Val = _Traits::_Load(_Tmp); + } - const __m128i _Ex1 = _mm_cmpeq_epi16(_Val, _Dx1); - const __m128i _Ex0 = _mm_xor_si128(_Val, _Dx0); + const auto _Ex1 = _Traits::_Cmp(_Val, _Dx1); - if (!_mm_testc_si128(_Ex1, _Ex0)) { - return false; - } - - _Out(_Ex1); + if (!_Traits::_Check(_Val, _Ex1, _Dx0)) { + return false; } + + _Out(_Ex1); } -#endif // !defined(_M_ARM64EC) - }; + } -#ifndef _M_ARM64EC template static bool _Impl(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, _Elem _Elem0, _Elem _Elem1) noexcept { @@ -3879,13 +3827,13 @@ namespace { const size_t _Size_convert = (_Size_chars <= _Size_bits) ? _Size_chars : _Size_bits; // Convert characters to bits - if (!_Traits::_Loop(_Src, _Src + _Size_convert, _Dx0, _Dx1, _Out)) { + if (!_Loop<_Traits>(_Src, _Src + _Size_convert, _Dx0, _Dx1, _Out)) { return false; } // Verify remaining characters, if any if (_Size_convert != _Size_chars - && !_Traits::_Loop(_Src + _Size_convert, _Src + _Size_chars, _Dx0, _Dx1, [](_Traits::_Vec) {})) { + && !_Loop<_Traits>(_Src + _Size_convert, _Src + _Size_chars, _Dx0, _Dx1, [](_Traits::_Vec) {})) { return false; } From c5931a1d76bbce19b09696c111e6e6a8effbc825 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 14:38:35 -0700 Subject: [PATCH 28/46] Include what you use. --- benchmarks/src/bitset_from_string.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 8bce84ceca5..96531c0c56f 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -4,10 +4,7 @@ #include #include #include -#include #include -#include -#include #include using namespace std; From 30bf2d583e3be4419243352d0587e239d63f2a3d Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 14:30:02 -0700 Subject: [PATCH 29/46] `typename` => `class` --- benchmarks/src/bitset_from_string.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 96531c0c56f..ae58394bfdf 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -10,7 +10,7 @@ using namespace std; namespace { - template + template const auto random_digits_init() { mt19937_64 rnd{}; uniform_int_distribution<> dis('0', '1'); @@ -37,7 +37,7 @@ namespace { enum class length_type : bool { char_count, null_term }; - template + template const auto random_digits = random_digits_init(); template From 1a6625d775a675c517c84bc7b4fc8b1e8acbbdef Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 15:02:54 -0700 Subject: [PATCH 30/46] `random_digits_init()` shouldn't return a `const` prvalue. --- benchmarks/src/bitset_from_string.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index ae58394bfdf..81dec6f696a 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -11,7 +11,7 @@ using namespace std; namespace { template - const auto random_digits_init() { + auto random_digits_init() { mt19937_64 rnd{}; uniform_int_distribution<> dis('0', '1'); std::basic_string str; From d2ed6cc0efbd3ec57950a1bbed739cc47f18bad3 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 15:08:23 -0700 Subject: [PATCH 31/46] Drop unused `basic_string`. --- benchmarks/src/bitset_from_string.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 81dec6f696a..01d4b55727b 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -14,7 +14,6 @@ namespace { auto random_digits_init() { mt19937_64 rnd{}; uniform_int_distribution<> dis('0', '1'); - std::basic_string str; constexpr size_t number_of_bitsets = (Min_length + N - 1) / N; static_assert(number_of_bitsets != 0); From 6fedf3e1b3e3311f6d8f93cd84c5413ee5f61f44 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 15:07:39 -0700 Subject: [PATCH 32/46] Drop unnecessary `std::`. --- benchmarks/src/bitset_from_string.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 01d4b55727b..974686b7915 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -20,7 +20,7 @@ namespace { constexpr size_t actual_size = number_of_bitsets * (N + 1); // +1 for \0 - std::array result; + array result; for (auto dest = result.begin(); dest != result.end();) { for (size_t i = 0; i != N; ++i, ++dest) { From c61c4543f6c303ec106071644b6cad69a78227d1 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 14:58:50 -0700 Subject: [PATCH 33/46] `bit_string/data/max` => `digit_array/arr_data/arr_size` and extract `arr_size`. --- benchmarks/src/bitset_from_string.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 974686b7915..2743cdfd3be 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -41,16 +41,17 @@ namespace { template void BM_bitset_from_string(benchmark::State& state) { - const auto& bit_string = random_digits; + const auto& digit_array = random_digits; for (auto _ : state) { - benchmark::DoNotOptimize(bit_string); - const charT* const data = bit_string.data(); - for (size_t pos = 0, max = bit_string.size(); pos != max; pos += N + 1) { + benchmark::DoNotOptimize(digit_array); + const auto arr_data = digit_array.data(); + const auto arr_size = digit_array.size(); + for (size_t pos = 0; pos != arr_size; pos += N + 1) { if constexpr (Length == length_type::char_count) { - bitset bs(data + pos, N); + bitset bs(arr_data + pos, N); benchmark::DoNotOptimize(bs); } else { - bitset bs(data + pos); + bitset bs(arr_data + pos); benchmark::DoNotOptimize(bs); } } From 9cd368794dc54bdb94d4d3036d4ccd97f1ecaa8a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 15:26:39 -0700 Subject: [PATCH 34/46] Replace nested loops with separate loops. --- benchmarks/src/bitset_from_string.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 2743cdfd3be..223cd6716a6 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -22,13 +22,12 @@ namespace { array result; - for (auto dest = result.begin(); dest != result.end();) { - for (size_t i = 0; i != N; ++i, ++dest) { - *dest = static_cast(dis(rnd)); - } + for (auto& elem : result) { + elem = static_cast(dis(rnd)); // fill random digits + } - *dest = charT{'\0'}; - ++dest; + for (size_t bitset_idx = 0; bitset_idx < number_of_bitsets; ++bitset_idx) { + result[bitset_idx * (N + 1) + N] = charT{'\0'}; // write null terminators } return result; From 467e01ab83ec081e841564be96d3ece640fc3cdc Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 17:07:13 -0700 Subject: [PATCH 35/46] Consistent param order for `random_digits_init()`. --- benchmarks/src/bitset_from_string.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 223cd6716a6..6a769786ae1 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -10,7 +10,7 @@ using namespace std; namespace { - template + template auto random_digits_init() { mt19937_64 rnd{}; uniform_int_distribution<> dis('0', '1'); @@ -36,7 +36,7 @@ namespace { enum class length_type : bool { char_count, null_term }; template - const auto random_digits = random_digits_init(); + const auto random_digits = random_digits_init(); template void BM_bitset_from_string(benchmark::State& state) { From 1f4869393fc0474e6a4a91cc03e70d3a9590ea96 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Sep 2024 20:31:41 -0700 Subject: [PATCH 36/46] Drop unnecessary `static`. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 38de8ca80f1..604539afb69 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3967,7 +3967,7 @@ namespace { } template - static bool _Impl(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, + bool _Impl(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, _Elem _Elem0, _Elem _Elem1) noexcept { const auto _Dx0 = _Traits::_Set(_Elem0); const auto _Dx1 = _Traits::_Set(_Elem1); From 1bb6faf6a382b69aca8b3119be0eafdff5f409fb Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 11 Sep 2024 20:45:02 -0700 Subject: [PATCH 37/46] FizzBuzz! --- benchmarks/src/bitset_from_string.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp index 6a769786ae1..78faf10a373 100644 --- a/benchmarks/src/bitset_from_string.cpp +++ b/benchmarks/src/bitset_from_string.cpp @@ -22,12 +22,12 @@ namespace { array result; - for (auto& elem : result) { - elem = static_cast(dis(rnd)); // fill random digits - } - - for (size_t bitset_idx = 0; bitset_idx < number_of_bitsets; ++bitset_idx) { - result[bitset_idx * (N + 1) + N] = charT{'\0'}; // write null terminators + for (size_t i = 0; i < actual_size; ++i) { + if (i % (N + 1) == N) { + result[i] = charT{'\0'}; // write null terminators + } else { + result[i] = static_cast(dis(rnd)); // fill random digits + } } return result; From dac65c3ecd44b5e8538ddaac5d1bf3dc586cbddc Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 20 Sep 2024 07:44:47 +0300 Subject: [PATCH 38/46] unicorns --- stl/inc/bitset | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index 2774c185863..9ae2cbc686b 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -126,7 +126,7 @@ private: #if _USE_STD_VECTOR_ALGORITHMS constexpr size_t _Bitset_from_string_vector_threshold = 16; if constexpr (_Bits >= _Bitset_from_string_vector_threshold - && _Is_specialization_v<_Traits, char_traits> && sizeof(_Elem) <= 2) { + && _Is_implementation_handled_char_traits<_Traits> && sizeof(_Elem) <= 2) { if (!_STD _Is_constant_evaluated()) { bool _Result; From 238c8dda373ac6467336582751db45cf761df78e Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 10:09:52 -0700 Subject: [PATCH 39/46] Add `const`, drop `std::`. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index ee914c110c1..323ed848eed 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -1002,7 +1002,7 @@ void assert_throws_inv(F f) { try { f(); assert(false); - } catch (std::invalid_argument&) { + } catch (const invalid_argument&) { } } From e738b17edcd7c8d0e909aafa0b0028e496ed8b2a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 10:42:30 -0700 Subject: [PATCH 40/46] Add `const`. --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 090ca1165e9..b999c59d882 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3976,8 +3976,8 @@ namespace { } template - bool _Impl(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, - _Elem _Elem0, _Elem _Elem1) noexcept { + bool _Impl(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits, + const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { const auto _Dx0 = _Traits::_Set(_Elem0); const auto _Dx1 = _Traits::_Set(_Elem1); From e30ce6be8f7c32669e2b81085bf9d359cc5e61df Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 11:18:36 -0700 Subject: [PATCH 41/46] Drop unnecessary `_Mx`. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b999c59d882..6f40420c1c7 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -4021,7 +4021,7 @@ namespace { if (_Size_chars > _Size_bits) { _Size_convert = _Size_bits; - for (size_t _Ix = _Size_bits, _Mx = _Size_chars; _Ix < _Mx; ++_Ix) { + for (size_t _Ix = _Size_bits; _Ix < _Size_chars; ++_Ix) { if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { return false; } From 15782879c1bc4c2778bda2c1931f7d81efd09125 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 11:24:37 -0700 Subject: [PATCH 42/46] Mark `_Dispatch` as `noexcept`. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 6f40420c1c7..226b43a04d0 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -4045,7 +4045,7 @@ namespace { template bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, - _Elem _Elem0, _Elem _Elem1) { + _Elem _Elem0, _Elem _Elem1) noexcept { #ifndef _M_ARM64EC if (_Use_avx2() && _Size_bits >= 256) { _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 From 0e1fe076be4c548b7bcf459219e423bc9d310baa Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 11:52:26 -0700 Subject: [PATCH 43/46] Improve comment grammar, add citation. --- stl/inc/bitset | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/inc/bitset b/stl/inc/bitset index 1cf662c2afc..0fcff9b0297 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -23,8 +23,8 @@ _STL_DISABLE_CLANG_WARNINGS #endif // !defined(_STD_BITSET_TO_STREAM_STACK_RESERVATION) #if _USE_STD_VECTOR_ALGORITHMS -// These bitset functions sometimes assume bit array has zero padding to multiple of 2 or 4 bytes -// The assumptions hold true even for vNext suggestion to use smaller types for small bitsets +// These bitset functions sometimes assume that the bit array has zero padding to a multiple of 2 or 4 bytes. +// The assumptions hold true even for the vNext suggestion to use smaller types for small bitsets (see GH-1498) // due to vectorization thresholds. extern "C" { From ac71a7e81a45a6efa3d14e163113f080d8e0cde7 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 12:54:51 -0700 Subject: [PATCH 44/46] Use `string::npos` and `wstring::npos`. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 323ed848eed..cf2886bef66 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -1056,12 +1056,10 @@ void test_bitset(mt19937_64& gen) { assert(bitset<45>("111").to_ullong() == 0x7); assert_throws_inv([] { (void) bitset<45>("11x11"); }); - assert(bitset<64>( - "xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", basic_string::npos, 'o', 'x') + assert(bitset<64>("xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", string::npos, 'o', 'x') .to_ullong() == 0xFEDCBA9876543210ULL); - assert(bitset<64>(L"xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", basic_string::npos, - L'o', L'x') + assert(bitset<64>(L"xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", wstring::npos, L'o', L'x') .to_ullong() == 0xFEDCBA9876543210ULL); From 7b54b0cdae217315941b4ff46d7e3c6778ec1738 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 12:55:59 -0700 Subject: [PATCH 45/46] Include `` for `invalid_argument`. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index cf2886bef66..e72278e70b4 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include From ec1e4588eadb98b358ec7af42f745f03878ec14e Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 7 Oct 2024 13:23:37 -0700 Subject: [PATCH 46/46] Add more test coverage for `invalid_argument`. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index e72278e70b4..8efd56dd026 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -1056,6 +1056,8 @@ void test_bitset(mt19937_64& gen) { assert(bitset<45>("110101001100001110110010101000011001000010000").to_ullong() == 0x1A9876543210ULL); assert(bitset<45>("111").to_ullong() == 0x7); assert_throws_inv([] { (void) bitset<45>("11x11"); }); + assert_throws_inv([] { (void) bitset<45>("111111111111111111111111111111111111111111111x"); }); + assert_throws_inv([] { (void) bitset<45>("x111111111111111111111111111111111111111111111"); }); assert(bitset<64>("xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", string::npos, 'o', 'x') .to_ullong()