diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index 41b2089e4ca..bf6a8585f02 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -5,10 +5,7 @@ #include #include #include -#include #include -#include -#include #include #include #include @@ -17,9 +14,15 @@ using namespace std; -enum class AlgType { std_func, str_member_first, str_member_last }; +enum class AlgType { + std_func, + str_member_first, + str_member_last, + str_member_first_not, + str_member_last_not, +}; -template +template void bm(benchmark::State& state) { const size_t Pos = static_cast(state.range(0)); const size_t NSize = static_cast(state.range(1)); @@ -29,24 +32,37 @@ void bm(benchmark::State& state) { using container = conditional_t>, basic_string, not_highly_aligned_allocator>>; - constexpr T HaystackFiller{' '}; - static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler."); + constexpr size_t IncrementCap = 16; - container h(HSize, HaystackFiller); + constexpr T HaystackFillerBase = T{' '}; + static_assert( + NeedleFillerBase + IncrementCap <= HaystackFillerBase || HaystackFillerBase + IncrementCap <= NeedleFillerBase, + "Would match where it shouldn't"); + + container h(HSize, T{0}); container n(NSize, T{0}); - if (NSize - 1 > static_cast(numeric_limits::max()) - static_cast(Start)) { - puts("ERROR: The following iota() would overflow."); - abort(); + for (size_t i = 0; i != NSize; ++i) { + n[i] = NeedleFillerBase + i % IncrementCap; } - iota(n.begin(), n.end(), Start); - if (Pos >= HSize || Which >= NSize) { abort(); } - h[Pos] = n[Which]; + if constexpr (Alg == AlgType::str_member_first_not || Alg == AlgType::str_member_last_not) { + for (size_t i = 0; i != HSize; ++i) { + h[i] = n[(i + Which) % NSize]; + } + + h[Pos] = HaystackFillerBase; + } else { + for (size_t i = 0; i != HSize; ++i) { + h[i] = HaystackFillerBase + i % IncrementCap; + } + + h[Pos] = n[Which]; + } for (auto _ : state) { benchmark::DoNotOptimize(h); @@ -55,6 +71,10 @@ void bm(benchmark::State& state) { benchmark::DoNotOptimize(h.find_first_of(n)); } else if constexpr (Alg == AlgType::str_member_last) { benchmark::DoNotOptimize(h.find_last_of(n)); + } else if constexpr (Alg == AlgType::str_member_first_not) { + benchmark::DoNotOptimize(h.find_first_not_of(n)); + } else if constexpr (Alg == AlgType::str_member_last_not) { + benchmark::DoNotOptimize(h.find_last_not_of(n)); } else { benchmark::DoNotOptimize(find_first_of(h.begin(), h.end(), n.begin(), n.end())); } @@ -82,4 +102,12 @@ BENCHMARK(bm)->Apply(common_args); BENCHMARK(bm)->Apply(common_args); BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + BENCHMARK_MAIN(); diff --git a/stl/inc/__msvc_string_view.hpp b/stl/inc/__msvc_string_view.hpp index eba67768432..d6e37a6d46f 100644 --- a/stl/inc/__msvc_string_view.hpp +++ b/stl/inc/__msvc_string_view.hpp @@ -57,6 +57,16 @@ __declspec(noalias) size_t __stdcall __std_find_last_not_ch_pos_4( __declspec(noalias) size_t __stdcall __std_find_last_not_ch_pos_8( const void* _First, const void* _Last, uint64_t _Val) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_not_of_trivial_pos_1( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_not_of_trivial_pos_2( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; + +__declspec(noalias) size_t __stdcall __std_find_last_not_of_trivial_pos_1( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_last_not_of_trivial_pos_2( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; + } // extern "C" _STD_BEGIN @@ -120,6 +130,32 @@ size_t _Find_last_not_ch_pos_vectorized(const _Ty* const _First, const _Ty* cons _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size } } +template +size_t _Find_first_not_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length, + const _Ty2* const _Needle, const size_t _Needle_length) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); + if constexpr (sizeof(_Ty1) == 1) { + return ::__std_find_first_not_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 2) { + return ::__std_find_first_not_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + +template +size_t _Find_last_not_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length, + const _Ty2* const _Needle, const size_t _Needle_length) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); + if constexpr (sizeof(_Ty1) == 1) { + return ::__std_find_last_not_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 2) { + return ::__std_find_last_not_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + _STD_END #endif // _USE_STD_VECTOR_ALGORITHMS @@ -1004,6 +1040,21 @@ constexpr size_t _Traits_find_first_not_of(_In_reads_(_Hay_size) const _Traits_p if constexpr (_Is_implementation_handled_char_traits<_Traits>) { using _Elem = typename _Traits::char_type; +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (sizeof(_Elem) <= 2) { + if (!_STD _Is_constant_evaluated()) { + const size_t _Remaining_size = _Hay_size - _Start_at; + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { + size_t _Pos = _Find_first_not_of_pos_vectorized(_Hay_start, _Remaining_size, _Needle, _Needle_size); + if (_Pos != static_cast(-1)) { + _Pos += _Start_at; + } + return _Pos; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + _String_bitmap<_Elem> _Matches; if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { @@ -1071,6 +1122,17 @@ constexpr size_t _Traits_find_last_not_of(_In_reads_(_Hay_size) const _Traits_pt if constexpr (_Is_implementation_handled_char_traits<_Traits>) { using _Elem = typename _Traits::char_type; +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (sizeof(_Elem) <= 2) { + if (!_STD _Is_constant_evaluated()) { + const size_t _Remaining_size = _Hay_start + 1; + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { // same threshold for first/last + return _Find_last_not_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size); + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + _String_bitmap<_Elem> _Matches; if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 15442d480d1..559de319996 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2976,6 +2976,8 @@ namespace { return _Result; } + enum class _Find_meow_of_predicate { _Any_of, _None_of }; + #ifndef _M_ARM64EC namespace __std_find_meow_of_bitmap_details { __m256i _Bitmap_step(const __m256i _Bitmap, const __m256i _Data) noexcept { @@ -3246,7 +3248,7 @@ namespace { } } - template + template size_t _Impl_first_avx(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { using namespace __std_find_meow_of_bitmap_details; @@ -3260,9 +3262,14 @@ namespace { const size_t _Haystack_length_vec = _Haystack_length & ~size_t{7}; for (size_t _Ix = 0; _Ix != _Haystack_length_vec; _Ix += 8) { - const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); - const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); - const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + + if constexpr (_Pred == _Find_meow_of_predicate::_None_of) { + _Bingo ^= 0xFF; + } + if (_Bingo != 0) { return _Ix + _tzcnt_u32(_Bingo); } @@ -3273,7 +3280,12 @@ namespace { const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; const __m256i _Data = _Load_avx_256_8_last(_Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail); const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); - const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + + if constexpr (_Pred == _Find_meow_of_predicate::_None_of) { + _Bingo ^= _Tail_bingo_mask; + } + if (_Bingo != 0) { return _Haystack_length_vec + _tzcnt_u32(_Bingo); } @@ -3282,7 +3294,7 @@ namespace { return static_cast(-1); } - template + template size_t _Impl_last_avx(const void* const _Haystack, size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { using namespace __std_find_meow_of_bitmap_details; @@ -3296,9 +3308,14 @@ namespace { while (_Haystack_length >= 8) { _Haystack_length -= 8; - const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); - const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); - const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + + if constexpr (_Pred == _Find_meow_of_predicate::_None_of) { + _Bingo ^= 0xFF; + } + if (_Bingo != 0) { return _Haystack_length + 31 - _lzcnt_u32(_Bingo); } @@ -3309,7 +3326,12 @@ namespace { const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; const __m256i _Data = _Load_avx_256_8_last(_Haystack_ptr, _Haystack_length_tail); const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); - const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + + if constexpr (_Pred == _Find_meow_of_predicate::_None_of) { + _Bingo ^= _Tail_bingo_mask; + } + if (_Bingo != 0) { return 31 - _lzcnt_u32(_Bingo); } @@ -3355,7 +3377,7 @@ namespace { } #endif // !_M_ARM64EC - template + template size_t _Impl_first_scalar( const void* const _Haystack, const size_t _Haystack_length, const _Scalar_table_t& _Table) noexcept { const auto _Haystack_ptr = static_cast(_Haystack); @@ -3365,19 +3387,29 @@ namespace { if constexpr (sizeof(_Val) > 1) { if (_Val >= 256) { - continue; + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + continue; + } else { + return _Ix; + } } } - if (_Table[_Val]) { - return _Ix; + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + if (_Table[_Val]) { + return _Ix; + } + } else { + if (!_Table[_Val]) { + return _Ix; + } } } return static_cast(-1); } - template + template size_t _Impl_last_scalar( const void* const _Haystack, size_t _Haystack_length, const _Scalar_table_t& _Table) noexcept { const auto _Haystack_ptr = static_cast(_Haystack); @@ -3389,12 +3421,22 @@ namespace { if constexpr (sizeof(_Val) > 1) { if (_Val >= 256) { - continue; + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + continue; + } else { + return _Haystack_length; + } } } - if (_Table[_Val]) { - return _Haystack_length; + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + if (_Table[_Val]) { + return _Haystack_length; + } + } else { + if (!_Table[_Val]) { + return _Haystack_length; + } } } @@ -3403,7 +3445,7 @@ namespace { } // namespace __std_find_meow_of_bitmap namespace __std_find_first_of { - template + template const void* _Fallback(const void* _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { auto _Ptr_haystack = static_cast(_First1); @@ -3412,8 +3454,22 @@ namespace { const auto _Ptr_needle_end = static_cast(_Last2); for (; _Ptr_haystack != _Ptr_haystack_end; ++_Ptr_haystack) { - for (auto _Ptr = _Ptr_needle; _Ptr != _Ptr_needle_end; ++_Ptr) { - if (*_Ptr_haystack == *_Ptr) { + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + for (auto _Ptr = _Ptr_needle; _Ptr != _Ptr_needle_end; ++_Ptr) { + if (*_Ptr_haystack == *_Ptr) { + return _Ptr_haystack; + } + } + } else { + bool _Match = false; + for (auto _Ptr = _Ptr_needle; _Ptr != _Ptr_needle_end; ++_Ptr) { + if (*_Ptr_haystack == *_Ptr) { + _Match = true; + break; + } + } + + if (!_Match) { return _Ptr_haystack; } } @@ -3423,11 +3479,13 @@ namespace { } #ifndef _M_ARM64EC - template + template const void* _Impl_pcmpestri(const void* _First1, const size_t _Haystack_length, const void* const _First2, const size_t _Needle_length) noexcept { - constexpr int _Op = - (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + constexpr int _Op_base = + (_Pred == _Find_meow_of_predicate::_Any_of ? _SIDD_POSITIVE_POLARITY : _SIDD_MASKED_NEGATIVE_POLARITY) + | (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY; + constexpr int _Op = _Op_base | _SIDD_LEAST_SIGNIFICANT; constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; const void* _Stop_at = _First1; @@ -3483,37 +3541,78 @@ namespace { const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); constexpr int _Not_found = 16; // arbitrary value greater than any found value +#pragma warning(push) +#pragma warning(disable : 4324) // structure was padded due to alignment specifier + const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1, + const int _Found_pos_init) noexcept { + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + int _Found_pos = _Found_pos_init; + + const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, + const int _Size1) noexcept { + if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); + if (_Pos < _Found_pos) { + _Found_pos = _Pos; + } + } + }; - int _Found_pos = _Not_found; + const void* _Cur_needle = _First2; + do { + const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); + _Step(_Data2, _Part_size_el, _Data1, _Size1); + _Advance_bytes(_Cur_needle, 16); + } while (_Cur_needle != _Last_needle); - const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, - const int _Size1) noexcept { - if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); - if (_Pos < _Found_pos) { - _Found_pos = _Pos; + if (_Last_needle_length_el != 0) { + _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); } - } - }; -#pragma warning(push) -#pragma warning(disable : 4324) // structure was padded due to alignment specifier - const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1) noexcept { - const void* _Cur_needle = _First2; - do { - const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); - _Step(_Data2, _Part_size_el, _Data1, _Size1); + return _Found_pos; + } else { + constexpr int _Op_mask = _Op_base | _SIDD_BIT_MASK; + + const void* _Cur_needle = _First2; + + const __m128i _Data2_first = _mm_loadu_si128(static_cast(_Cur_needle)); + + __m128i _Found = _mm_cmpestrm(_Data2_first, _Part_size_el, _Data1, _Size1, _Op_mask); _Advance_bytes(_Cur_needle, 16); - } while (_Cur_needle != _Last_needle); - if (_Last_needle_length_el != 0) { - _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); + while (_Cur_needle != _Last_needle) { + const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); + const __m128i _Found_part = _mm_cmpestrm(_Data2, _Part_size_el, _Data1, _Size1, _Op_mask); + _Found = _mm_and_si128(_Found, _Found_part); + _Advance_bytes(_Cur_needle, 16); + } + + if (_Last_needle_length_el != 0) { + const __m128i _Found_part = + _mm_cmpestrm(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1, _Op_mask); + _Found = _mm_and_si128(_Found, _Found_part); + } + + const int _Bingo = _mm_cvtsi128_si32(_Found); + int _Found_pos = _Found_pos_init; + + if (_Bingo != 0) { + unsigned long _Tmp; + // CodeQL [SM02313] _Tmp is always initialized: we just tested `if (_Bingo != 0)`. + _BitScanForward(&_Tmp, _Bingo); + if (_Found_pos > static_cast(_Tmp)) { + _Found_pos = static_cast(_Tmp); + } + } + + return _Found_pos; } }; #pragma warning(pop) while (_First1 != _Stop_at) { - _Test_whole_needle(_mm_loadu_si128(static_cast(_First1)), _Part_size_el); + const int _Found_pos = _Test_whole_needle( + _mm_loadu_si128(static_cast(_First1)), _Part_size_el, _Not_found); if (_Found_pos != _Not_found) { _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); @@ -3530,9 +3629,7 @@ namespace { memcpy(_Tmp1, _First1, _Last_part_size); const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - _Found_pos = _Last_part_size_el; - - _Test_whole_needle(_Data1, _Last_part_size_el); + const int _Found_pos = _Test_whole_needle(_Data1, _Last_part_size_el, _Last_part_size_el); _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); } @@ -3757,7 +3854,7 @@ namespace { #ifndef _M_ARM64EC if constexpr (sizeof(_Ty) <= 2) { if (_Use_sse42()) { - return _Impl_pcmpestri<_Ty>( + return _Impl_pcmpestri<_Ty, _Find_meow_of_predicate::_Any_of>( _First1, _Byte_length(_First1, _Last1), _First2, _Byte_length(_First2, _Last2)); } } else { @@ -3768,7 +3865,7 @@ namespace { } #endif // !_M_ARM64EC - return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); + return _Fallback<_Ty, _Find_meow_of_predicate::_Any_of>(_First1, _Last1, _First2, _Last2); } template @@ -3781,7 +3878,7 @@ namespace { } #ifndef _M_ARM64EC - template + template size_t _Dispatch_pos_sse_1_2( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { using namespace __std_find_meow_of_bitmap; @@ -3790,13 +3887,13 @@ namespace { if (_Strat == _Strategy::_Vector_bitmap) { if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + return _Impl_first_avx<_Ty, _Pred>(_First1, _Count1, _First2, _Count2); } } else if (_Strat == _Strategy::_Scalar_bitmap) { if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { alignas(32) _Scalar_table_t _Table = {}; _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); - return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + return _Impl_first_scalar<_Ty, _Pred>(_First1, _Count1, _Table); } } @@ -3805,7 +3902,7 @@ namespace { const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); return _Pos_from_ptr<_Ty>( - _Impl_pcmpestri<_Ty>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); + _Impl_pcmpestri<_Ty, _Pred>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); } template @@ -3817,13 +3914,13 @@ namespace { if (_Strat == _Strategy::_Vector_bitmap) { if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + return _Impl_first_avx<_Ty, _Find_meow_of_predicate::_Any_of>(_First1, _Count1, _First2, _Count2); } } else if (_Strat == _Strategy::_Scalar_bitmap) { if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { alignas(32) _Scalar_table_t _Table = {}; _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); - return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + return _Impl_first_scalar<_Ty, _Find_meow_of_predicate::_Any_of>(_First1, _Count1, _Table); } } @@ -3835,42 +3932,44 @@ namespace { } #endif // !_M_ARM64EC - template + template size_t _Dispatch_pos_fallback( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { using namespace __std_find_meow_of_bitmap; _Scalar_table_t _Table = {}; if (_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { - return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + return _Impl_first_scalar<_Ty, _Pred>(_First1, _Count1, _Table); } const void* const _Last1 = static_cast(_First1) + _Count1; const void* const _Last2 = static_cast(_First2) + _Count2; - return _Pos_from_ptr<_Ty>(_Fallback<_Ty>(_First1, _Last1, _First2, _Last2), _First1, _Last1); + return _Pos_from_ptr<_Ty>(_Fallback<_Ty, _Pred>(_First1, _Last1, _First2, _Last2), _First1, _Last1); } - template + template size_t _Dispatch_pos( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { #ifndef _M_ARM64EC if constexpr (sizeof(_Ty) <= 2) { if (_Use_sse42()) { - return _Dispatch_pos_sse_1_2<_Ty>(_First1, _Count1, _First2, _Count2); + return _Dispatch_pos_sse_1_2<_Ty, _Pred>(_First1, _Count1, _First2, _Count2); } } else { if (_Use_avx2()) { + static_assert(_Pred == _Find_meow_of_predicate::_Any_of); + return _Dispatch_pos_avx_4_8<_Ty>(_First1, _Count1, _First2, _Count2); } } #endif // !_M_ARM64EC - return _Dispatch_pos_fallback<_Ty>(_First1, _Count1, _First2, _Count2); + return _Dispatch_pos_fallback<_Ty, _Pred>(_First1, _Count1, _First2, _Count2); } } // namespace __std_find_first_of namespace __std_find_last_of { - template + template size_t __stdcall _Fallback(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { @@ -3881,8 +3980,22 @@ namespace { while (_Pos != 0) { --_Pos; - for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { - if (_Ptr_haystack[_Pos] == *_Ptr) { + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { + if (_Ptr_haystack[_Pos] == *_Ptr) { + return _Pos; + } + } + } else { + bool _Match = false; + for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { + if (_Ptr_haystack[_Pos] == *_Ptr) { + _Match = true; + break; + } + } + + if (!_Match) { return _Pos; } } @@ -3892,13 +4005,15 @@ namespace { } #ifndef _M_ARM64EC - template + template size_t _Impl(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { const size_t _Haystack_length_bytes = _Haystack_length * sizeof(_Ty); - constexpr int _Op = - (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT; + constexpr int _Op_base = + (_Pred == _Find_meow_of_predicate::_Any_of ? _SIDD_POSITIVE_POLARITY : _SIDD_MASKED_NEGATIVE_POLARITY) + | (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY; + constexpr int _Op = _Op_base | _SIDD_MOST_SIGNIFICANT; constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; const size_t _Last_part_size = _Haystack_length_bytes & 0xF; @@ -3959,37 +4074,77 @@ namespace { const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); constexpr int _Not_found = -1; // equal to npos when treated as size_t; also less than any found value - int _Found_pos = _Not_found; - - const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, - const int _Size1) noexcept { - if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); - if (_Pos > _Found_pos) { - _Found_pos = _Pos; - } - } - }; #pragma warning(push) #pragma warning(disable : 4324) // structure was padded due to alignment specifier const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1) noexcept { - const void* _Cur_needle = _Needle; - do { - const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); - _Step(_Data2, _Part_size_el, _Data1, _Size1); - _Advance_bytes(_Cur_needle, 16); - } while (_Cur_needle != _Last_needle); + if constexpr (_Pred == _Find_meow_of_predicate::_Any_of) { + int _Found_pos = _Not_found; + + const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, + const int _Size1) noexcept { + if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); + if (_Pos > _Found_pos) { + _Found_pos = _Pos; + } + } + }; + + const void* _Cur_needle = _Needle; + do { + const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); + _Step(_Data2, _Part_size_el, _Data1, _Size1); + _Advance_bytes(_Cur_needle, 16); + } while (_Cur_needle != _Last_needle); + + if (_Last_needle_length_el != 0) { + _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); + } + + return _Found_pos; + } else { + constexpr int _Op_mask = _Op_base | _SIDD_BIT_MASK; + + const void* _Cur_needle = _Needle; + + const __m128i _Data2_first = _mm_loadu_si128(static_cast(_Cur_needle)); + + __m128i _Found = _mm_cmpestrm(_Data2_first, _Part_size_el, _Data1, _Size1, _Op_mask); + + while (_Cur_needle != _Last_needle) { + const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); + const __m128i _Found_part = _mm_cmpestrm(_Data2, _Part_size_el, _Data1, _Size1, _Op_mask); + _Found = _mm_and_si128(_Found, _Found_part); + _Advance_bytes(_Cur_needle, 16); + } + + if (_Last_needle_length_el != 0) { + const __m128i _Found_part = + _mm_cmpestrm(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1, _Op_mask); + _Found = _mm_and_si128(_Found, _Found_part); + _Advance_bytes(_Cur_needle, 16); + } + + const int _Bingo = _mm_cvtsi128_si32(_Found); + int _Found_pos = _Not_found; + + if (_Bingo != 0) { + unsigned long _Tmp; + // CodeQL [SM02313] _Tmp is always initialized: we just tested `if (_Bingo != 0)`. + _BitScanReverse(&_Tmp, _Bingo); + _Found_pos = static_cast(_Tmp); + } - if (_Last_needle_length_el != 0) { - _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); + return _Found_pos; } }; #pragma warning(pop) while (_Cur != _Stop_at) { _Rewind_bytes(_Cur, 16); - _Test_whole_needle(_mm_loadu_si128(static_cast(_Cur)), _Part_size_el); + const int _Found_pos = + _Test_whole_needle(_mm_loadu_si128(static_cast(_Cur)), _Part_size_el); if (_Found_pos != _Not_found) { return _Byte_length(_Haystack, _Cur) / sizeof(_Ty) + _Found_pos; @@ -4008,15 +4163,15 @@ namespace { _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); } - _Test_whole_needle(_Data1, _Last_part_size_el); + return _Test_whole_needle(_Data1, _Last_part_size_el); } - return static_cast(_Found_pos); + return static_cast(_Not_found); } } #endif // !_M_ARM64EC - template + template size_t _Dispatch_pos( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { using namespace __std_find_meow_of_bitmap; @@ -4027,26 +4182,26 @@ namespace { if (_Strat == _Strategy::_Vector_bitmap) { if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Impl_last_avx<_Ty>(_First1, _Count1, _First2, _Count2); + return _Impl_last_avx<_Ty, _Pred>(_First1, _Count1, _First2, _Count2); } } else if (_Strat == _Strategy::_Scalar_bitmap) { if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { alignas(32) _Scalar_table_t _Table = {}; _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); - return _Impl_last_scalar<_Ty>(_First1, _Count1, _Table); + return _Impl_last_scalar<_Ty, _Pred>(_First1, _Count1, _Table); } } - return _Impl<_Ty>(_First1, _Count1, _First2, _Count2); + return _Impl<_Ty, _Pred>(_First1, _Count1, _First2, _Count2); } else #endif // !_M_ARM64EC { alignas(32) _Scalar_table_t _Table = {}; if (_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { - return _Impl_last_scalar<_Ty>(_First1, _Count1, _Table); + return _Impl_last_scalar<_Ty, _Pred>(_First1, _Count1, _Table); } - return _Fallback<_Ty>(_First1, _Count1, _First2, _Count2); + return _Fallback<_Ty, _Pred>(_First1, _Count1, _First2, _Count2); } } } // namespace __std_find_last_of @@ -4675,32 +4830,62 @@ const void* __stdcall __std_find_first_of_trivial_8( __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_last_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_last_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_last_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_last_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_not_of_trivial_pos_1(const void* const _Haystack, + const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + return __std_find_first_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_not_of_trivial_pos_2(const void* const _Haystack, + const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + return __std_find_first_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_last_not_of_trivial_pos_1(const void* const _Haystack, + const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + return __std_find_last_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_last_not_of_trivial_pos_2(const void* const _Haystack, + const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + return __std_find_last_of::_Dispatch_pos( + _Haystack, _Haystack_length, _Needle, _Needle_length); } const void* __stdcall __std_search_1( diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 9ccaadf6ea3..2e5a86f1a35 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -1064,12 +1064,25 @@ void test_bitset(mt19937_64& gen) { } template -void test_case_string_find_first_of(const basic_string& input_haystack, const basic_string& input_needle) { - auto expected_iter = last_known_good_find_first_of( - input_haystack.begin(), input_haystack.end(), input_needle.begin(), input_needle.end()); - auto expected = (expected_iter != input_haystack.end()) ? expected_iter - input_haystack.begin() : ptrdiff_t{-1}; - auto actual = static_cast(input_haystack.find_first_of(input_needle)); - assert(expected == actual); +size_t last_known_good_find_first_of(const basic_string& h, const basic_string& n) { + for (size_t pos = 0, pos_max = h.size(); pos != pos_max; ++pos) { + if (n.find(h[pos]) != basic_string::npos) { + return pos; + } + } + + return basic_string::npos; +} + +template +size_t last_known_good_find_first_not_of(const basic_string& h, const basic_string& n) { + for (size_t pos = 0, pos_max = h.size(); pos != pos_max; ++pos) { + if (n.find(h[pos]) == basic_string::npos) { + return pos; + } + } + + return basic_string::npos; } template @@ -1085,6 +1098,33 @@ size_t last_known_good_find_last_of(const basic_string& h, const basic_string return basic_string::npos; } +template +size_t last_known_good_find_last_not_of(const basic_string& h, const basic_string& n) { + size_t pos = h.size(); + while (pos != 0) { + --pos; + if (n.find(h[pos]) == basic_string::npos) { + return pos; + } + } + + return basic_string::npos; +} + +template +void test_case_string_find_first_of(const basic_string& input_haystack, const basic_string& input_needle) { + size_t expected = last_known_good_find_first_of(input_haystack, input_needle); + size_t actual = input_haystack.find_first_of(input_needle); + assert(expected == actual); +} + +template +void test_case_string_find_first_not_of(const basic_string& input_haystack, const basic_string& input_needle) { + size_t expected = last_known_good_find_first_not_of(input_haystack, input_needle); + size_t actual = input_haystack.find_first_not_of(input_needle); + assert(expected == actual); +} + template void test_case_string_find_last_of(const basic_string& input_haystack, const basic_string& input_needle) { size_t expected = last_known_good_find_last_of(input_haystack, input_needle); @@ -1092,6 +1132,13 @@ void test_case_string_find_last_of(const basic_string& input_haystack, const assert(expected == actual); } +template +void test_case_string_find_last_not_of(const basic_string& input_haystack, const basic_string& input_needle) { + size_t expected = last_known_good_find_last_not_of(input_haystack, input_needle); + size_t actual = input_haystack.find_last_not_of(input_needle); + assert(expected == actual); +} + template void test_case_string_find_ch(const basic_string& input_haystack, const T value) { ptrdiff_t expected; @@ -1216,6 +1263,8 @@ void test_basic_string_dis(mt19937_64& gen, D& dis) { test_case_string_find_first_of(input_haystack, input_needle); test_case_string_find_last_of(input_haystack, input_needle); + test_case_string_find_first_not_of(input_haystack, input_needle); + test_case_string_find_last_not_of(input_haystack, input_needle); test_case_string_find_str(input_haystack, input_needle); test_case_string_rfind_str(input_haystack, input_needle); @@ -1223,6 +1272,8 @@ void test_basic_string_dis(mt19937_64& gen, D& dis) { input_needle.push_back(static_cast(dis(gen))); test_case_string_find_first_of(input_haystack, input_needle); test_case_string_find_last_of(input_haystack, input_needle); + test_case_string_find_first_not_of(input_haystack, input_needle); + test_case_string_find_last_not_of(input_haystack, input_needle); test_case_string_find_str(input_haystack, input_needle); test_case_string_rfind_str(input_haystack, input_needle);