Skip to content

Vectorize search for 32-bit and 64-bit elements, also improve 8-bit and 16-bit vectorization #5484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 17, 2025
Merged
17 changes: 15 additions & 2 deletions benchmarks/src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ constexpr data_and_pattern patterns[] = {
template <class T>
using not_highly_aligned_basic_string = std::basic_string<T, std::char_traits<T>, not_highly_aligned_allocator<T>>;

using not_highly_aligned_string = not_highly_aligned_basic_string<char>;
using not_highly_aligned_wstring = not_highly_aligned_basic_string<wchar_t>;
using not_highly_aligned_string = not_highly_aligned_basic_string<char>;
using not_highly_aligned_wstring = not_highly_aligned_basic_string<wchar_t>;
using not_highly_aligned_u32string = not_highly_aligned_basic_string<char32_t>;

void c_strstr(benchmark::State& state) {
const auto& src_haystack = patterns[static_cast<size_t>(state.range())].data;
Expand Down Expand Up @@ -190,23 +191,35 @@ BENCHMARK(c_strstr)->Apply(common_args);

BENCHMARK(classic_search<std::uint8_t>)->Apply(common_args);
BENCHMARK(classic_search<std::uint16_t>)->Apply(common_args);
BENCHMARK(classic_search<std::uint32_t>)->Apply(common_args);
BENCHMARK(classic_search<std::uint64_t>)->Apply(common_args);

BENCHMARK(ranges_search<std::uint8_t>)->Apply(common_args);
BENCHMARK(ranges_search<std::uint16_t>)->Apply(common_args);
BENCHMARK(ranges_search<std::uint32_t>)->Apply(common_args);
BENCHMARK(ranges_search<std::uint64_t>)->Apply(common_args);

BENCHMARK(search_default_searcher<std::uint8_t>)->Apply(common_args);
BENCHMARK(search_default_searcher<std::uint16_t>)->Apply(common_args);
BENCHMARK(search_default_searcher<std::uint32_t>)->Apply(common_args);
BENCHMARK(search_default_searcher<std::uint64_t>)->Apply(common_args);

BENCHMARK(member_find<not_highly_aligned_string>)->Apply(common_args);
BENCHMARK(member_find<not_highly_aligned_wstring>)->Apply(common_args);
BENCHMARK(member_find<not_highly_aligned_u32string>)->Apply(common_args);

BENCHMARK(classic_find_end<std::uint8_t>)->Apply(common_args);
BENCHMARK(classic_find_end<std::uint16_t>)->Apply(common_args);
BENCHMARK(classic_find_end<std::uint32_t>)->Apply(common_args);
BENCHMARK(classic_find_end<std::uint64_t>)->Apply(common_args);

BENCHMARK(ranges_find_end<std::uint8_t>)->Apply(common_args);
BENCHMARK(ranges_find_end<std::uint16_t>)->Apply(common_args);
BENCHMARK(ranges_find_end<std::uint32_t>)->Apply(common_args);
BENCHMARK(ranges_find_end<std::uint64_t>)->Apply(common_args);

BENCHMARK(member_rfind<not_highly_aligned_string>)->Apply(common_args);
BENCHMARK(member_rfind<not_highly_aligned_wstring>)->Apply(common_args);
BENCHMARK(member_rfind<not_highly_aligned_u32string>)->Apply(common_args);

BENCHMARK_MAIN();
4 changes: 2 additions & 2 deletions stl/inc/__msvc_string_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ constexpr size_t _Traits_find(_In_reads_(_Hay_size) const _Traits_ptr_t<_Traits>
}

#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (_Is_implementation_handled_char_traits<_Traits> && sizeof(typename _Traits::char_type) <= 2) {
if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
if (!_STD _Is_constant_evaluated()) {
const auto _End = _Haystack + _Hay_size;
const auto _Ptr = _STD _Search_vectorized(_Haystack + _Start_at, _End, _Needle, _Needle_size);
Expand Down Expand Up @@ -808,7 +808,7 @@ constexpr size_t _Traits_rfind(_In_reads_(_Hay_size) const _Traits_ptr_t<_Traits
const size_t _Actual_start_at = (_STD min)(_Start_at, _Hay_size - _Needle_size);

#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (_Is_implementation_handled_char_traits<_Traits> && sizeof(typename _Traits::char_type) <= 2) {
if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
if (!_STD _Is_constant_evaluated()) {
// _Find_end_vectorized takes into account the needle length when locating the search start.
// As a potentially earlier start position can be specified, we need to take it into account,
Expand Down
20 changes: 18 additions & 2 deletions stl/inc/xutility
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,20 @@ const void* __stdcall __std_search_1(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_search_2(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_search_4(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_search_8(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;

const void* __stdcall __std_find_end_1(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_find_end_2(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_find_end_4(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_find_end_8(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;


const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept;
const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept;
Expand Down Expand Up @@ -270,6 +279,10 @@ _Ty1* _Search_vectorized(_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _F
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_search_1(_First1, _Last1, _First2, _Count2)));
} else if constexpr (sizeof(_Ty1) == 2) {
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_search_2(_First1, _Last1, _First2, _Count2)));
} else if constexpr (sizeof(_Ty1) == 4) {
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_search_4(_First1, _Last1, _First2, _Count2)));
} else if constexpr (sizeof(_Ty1) == 8) {
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_search_8(_First1, _Last1, _First2, _Count2)));
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
}
Expand All @@ -283,6 +296,10 @@ _Ty1* _Find_end_vectorized(
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_find_end_1(_First1, _Last1, _First2, _Count2)));
} else if constexpr (sizeof(_Ty1) == 2) {
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_find_end_2(_First1, _Last1, _First2, _Count2)));
} else if constexpr (sizeof(_Ty1) == 4) {
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_find_end_4(_First1, _Last1, _First2, _Count2)));
} else if constexpr (sizeof(_Ty1) == 8) {
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_find_end_8(_First1, _Last1, _First2, _Count2)));
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
}
Expand Down Expand Up @@ -5502,8 +5519,7 @@ constexpr bool _Equal_memcmp_is_safe =

// Can we activate the vector algorithms for std::search?
template <class _It1, class _It2, class _Pr>
constexpr bool _Vector_alg_in_search_is_safe = _Equal_memcmp_is_safe<_It1, _It2, _Pr> // can search bitwise
&& sizeof(_Iter_value_t<_It1>) <= 2; // pcmpestri compatible element size
constexpr bool _Vector_alg_in_search_is_safe = _Equal_memcmp_is_safe<_It1, _It2, _Pr>;

template <class _CtgIt1, class _CtgIt2>
_NODISCARD int _Memcmp_count(_CtgIt1 _First1, _CtgIt2 _First2, const size_t _Count) {
Expand Down
Loading