Skip to content

Commit d275277

Browse files
authored
[libc] Optimize mempcy size thresholds (#70049)
Adjust boundary conditions for sizes = 16/32/64. See the added comment for explanations. Results on a machine with AVX2, so sizes 64/128 affected: ``` │ baseline │ adjusted │ │ sec/op │ sec/op vs base │ memcpy/Google_A 5.701n ± 0% 5.551n ± 1% -2.63% (n=100) memcpy/Google_B 3.817n ± 0% 3.776n ± 0% -1.07% (p=0.000 n=100) memcpy/Google_D 11.35n ± 1% 11.32n ± 0% ~ (p=0.066 n=100) memcpy/Google_U 3.874n ± 1% 3.821n ± 1% -1.37% (p=0.001 n=100) memcpy/64 3.843n ± 0% 3.105n ± 3% -19.22% (n=50) memcpy/128 4.842n ± 0% 3.818n ± 0% -21.15% (p=0.000 n=50) ```
1 parent a0cd626 commit d275277

File tree

1 file changed

+25
-7
lines changed

1 file changed

+25
-7
lines changed

libc/src/string/memory_utils/x86_64/inline_memcpy.h

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
5555
[[maybe_unused]] LIBC_INLINE void
5656
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
5757
size_t count) {
58-
if (count < 128)
58+
if (count <= 128)
5959
return builtin::Memcpy<64>::head_tail(dst, src, count);
6060
builtin::Memcpy<32>::block(dst, src);
6161
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -65,7 +65,7 @@ inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
6565
[[maybe_unused]] LIBC_INLINE void
6666
inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
6767
size_t count) {
68-
if (count < 128)
68+
if (count <= 128)
6969
return builtin::Memcpy<64>::head_tail(dst, src, count);
7070
if (count < 256)
7171
return builtin::Memcpy<128>::head_tail(dst, src, count);
@@ -79,7 +79,7 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
7979
CPtr __restrict src, size_t count) {
8080
using namespace LIBC_NAMESPACE::x86;
8181
prefetch_to_local_cache(src + kOneCacheline);
82-
if (count < 128)
82+
if (count <= 128)
8383
return builtin::Memcpy<64>::head_tail(dst, src, count);
8484
prefetch_to_local_cache(src + kTwoCachelines);
8585
// Aligning 'dst' on a 32B boundary.
@@ -120,7 +120,7 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
120120
CPtr __restrict src, size_t count) {
121121
using namespace LIBC_NAMESPACE::x86;
122122
prefetch_to_local_cache(src + kOneCacheline);
123-
if (count < 128)
123+
if (count <= 128)
124124
return builtin::Memcpy<64>::head_tail(dst, src, count);
125125
prefetch_to_local_cache(src + kTwoCachelines);
126126
prefetch_to_local_cache(src + kThreeCachelines);
@@ -149,6 +149,15 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
149149

150150
[[maybe_unused]] LIBC_INLINE void
151151
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
152+
#if defined(__AVX512F__)
153+
constexpr size_t vector_size = 64;
154+
#elif defined(__AVX__)
155+
constexpr size_t vector_size = 32;
156+
#elif defined(__SSE2__)
157+
constexpr size_t vector_size = 16;
158+
#else
159+
constexpr size_t vector_size = 8;
160+
#endif
152161
if (count == 0)
153162
return;
154163
if (count == 1)
@@ -161,11 +170,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
161170
return builtin::Memcpy<4>::block(dst, src);
162171
if (count < 8)
163172
return builtin::Memcpy<4>::head_tail(dst, src, count);
164-
if (count < 16)
173+
// If count is equal to a power of 2, we can handle it as head-tail
174+
// of both smaller size and larger size (head-tail are either
175+
// non-overlapping for smaller size, or completely collapsed
176+
// for larger size). It seems to be more profitable to do the copy
177+
// with the larger size, if it's natively supported (e.g. doing
178+
// 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
179+
// But it's not profitable to use larger size if it's not natively
180+
// supported: we will both use more instructions and handle fewer
181+
// sizes in earlier branches.
182+
if (vector_size >= 16 ? count < 16 : count <= 16)
165183
return builtin::Memcpy<8>::head_tail(dst, src, count);
166-
if (count < 32)
184+
if (vector_size >= 32 ? count < 32 : count <= 32)
167185
return builtin::Memcpy<16>::head_tail(dst, src, count);
168-
if (count < 64)
186+
if (vector_size >= 64 ? count < 64 : count <= 64)
169187
return builtin::Memcpy<32>::head_tail(dst, src, count);
170188
if constexpr (x86::kAvx) {
171189
if constexpr (x86::kUseSoftwarePrefetching) {

0 commit comments

Comments
 (0)