@@ -55,7 +55,7 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
55
55
[[maybe_unused]] LIBC_INLINE void
56
56
inline_memcpy_x86_sse2_ge64 (Ptr __restrict dst, CPtr __restrict src,
57
57
size_t count) {
58
- if (count < 128 )
58
+ if (count <= 128 )
59
59
return builtin::Memcpy<64 >::head_tail (dst, src, count);
60
60
builtin::Memcpy<32 >::block (dst, src);
61
61
align_to_next_boundary<32 , Arg::Dst>(dst, src, count);
@@ -65,7 +65,7 @@ inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
65
65
[[maybe_unused]] LIBC_INLINE void
66
66
inline_memcpy_x86_avx_ge64 (Ptr __restrict dst, CPtr __restrict src,
67
67
size_t count) {
68
- if (count < 128 )
68
+ if (count <= 128 )
69
69
return builtin::Memcpy<64 >::head_tail (dst, src, count);
70
70
if (count < 256 )
71
71
return builtin::Memcpy<128 >::head_tail (dst, src, count);
@@ -79,7 +79,7 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
79
79
CPtr __restrict src, size_t count) {
80
80
using namespace LIBC_NAMESPACE ::x86;
81
81
prefetch_to_local_cache (src + kOneCacheline );
82
- if (count < 128 )
82
+ if (count <= 128 )
83
83
return builtin::Memcpy<64 >::head_tail (dst, src, count);
84
84
prefetch_to_local_cache (src + kTwoCachelines );
85
85
// Aligning 'dst' on a 32B boundary.
@@ -120,7 +120,7 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
120
120
CPtr __restrict src, size_t count) {
121
121
using namespace LIBC_NAMESPACE ::x86;
122
122
prefetch_to_local_cache (src + kOneCacheline );
123
- if (count < 128 )
123
+ if (count <= 128 )
124
124
return builtin::Memcpy<64 >::head_tail (dst, src, count);
125
125
prefetch_to_local_cache (src + kTwoCachelines );
126
126
prefetch_to_local_cache (src + kThreeCachelines );
@@ -149,6 +149,15 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
149
149
150
150
[[maybe_unused]] LIBC_INLINE void
151
151
inline_memcpy_x86 (Ptr __restrict dst, CPtr __restrict src, size_t count) {
152
+ #if defined(__AVX512F__)
153
+ constexpr size_t vector_size = 64 ;
154
+ #elif defined(__AVX__)
155
+ constexpr size_t vector_size = 32 ;
156
+ #elif defined(__SSE2__)
157
+ constexpr size_t vector_size = 16 ;
158
+ #else
159
+ constexpr size_t vector_size = 8 ;
160
+ #endif
152
161
if (count == 0 )
153
162
return ;
154
163
if (count == 1 )
@@ -161,11 +170,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
161
170
return builtin::Memcpy<4 >::block (dst, src);
162
171
if (count < 8 )
163
172
return builtin::Memcpy<4 >::head_tail (dst, src, count);
164
- if (count < 16 )
173
+ // If count is equal to a power of 2, we can handle it as head-tail
174
+ // of both smaller size and larger size (head-tail are either
175
+ // non-overlapping for smaller size, or completely collapsed
176
+ // for larger size). It seems to be more profitable to do the copy
177
+ // with the larger size, if it's natively supported (e.g. doing
178
+ // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
179
+ // But it's not profitable to use larger size if it's not natively
180
+ // supported: we will both use more instructions and handle fewer
181
+ // sizes in earlier branches.
182
+ if (vector_size >= 16 ? count < 16 : count <= 16 )
165
183
return builtin::Memcpy<8 >::head_tail (dst, src, count);
166
- if (count < 32 )
184
+ if (vector_size >= 32 ? count < 32 : count <= 32 )
167
185
return builtin::Memcpy<16 >::head_tail (dst, src, count);
168
- if (count < 64 )
186
+ if (vector_size >= 64 ? count < 64 : count <= 64 )
169
187
return builtin::Memcpy<32 >::head_tail (dst, src, count);
170
188
if constexpr (x86::kAvx ) {
171
189
if constexpr (x86::kUseSoftwarePrefetching ) {
0 commit comments