1 /* memmove/memcpy/mempcpy optimized for aligned access with SSSE3.
2 All versions must be listed in ifunc-impl-list.c.
3 Copyright (C) 2022-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 #include <isa-level.h>
23 #if ISA_SHOULD_BUILD (2)
27 # define MEMMOVE __memmove_ssse3
28 # define MEMMOVE_CHK __memmove_chk_ssse3
29 # define MEMCPY __memcpy_ssse3
30 # define MEMCPY_CHK __memcpy_chk_ssse3
31 # define MEMPCPY __mempcpy_ssse3
32 # define MEMPCPY_CHK __mempcpy_chk_ssse3
35 .section .text.ssse3, "ax", @progbits
39 jb HIDDEN_JUMPTARGET(__chk_fail)
52 jb HIDDEN_JUMPTARGET(__chk_fail)
56 ENTRY_P2ALIGN(MEMMOVE, 6)
58 /* Clear the upper 32 bits. */
66 /* These loads are always useful. */
68 movups -16(%rsi, %rdx), %xmm7
73 movups %xmm7, -16(%rdi, %rdx)
83 movq -8(%rsi, %rdx), %rsi
85 movq %rsi, -8(%rdi, %rdx)
91 movl -4(%rsi, %rdx), %esi
93 movl %esi, -4(%rdi, %rdx)
103 movzwl -1(%rsi, %rdx), %esi
104 movw %si, -1(%rdi, %rdx)
112 movups 16(%rsi), %xmm1
113 movups -32(%rsi, %rdx), %xmm2
115 movups %xmm0, 0(%rdi)
116 movups %xmm1, 16(%rdi)
117 movups %xmm2, -32(%rdi, %rdx)
118 movups %xmm7, -16(%rdi, %rdx)
127 /* We use rcx later to get alignr value. */
130 /* Backward copy for overlap + dst > src for memmove safety. */
137 /* -16(%rsi, %rdx) already loaded into xmm7. */
138 movups -32(%rsi, %rdx), %xmm8
139 movups -48(%rsi, %rdx), %xmm9
141 /* Get misalignment. */
147 /* Get first vec for `palignr`. */
150 /* We have loaded (%rsi) so safe to do this store before the
154 # ifdef SHARED_CACHE_SIZE_HALF
155 cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
157 cmp __x86_shared_cache_size_half(%rip), %rdx
161 leaq -64(%rdi, %rdx), %r8
165 leaq L(loop_fwd_start)(%rip), %r9
177 /* (%rsi) already loaded into xmm0. */
178 movups 16(%rsi), %xmm4
179 movups 32(%rsi), %xmm5
183 leaq -49(%rdi, %rdx), %rdi
188 movaps 48(%rsi), %xmm6
191 leaq L(loop_bkwd_start)(%rip), %r9
199 movups -64(%r9, %rdx), %xmm10
200 movups -80(%r9, %rdx), %xmm11
203 leal (%rcx, %rcx, 2), %r8d
204 leaq -96(%rdi, %rdx), %rcx
206 leaq L(large_loop_fwd_start)(%rip), %rdx
211 /* Instead of a typical jump table all 16 loops are exactly
212 64-bytes in size. So, we can just jump to first loop + r8 *
213 64. Before modifying any loop ensure all their sizes match!
218 movaps 16(%rsi), %xmm1
219 movaps 32(%rsi), %xmm2
220 movaps 48(%rsi), %xmm3
221 movaps %xmm1, 16(%rdi)
222 movaps %xmm2, 32(%rdi)
223 movaps %xmm3, 48(%rdi)
229 movups %xmm9, 16(%r8)
230 movups %xmm8, 32(%r8)
231 movups %xmm7, 48(%r8)
234 /* Exactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
235 60 bytes otherwise. */
236 # define ALIGNED_LOOP_FWD(align_by); \
238 L(loop_fwd_ ## align_by): \
239 movaps 16(%rsi), %xmm0; \
240 movaps 32(%rsi), %xmm2; \
241 movaps 48(%rsi), %xmm3; \
242 movaps %xmm3, %xmm4; \
243 palignr $align_by, %xmm2, %xmm3; \
244 palignr $align_by, %xmm0, %xmm2; \
245 palignr $align_by, %xmm1, %xmm0; \
246 movaps %xmm4, %xmm1; \
247 movaps %xmm0, 16(%rdi); \
248 movaps %xmm2, 32(%rdi); \
249 movaps %xmm3, 48(%rdi); \
253 ja L(loop_fwd_ ## align_by); \
256 /* Must be in descending order. */
257 ALIGNED_LOOP_FWD (0xf)
258 ALIGNED_LOOP_FWD (0xe)
259 ALIGNED_LOOP_FWD (0xd)
260 ALIGNED_LOOP_FWD (0xc)
261 ALIGNED_LOOP_FWD (0xb)
262 ALIGNED_LOOP_FWD (0xa)
263 ALIGNED_LOOP_FWD (0x9)
264 ALIGNED_LOOP_FWD (0x8)
265 ALIGNED_LOOP_FWD (0x7)
266 ALIGNED_LOOP_FWD (0x6)
267 ALIGNED_LOOP_FWD (0x5)
268 ALIGNED_LOOP_FWD (0x4)
269 ALIGNED_LOOP_FWD (0x3)
270 ALIGNED_LOOP_FWD (0x2)
271 ALIGNED_LOOP_FWD (0x1)
274 L(large_loop_fwd_start):
275 L(large_loop_fwd_0x0):
276 movaps 16(%rsi), %xmm1
277 movaps 32(%rsi), %xmm2
278 movaps 48(%rsi), %xmm3
279 movaps 64(%rsi), %xmm4
280 movaps 80(%rsi), %xmm5
281 movntps %xmm1, 16(%rdi)
282 movntps %xmm2, 32(%rdi)
283 movntps %xmm3, 48(%rdi)
284 movntps %xmm4, 64(%rdi)
285 movntps %xmm5, 80(%rdi)
289 ja L(large_loop_fwd_0x0)
291 /* Ensure no icache line split on tail. */
293 L(end_large_loop_fwd):
295 movups %xmm11, 16(%rcx)
296 movups %xmm10, 32(%rcx)
297 movups %xmm9, 48(%rcx)
298 movups %xmm8, 64(%rcx)
299 movups %xmm7, 80(%rcx)
303 /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
304 96-byte spacing between each. */
305 # define ALIGNED_LARGE_LOOP_FWD(align_by); \
307 L(large_loop_fwd_ ## align_by): \
308 movaps 16(%rsi), %xmm0; \
309 movaps 32(%rsi), %xmm2; \
310 movaps 48(%rsi), %xmm3; \
311 movaps 64(%rsi), %xmm4; \
312 movaps 80(%rsi), %xmm5; \
313 movaps %xmm5, %xmm6; \
314 palignr $align_by, %xmm4, %xmm5; \
315 palignr $align_by, %xmm3, %xmm4; \
316 palignr $align_by, %xmm2, %xmm3; \
317 palignr $align_by, %xmm0, %xmm2; \
318 palignr $align_by, %xmm1, %xmm0; \
319 movaps %xmm6, %xmm1; \
320 movntps %xmm0, 16(%rdi); \
321 movntps %xmm2, 32(%rdi); \
322 movntps %xmm3, 48(%rdi); \
323 movntps %xmm4, 64(%rdi); \
324 movntps %xmm5, 80(%rdi); \
328 ja L(large_loop_fwd_ ## align_by); \
329 jmp L(end_large_loop_fwd);
331 /* Must be in descending order. */
332 ALIGNED_LARGE_LOOP_FWD (0xf)
333 ALIGNED_LARGE_LOOP_FWD (0xe)
334 ALIGNED_LARGE_LOOP_FWD (0xd)
335 ALIGNED_LARGE_LOOP_FWD (0xc)
336 ALIGNED_LARGE_LOOP_FWD (0xb)
337 ALIGNED_LARGE_LOOP_FWD (0xa)
338 ALIGNED_LARGE_LOOP_FWD (0x9)
339 ALIGNED_LARGE_LOOP_FWD (0x8)
340 ALIGNED_LARGE_LOOP_FWD (0x7)
341 ALIGNED_LARGE_LOOP_FWD (0x6)
342 ALIGNED_LARGE_LOOP_FWD (0x5)
343 ALIGNED_LARGE_LOOP_FWD (0x4)
344 ALIGNED_LARGE_LOOP_FWD (0x3)
345 ALIGNED_LARGE_LOOP_FWD (0x2)
346 ALIGNED_LARGE_LOOP_FWD (0x1)
352 movaps 32(%rsi), %xmm1
353 movaps 16(%rsi), %xmm2
354 movaps 0(%rsi), %xmm3
355 movaps %xmm1, 32(%rdi)
356 movaps %xmm2, 16(%rdi)
357 movaps %xmm3, 0(%rdi)
363 movups %xmm7, -16(%r8, %rdx)
365 movups %xmm4, 16(%r8)
366 movups %xmm5, 32(%r8)
371 /* Exactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
372 60 bytes otherwise. */
373 # define ALIGNED_LOOP_BKWD(align_by); \
375 L(loop_bkwd_ ## align_by): \
376 movaps 32(%rsi), %xmm1; \
377 movaps 16(%rsi), %xmm2; \
378 movaps 0(%rsi), %xmm3; \
379 palignr $align_by, %xmm1, %xmm6; \
380 palignr $align_by, %xmm2, %xmm1; \
381 palignr $align_by, %xmm3, %xmm2; \
382 movaps %xmm6, 32(%rdi); \
383 movaps %xmm1, 16(%rdi); \
384 movaps %xmm2, 0(%rdi); \
387 movaps %xmm3, %xmm6; \
389 jb L(loop_bkwd_ ## align_by); \
390 jmp L(end_loop_bkwd);
392 /* Must be in descending order. */
393 ALIGNED_LOOP_BKWD (0xf)
394 ALIGNED_LOOP_BKWD (0xe)
395 ALIGNED_LOOP_BKWD (0xd)
396 ALIGNED_LOOP_BKWD (0xc)
397 ALIGNED_LOOP_BKWD (0xb)
398 ALIGNED_LOOP_BKWD (0xa)
399 ALIGNED_LOOP_BKWD (0x9)
400 ALIGNED_LOOP_BKWD (0x8)
401 ALIGNED_LOOP_BKWD (0x7)
402 ALIGNED_LOOP_BKWD (0x6)
403 ALIGNED_LOOP_BKWD (0x5)
404 ALIGNED_LOOP_BKWD (0x4)
405 ALIGNED_LOOP_BKWD (0x3)
406 ALIGNED_LOOP_BKWD (0x2)
407 ALIGNED_LOOP_BKWD (0x1)
410 strong_alias (MEMMOVE, MEMCPY)
412 strong_alias (MEMMOVE_CHK, MEMCPY_CHK)