1 /* memcpy optimized with SSE2 unaligned memory access instructions.
2 Copyright (C) 2014-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
25 # include "asm-syntax.h"
28 # define MEMCPY __memcpy_sse2_unaligned
29 # define MEMCPY_CHK __memcpy_chk_sse2_unaligned
36 # define CFI_PUSH(REG) \
37 cfi_adjust_cfa_offset (4); \
38 cfi_rel_offset (REG, 0)
40 # define CFI_POP(REG) \
41 cfi_adjust_cfa_offset (-4); \
44 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
45 # define POP(REG) popl REG; CFI_POP (REG)
47 # define PARMS 8 /* Preserve EBX. */
48 # define ENTRANCE PUSH (%ebx);
49 # define RETURN_END POP (%ebx); ret
50 # define RETURN RETURN_END; CFI_PUSH (%ebx)
52 .section .text.sse2,"ax",@progbits
57 jb HIDDEN_JUMPTARGET (__chk_fail)
68 # ifdef USE_AS_MEMMOVE
71 L(mm_len_0_or_more_backward):
72 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
75 jbe L(mm_len_0_16_bytes_backward)
78 ja L(mm_len_32_or_more_backward)
80 /* Copy [0..32] and return. */
82 movdqu -16(%eax, %ecx), %xmm1
84 movdqu %xmm1, -16(%edx, %ecx)
87 L(mm_len_32_or_more_backward):
89 ja L(mm_len_64_or_more_backward)
91 /* Copy [0..64] and return. */
93 movdqu 16(%eax), %xmm1
94 movdqu -16(%eax, %ecx), %xmm2
95 movdqu -32(%eax, %ecx), %xmm3
97 movdqu %xmm1, 16(%edx)
98 movdqu %xmm2, -16(%edx, %ecx)
99 movdqu %xmm3, -32(%edx, %ecx)
102 L(mm_len_64_or_more_backward):
104 ja L(mm_len_128_or_more_backward)
106 /* Copy [0..128] and return. */
108 movdqu 16(%eax), %xmm1
109 movdqu 32(%eax), %xmm2
110 movdqu 48(%eax), %xmm3
111 movdqu -64(%eax, %ecx), %xmm4
112 movdqu -48(%eax, %ecx), %xmm5
113 movdqu -32(%eax, %ecx), %xmm6
114 movdqu -16(%eax, %ecx), %xmm7
116 movdqu %xmm1, 16(%edx)
117 movdqu %xmm2, 32(%edx)
118 movdqu %xmm3, 48(%edx)
119 movdqu %xmm4, -64(%edx, %ecx)
120 movdqu %xmm5, -48(%edx, %ecx)
121 movdqu %xmm6, -32(%edx, %ecx)
122 movdqu %xmm7, -16(%edx, %ecx)
125 L(mm_len_128_or_more_backward):
134 /* Aligning the address of destination. */
136 movdqu 16(%eax), %xmm5
137 movdqu 32(%eax), %xmm6
138 movdqu 48(%eax), %xmm7
139 leal (%edx, %ecx), %esi
140 movdqu -16(%eax, %ecx), %xmm0
148 leal (%eax, %ebx), %eax
151 # ifdef SHARED_CACHE_SIZE_HALF
152 cmp $SHARED_CACHE_SIZE_HALF, %edi
157 add $_GLOBAL_OFFSET_TABLE_, %ebx
158 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
161 cmp __x86_shared_cache_size_half, %edi
164 jae L(mm_large_page_loop_backward)
167 L(mm_main_loop_backward):
169 prefetcht0 -128(%eax)
171 movdqu -64(%eax), %xmm0
172 movdqu -48(%eax), %xmm1
173 movdqu -32(%eax), %xmm2
174 movdqu -16(%eax), %xmm3
175 movaps %xmm0, -64(%ecx)
177 movaps %xmm1, -48(%ecx)
178 movaps %xmm2, -32(%ecx)
179 movaps %xmm3, -16(%ecx)
182 jnz L(mm_main_loop_backward)
185 movdqu %xmm0, -16(%esi)
187 movdqu %xmm5, 16(%edx)
188 movdqu %xmm6, 32(%edx)
189 movdqu %xmm7, 48(%edx)
191 jmp L(mm_return_pop_all)
193 /* Copy [0..16] and return. */
194 L(mm_len_0_16_bytes_backward):
196 jnz L(mm_len_9_16_bytes_backward)
199 jnz L(mm_len_5_8_bytes_backward)
205 jne L(mm_len_3_4_bytes_backward)
206 movzbl -1(%eax,%ecx), %ebx
208 movb %bl, -1(%edx,%ecx)
212 L(mm_len_3_4_bytes_backward):
213 movzwl -2(%eax,%ecx), %ebx
215 movw %bx, -2(%edx,%ecx)
219 L(mm_len_9_16_bytes_backward):
221 movl -4(%eax,%ecx), %ebx
222 movl -8(%eax,%ecx), %esi
223 movl %ebx, -4(%edx,%ecx)
224 movl %esi, -8(%edx,%ecx)
227 jmp L(mm_len_0_16_bytes_backward)
229 L(mm_len_5_8_bytes_backward):
231 movl -4(%eax,%ecx), %eax
233 movl %eax, -4(%edx,%ecx)
236 /* Big length copy backward part. */
238 L(mm_large_page_loop_backward):
239 movdqu -64(%eax), %xmm0
240 movdqu -48(%eax), %xmm1
241 movdqu -32(%eax), %xmm2
242 movdqu -16(%eax), %xmm3
243 movntdq %xmm0, -64(%ecx)
245 movntdq %xmm1, -48(%ecx)
246 movntdq %xmm2, -32(%ecx)
247 movntdq %xmm3, -16(%ecx)
250 jnz L(mm_large_page_loop_backward)
254 movdqu %xmm0, -16(%esi)
256 movdqu %xmm5, 16(%edx)
257 movdqu %xmm6, 32(%edx)
258 movdqu %xmm7, 48(%edx)
260 jmp L(mm_return_pop_all)
268 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
271 jbe L(mm_len_0_16_bytes_forward)
274 ja L(mm_len_32_or_more_forward)
276 /* Copy [0..32] and return. */
278 movdqu -16(%eax, %ecx), %xmm1
280 movdqu %xmm1, -16(%edx, %ecx)
283 L(mm_len_32_or_more_forward):
285 ja L(mm_len_64_or_more_forward)
287 /* Copy [0..64] and return. */
289 movdqu 16(%eax), %xmm1
290 movdqu -16(%eax, %ecx), %xmm2
291 movdqu -32(%eax, %ecx), %xmm3
293 movdqu %xmm1, 16(%edx)
294 movdqu %xmm2, -16(%edx, %ecx)
295 movdqu %xmm3, -32(%edx, %ecx)
298 L(mm_len_64_or_more_forward):
300 ja L(mm_len_128_or_more_forward)
302 /* Copy [0..128] and return. */
304 movdqu 16(%eax), %xmm1
305 movdqu 32(%eax), %xmm2
306 movdqu 48(%eax), %xmm3
307 movdqu -64(%eax, %ecx), %xmm4
308 movdqu -48(%eax, %ecx), %xmm5
309 movdqu -32(%eax, %ecx), %xmm6
310 movdqu -16(%eax, %ecx), %xmm7
312 movdqu %xmm1, 16(%edx)
313 movdqu %xmm2, 32(%edx)
314 movdqu %xmm3, 48(%edx)
315 movdqu %xmm4, -64(%edx, %ecx)
316 movdqu %xmm5, -48(%edx, %ecx)
317 movdqu %xmm6, -32(%edx, %ecx)
318 movdqu %xmm7, -16(%edx, %ecx)
321 L(mm_len_128_or_more_forward):
326 /* Aligning the address of destination. */
327 movdqu -16(%eax, %ecx), %xmm4
328 movdqu -32(%eax, %ecx), %xmm5
329 movdqu -48(%eax, %ecx), %xmm6
330 movdqu -64(%eax, %ecx), %xmm7
331 leal (%edx, %ecx), %esi
345 # ifdef SHARED_CACHE_SIZE_HALF
346 cmp $SHARED_CACHE_SIZE_HALF, %edi
351 add $_GLOBAL_OFFSET_TABLE_, %ebx
352 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
355 cmp __x86_shared_cache_size_half, %edi
358 jae L(mm_large_page_loop_forward)
361 L(mm_main_loop_forward):
366 movdqu 16(%eax), %xmm1
367 movdqu 32(%eax), %xmm2
368 movdqu 48(%eax), %xmm3
371 movaps %xmm1, 16(%ecx)
372 movaps %xmm2, 32(%ecx)
373 movaps %xmm3, 48(%ecx)
376 jnz L(mm_main_loop_forward)
380 movdqu %xmm4, -16(%esi)
381 movdqu %xmm5, -32(%esi)
382 movdqu %xmm6, -48(%esi)
383 movdqu %xmm7, -64(%esi)
385 jmp L(mm_return_pop_all)
387 L(mm_len_0_16_bytes_forward):
389 jne L(mm_len_9_16_bytes_forward)
392 jne L(mm_len_5_8_bytes_forward)
398 jne L(mm_len_2_4_bytes_forward)
399 movzbl -1(%eax,%ecx), %ebx
401 movb %bl, -1(%edx,%ecx)
405 L(mm_len_2_4_bytes_forward):
406 movzwl -2(%eax,%ecx), %ebx
408 movw %bx, -2(%edx,%ecx)
412 L(mm_len_5_8_bytes_forward):
414 movl -4(%eax,%ecx), %eax
416 movl %eax, -4(%edx,%ecx)
419 L(mm_len_9_16_bytes_forward):
421 movq -8(%eax, %ecx), %xmm1
423 movq %xmm1, -8(%edx, %ecx)
426 L(mm_return_pop_all):
432 /* Big length copy forward part. */
434 L(mm_large_page_loop_forward):
436 movdqu 16(%eax), %xmm1
437 movdqu 32(%eax), %xmm2
438 movdqu 48(%eax), %xmm3
439 movntdq %xmm0, (%ecx)
441 movntdq %xmm1, 16(%ecx)
442 movntdq %xmm2, 32(%ecx)
443 movntdq %xmm3, 48(%ecx)
446 jnz L(mm_large_page_loop_forward)
451 movdqu %xmm4, -16(%esi)
452 movdqu %xmm5, -32(%esi)
453 movdqu %xmm6, -48(%esi)
454 movdqu %xmm7, -64(%esi)
456 jmp L(mm_return_pop_all)
461 jbe L(len_0_16_bytes)
463 # ifdef SHARED_CACHE_SIZE_HALF
464 cmp $SHARED_CACHE_SIZE_HALF, %ecx
468 add $_GLOBAL_OFFSET_TABLE_, %ebx
469 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
471 cmp __x86_shared_cache_size_half, %ecx
477 movdqu -16(%eax, %ecx), %xmm1
480 movdqu %xmm1, -16(%edx, %ecx)
483 movdqu 16(%eax), %xmm0
484 movdqu -32(%eax, %ecx), %xmm1
486 movdqu %xmm0, 16(%edx)
487 movdqu %xmm1, -32(%edx, %ecx)
490 movdqu 32(%eax), %xmm0
491 movdqu 48(%eax), %xmm1
492 movdqu -48(%eax, %ecx), %xmm2
493 movdqu -64(%eax, %ecx), %xmm3
495 movdqu %xmm0, 32(%edx)
496 movdqu %xmm1, 48(%edx)
497 movdqu %xmm2, -48(%edx, %ecx)
498 movdqu %xmm3, -64(%edx, %ecx)
501 /* Now the main loop: we align the address of the destination. */
510 /* We should stop two iterations before the termination
511 (in order not to misprefetch). */
514 je L(main_loop_just_one_iteration)
518 je L(main_loop_last_two_iterations)
523 prefetcht0 128(%ebx, %eax)
525 movdqu (%ebx, %eax), %xmm0
526 movdqu 16(%ebx, %eax), %xmm1
527 movdqu 32(%ebx, %eax), %xmm2
528 movdqu 48(%ebx, %eax), %xmm3
530 movaps %xmm1, 16(%ebx)
531 movaps %xmm2, 32(%ebx)
532 movaps %xmm3, 48(%ebx)
535 jne L(main_loop_cache)
537 L(main_loop_last_two_iterations):
538 movdqu (%ebx, %eax), %xmm0
539 movdqu 16(%ebx, %eax), %xmm1
540 movdqu 32(%ebx, %eax), %xmm2
541 movdqu 48(%ebx, %eax), %xmm3
542 movdqu 64(%ebx, %eax), %xmm4
543 movdqu 80(%ebx, %eax), %xmm5
544 movdqu 96(%ebx, %eax), %xmm6
545 movdqu 112(%ebx, %eax), %xmm7
547 movaps %xmm1, 16(%ebx)
548 movaps %xmm2, 32(%ebx)
549 movaps %xmm3, 48(%ebx)
550 movaps %xmm4, 64(%ebx)
551 movaps %xmm5, 80(%ebx)
552 movaps %xmm6, 96(%ebx)
553 movaps %xmm7, 112(%ebx)
556 L(main_loop_just_one_iteration):
557 movdqu (%ebx, %eax), %xmm0
558 movdqu 16(%ebx, %eax), %xmm1
559 movdqu 32(%ebx, %eax), %xmm2
560 movdqu 48(%ebx, %eax), %xmm3
562 movaps %xmm1, 16(%ebx)
563 movaps %xmm2, 32(%ebx)
564 movaps %xmm3, 48(%ebx)
569 movdqu 16(%eax), %xmm1
570 movdqu 32(%eax), %xmm2
571 movdqu 48(%eax), %xmm3
572 movdqu -64(%eax, %ecx), %xmm4
573 movdqu -48(%eax, %ecx), %xmm5
574 movdqu -32(%eax, %ecx), %xmm6
575 movdqu -16(%eax, %ecx), %xmm7
577 movdqu %xmm1, 16(%edx)
578 movdqu %xmm2, 32(%edx)
579 movdqu %xmm3, 48(%edx)
580 movdqu %xmm4, -64(%edx, %ecx)
581 movdqu %xmm5, -48(%edx, %ecx)
582 movdqu %xmm6, -32(%edx, %ecx)
583 movdqu %xmm7, -16(%edx, %ecx)
585 movdqu 64(%eax), %xmm0
586 movdqu 80(%eax), %xmm1
587 movdqu 96(%eax), %xmm2
588 movdqu 112(%eax), %xmm3
589 movdqu -128(%eax, %ecx), %xmm4
590 movdqu -112(%eax, %ecx), %xmm5
591 movdqu -96(%eax, %ecx), %xmm6
592 movdqu -80(%eax, %ecx), %xmm7
593 movdqu %xmm0, 64(%edx)
594 movdqu %xmm1, 80(%edx)
595 movdqu %xmm2, 96(%edx)
596 movdqu %xmm3, 112(%edx)
597 movdqu %xmm4, -128(%edx, %ecx)
598 movdqu %xmm5, -112(%edx, %ecx)
599 movdqu %xmm6, -96(%edx, %ecx)
600 movdqu %xmm7, -80(%edx, %ecx)
602 /* Now the main loop with non temporal stores. We align
603 the address of the destination. */
613 L(main_loop_large_page):
614 movdqu (%ebx, %eax), %xmm0
615 movdqu 16(%ebx, %eax), %xmm1
616 movdqu 32(%ebx, %eax), %xmm2
617 movdqu 48(%ebx, %eax), %xmm3
618 movdqu 64(%ebx, %eax), %xmm4
619 movdqu 80(%ebx, %eax), %xmm5
620 movdqu 96(%ebx, %eax), %xmm6
621 movdqu 112(%ebx, %eax), %xmm7
622 movntdq %xmm0, (%ebx)
623 movntdq %xmm1, 16(%ebx)
624 movntdq %xmm2, 32(%ebx)
625 movntdq %xmm3, 48(%ebx)
626 movntdq %xmm4, 64(%ebx)
627 movntdq %xmm5, 80(%ebx)
628 movntdq %xmm6, 96(%ebx)
629 movntdq %xmm7, 112(%ebx)
632 jne L(main_loop_large_page)
638 jne L(len_9_16_bytes)
649 movzwl -2(%eax,%ecx), %ebx
650 movw %bx, -2(%edx,%ecx)
655 movq -8(%eax, %ecx), %xmm1
657 movq %xmm1, -8(%edx, %ecx)
663 movl -4(%eax,%ecx), %ebx
664 movl %ebx, -4(%edx,%ecx)
668 # ifdef USE_AS_MEMPCPY