1 /* memcpy with SSSE3 and REP string.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
23 || defined USE_AS_MEMMOVE \
24 || !defined USE_MULTIARCH)
26 #include "asm-syntax.h"
29 # define MEMCPY __memcpy_ssse3_rep
30 # define MEMCPY_CHK __memcpy_chk_ssse3_rep
37 #define CFI_PUSH(REG) \
38 cfi_adjust_cfa_offset (4); \
39 cfi_rel_offset (REG, 0)
41 #define CFI_POP(REG) \
42 cfi_adjust_cfa_offset (-4); \
45 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
46 #define POP(REG) popl REG; CFI_POP (REG)
49 # define PARMS 8 /* Preserve EBX. */
50 # define ENTRANCE PUSH (%ebx);
51 # define RETURN_END POP (%ebx); ret
52 # define RETURN RETURN_END; CFI_PUSH (%ebx)
53 # define JMPTBL(I, B) I - B
55 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
56 jump table with relative offsets. INDEX is a register contains the
57 index into the jump table. SCALE is the scale of INDEX. */
58 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
59 /* We first load PC into EBX. */ \
61 /* Get the address of the jump table. */ \
62 addl $(TABLE - .), %ebx; \
63 /* Get the entry and convert the relative offset to the \
64 absolute address. */ \
65 addl (%ebx,INDEX,SCALE), %ebx; \
66 /* We loaded the jump table. Go. */ \
69 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
70 addl $(TABLE - .), %ebx
72 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
73 addl (%ebx,INDEX,SCALE), %ebx; \
74 /* We loaded the jump table. Go. */ \
79 # define RETURN_END ret
80 # define RETURN RETURN_END
81 # define JMPTBL(I, B) I
83 /* Branch to an entry in a jump table. TABLE is a jump table with
84 absolute offsets. INDEX is a register contains the index into the
85 jump table. SCALE is the scale of INDEX. */
86 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
87 jmp *TABLE(,INDEX,SCALE)
89 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
91 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
92 jmp *TABLE(,INDEX,SCALE)
95 .section .text.ssse3,"ax",@progbits
100 jb HIDDEN_JUMPTARGET (__chk_fail)
107 movl DEST(%esp), %edx
109 #ifdef USE_AS_MEMMOVE
112 je L(fwd_write_0bytes)
114 jb L(bk_write_less48bytes)
125 L(fwd_write_less32bytes):
126 #ifndef USE_AS_MEMMOVE
132 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
133 #ifndef USE_AS_MEMMOVE
135 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
139 /* ECX > 32 and EDX is 4 byte aligned. */
153 #ifdef SHARED_CACHE_SIZE_HALF
154 cmp $SHARED_CACHE_SIZE_HALF, %ecx
158 add $_GLOBAL_OFFSET_TABLE_, %ebx
159 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
161 cmp __x86_shared_cache_size_half, %ecx
170 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
180 movdqa (%eax, %edi), %xmm0
181 movdqa 16(%eax, %edi), %xmm1
183 movdqa %xmm0, (%edx, %edi)
184 movdqa %xmm1, 16(%edx, %edi)
188 movdqa (%eax, %edi), %xmm0
189 movdqa 16(%eax, %edi), %xmm1
191 movdqa %xmm0, (%edx, %edi)
192 movdqa %xmm1, 16(%edx, %edi)
196 movdqa (%eax, %edi), %xmm0
197 movdqa 16(%eax, %edi), %xmm1
199 movdqa %xmm0, (%edx, %edi)
200 movdqa %xmm1, 16(%edx, %edi)
204 movdqa (%eax, %edi), %xmm0
205 movdqa 16(%eax, %edi), %xmm1
207 movdqa %xmm0, (%edx, %edi)
208 movdqa %xmm1, 16(%edx, %edi)
217 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
223 #ifdef DATA_CACHE_SIZE_HALF
224 cmp $DATA_CACHE_SIZE_HALF, %ecx
228 add $_GLOBAL_OFFSET_TABLE_, %ebx
229 mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
231 mov __x86_data_cache_size_half, %edi
238 jae L(shl_0_gobble_mem_start)
241 L(shl_0_gobble_cache_loop):
243 movaps 0x10(%eax), %xmm1
244 movaps 0x20(%eax), %xmm2
245 movaps 0x30(%eax), %xmm3
246 movaps 0x40(%eax), %xmm4
247 movaps 0x50(%eax), %xmm5
248 movaps 0x60(%eax), %xmm6
249 movaps 0x70(%eax), %xmm7
253 movaps %xmm1, 0x10(%edx)
254 movaps %xmm2, 0x20(%edx)
255 movaps %xmm3, 0x30(%edx)
256 movaps %xmm4, 0x40(%edx)
257 movaps %xmm5, 0x50(%edx)
258 movaps %xmm6, 0x60(%edx)
259 movaps %xmm7, 0x70(%edx)
262 jae L(shl_0_gobble_cache_loop)
265 jb L(shl_0_cache_less_64bytes)
269 movdqa 0x10(%eax), %xmm1
272 movdqa %xmm1, 0x10(%edx)
274 movdqa 0x20(%eax), %xmm0
275 movdqa 0x30(%eax), %xmm1
278 movdqa %xmm0, 0x20(%edx)
279 movdqa %xmm1, 0x30(%edx)
281 L(shl_0_cache_less_64bytes):
283 jb L(shl_0_cache_less_32bytes)
286 movdqa 0x10(%eax), %xmm1
289 movdqa %xmm1, 0x10(%edx)
291 L(shl_0_cache_less_32bytes):
293 jb L(shl_0_cache_less_16bytes)
299 L(shl_0_cache_less_16bytes):
304 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
309 L(shl_0_gobble_mem_start):
311 je L(copy_page_by_rep)
313 L(shl_0_gobble_mem_loop):
314 prefetchnta 0x1c0(%eax)
315 prefetchnta 0x280(%eax)
316 prefetchnta 0x1c0(%edx)
317 prefetchnta 0x280(%edx)
320 movaps 0x10(%eax), %xmm1
321 movaps 0x20(%eax), %xmm2
322 movaps 0x30(%eax), %xmm3
323 movaps 0x40(%eax), %xmm4
324 movaps 0x50(%eax), %xmm5
325 movaps 0x60(%eax), %xmm6
326 movaps 0x70(%eax), %xmm7
330 movaps %xmm1, 0x10(%edx)
331 movaps %xmm2, 0x20(%edx)
332 movaps %xmm3, 0x30(%edx)
333 movaps %xmm4, 0x40(%edx)
334 movaps %xmm5, 0x50(%edx)
335 movaps %xmm6, 0x60(%edx)
336 movaps %xmm7, 0x70(%edx)
339 jae L(shl_0_gobble_mem_loop)
342 jb L(shl_0_mem_less_64bytes)
346 movdqa 0x10(%eax), %xmm1
349 movdqa %xmm1, 0x10(%edx)
351 movdqa 0x20(%eax), %xmm0
352 movdqa 0x30(%eax), %xmm1
355 movdqa %xmm0, 0x20(%edx)
356 movdqa %xmm1, 0x30(%edx)
358 L(shl_0_mem_less_64bytes):
360 jb L(shl_0_mem_less_32bytes)
363 movdqa 0x10(%eax), %xmm1
366 movdqa %xmm1, 0x10(%edx)
368 L(shl_0_mem_less_32bytes):
370 jb L(shl_0_mem_less_16bytes)
376 L(shl_0_mem_less_16bytes):
381 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
387 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
396 movdqa 16(%eax, %edi), %xmm2
398 movdqa 32(%eax, %edi), %xmm3
400 palignr $1, %xmm2, %xmm3
401 palignr $1, %xmm1, %xmm2
403 movdqa %xmm2, -32(%edx, %edi)
404 movdqa %xmm3, -16(%edx, %edi)
408 movdqa 16(%eax, %edi), %xmm2
410 movdqa 32(%eax, %edi), %xmm3
412 palignr $1, %xmm2, %xmm3
413 palignr $1, %xmm4, %xmm2
415 movdqa %xmm2, -32(%edx, %edi)
416 movdqa %xmm3, -16(%edx, %edi)
424 lea 1(%edi, %eax), %eax
426 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
432 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
441 movdqa 16(%eax, %edi), %xmm2
443 movdqa 32(%eax, %edi), %xmm3
445 palignr $2, %xmm2, %xmm3
446 palignr $2, %xmm1, %xmm2
448 movdqa %xmm2, -32(%edx, %edi)
449 movdqa %xmm3, -16(%edx, %edi)
453 movdqa 16(%eax, %edi), %xmm2
455 movdqa 32(%eax, %edi), %xmm3
457 palignr $2, %xmm2, %xmm3
458 palignr $2, %xmm4, %xmm2
460 movdqa %xmm2, -32(%edx, %edi)
461 movdqa %xmm3, -16(%edx, %edi)
469 lea 2(%edi, %eax), %eax
471 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
477 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
486 movdqa 16(%eax, %edi), %xmm2
488 movdqa 32(%eax, %edi), %xmm3
490 palignr $3, %xmm2, %xmm3
491 palignr $3, %xmm1, %xmm2
493 movdqa %xmm2, -32(%edx, %edi)
494 movdqa %xmm3, -16(%edx, %edi)
498 movdqa 16(%eax, %edi), %xmm2
500 movdqa 32(%eax, %edi), %xmm3
502 palignr $3, %xmm2, %xmm3
503 palignr $3, %xmm4, %xmm2
505 movdqa %xmm2, -32(%edx, %edi)
506 movdqa %xmm3, -16(%edx, %edi)
514 lea 3(%edi, %eax), %eax
516 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
522 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
531 movdqa 16(%eax, %edi), %xmm2
533 movdqa 32(%eax, %edi), %xmm3
535 palignr $4, %xmm2, %xmm3
536 palignr $4, %xmm1, %xmm2
538 movdqa %xmm2, -32(%edx, %edi)
539 movdqa %xmm3, -16(%edx, %edi)
543 movdqa 16(%eax, %edi), %xmm2
545 movdqa 32(%eax, %edi), %xmm3
547 palignr $4, %xmm2, %xmm3
548 palignr $4, %xmm4, %xmm2
550 movdqa %xmm2, -32(%edx, %edi)
551 movdqa %xmm3, -16(%edx, %edi)
559 lea 4(%edi, %eax), %eax
561 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
567 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
576 movdqa 16(%eax, %edi), %xmm2
578 movdqa 32(%eax, %edi), %xmm3
580 palignr $5, %xmm2, %xmm3
581 palignr $5, %xmm1, %xmm2
583 movdqa %xmm2, -32(%edx, %edi)
584 movdqa %xmm3, -16(%edx, %edi)
588 movdqa 16(%eax, %edi), %xmm2
590 movdqa 32(%eax, %edi), %xmm3
592 palignr $5, %xmm2, %xmm3
593 palignr $5, %xmm4, %xmm2
595 movdqa %xmm2, -32(%edx, %edi)
596 movdqa %xmm3, -16(%edx, %edi)
604 lea 5(%edi, %eax), %eax
606 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
612 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
621 movdqa 16(%eax, %edi), %xmm2
623 movdqa 32(%eax, %edi), %xmm3
625 palignr $6, %xmm2, %xmm3
626 palignr $6, %xmm1, %xmm2
628 movdqa %xmm2, -32(%edx, %edi)
629 movdqa %xmm3, -16(%edx, %edi)
633 movdqa 16(%eax, %edi), %xmm2
635 movdqa 32(%eax, %edi), %xmm3
637 palignr $6, %xmm2, %xmm3
638 palignr $6, %xmm4, %xmm2
640 movdqa %xmm2, -32(%edx, %edi)
641 movdqa %xmm3, -16(%edx, %edi)
649 lea 6(%edi, %eax), %eax
651 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
657 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
666 movdqa 16(%eax, %edi), %xmm2
668 movdqa 32(%eax, %edi), %xmm3
670 palignr $7, %xmm2, %xmm3
671 palignr $7, %xmm1, %xmm2
673 movdqa %xmm2, -32(%edx, %edi)
674 movdqa %xmm3, -16(%edx, %edi)
678 movdqa 16(%eax, %edi), %xmm2
680 movdqa 32(%eax, %edi), %xmm3
682 palignr $7, %xmm2, %xmm3
683 palignr $7, %xmm4, %xmm2
685 movdqa %xmm2, -32(%edx, %edi)
686 movdqa %xmm3, -16(%edx, %edi)
694 lea 7(%edi, %eax), %eax
696 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
702 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
711 movdqa 16(%eax, %edi), %xmm2
713 movdqa 32(%eax, %edi), %xmm3
715 palignr $8, %xmm2, %xmm3
716 palignr $8, %xmm1, %xmm2
718 movdqa %xmm2, -32(%edx, %edi)
719 movdqa %xmm3, -16(%edx, %edi)
723 movdqa 16(%eax, %edi), %xmm2
725 movdqa 32(%eax, %edi), %xmm3
727 palignr $8, %xmm2, %xmm3
728 palignr $8, %xmm4, %xmm2
730 movdqa %xmm2, -32(%edx, %edi)
731 movdqa %xmm3, -16(%edx, %edi)
739 lea 8(%edi, %eax), %eax
741 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
747 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
756 movdqa 16(%eax, %edi), %xmm2
758 movdqa 32(%eax, %edi), %xmm3
760 palignr $9, %xmm2, %xmm3
761 palignr $9, %xmm1, %xmm2
763 movdqa %xmm2, -32(%edx, %edi)
764 movdqa %xmm3, -16(%edx, %edi)
768 movdqa 16(%eax, %edi), %xmm2
770 movdqa 32(%eax, %edi), %xmm3
772 palignr $9, %xmm2, %xmm3
773 palignr $9, %xmm4, %xmm2
775 movdqa %xmm2, -32(%edx, %edi)
776 movdqa %xmm3, -16(%edx, %edi)
784 lea 9(%edi, %eax), %eax
786 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
792 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
801 movdqa 16(%eax, %edi), %xmm2
803 movdqa 32(%eax, %edi), %xmm3
805 palignr $10, %xmm2, %xmm3
806 palignr $10, %xmm1, %xmm2
808 movdqa %xmm2, -32(%edx, %edi)
809 movdqa %xmm3, -16(%edx, %edi)
813 movdqa 16(%eax, %edi), %xmm2
815 movdqa 32(%eax, %edi), %xmm3
817 palignr $10, %xmm2, %xmm3
818 palignr $10, %xmm4, %xmm2
820 movdqa %xmm2, -32(%edx, %edi)
821 movdqa %xmm3, -16(%edx, %edi)
829 lea 10(%edi, %eax), %eax
831 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
837 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
846 movdqa 16(%eax, %edi), %xmm2
848 movdqa 32(%eax, %edi), %xmm3
850 palignr $11, %xmm2, %xmm3
851 palignr $11, %xmm1, %xmm2
853 movdqa %xmm2, -32(%edx, %edi)
854 movdqa %xmm3, -16(%edx, %edi)
858 movdqa 16(%eax, %edi), %xmm2
860 movdqa 32(%eax, %edi), %xmm3
862 palignr $11, %xmm2, %xmm3
863 palignr $11, %xmm4, %xmm2
865 movdqa %xmm2, -32(%edx, %edi)
866 movdqa %xmm3, -16(%edx, %edi)
874 lea 11(%edi, %eax), %eax
876 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
882 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
891 movdqa 16(%eax, %edi), %xmm2
893 movdqa 32(%eax, %edi), %xmm3
895 palignr $12, %xmm2, %xmm3
896 palignr $12, %xmm1, %xmm2
898 movdqa %xmm2, -32(%edx, %edi)
899 movdqa %xmm3, -16(%edx, %edi)
903 movdqa 16(%eax, %edi), %xmm2
905 movdqa 32(%eax, %edi), %xmm3
907 palignr $12, %xmm2, %xmm3
908 palignr $12, %xmm4, %xmm2
910 movdqa %xmm2, -32(%edx, %edi)
911 movdqa %xmm3, -16(%edx, %edi)
919 lea 12(%edi, %eax), %eax
921 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
927 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
936 movdqa 16(%eax, %edi), %xmm2
938 movdqa 32(%eax, %edi), %xmm3
940 palignr $13, %xmm2, %xmm3
941 palignr $13, %xmm1, %xmm2
943 movdqa %xmm2, -32(%edx, %edi)
944 movdqa %xmm3, -16(%edx, %edi)
948 movdqa 16(%eax, %edi), %xmm2
950 movdqa 32(%eax, %edi), %xmm3
952 palignr $13, %xmm2, %xmm3
953 palignr $13, %xmm4, %xmm2
955 movdqa %xmm2, -32(%edx, %edi)
956 movdqa %xmm3, -16(%edx, %edi)
964 lea 13(%edi, %eax), %eax
966 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
972 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
981 movdqa 16(%eax, %edi), %xmm2
983 movdqa 32(%eax, %edi), %xmm3
985 palignr $14, %xmm2, %xmm3
986 palignr $14, %xmm1, %xmm2
988 movdqa %xmm2, -32(%edx, %edi)
989 movdqa %xmm3, -16(%edx, %edi)
993 movdqa 16(%eax, %edi), %xmm2
995 movdqa 32(%eax, %edi), %xmm3
997 palignr $14, %xmm2, %xmm3
998 palignr $14, %xmm4, %xmm2
1000 movdqa %xmm2, -32(%edx, %edi)
1001 movdqa %xmm3, -16(%edx, %edi)
1009 lea 14(%edi, %eax), %eax
1011 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1017 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1019 movaps (%eax), %xmm1
1022 movdqu %xmm0, (%esi)
1026 movdqa 16(%eax, %edi), %xmm2
1028 movdqa 32(%eax, %edi), %xmm3
1030 palignr $15, %xmm2, %xmm3
1031 palignr $15, %xmm1, %xmm2
1033 movdqa %xmm2, -32(%edx, %edi)
1034 movdqa %xmm3, -16(%edx, %edi)
1038 movdqa 16(%eax, %edi), %xmm2
1040 movdqa 32(%eax, %edi), %xmm3
1042 palignr $15, %xmm2, %xmm3
1043 palignr $15, %xmm4, %xmm2
1045 movdqa %xmm2, -32(%edx, %edi)
1046 movdqa %xmm3, -16(%edx, %edi)
1054 lea 15(%edi, %eax), %eax
1056 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1060 L(fwd_write_44bytes):
1061 movl -44(%eax), %ecx
1062 movl %ecx, -44(%edx)
1063 L(fwd_write_40bytes):
1064 movl -40(%eax), %ecx
1065 movl %ecx, -40(%edx)
1066 L(fwd_write_36bytes):
1067 movl -36(%eax), %ecx
1068 movl %ecx, -36(%edx)
1069 L(fwd_write_32bytes):
1070 movl -32(%eax), %ecx
1071 movl %ecx, -32(%edx)
1072 L(fwd_write_28bytes):
1073 movl -28(%eax), %ecx
1074 movl %ecx, -28(%edx)
1075 L(fwd_write_24bytes):
1076 movl -24(%eax), %ecx
1077 movl %ecx, -24(%edx)
1078 L(fwd_write_20bytes):
1079 movl -20(%eax), %ecx
1080 movl %ecx, -20(%edx)
1081 L(fwd_write_16bytes):
1082 movl -16(%eax), %ecx
1083 movl %ecx, -16(%edx)
1084 L(fwd_write_12bytes):
1085 movl -12(%eax), %ecx
1086 movl %ecx, -12(%edx)
1087 L(fwd_write_8bytes):
1090 L(fwd_write_4bytes):
1093 L(fwd_write_0bytes):
1094 #ifdef USE_AS_MEMPCPY
1097 movl DEST(%esp), %eax
1102 L(fwd_write_5bytes):
1107 #ifdef USE_AS_MEMPCPY
1110 movl DEST(%esp), %eax
1115 L(fwd_write_45bytes):
1116 movl -45(%eax), %ecx
1117 movl %ecx, -45(%edx)
1118 L(fwd_write_41bytes):
1119 movl -41(%eax), %ecx
1120 movl %ecx, -41(%edx)
1121 L(fwd_write_37bytes):
1122 movl -37(%eax), %ecx
1123 movl %ecx, -37(%edx)
1124 L(fwd_write_33bytes):
1125 movl -33(%eax), %ecx
1126 movl %ecx, -33(%edx)
1127 L(fwd_write_29bytes):
1128 movl -29(%eax), %ecx
1129 movl %ecx, -29(%edx)
1130 L(fwd_write_25bytes):
1131 movl -25(%eax), %ecx
1132 movl %ecx, -25(%edx)
1133 L(fwd_write_21bytes):
1134 movl -21(%eax), %ecx
1135 movl %ecx, -21(%edx)
1136 L(fwd_write_17bytes):
1137 movl -17(%eax), %ecx
1138 movl %ecx, -17(%edx)
1139 L(fwd_write_13bytes):
1140 movl -13(%eax), %ecx
1141 movl %ecx, -13(%edx)
1142 L(fwd_write_9bytes):
1147 L(fwd_write_1bytes):
1148 movzbl -1(%eax), %ecx
1150 #ifdef USE_AS_MEMPCPY
1153 movl DEST(%esp), %eax
1158 L(fwd_write_46bytes):
1159 movl -46(%eax), %ecx
1160 movl %ecx, -46(%edx)
1161 L(fwd_write_42bytes):
1162 movl -42(%eax), %ecx
1163 movl %ecx, -42(%edx)
1164 L(fwd_write_38bytes):
1165 movl -38(%eax), %ecx
1166 movl %ecx, -38(%edx)
1167 L(fwd_write_34bytes):
1168 movl -34(%eax), %ecx
1169 movl %ecx, -34(%edx)
1170 L(fwd_write_30bytes):
1171 movl -30(%eax), %ecx
1172 movl %ecx, -30(%edx)
1173 L(fwd_write_26bytes):
1174 movl -26(%eax), %ecx
1175 movl %ecx, -26(%edx)
1176 L(fwd_write_22bytes):
1177 movl -22(%eax), %ecx
1178 movl %ecx, -22(%edx)
1179 L(fwd_write_18bytes):
1180 movl -18(%eax), %ecx
1181 movl %ecx, -18(%edx)
1182 L(fwd_write_14bytes):
1183 movl -14(%eax), %ecx
1184 movl %ecx, -14(%edx)
1185 L(fwd_write_10bytes):
1186 movl -10(%eax), %ecx
1187 movl %ecx, -10(%edx)
1188 L(fwd_write_6bytes):
1191 L(fwd_write_2bytes):
1192 movzwl -2(%eax), %ecx
1194 #ifdef USE_AS_MEMPCPY
1197 movl DEST(%esp), %eax
1202 L(fwd_write_47bytes):
1203 movl -47(%eax), %ecx
1204 movl %ecx, -47(%edx)
1205 L(fwd_write_43bytes):
1206 movl -43(%eax), %ecx
1207 movl %ecx, -43(%edx)
1208 L(fwd_write_39bytes):
1209 movl -39(%eax), %ecx
1210 movl %ecx, -39(%edx)
1211 L(fwd_write_35bytes):
1212 movl -35(%eax), %ecx
1213 movl %ecx, -35(%edx)
1214 L(fwd_write_31bytes):
1215 movl -31(%eax), %ecx
1216 movl %ecx, -31(%edx)
1217 L(fwd_write_27bytes):
1218 movl -27(%eax), %ecx
1219 movl %ecx, -27(%edx)
1220 L(fwd_write_23bytes):
1221 movl -23(%eax), %ecx
1222 movl %ecx, -23(%edx)
1223 L(fwd_write_19bytes):
1224 movl -19(%eax), %ecx
1225 movl %ecx, -19(%edx)
1226 L(fwd_write_15bytes):
1227 movl -15(%eax), %ecx
1228 movl %ecx, -15(%edx)
1229 L(fwd_write_11bytes):
1230 movl -11(%eax), %ecx
1231 movl %ecx, -11(%edx)
1232 L(fwd_write_7bytes):
1235 L(fwd_write_3bytes):
1236 movzwl -3(%eax), %ecx
1237 movzbl -1(%eax), %eax
1240 #ifdef USE_AS_MEMPCPY
1243 movl DEST(%esp), %eax
1251 movdqu (%eax), %xmm1
1252 movdqu %xmm0, (%esi)
1253 movntdq %xmm1, (%edx)
1258 je L(copy_page_by_rep)
1259 L(large_page_loop_init):
1264 prefetchnta 0x1c0(%eax)
1265 prefetchnta 0x280(%eax)
1266 movdqu (%eax), %xmm0
1267 movdqu 0x10(%eax), %xmm1
1268 movdqu 0x20(%eax), %xmm2
1269 movdqu 0x30(%eax), %xmm3
1270 movdqu 0x40(%eax), %xmm4
1271 movdqu 0x50(%eax), %xmm5
1272 movdqu 0x60(%eax), %xmm6
1273 movdqu 0x70(%eax), %xmm7
1274 lea 0x80(%eax), %eax
1277 movntdq %xmm0, (%edx)
1278 movntdq %xmm1, 0x10(%edx)
1279 movntdq %xmm2, 0x20(%edx)
1280 movntdq %xmm3, 0x30(%edx)
1281 movntdq %xmm4, 0x40(%edx)
1282 movntdq %xmm5, 0x50(%edx)
1283 movntdq %xmm6, 0x60(%edx)
1284 movntdq %xmm7, 0x70(%edx)
1285 lea 0x80(%edx), %edx
1286 jae L(large_page_loop)
1289 jb L(large_page_less_64bytes)
1291 movdqu (%eax), %xmm0
1292 movdqu 0x10(%eax), %xmm1
1293 movdqu 0x20(%eax), %xmm2
1294 movdqu 0x30(%eax), %xmm3
1295 lea 0x40(%eax), %eax
1297 movntdq %xmm0, (%edx)
1298 movntdq %xmm1, 0x10(%edx)
1299 movntdq %xmm2, 0x20(%edx)
1300 movntdq %xmm3, 0x30(%edx)
1301 lea 0x40(%edx), %edx
1303 L(large_page_less_64bytes):
1305 jb L(large_page_less_32bytes)
1306 movdqu (%eax), %xmm0
1307 movdqu 0x10(%eax), %xmm1
1308 lea 0x20(%eax), %eax
1309 movntdq %xmm0, (%edx)
1310 movntdq %xmm1, 0x10(%edx)
1311 lea 0x20(%edx), %edx
1313 L(large_page_less_32bytes):
1317 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1322 L(copy_page_by_rep):
1329 jz L(copy_page_by_rep_exit)
1331 jb L(copy_page_by_rep_left_1)
1337 jz L(copy_page_by_rep_exit)
1338 L(copy_page_by_rep_left_1):
1341 L(copy_page_by_rep_exit):
1344 movl DEST(%esp), %eax
1345 #ifdef USE_AS_MEMPCPY
1346 movl LEN(%esp), %ecx
1352 L(bk_write_44bytes):
1355 L(bk_write_40bytes):
1358 L(bk_write_36bytes):
1361 L(bk_write_32bytes):
1364 L(bk_write_28bytes):
1367 L(bk_write_24bytes):
1370 L(bk_write_20bytes):
1373 L(bk_write_16bytes):
1376 L(bk_write_12bytes):
1386 movl DEST(%esp), %eax
1387 #ifdef USE_AS_MEMPCPY
1388 movl LEN(%esp), %ecx
1394 L(bk_write_45bytes):
1397 L(bk_write_41bytes):
1400 L(bk_write_37bytes):
1403 L(bk_write_33bytes):
1406 L(bk_write_29bytes):
1409 L(bk_write_25bytes):
1412 L(bk_write_21bytes):
1415 L(bk_write_17bytes):
1418 L(bk_write_13bytes):
1430 movl DEST(%esp), %eax
1431 #ifdef USE_AS_MEMPCPY
1432 movl LEN(%esp), %ecx
1438 L(bk_write_46bytes):
1441 L(bk_write_42bytes):
1444 L(bk_write_38bytes):
1447 L(bk_write_34bytes):
1450 L(bk_write_30bytes):
1453 L(bk_write_26bytes):
1456 L(bk_write_22bytes):
1459 L(bk_write_18bytes):
1462 L(bk_write_14bytes):
1465 L(bk_write_10bytes):
1474 movl DEST(%esp), %eax
1475 #ifdef USE_AS_MEMPCPY
1476 movl LEN(%esp), %ecx
1482 L(bk_write_47bytes):
1485 L(bk_write_43bytes):
1488 L(bk_write_39bytes):
1491 L(bk_write_35bytes):
1494 L(bk_write_31bytes):
1497 L(bk_write_27bytes):
1500 L(bk_write_23bytes):
1503 L(bk_write_19bytes):
1506 L(bk_write_15bytes):
1509 L(bk_write_11bytes):
1516 movzwl 1(%eax), %ecx
1520 movl DEST(%esp), %eax
1521 #ifdef USE_AS_MEMPCPY
1522 movl LEN(%esp), %ecx
1528 .pushsection .rodata.ssse3,"a",@progbits
1530 L(table_48bytes_fwd):
1531 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1532 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1533 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1534 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1535 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1536 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1537 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1538 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1539 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1540 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1541 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1542 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1543 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1544 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1545 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1546 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1547 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1548 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1549 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1550 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1551 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1552 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1553 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1554 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1555 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1556 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1557 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1558 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1559 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1560 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1561 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1562 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1563 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1567 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1568 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1569 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1570 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1571 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1572 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1573 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1574 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1575 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1576 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1577 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1578 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1582 .int JMPTBL (L(shl_0), L(shl_table))
1583 .int JMPTBL (L(shl_1), L(shl_table))
1584 .int JMPTBL (L(shl_2), L(shl_table))
1585 .int JMPTBL (L(shl_3), L(shl_table))
1586 .int JMPTBL (L(shl_4), L(shl_table))
1587 .int JMPTBL (L(shl_5), L(shl_table))
1588 .int JMPTBL (L(shl_6), L(shl_table))
1589 .int JMPTBL (L(shl_7), L(shl_table))
1590 .int JMPTBL (L(shl_8), L(shl_table))
1591 .int JMPTBL (L(shl_9), L(shl_table))
1592 .int JMPTBL (L(shl_10), L(shl_table))
1593 .int JMPTBL (L(shl_11), L(shl_table))
1594 .int JMPTBL (L(shl_12), L(shl_table))
1595 .int JMPTBL (L(shl_13), L(shl_table))
1596 .int JMPTBL (L(shl_14), L(shl_table))
1597 .int JMPTBL (L(shl_15), L(shl_table))
1600 L(table_48_bytes_bwd):
1601 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1602 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1603 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1604 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1605 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1606 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1607 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1608 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1609 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1610 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1611 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1612 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1613 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1614 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1615 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1616 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1617 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1618 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1619 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1620 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1621 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1622 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1623 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1624 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1625 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1626 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1627 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1628 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1629 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1630 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1631 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1632 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1633 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1637 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1638 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1639 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1640 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1641 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1642 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1643 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1644 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1645 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1646 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1647 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1648 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1652 #ifdef USE_AS_MEMMOVE
1664 jae L(bk_write_more64bytes)
1666 L(bk_write_64bytesless):
1668 jb L(bk_write_less32bytes)
1670 L(bk_write_more32bytes):
1671 /* Copy 32 bytes at a time. */
1677 movl -12(%esi), %eax
1678 movl %eax, -12(%edx)
1679 movl -16(%esi), %eax
1680 movl %eax, -16(%edx)
1681 movl -20(%esi), %eax
1682 movl %eax, -20(%edx)
1683 movl -24(%esi), %eax
1684 movl %eax, -24(%edx)
1685 movl -28(%esi), %eax
1686 movl %eax, -28(%edx)
1687 movl -32(%esi), %eax
1688 movl %eax, -32(%edx)
1692 L(bk_write_less32bytes):
1697 L(bk_write_less48bytes):
1698 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1704 jbe L(bk_write_less32bytes)
1706 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1707 then (EDX & 2) must be != 0. */
1727 L(bk_write_more64bytes):
1728 /* Check alignment of last byte. */
1730 jz L(bk_ssse3_cpy_pre)
1732 /* EDX is aligned 4 bytes, but not 16 bytes. */
1741 jz L(bk_ssse3_cpy_pre)
1750 jz L(bk_ssse3_cpy_pre)
1758 L(bk_ssse3_cpy_pre):
1760 jb L(bk_write_more32bytes)
1766 movdqu 0x30(%esi), %xmm3
1767 movdqa %xmm3, 0x30(%edx)
1768 movdqu 0x20(%esi), %xmm2
1769 movdqa %xmm2, 0x20(%edx)
1770 movdqu 0x10(%esi), %xmm1
1771 movdqa %xmm1, 0x10(%edx)
1772 movdqu (%esi), %xmm0
1773 movdqa %xmm0, (%edx)
1776 jmp L(bk_write_64bytesless)