1 /* memcpy with SSSE3 and REP string.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #if !defined NOT_IN_libc \
25 || defined USE_AS_MEMMOVE \
26 || !defined USE_MULTIARCH)
28 #include "asm-syntax.h"
31 # define MEMCPY __memcpy_ssse3_rep
32 # define MEMCPY_CHK __memcpy_chk_ssse3_rep
45 #define CFI_PUSH(REG) \
46 cfi_adjust_cfa_offset (4); \
47 cfi_rel_offset (REG, 0)
49 #define CFI_POP(REG) \
50 cfi_adjust_cfa_offset (-4); \
53 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
54 #define POP(REG) popl REG; CFI_POP (REG)
57 # define PARMS 8 /* Preserve EBX. */
58 # define ENTRANCE PUSH (%ebx);
59 # define RETURN_END POP (%ebx); ret
60 # define RETURN RETURN_END; CFI_PUSH (%ebx)
61 # define JMPTBL(I, B) I - B
63 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
64 jump table with relative offsets. INDEX is a register contains the
65 index into the jump table. SCALE is the scale of INDEX. */
66 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
67 /* We first load PC into EBX. */ \
68 call __i686.get_pc_thunk.bx; \
69 /* Get the address of the jump table. */ \
70 addl $(TABLE - .), %ebx; \
71 /* Get the entry and convert the relative offset to the \
72 absolute address. */ \
73 addl (%ebx,INDEX,SCALE), %ebx; \
74 /* We loaded the jump table. Go. */ \
77 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
78 addl $(TABLE - .), %ebx
80 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
81 addl (%ebx,INDEX,SCALE), %ebx; \
82 /* We loaded the jump table. Go. */ \
85 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
86 .globl __i686.get_pc_thunk.bx
87 .hidden __i686.get_pc_thunk.bx
89 .type __i686.get_pc_thunk.bx,@function
90 __i686.get_pc_thunk.bx:
96 # define RETURN_END ret
97 # define RETURN RETURN_END
98 # define JMPTBL(I, B) I
100 /* Branch to an entry in a jump table. TABLE is a jump table with
101 absolute offsets. INDEX is a register contains the index into the
102 jump table. SCALE is the scale of INDEX. */
103 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
104 jmp *TABLE(,INDEX,SCALE)
106 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
108 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
109 jmp *TABLE(,INDEX,SCALE)
112 .section .text.ssse3,"ax",@progbits
113 #if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
117 jb HIDDEN_JUMPTARGET (__chk_fail)
124 movl DEST(%esp), %edx
126 #ifdef USE_AS_MEMMOVE
129 je L(fwd_write_0bytes)
132 jmp L(bk_write_less32bytes_2)
144 L(fwd_write_less32bytes):
145 #ifndef USE_AS_MEMMOVE
151 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
152 #ifndef USE_AS_MEMMOVE
154 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
158 /* ECX > 32 and EDX is 4 byte aligned. */
171 #ifdef SHARED_CACHE_SIZE_HALF
172 cmp $SHARED_CACHE_SIZE_HALF, %ecx
175 call __i686.get_pc_thunk.bx
176 add $_GLOBAL_OFFSET_TABLE_, %ebx
177 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
179 cmp __x86_shared_cache_size_half, %ecx
188 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
198 movdqa (%eax, %edi), %xmm0
199 movdqa 16(%eax, %edi), %xmm1
201 movdqa %xmm0, (%edx, %edi)
202 movdqa %xmm1, 16(%edx, %edi)
206 movdqa (%eax, %edi), %xmm0
207 movdqa 16(%eax, %edi), %xmm1
209 movdqa %xmm0, (%edx, %edi)
210 movdqa %xmm1, 16(%edx, %edi)
214 movdqa (%eax, %edi), %xmm0
215 movdqa 16(%eax, %edi), %xmm1
217 movdqa %xmm0, (%edx, %edi)
218 movdqa %xmm1, 16(%edx, %edi)
222 movdqa (%eax, %edi), %xmm0
223 movdqa 16(%eax, %edi), %xmm1
225 movdqa %xmm0, (%edx, %edi)
226 movdqa %xmm1, 16(%edx, %edi)
235 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
239 #ifdef DATA_CACHE_SIZE_HALF
240 cmp $DATA_CACHE_SIZE_HALF, %ecx
243 call __i686.get_pc_thunk.bx
244 add $_GLOBAL_OFFSET_TABLE_, %ebx
245 mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
247 mov __x86_data_cache_size_half, %edi
254 jge L(shl_0_gobble_mem_start)
257 L(shl_0_gobble_cache_loop):
259 movaps 0x10(%eax), %xmm1
260 movaps 0x20(%eax), %xmm2
261 movaps 0x30(%eax), %xmm3
262 movaps 0x40(%eax), %xmm4
263 movaps 0x50(%eax), %xmm5
264 movaps 0x60(%eax), %xmm6
265 movaps 0x70(%eax), %xmm7
269 movaps %xmm1, 0x10(%edx)
270 movaps %xmm2, 0x20(%edx)
271 movaps %xmm3, 0x30(%edx)
272 movaps %xmm4, 0x40(%edx)
273 movaps %xmm5, 0x50(%edx)
274 movaps %xmm6, 0x60(%edx)
275 movaps %xmm7, 0x70(%edx)
278 jge L(shl_0_gobble_cache_loop)
279 L(shl_0_gobble_cache_loop_tail):
282 jl L(shl_0_cache_less_64bytes)
286 movdqa 0x10(%eax), %xmm1
289 movdqa %xmm1, 0x10(%edx)
291 movdqa 0x20(%eax), %xmm0
292 movdqa 0x30(%eax), %xmm1
295 movdqa %xmm0, 0x20(%edx)
296 movdqa %xmm1, 0x30(%edx)
298 L(shl_0_cache_less_64bytes):
300 jl L(shl_0_cache_less_32bytes)
303 movdqa 0x10(%eax), %xmm1
306 movdqa %xmm1, 0x10(%edx)
308 L(shl_0_cache_less_32bytes):
310 jl L(shl_0_cache_less_16bytes)
316 L(shl_0_cache_less_16bytes):
321 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
325 L(shl_0_gobble_mem_start):
327 je L(copy_page_by_rep)
329 L(shl_0_gobble_mem_loop):
330 prefetchnta 0x1c0(%eax)
331 prefetchnta 0x280(%eax)
332 prefetchnta 0x1c0(%edx)
333 prefetchnta 0x280(%edx)
336 movaps 0x10(%eax), %xmm1
337 movaps 0x20(%eax), %xmm2
338 movaps 0x30(%eax), %xmm3
339 movaps 0x40(%eax), %xmm4
340 movaps 0x50(%eax), %xmm5
341 movaps 0x60(%eax), %xmm6
342 movaps 0x70(%eax), %xmm7
346 movaps %xmm1, 0x10(%edx)
347 movaps %xmm2, 0x20(%edx)
348 movaps %xmm3, 0x30(%edx)
349 movaps %xmm4, 0x40(%edx)
350 movaps %xmm5, 0x50(%edx)
351 movaps %xmm6, 0x60(%edx)
352 movaps %xmm7, 0x70(%edx)
355 jge L(shl_0_gobble_mem_loop)
358 jl L(shl_0_mem_less_64bytes)
362 movdqa 0x10(%eax), %xmm1
365 movdqa %xmm1, 0x10(%edx)
367 movdqa 0x20(%eax), %xmm0
368 movdqa 0x30(%eax), %xmm1
371 movdqa %xmm0, 0x20(%edx)
372 movdqa %xmm1, 0x30(%edx)
374 L(shl_0_mem_less_64bytes):
376 jl L(shl_0_mem_less_32bytes)
379 movdqa 0x10(%eax), %xmm1
382 movdqa %xmm1, 0x10(%edx)
384 L(shl_0_mem_less_32bytes):
386 jl L(shl_0_mem_less_16bytes)
392 L(shl_0_mem_less_16bytes):
397 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
402 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
411 movdqa 16(%eax, %edi), %xmm2
413 movdqa 32(%eax, %edi), %xmm3
415 palignr $1, %xmm2, %xmm3
416 palignr $1, %xmm1, %xmm2
418 movdqa %xmm2, -32(%edx, %edi)
419 movdqa %xmm3, -16(%edx, %edi)
423 movdqa 16(%eax, %edi), %xmm2
425 movdqa 32(%eax, %edi), %xmm3
427 palignr $1, %xmm2, %xmm3
428 palignr $1, %xmm4, %xmm2
430 movdqa %xmm2, -32(%edx, %edi)
431 movdqa %xmm3, -16(%edx, %edi)
439 lea 1(%edi, %eax), %eax
441 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
445 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
454 movdqa 16(%eax, %edi), %xmm2
456 movdqa 32(%eax, %edi), %xmm3
458 palignr $2, %xmm2, %xmm3
459 palignr $2, %xmm1, %xmm2
461 movdqa %xmm2, -32(%edx, %edi)
462 movdqa %xmm3, -16(%edx, %edi)
466 movdqa 16(%eax, %edi), %xmm2
468 movdqa 32(%eax, %edi), %xmm3
470 palignr $2, %xmm2, %xmm3
471 palignr $2, %xmm4, %xmm2
473 movdqa %xmm2, -32(%edx, %edi)
474 movdqa %xmm3, -16(%edx, %edi)
482 lea 2(%edi, %eax), %eax
484 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
488 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
497 movdqa 16(%eax, %edi), %xmm2
499 movdqa 32(%eax, %edi), %xmm3
501 palignr $3, %xmm2, %xmm3
502 palignr $3, %xmm1, %xmm2
504 movdqa %xmm2, -32(%edx, %edi)
505 movdqa %xmm3, -16(%edx, %edi)
509 movdqa 16(%eax, %edi), %xmm2
511 movdqa 32(%eax, %edi), %xmm3
513 palignr $3, %xmm2, %xmm3
514 palignr $3, %xmm4, %xmm2
516 movdqa %xmm2, -32(%edx, %edi)
517 movdqa %xmm3, -16(%edx, %edi)
525 lea 3(%edi, %eax), %eax
527 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
531 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
540 movdqa 16(%eax, %edi), %xmm2
542 movdqa 32(%eax, %edi), %xmm3
544 palignr $4, %xmm2, %xmm3
545 palignr $4, %xmm1, %xmm2
547 movdqa %xmm2, -32(%edx, %edi)
548 movdqa %xmm3, -16(%edx, %edi)
552 movdqa 16(%eax, %edi), %xmm2
554 movdqa 32(%eax, %edi), %xmm3
556 palignr $4, %xmm2, %xmm3
557 palignr $4, %xmm4, %xmm2
559 movdqa %xmm2, -32(%edx, %edi)
560 movdqa %xmm3, -16(%edx, %edi)
568 lea 4(%edi, %eax), %eax
570 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
574 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
583 movdqa 16(%eax, %edi), %xmm2
585 movdqa 32(%eax, %edi), %xmm3
587 palignr $5, %xmm2, %xmm3
588 palignr $5, %xmm1, %xmm2
590 movdqa %xmm2, -32(%edx, %edi)
591 movdqa %xmm3, -16(%edx, %edi)
595 movdqa 16(%eax, %edi), %xmm2
597 movdqa 32(%eax, %edi), %xmm3
599 palignr $5, %xmm2, %xmm3
600 palignr $5, %xmm4, %xmm2
602 movdqa %xmm2, -32(%edx, %edi)
603 movdqa %xmm3, -16(%edx, %edi)
611 lea 5(%edi, %eax), %eax
613 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
618 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
627 movdqa 16(%eax, %edi), %xmm2
629 movdqa 32(%eax, %edi), %xmm3
631 palignr $6, %xmm2, %xmm3
632 palignr $6, %xmm1, %xmm2
634 movdqa %xmm2, -32(%edx, %edi)
635 movdqa %xmm3, -16(%edx, %edi)
639 movdqa 16(%eax, %edi), %xmm2
641 movdqa 32(%eax, %edi), %xmm3
643 palignr $6, %xmm2, %xmm3
644 palignr $6, %xmm4, %xmm2
646 movdqa %xmm2, -32(%edx, %edi)
647 movdqa %xmm3, -16(%edx, %edi)
655 lea 6(%edi, %eax), %eax
657 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
661 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
670 movdqa 16(%eax, %edi), %xmm2
672 movdqa 32(%eax, %edi), %xmm3
674 palignr $7, %xmm2, %xmm3
675 palignr $7, %xmm1, %xmm2
677 movdqa %xmm2, -32(%edx, %edi)
678 movdqa %xmm3, -16(%edx, %edi)
682 movdqa 16(%eax, %edi), %xmm2
684 movdqa 32(%eax, %edi), %xmm3
686 palignr $7, %xmm2, %xmm3
687 palignr $7, %xmm4, %xmm2
689 movdqa %xmm2, -32(%edx, %edi)
690 movdqa %xmm3, -16(%edx, %edi)
698 lea 7(%edi, %eax), %eax
700 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
704 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
713 movdqa 16(%eax, %edi), %xmm2
715 movdqa 32(%eax, %edi), %xmm3
717 palignr $8, %xmm2, %xmm3
718 palignr $8, %xmm1, %xmm2
720 movdqa %xmm2, -32(%edx, %edi)
721 movdqa %xmm3, -16(%edx, %edi)
725 movdqa 16(%eax, %edi), %xmm2
727 movdqa 32(%eax, %edi), %xmm3
729 palignr $8, %xmm2, %xmm3
730 palignr $8, %xmm4, %xmm2
732 movdqa %xmm2, -32(%edx, %edi)
733 movdqa %xmm3, -16(%edx, %edi)
741 lea 8(%edi, %eax), %eax
743 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
747 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
756 movdqa 16(%eax, %edi), %xmm2
758 movdqa 32(%eax, %edi), %xmm3
760 palignr $9, %xmm2, %xmm3
761 palignr $9, %xmm1, %xmm2
763 movdqa %xmm2, -32(%edx, %edi)
764 movdqa %xmm3, -16(%edx, %edi)
768 movdqa 16(%eax, %edi), %xmm2
770 movdqa 32(%eax, %edi), %xmm3
772 palignr $9, %xmm2, %xmm3
773 palignr $9, %xmm4, %xmm2
775 movdqa %xmm2, -32(%edx, %edi)
776 movdqa %xmm3, -16(%edx, %edi)
784 lea 9(%edi, %eax), %eax
786 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
790 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
799 movdqa 16(%eax, %edi), %xmm2
801 movdqa 32(%eax, %edi), %xmm3
803 palignr $10, %xmm2, %xmm3
804 palignr $10, %xmm1, %xmm2
806 movdqa %xmm2, -32(%edx, %edi)
807 movdqa %xmm3, -16(%edx, %edi)
811 movdqa 16(%eax, %edi), %xmm2
813 movdqa 32(%eax, %edi), %xmm3
815 palignr $10, %xmm2, %xmm3
816 palignr $10, %xmm4, %xmm2
818 movdqa %xmm2, -32(%edx, %edi)
819 movdqa %xmm3, -16(%edx, %edi)
827 lea 10(%edi, %eax), %eax
829 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
833 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
842 movdqa 16(%eax, %edi), %xmm2
844 movdqa 32(%eax, %edi), %xmm3
846 palignr $11, %xmm2, %xmm3
847 palignr $11, %xmm1, %xmm2
849 movdqa %xmm2, -32(%edx, %edi)
850 movdqa %xmm3, -16(%edx, %edi)
854 movdqa 16(%eax, %edi), %xmm2
856 movdqa 32(%eax, %edi), %xmm3
858 palignr $11, %xmm2, %xmm3
859 palignr $11, %xmm4, %xmm2
861 movdqa %xmm2, -32(%edx, %edi)
862 movdqa %xmm3, -16(%edx, %edi)
870 lea 11(%edi, %eax), %eax
872 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
876 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
885 movdqa 16(%eax, %edi), %xmm2
887 movdqa 32(%eax, %edi), %xmm3
889 palignr $12, %xmm2, %xmm3
890 palignr $12, %xmm1, %xmm2
892 movdqa %xmm2, -32(%edx, %edi)
893 movdqa %xmm3, -16(%edx, %edi)
897 movdqa 16(%eax, %edi), %xmm2
899 movdqa 32(%eax, %edi), %xmm3
901 palignr $12, %xmm2, %xmm3
902 palignr $12, %xmm4, %xmm2
904 movdqa %xmm2, -32(%edx, %edi)
905 movdqa %xmm3, -16(%edx, %edi)
913 lea 12(%edi, %eax), %eax
915 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
919 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
928 movdqa 16(%eax, %edi), %xmm2
930 movdqa 32(%eax, %edi), %xmm3
932 palignr $13, %xmm2, %xmm3
933 palignr $13, %xmm1, %xmm2
935 movdqa %xmm2, -32(%edx, %edi)
936 movdqa %xmm3, -16(%edx, %edi)
940 movdqa 16(%eax, %edi), %xmm2
942 movdqa 32(%eax, %edi), %xmm3
944 palignr $13, %xmm2, %xmm3
945 palignr $13, %xmm4, %xmm2
947 movdqa %xmm2, -32(%edx, %edi)
948 movdqa %xmm3, -16(%edx, %edi)
956 lea 13(%edi, %eax), %eax
958 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
962 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
971 movdqa 16(%eax, %edi), %xmm2
973 movdqa 32(%eax, %edi), %xmm3
975 palignr $14, %xmm2, %xmm3
976 palignr $14, %xmm1, %xmm2
978 movdqa %xmm2, -32(%edx, %edi)
979 movdqa %xmm3, -16(%edx, %edi)
983 movdqa 16(%eax, %edi), %xmm2
985 movdqa 32(%eax, %edi), %xmm3
987 palignr $14, %xmm2, %xmm3
988 palignr $14, %xmm4, %xmm2
990 movdqa %xmm2, -32(%edx, %edi)
991 movdqa %xmm3, -16(%edx, %edi)
999 lea 14(%edi, %eax), %eax
1001 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1006 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1008 movaps (%eax), %xmm1
1011 movdqu %xmm0, (%esi)
1015 movdqa 16(%eax, %edi), %xmm2
1017 movdqa 32(%eax, %edi), %xmm3
1019 palignr $15, %xmm2, %xmm3
1020 palignr $15, %xmm1, %xmm2
1022 movdqa %xmm2, -32(%edx, %edi)
1023 movdqa %xmm3, -16(%edx, %edi)
1027 movdqa 16(%eax, %edi), %xmm2
1029 movdqa 32(%eax, %edi), %xmm3
1031 palignr $15, %xmm2, %xmm3
1032 palignr $15, %xmm4, %xmm2
1034 movdqa %xmm2, -32(%edx, %edi)
1035 movdqa %xmm3, -16(%edx, %edi)
1043 lea 15(%edi, %eax), %eax
1045 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1049 L(fwd_write_44bytes):
1050 movl -44(%eax), %ecx
1051 movl %ecx, -44(%edx)
1052 L(fwd_write_40bytes):
1053 movl -40(%eax), %ecx
1054 movl %ecx, -40(%edx)
1055 L(fwd_write_36bytes):
1056 movl -36(%eax), %ecx
1057 movl %ecx, -36(%edx)
1058 L(fwd_write_32bytes):
1059 movl -32(%eax), %ecx
1060 movl %ecx, -32(%edx)
1061 L(fwd_write_28bytes):
1062 movl -28(%eax), %ecx
1063 movl %ecx, -28(%edx)
1064 L(fwd_write_24bytes):
1065 movl -24(%eax), %ecx
1066 movl %ecx, -24(%edx)
1067 L(fwd_write_20bytes):
1068 movl -20(%eax), %ecx
1069 movl %ecx, -20(%edx)
1070 L(fwd_write_16bytes):
1071 movl -16(%eax), %ecx
1072 movl %ecx, -16(%edx)
1073 L(fwd_write_12bytes):
1074 movl -12(%eax), %ecx
1075 movl %ecx, -12(%edx)
1076 L(fwd_write_8bytes):
1079 L(fwd_write_4bytes):
1082 L(fwd_write_0bytes):
1083 #ifndef USE_AS_BCOPY
1084 # ifdef USE_AS_MEMPCPY
1087 movl DEST(%esp), %eax
1093 L(fwd_write_5bytes):
1098 #ifndef USE_AS_BCOPY
1099 # ifdef USE_AS_MEMPCPY
1102 movl DEST(%esp), %eax
1108 L(fwd_write_45bytes):
1109 movl -45(%eax), %ecx
1110 movl %ecx, -45(%edx)
1111 L(fwd_write_41bytes):
1112 movl -41(%eax), %ecx
1113 movl %ecx, -41(%edx)
1114 L(fwd_write_37bytes):
1115 movl -37(%eax), %ecx
1116 movl %ecx, -37(%edx)
1117 L(fwd_write_33bytes):
1118 movl -33(%eax), %ecx
1119 movl %ecx, -33(%edx)
1120 L(fwd_write_29bytes):
1121 movl -29(%eax), %ecx
1122 movl %ecx, -29(%edx)
1123 L(fwd_write_25bytes):
1124 movl -25(%eax), %ecx
1125 movl %ecx, -25(%edx)
1126 L(fwd_write_21bytes):
1127 movl -21(%eax), %ecx
1128 movl %ecx, -21(%edx)
1129 L(fwd_write_17bytes):
1130 movl -17(%eax), %ecx
1131 movl %ecx, -17(%edx)
1132 L(fwd_write_13bytes):
1133 movl -13(%eax), %ecx
1134 movl %ecx, -13(%edx)
1135 L(fwd_write_9bytes):
1140 L(fwd_write_1bytes):
1141 movzbl -1(%eax), %ecx
1143 #ifndef USE_AS_BCOPY
1144 # ifdef USE_AS_MEMPCPY
1147 movl DEST(%esp), %eax
1153 L(fwd_write_46bytes):
1154 movl -46(%eax), %ecx
1155 movl %ecx, -46(%edx)
1156 L(fwd_write_42bytes):
1157 movl -42(%eax), %ecx
1158 movl %ecx, -42(%edx)
1159 L(fwd_write_38bytes):
1160 movl -38(%eax), %ecx
1161 movl %ecx, -38(%edx)
1162 L(fwd_write_34bytes):
1163 movl -34(%eax), %ecx
1164 movl %ecx, -34(%edx)
1165 L(fwd_write_30bytes):
1166 movl -30(%eax), %ecx
1167 movl %ecx, -30(%edx)
1168 L(fwd_write_26bytes):
1169 movl -26(%eax), %ecx
1170 movl %ecx, -26(%edx)
1171 L(fwd_write_22bytes):
1172 movl -22(%eax), %ecx
1173 movl %ecx, -22(%edx)
1174 L(fwd_write_18bytes):
1175 movl -18(%eax), %ecx
1176 movl %ecx, -18(%edx)
1177 L(fwd_write_14bytes):
1178 movl -14(%eax), %ecx
1179 movl %ecx, -14(%edx)
1180 L(fwd_write_10bytes):
1181 movl -10(%eax), %ecx
1182 movl %ecx, -10(%edx)
1183 L(fwd_write_6bytes):
1186 L(fwd_write_2bytes):
1187 movzwl -2(%eax), %ecx
1189 #ifndef USE_AS_BCOPY
1190 # ifdef USE_AS_MEMPCPY
1193 movl DEST(%esp), %eax
1199 L(fwd_write_47bytes):
1200 movl -47(%eax), %ecx
1201 movl %ecx, -47(%edx)
1202 L(fwd_write_43bytes):
1203 movl -43(%eax), %ecx
1204 movl %ecx, -43(%edx)
1205 L(fwd_write_39bytes):
1206 movl -39(%eax), %ecx
1207 movl %ecx, -39(%edx)
1208 L(fwd_write_35bytes):
1209 movl -35(%eax), %ecx
1210 movl %ecx, -35(%edx)
1211 L(fwd_write_31bytes):
1212 movl -31(%eax), %ecx
1213 movl %ecx, -31(%edx)
1214 L(fwd_write_27bytes):
1215 movl -27(%eax), %ecx
1216 movl %ecx, -27(%edx)
1217 L(fwd_write_23bytes):
1218 movl -23(%eax), %ecx
1219 movl %ecx, -23(%edx)
1220 L(fwd_write_19bytes):
1221 movl -19(%eax), %ecx
1222 movl %ecx, -19(%edx)
1223 L(fwd_write_15bytes):
1224 movl -15(%eax), %ecx
1225 movl %ecx, -15(%edx)
1226 L(fwd_write_11bytes):
1227 movl -11(%eax), %ecx
1228 movl %ecx, -11(%edx)
1229 L(fwd_write_7bytes):
1232 L(fwd_write_3bytes):
1233 movzwl -3(%eax), %ecx
1234 movzbl -1(%eax), %eax
1237 #ifndef USE_AS_BCOPY
1238 # ifdef USE_AS_MEMPCPY
1241 movl DEST(%esp), %eax
1248 movdqu (%eax), %xmm1
1250 movdqu %xmm0, (%esi)
1251 movntdq %xmm1, (%edx)
1254 je L(copy_page_by_rep)
1255 L(large_page_loop_init):
1257 lea -0x90(%ecx), %ecx
1260 prefetchnta 0x1c0(%eax)
1261 prefetchnta 0x280(%eax)
1262 movdqu (%eax), %xmm0
1263 movdqu 0x10(%eax), %xmm1
1264 movdqu 0x20(%eax), %xmm2
1265 movdqu 0x30(%eax), %xmm3
1266 movdqu 0x40(%eax), %xmm4
1267 movdqu 0x50(%eax), %xmm5
1268 movdqu 0x60(%eax), %xmm6
1269 movdqu 0x70(%eax), %xmm7
1270 lea 0x80(%eax), %eax
1273 movntdq %xmm0, (%edx)
1274 movntdq %xmm1, 0x10(%edx)
1275 movntdq %xmm2, 0x20(%edx)
1276 movntdq %xmm3, 0x30(%edx)
1277 movntdq %xmm4, 0x40(%edx)
1278 movntdq %xmm5, 0x50(%edx)
1279 movntdq %xmm6, 0x60(%edx)
1280 movntdq %xmm7, 0x70(%edx)
1281 lea 0x80(%edx), %edx
1282 jae L(large_page_loop)
1284 lea 0x80(%ecx), %ecx
1285 jl L(large_page_less_64bytes)
1287 movdqu (%eax), %xmm0
1288 movdqu 0x10(%eax), %xmm1
1289 movdqu 0x20(%eax), %xmm2
1290 movdqu 0x30(%eax), %xmm3
1291 lea 0x40(%eax), %eax
1293 movntdq %xmm0, (%edx)
1294 movntdq %xmm1, 0x10(%edx)
1295 movntdq %xmm2, 0x20(%edx)
1296 movntdq %xmm3, 0x30(%edx)
1297 lea 0x40(%edx), %edx
1299 L(large_page_less_64bytes):
1301 jl L(large_page_less_32bytes)
1302 movdqu (%eax), %xmm0
1303 movdqu 0x10(%eax), %xmm1
1304 lea 0x20(%eax), %eax
1305 movntdq %xmm0, (%edx)
1306 movntdq %xmm1, 0x10(%edx)
1307 lea 0x20(%edx), %edx
1309 L(large_page_less_32bytes):
1313 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1316 L(copy_page_by_rep):
1323 jz L(copy_page_by_rep_exit)
1325 jb L(copy_page_by_rep_left_1)
1331 jz L(copy_page_by_rep_exit)
1332 L(copy_page_by_rep_left_1):
1335 L(copy_page_by_rep_exit):
1338 #ifndef USE_AS_BCOPY
1339 movl DEST(%esp), %eax
1340 # ifdef USE_AS_MEMPCPY
1341 movl LEN(%esp), %ecx
1348 L(bk_write_44bytes):
1351 L(bk_write_40bytes):
1354 L(bk_write_36bytes):
1357 L(bk_write_32bytes):
1360 L(bk_write_28bytes):
1363 L(bk_write_24bytes):
1366 L(bk_write_20bytes):
1369 L(bk_write_16bytes):
1372 L(bk_write_12bytes):
1382 #ifndef USE_AS_BCOPY
1383 movl DEST(%esp), %eax
1384 # ifdef USE_AS_MEMPCPY
1385 movl LEN(%esp), %ecx
1392 L(bk_write_45bytes):
1395 L(bk_write_41bytes):
1398 L(bk_write_37bytes):
1401 L(bk_write_33bytes):
1404 L(bk_write_29bytes):
1407 L(bk_write_25bytes):
1410 L(bk_write_21bytes):
1413 L(bk_write_17bytes):
1416 L(bk_write_13bytes):
1428 #ifndef USE_AS_BCOPY
1429 movl DEST(%esp), %eax
1430 # ifdef USE_AS_MEMPCPY
1431 movl LEN(%esp), %ecx
1438 L(bk_write_46bytes):
1441 L(bk_write_42bytes):
1444 L(bk_write_38bytes):
1447 L(bk_write_34bytes):
1450 L(bk_write_30bytes):
1453 L(bk_write_26bytes):
1456 L(bk_write_22bytes):
1459 L(bk_write_18bytes):
1462 L(bk_write_14bytes):
1465 L(bk_write_10bytes):
1474 #ifndef USE_AS_BCOPY
1475 movl DEST(%esp), %eax
1476 # ifdef USE_AS_MEMPCPY
1477 movl LEN(%esp), %ecx
1484 L(bk_write_47bytes):
1487 L(bk_write_43bytes):
1490 L(bk_write_39bytes):
1493 L(bk_write_35bytes):
1496 L(bk_write_31bytes):
1499 L(bk_write_27bytes):
1502 L(bk_write_23bytes):
1505 L(bk_write_19bytes):
1508 L(bk_write_15bytes):
1511 L(bk_write_11bytes):
1518 movzwl 1(%eax), %ecx
1522 #ifndef USE_AS_BCOPY
1523 movl DEST(%esp), %eax
1524 # ifdef USE_AS_MEMPCPY
1525 movl LEN(%esp), %ecx
1532 .pushsection .rodata.ssse3,"a",@progbits
1534 L(table_48bytes_fwd):
1535 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1536 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1537 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1538 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1539 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1540 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1541 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1542 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1543 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1544 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1545 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1546 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1547 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1548 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1549 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1550 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1551 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1552 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1553 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1554 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1555 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1556 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1557 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1558 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1559 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1560 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1561 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1562 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1563 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1567 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1568 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1569 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1570 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1571 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1572 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1573 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1574 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1575 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1576 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1577 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1578 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1579 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1580 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1581 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1582 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1586 .int JMPTBL (L(shl_0), L(shl_table))
1587 .int JMPTBL (L(shl_1), L(shl_table))
1588 .int JMPTBL (L(shl_2), L(shl_table))
1589 .int JMPTBL (L(shl_3), L(shl_table))
1590 .int JMPTBL (L(shl_4), L(shl_table))
1591 .int JMPTBL (L(shl_5), L(shl_table))
1592 .int JMPTBL (L(shl_6), L(shl_table))
1593 .int JMPTBL (L(shl_7), L(shl_table))
1594 .int JMPTBL (L(shl_8), L(shl_table))
1595 .int JMPTBL (L(shl_9), L(shl_table))
1596 .int JMPTBL (L(shl_10), L(shl_table))
1597 .int JMPTBL (L(shl_11), L(shl_table))
1598 .int JMPTBL (L(shl_12), L(shl_table))
1599 .int JMPTBL (L(shl_13), L(shl_table))
1600 .int JMPTBL (L(shl_14), L(shl_table))
1601 .int JMPTBL (L(shl_15), L(shl_table))
1604 L(table_48_bytes_bwd):
1605 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1606 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1607 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1608 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1609 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1610 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1611 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1612 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1613 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1614 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1615 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1616 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1617 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1618 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1619 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1620 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1621 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1622 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1623 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1624 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1625 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1626 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1627 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1628 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1629 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1630 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1631 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1632 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1633 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1637 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1638 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1639 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1640 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1641 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1642 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1643 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1644 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1645 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1646 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1647 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1648 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1649 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1650 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1651 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1652 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1656 #ifdef USE_AS_MEMMOVE
1661 lea (%ecx,%edx,1),%edx
1662 lea (%ecx,%esi,1),%esi
1668 jge L(bk_write_more64bytes)
1670 L(bk_write_64bytesless):
1672 jl L(bk_write_less32bytes)
1674 L(bk_write_more32bytes):
1675 /* Copy 32 bytes at a time. */
1681 movl -12(%esi), %eax
1682 movl %eax, -12(%edx)
1683 movl -16(%esi), %eax
1684 movl %eax, -16(%edx)
1685 movl -20(%esi), %eax
1686 movl %eax, -20(%edx)
1687 movl -24(%esi), %eax
1688 movl %eax, -24(%edx)
1689 movl -28(%esi), %eax
1690 movl %eax, -28(%edx)
1691 movl -32(%esi), %eax
1692 movl %eax, -32(%edx)
1696 L(bk_write_less32bytes):
1701 L(bk_write_less32bytes_2):
1702 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1707 jle L(bk_write_less32bytes)
1709 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1710 then (EDX & 2) must be != 0. */
1730 L(bk_write_more64bytes):
1731 /* Check alignment of last byte. */
1733 jz L(bk_ssse3_cpy_pre)
1735 /* EDX is aligned 4 bytes, but not 16 bytes. */
1744 jz L(bk_ssse3_cpy_pre)
1753 jz L(bk_ssse3_cpy_pre)
1761 L(bk_ssse3_cpy_pre):
1763 jl L(bk_write_more32bytes)
1769 movdqu 0x30(%esi), %xmm3
1770 movdqa %xmm3, 0x30(%edx)
1771 movdqu 0x20(%esi), %xmm2
1772 movdqa %xmm2, 0x20(%edx)
1773 movdqu 0x10(%esi), %xmm1
1774 movdqa %xmm1, 0x10(%edx)
1775 movdqu (%esi), %xmm0
1776 movdqa %xmm0, (%edx)
1779 jmp L(bk_write_64bytesless)