2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #if !defined NOT_IN_libc \
25 || defined USE_AS_MEMMOVE \
26 || !defined USE_MULTIARCH)
28 #include "asm-syntax.h"
31 # define MEMCPY __memcpy_ssse3
32 # define MEMCPY_CHK __memcpy_chk_ssse3
45 #define CFI_PUSH(REG) \
46 cfi_adjust_cfa_offset (4); \
47 cfi_rel_offset (REG, 0)
49 #define CFI_POP(REG) \
50 cfi_adjust_cfa_offset (-4); \
53 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
54 #define POP(REG) popl REG; CFI_POP (REG)
57 # define PARMS 8 /* Preserve EBX. */
58 # define ENTRANCE PUSH (%ebx);
59 # define RETURN_END POP (%ebx); ret
60 # define RETURN RETURN_END; CFI_PUSH (%ebx)
61 # define JMPTBL(I, B) I - B
63 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
64 jump table with relative offsets. INDEX is a register contains the
65 index into the jump table. SCALE is the scale of INDEX. */
66 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
67 /* We first load PC into EBX. */ \
68 call __i686.get_pc_thunk.bx; \
69 /* Get the address of the jump table. */ \
70 addl $(TABLE - .), %ebx; \
71 /* Get the entry and convert the relative offset to the \
72 absolute address. */ \
73 addl (%ebx,INDEX,SCALE), %ebx; \
74 /* We loaded the jump table. Go. */ \
77 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
78 addl $(TABLE - .), %ebx
80 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
81 addl (%ebx,INDEX,SCALE), %ebx; \
82 /* We loaded the jump table. Go. */ \
85 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
86 .globl __i686.get_pc_thunk.bx
87 .hidden __i686.get_pc_thunk.bx
89 .type __i686.get_pc_thunk.bx,@function
90 __i686.get_pc_thunk.bx:
96 # define RETURN_END ret
97 # define RETURN RETURN_END
98 # define JMPTBL(I, B) I
100 /* Branch to an entry in a jump table. TABLE is a jump table with
101 absolute offsets. INDEX is a register contains the index into the
102 jump table. SCALE is the scale of INDEX. */
103 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
104 jmp *TABLE(,INDEX,SCALE)
106 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
108 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
109 jmp *TABLE(,INDEX,SCALE)
112 .section .text.ssse3,"ax",@progbits
113 #if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
117 jb HIDDEN_JUMPTARGET (__chk_fail)
124 movl DEST(%esp), %edx
126 #ifdef USE_AS_MEMMOVE
129 je L(fwd_write_0bytes)
132 jmp L(bk_write_less32bytes_2)
144 L(fwd_write_less32bytes):
145 #ifndef USE_AS_MEMMOVE
151 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
152 #ifndef USE_AS_MEMMOVE
154 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
158 /* ECX > 32 and EDX is 4 byte aligned. */
172 #ifdef SHARED_CACHE_SIZE_HALF
173 cmp $SHARED_CACHE_SIZE_HALF, %ecx
176 call __i686.get_pc_thunk.bx
177 add $_GLOBAL_OFFSET_TABLE_, %ebx
178 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
180 cmp __x86_shared_cache_size_half, %ecx
189 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
202 movdqa (%eax, %edi), %xmm0
203 movdqa 16(%eax, %edi), %xmm1
205 movdqa %xmm0, (%edx, %edi)
206 movdqa %xmm1, 16(%edx, %edi)
210 movdqa (%eax, %edi), %xmm0
211 movdqa 16(%eax, %edi), %xmm1
213 movdqa %xmm0, (%edx, %edi)
214 movdqa %xmm1, 16(%edx, %edi)
218 movdqa (%eax, %edi), %xmm0
219 movdqa 16(%eax, %edi), %xmm1
221 movdqa %xmm0, (%edx, %edi)
222 movdqa %xmm1, 16(%edx, %edi)
226 movdqa (%eax, %edi), %xmm0
227 movdqa 16(%eax, %edi), %xmm1
229 movdqa %xmm0, (%edx, %edi)
230 movdqa %xmm1, 16(%edx, %edi)
238 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
243 #ifdef DATA_CACHE_SIZE_HALF
244 cmp $DATA_CACHE_SIZE_HALF, %ecx
247 call __i686.get_pc_thunk.bx
248 add $_GLOBAL_OFFSET_TABLE_, %ebx
249 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
251 cmp __x86_data_cache_size_half, %ecx
257 jae L(shl_0_gobble_mem_loop)
258 L(shl_0_gobble_cache_loop):
260 movdqa 0x10(%eax), %xmm1
261 movdqa 0x20(%eax), %xmm2
262 movdqa 0x30(%eax), %xmm3
263 movdqa 0x40(%eax), %xmm4
264 movdqa 0x50(%eax), %xmm5
265 movdqa 0x60(%eax), %xmm6
266 movdqa 0x70(%eax), %xmm7
270 movdqa %xmm1, 0x10(%edx)
271 movdqa %xmm2, 0x20(%edx)
272 movdqa %xmm3, 0x30(%edx)
273 movdqa %xmm4, 0x40(%edx)
274 movdqa %xmm5, 0x50(%edx)
275 movdqa %xmm6, 0x60(%edx)
276 movdqa %xmm7, 0x70(%edx)
279 jae L(shl_0_gobble_cache_loop)
282 jl L(shl_0_cache_less_64bytes)
286 movdqa 0x10(%eax), %xmm1
289 movdqa %xmm1, 0x10(%edx)
291 movdqa 0x20(%eax), %xmm0
292 movdqa 0x30(%eax), %xmm1
295 movdqa %xmm0, 0x20(%edx)
296 movdqa %xmm1, 0x30(%edx)
298 L(shl_0_cache_less_64bytes):
300 jb L(shl_0_cache_less_32bytes)
303 movdqa 0x10(%eax), %xmm1
306 movdqa %xmm1, 0x10(%edx)
308 L(shl_0_cache_less_32bytes):
310 jb L(shl_0_cache_less_16bytes)
316 L(shl_0_cache_less_16bytes):
319 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
323 L(shl_0_gobble_mem_loop):
324 prefetcht0 0x1c0(%eax)
325 prefetcht0 0x280(%eax)
326 prefetcht0 0x1c0(%edx)
329 movdqa 0x10(%eax), %xmm1
330 movdqa 0x20(%eax), %xmm2
331 movdqa 0x30(%eax), %xmm3
332 movdqa 0x40(%eax), %xmm4
333 movdqa 0x50(%eax), %xmm5
334 movdqa 0x60(%eax), %xmm6
335 movdqa 0x70(%eax), %xmm7
339 movdqa %xmm1, 0x10(%edx)
340 movdqa %xmm2, 0x20(%edx)
341 movdqa %xmm3, 0x30(%edx)
342 movdqa %xmm4, 0x40(%edx)
343 movdqa %xmm5, 0x50(%edx)
344 movdqa %xmm6, 0x60(%edx)
345 movdqa %xmm7, 0x70(%edx)
348 jae L(shl_0_gobble_mem_loop)
351 jl L(shl_0_mem_less_64bytes)
355 movdqa 0x10(%eax), %xmm1
358 movdqa %xmm1, 0x10(%edx)
360 movdqa 0x20(%eax), %xmm0
361 movdqa 0x30(%eax), %xmm1
364 movdqa %xmm0, 0x20(%edx)
365 movdqa %xmm1, 0x30(%edx)
367 L(shl_0_mem_less_64bytes):
369 jb L(shl_0_mem_less_32bytes)
372 movdqa 0x10(%eax), %xmm1
375 movdqa %xmm1, 0x10(%edx)
377 L(shl_0_mem_less_32bytes):
379 jb L(shl_0_mem_less_16bytes)
385 L(shl_0_mem_less_16bytes):
388 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
394 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
403 movdqa 16(%eax, %edi), %xmm2
405 movdqa 32(%eax, %edi), %xmm3
407 palignr $1, %xmm2, %xmm3
408 palignr $1, %xmm1, %xmm2
410 movdqa %xmm2, -32(%edx, %edi)
411 movdqa %xmm3, -16(%edx, %edi)
415 movdqa 16(%eax, %edi), %xmm2
417 movdqa 32(%eax, %edi), %xmm3
419 palignr $1, %xmm2, %xmm3
420 palignr $1, %xmm4, %xmm2
422 movdqa %xmm2, -32(%edx, %edi)
423 movdqa %xmm3, -16(%edx, %edi)
431 lea 1(%edi, %eax), %eax
433 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
439 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
448 movdqa 16(%eax, %edi), %xmm2
450 movdqa 32(%eax, %edi), %xmm3
452 palignr $2, %xmm2, %xmm3
453 palignr $2, %xmm1, %xmm2
455 movdqa %xmm2, -32(%edx, %edi)
456 movdqa %xmm3, -16(%edx, %edi)
460 movdqa 16(%eax, %edi), %xmm2
462 movdqa 32(%eax, %edi), %xmm3
464 palignr $2, %xmm2, %xmm3
465 palignr $2, %xmm4, %xmm2
467 movdqa %xmm2, -32(%edx, %edi)
468 movdqa %xmm3, -16(%edx, %edi)
476 lea 2(%edi, %eax), %eax
478 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
484 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
493 movdqa 16(%eax, %edi), %xmm2
495 movdqa 32(%eax, %edi), %xmm3
497 palignr $3, %xmm2, %xmm3
498 palignr $3, %xmm1, %xmm2
500 movdqa %xmm2, -32(%edx, %edi)
501 movdqa %xmm3, -16(%edx, %edi)
505 movdqa 16(%eax, %edi), %xmm2
507 movdqa 32(%eax, %edi), %xmm3
509 palignr $3, %xmm2, %xmm3
510 palignr $3, %xmm4, %xmm2
512 movdqa %xmm2, -32(%edx, %edi)
513 movdqa %xmm3, -16(%edx, %edi)
521 lea 3(%edi, %eax), %eax
523 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
529 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
538 movdqa 16(%eax, %edi), %xmm2
540 movdqa 32(%eax, %edi), %xmm3
542 palignr $4, %xmm2, %xmm3
543 palignr $4, %xmm1, %xmm2
545 movdqa %xmm2, -32(%edx, %edi)
546 movdqa %xmm3, -16(%edx, %edi)
550 movdqa 16(%eax, %edi), %xmm2
552 movdqa 32(%eax, %edi), %xmm3
554 palignr $4, %xmm2, %xmm3
555 palignr $4, %xmm4, %xmm2
557 movdqa %xmm2, -32(%edx, %edi)
558 movdqa %xmm3, -16(%edx, %edi)
566 lea 4(%edi, %eax), %eax
568 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
574 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
583 movdqa 16(%eax, %edi), %xmm2
585 movdqa 32(%eax, %edi), %xmm3
587 palignr $5, %xmm2, %xmm3
588 palignr $5, %xmm1, %xmm2
590 movdqa %xmm2, -32(%edx, %edi)
591 movdqa %xmm3, -16(%edx, %edi)
595 movdqa 16(%eax, %edi), %xmm2
597 movdqa 32(%eax, %edi), %xmm3
599 palignr $5, %xmm2, %xmm3
600 palignr $5, %xmm4, %xmm2
602 movdqa %xmm2, -32(%edx, %edi)
603 movdqa %xmm3, -16(%edx, %edi)
611 lea 5(%edi, %eax), %eax
613 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
619 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
628 movdqa 16(%eax, %edi), %xmm2
630 movdqa 32(%eax, %edi), %xmm3
632 palignr $6, %xmm2, %xmm3
633 palignr $6, %xmm1, %xmm2
635 movdqa %xmm2, -32(%edx, %edi)
636 movdqa %xmm3, -16(%edx, %edi)
640 movdqa 16(%eax, %edi), %xmm2
642 movdqa 32(%eax, %edi), %xmm3
644 palignr $6, %xmm2, %xmm3
645 palignr $6, %xmm4, %xmm2
647 movdqa %xmm2, -32(%edx, %edi)
648 movdqa %xmm3, -16(%edx, %edi)
656 lea 6(%edi, %eax), %eax
658 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
664 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
673 movdqa 16(%eax, %edi), %xmm2
675 movdqa 32(%eax, %edi), %xmm3
677 palignr $7, %xmm2, %xmm3
678 palignr $7, %xmm1, %xmm2
680 movdqa %xmm2, -32(%edx, %edi)
681 movdqa %xmm3, -16(%edx, %edi)
685 movdqa 16(%eax, %edi), %xmm2
687 movdqa 32(%eax, %edi), %xmm3
689 palignr $7, %xmm2, %xmm3
690 palignr $7, %xmm4, %xmm2
692 movdqa %xmm2, -32(%edx, %edi)
693 movdqa %xmm3, -16(%edx, %edi)
701 lea 7(%edi, %eax), %eax
703 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
709 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
718 movdqa 16(%eax, %edi), %xmm2
720 movdqa 32(%eax, %edi), %xmm3
722 palignr $8, %xmm2, %xmm3
723 palignr $8, %xmm1, %xmm2
725 movdqa %xmm2, -32(%edx, %edi)
726 movdqa %xmm3, -16(%edx, %edi)
730 movdqa 16(%eax, %edi), %xmm2
732 movdqa 32(%eax, %edi), %xmm3
734 palignr $8, %xmm2, %xmm3
735 palignr $8, %xmm4, %xmm2
737 movdqa %xmm2, -32(%edx, %edi)
738 movdqa %xmm3, -16(%edx, %edi)
746 lea 8(%edi, %eax), %eax
748 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
754 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
763 movdqa 16(%eax, %edi), %xmm2
765 movdqa 32(%eax, %edi), %xmm3
767 palignr $9, %xmm2, %xmm3
768 palignr $9, %xmm1, %xmm2
770 movdqa %xmm2, -32(%edx, %edi)
771 movdqa %xmm3, -16(%edx, %edi)
775 movdqa 16(%eax, %edi), %xmm2
777 movdqa 32(%eax, %edi), %xmm3
779 palignr $9, %xmm2, %xmm3
780 palignr $9, %xmm4, %xmm2
782 movdqa %xmm2, -32(%edx, %edi)
783 movdqa %xmm3, -16(%edx, %edi)
791 lea 9(%edi, %eax), %eax
793 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
799 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
808 movdqa 16(%eax, %edi), %xmm2
810 movdqa 32(%eax, %edi), %xmm3
812 palignr $10, %xmm2, %xmm3
813 palignr $10, %xmm1, %xmm2
815 movdqa %xmm2, -32(%edx, %edi)
816 movdqa %xmm3, -16(%edx, %edi)
820 movdqa 16(%eax, %edi), %xmm2
822 movdqa 32(%eax, %edi), %xmm3
824 palignr $10, %xmm2, %xmm3
825 palignr $10, %xmm4, %xmm2
827 movdqa %xmm2, -32(%edx, %edi)
828 movdqa %xmm3, -16(%edx, %edi)
836 lea 10(%edi, %eax), %eax
838 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
844 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
853 movdqa 16(%eax, %edi), %xmm2
855 movdqa 32(%eax, %edi), %xmm3
857 palignr $11, %xmm2, %xmm3
858 palignr $11, %xmm1, %xmm2
860 movdqa %xmm2, -32(%edx, %edi)
861 movdqa %xmm3, -16(%edx, %edi)
865 movdqa 16(%eax, %edi), %xmm2
867 movdqa 32(%eax, %edi), %xmm3
869 palignr $11, %xmm2, %xmm3
870 palignr $11, %xmm4, %xmm2
872 movdqa %xmm2, -32(%edx, %edi)
873 movdqa %xmm3, -16(%edx, %edi)
881 lea 11(%edi, %eax), %eax
883 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
889 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
898 movdqa 16(%eax, %edi), %xmm2
900 movdqa 32(%eax, %edi), %xmm3
902 palignr $12, %xmm2, %xmm3
903 palignr $12, %xmm1, %xmm2
905 movdqa %xmm2, -32(%edx, %edi)
906 movdqa %xmm3, -16(%edx, %edi)
910 movdqa 16(%eax, %edi), %xmm2
912 movdqa 32(%eax, %edi), %xmm3
914 palignr $12, %xmm2, %xmm3
915 palignr $12, %xmm4, %xmm2
917 movdqa %xmm2, -32(%edx, %edi)
918 movdqa %xmm3, -16(%edx, %edi)
926 lea 12(%edi, %eax), %eax
928 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
934 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
943 movdqa 16(%eax, %edi), %xmm2
945 movdqa 32(%eax, %edi), %xmm3
947 palignr $13, %xmm2, %xmm3
948 palignr $13, %xmm1, %xmm2
950 movdqa %xmm2, -32(%edx, %edi)
951 movdqa %xmm3, -16(%edx, %edi)
955 movdqa 16(%eax, %edi), %xmm2
957 movdqa 32(%eax, %edi), %xmm3
959 palignr $13, %xmm2, %xmm3
960 palignr $13, %xmm4, %xmm2
962 movdqa %xmm2, -32(%edx, %edi)
963 movdqa %xmm3, -16(%edx, %edi)
971 lea 13(%edi, %eax), %eax
973 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
979 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
988 movdqa 16(%eax, %edi), %xmm2
990 movdqa 32(%eax, %edi), %xmm3
992 palignr $14, %xmm2, %xmm3
993 palignr $14, %xmm1, %xmm2
995 movdqa %xmm2, -32(%edx, %edi)
996 movdqa %xmm3, -16(%edx, %edi)
1000 movdqa 16(%eax, %edi), %xmm2
1002 movdqa 32(%eax, %edi), %xmm3
1004 palignr $14, %xmm2, %xmm3
1005 palignr $14, %xmm4, %xmm2
1007 movdqa %xmm2, -32(%edx, %edi)
1008 movdqa %xmm3, -16(%edx, %edi)
1016 lea 14(%edi, %eax), %eax
1018 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1024 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1026 movaps (%eax), %xmm1
1029 movdqu %xmm0, (%esi)
1033 movdqa 16(%eax, %edi), %xmm2
1035 movdqa 32(%eax, %edi), %xmm3
1037 palignr $15, %xmm2, %xmm3
1038 palignr $15, %xmm1, %xmm2
1040 movdqa %xmm2, -32(%edx, %edi)
1041 movdqa %xmm3, -16(%edx, %edi)
1045 movdqa 16(%eax, %edi), %xmm2
1047 movdqa 32(%eax, %edi), %xmm3
1049 palignr $15, %xmm2, %xmm3
1050 palignr $15, %xmm4, %xmm2
1052 movdqa %xmm2, -32(%edx, %edi)
1053 movdqa %xmm3, -16(%edx, %edi)
1061 lea 15(%edi, %eax), %eax
1063 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1067 L(fwd_write_44bytes):
1068 movl -44(%eax), %ecx
1069 movl %ecx, -44(%edx)
1070 L(fwd_write_40bytes):
1071 movl -40(%eax), %ecx
1072 movl %ecx, -40(%edx)
1073 L(fwd_write_36bytes):
1074 movl -36(%eax), %ecx
1075 movl %ecx, -36(%edx)
1076 L(fwd_write_32bytes):
1077 movl -32(%eax), %ecx
1078 movl %ecx, -32(%edx)
1079 L(fwd_write_28bytes):
1080 movl -28(%eax), %ecx
1081 movl %ecx, -28(%edx)
1082 L(fwd_write_24bytes):
1083 movl -24(%eax), %ecx
1084 movl %ecx, -24(%edx)
1085 L(fwd_write_20bytes):
1086 movl -20(%eax), %ecx
1087 movl %ecx, -20(%edx)
1088 L(fwd_write_16bytes):
1089 movl -16(%eax), %ecx
1090 movl %ecx, -16(%edx)
1091 L(fwd_write_12bytes):
1092 movl -12(%eax), %ecx
1093 movl %ecx, -12(%edx)
1094 L(fwd_write_8bytes):
1097 L(fwd_write_4bytes):
1100 L(fwd_write_0bytes):
1101 #ifndef USE_AS_BCOPY
1102 # ifdef USE_AS_MEMPCPY
1105 movl DEST(%esp), %eax
1111 L(fwd_write_5bytes):
1116 #ifndef USE_AS_BCOPY
1117 # ifdef USE_AS_MEMPCPY
1120 movl DEST(%esp), %eax
1126 L(fwd_write_45bytes):
1127 movl -45(%eax), %ecx
1128 movl %ecx, -45(%edx)
1129 L(fwd_write_41bytes):
1130 movl -41(%eax), %ecx
1131 movl %ecx, -41(%edx)
1132 L(fwd_write_37bytes):
1133 movl -37(%eax), %ecx
1134 movl %ecx, -37(%edx)
1135 L(fwd_write_33bytes):
1136 movl -33(%eax), %ecx
1137 movl %ecx, -33(%edx)
1138 L(fwd_write_29bytes):
1139 movl -29(%eax), %ecx
1140 movl %ecx, -29(%edx)
1141 L(fwd_write_25bytes):
1142 movl -25(%eax), %ecx
1143 movl %ecx, -25(%edx)
1144 L(fwd_write_21bytes):
1145 movl -21(%eax), %ecx
1146 movl %ecx, -21(%edx)
1147 L(fwd_write_17bytes):
1148 movl -17(%eax), %ecx
1149 movl %ecx, -17(%edx)
1150 L(fwd_write_13bytes):
1151 movl -13(%eax), %ecx
1152 movl %ecx, -13(%edx)
1153 L(fwd_write_9bytes):
1158 L(fwd_write_1bytes):
1159 movzbl -1(%eax), %ecx
1161 #ifndef USE_AS_BCOPY
1162 # ifdef USE_AS_MEMPCPY
1165 movl DEST(%esp), %eax
1171 L(fwd_write_46bytes):
1172 movl -46(%eax), %ecx
1173 movl %ecx, -46(%edx)
1174 L(fwd_write_42bytes):
1175 movl -42(%eax), %ecx
1176 movl %ecx, -42(%edx)
1177 L(fwd_write_38bytes):
1178 movl -38(%eax), %ecx
1179 movl %ecx, -38(%edx)
1180 L(fwd_write_34bytes):
1181 movl -34(%eax), %ecx
1182 movl %ecx, -34(%edx)
1183 L(fwd_write_30bytes):
1184 movl -30(%eax), %ecx
1185 movl %ecx, -30(%edx)
1186 L(fwd_write_26bytes):
1187 movl -26(%eax), %ecx
1188 movl %ecx, -26(%edx)
1189 L(fwd_write_22bytes):
1190 movl -22(%eax), %ecx
1191 movl %ecx, -22(%edx)
1192 L(fwd_write_18bytes):
1193 movl -18(%eax), %ecx
1194 movl %ecx, -18(%edx)
1195 L(fwd_write_14bytes):
1196 movl -14(%eax), %ecx
1197 movl %ecx, -14(%edx)
1198 L(fwd_write_10bytes):
1199 movl -10(%eax), %ecx
1200 movl %ecx, -10(%edx)
1201 L(fwd_write_6bytes):
1204 L(fwd_write_2bytes):
1205 movzwl -2(%eax), %ecx
1207 #ifndef USE_AS_BCOPY
1208 # ifdef USE_AS_MEMPCPY
1211 movl DEST(%esp), %eax
1217 L(fwd_write_47bytes):
1218 movl -47(%eax), %ecx
1219 movl %ecx, -47(%edx)
1220 L(fwd_write_43bytes):
1221 movl -43(%eax), %ecx
1222 movl %ecx, -43(%edx)
1223 L(fwd_write_39bytes):
1224 movl -39(%eax), %ecx
1225 movl %ecx, -39(%edx)
1226 L(fwd_write_35bytes):
1227 movl -35(%eax), %ecx
1228 movl %ecx, -35(%edx)
1229 L(fwd_write_31bytes):
1230 movl -31(%eax), %ecx
1231 movl %ecx, -31(%edx)
1232 L(fwd_write_27bytes):
1233 movl -27(%eax), %ecx
1234 movl %ecx, -27(%edx)
1235 L(fwd_write_23bytes):
1236 movl -23(%eax), %ecx
1237 movl %ecx, -23(%edx)
1238 L(fwd_write_19bytes):
1239 movl -19(%eax), %ecx
1240 movl %ecx, -19(%edx)
1241 L(fwd_write_15bytes):
1242 movl -15(%eax), %ecx
1243 movl %ecx, -15(%edx)
1244 L(fwd_write_11bytes):
1245 movl -11(%eax), %ecx
1246 movl %ecx, -11(%edx)
1247 L(fwd_write_7bytes):
1250 L(fwd_write_3bytes):
1251 movzwl -3(%eax), %ecx
1252 movzbl -1(%eax), %eax
1255 #ifndef USE_AS_BCOPY
1256 # ifdef USE_AS_MEMPCPY
1259 movl DEST(%esp), %eax
1268 movdqu (%eax), %xmm1
1270 movdqu %xmm0, (%esi)
1271 movntdq %xmm1, (%edx)
1274 lea -0x90(%ecx), %ecx
1277 movdqu (%eax), %xmm0
1278 movdqu 0x10(%eax), %xmm1
1279 movdqu 0x20(%eax), %xmm2
1280 movdqu 0x30(%eax), %xmm3
1281 movdqu 0x40(%eax), %xmm4
1282 movdqu 0x50(%eax), %xmm5
1283 movdqu 0x60(%eax), %xmm6
1284 movdqu 0x70(%eax), %xmm7
1285 lea 0x80(%eax), %eax
1288 movntdq %xmm0, (%edx)
1289 movntdq %xmm1, 0x10(%edx)
1290 movntdq %xmm2, 0x20(%edx)
1291 movntdq %xmm3, 0x30(%edx)
1292 movntdq %xmm4, 0x40(%edx)
1293 movntdq %xmm5, 0x50(%edx)
1294 movntdq %xmm6, 0x60(%edx)
1295 movntdq %xmm7, 0x70(%edx)
1296 lea 0x80(%edx), %edx
1297 jae L(large_page_loop)
1299 lea 0x80(%ecx), %ecx
1300 jl L(large_page_less_64bytes)
1302 movdqu (%eax), %xmm0
1303 movdqu 0x10(%eax), %xmm1
1304 movdqu 0x20(%eax), %xmm2
1305 movdqu 0x30(%eax), %xmm3
1306 lea 0x40(%eax), %eax
1308 movntdq %xmm0, (%edx)
1309 movntdq %xmm1, 0x10(%edx)
1310 movntdq %xmm2, 0x20(%edx)
1311 movntdq %xmm3, 0x30(%edx)
1312 lea 0x40(%edx), %edx
1314 L(large_page_less_64bytes):
1316 jb L(large_page_less_32bytes)
1317 movdqu (%eax), %xmm0
1318 movdqu 0x10(%eax), %xmm1
1319 lea 0x20(%eax), %eax
1320 movntdq %xmm0, (%edx)
1321 movntdq %xmm1, 0x10(%edx)
1322 lea 0x20(%edx), %edx
1324 L(large_page_less_32bytes):
1328 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1332 L(bk_write_44bytes):
1335 L(bk_write_40bytes):
1338 L(bk_write_36bytes):
1341 L(bk_write_32bytes):
1344 L(bk_write_28bytes):
1347 L(bk_write_24bytes):
1350 L(bk_write_20bytes):
1353 L(bk_write_16bytes):
1356 L(bk_write_12bytes):
1366 #ifndef USE_AS_BCOPY
1367 movl DEST(%esp), %eax
1368 # ifdef USE_AS_MEMPCPY
1369 movl LEN(%esp), %ecx
1376 L(bk_write_45bytes):
1379 L(bk_write_41bytes):
1382 L(bk_write_37bytes):
1385 L(bk_write_33bytes):
1388 L(bk_write_29bytes):
1391 L(bk_write_25bytes):
1394 L(bk_write_21bytes):
1397 L(bk_write_17bytes):
1400 L(bk_write_13bytes):
1412 #ifndef USE_AS_BCOPY
1413 movl DEST(%esp), %eax
1414 # ifdef USE_AS_MEMPCPY
1415 movl LEN(%esp), %ecx
1422 L(bk_write_46bytes):
1425 L(bk_write_42bytes):
1428 L(bk_write_38bytes):
1431 L(bk_write_34bytes):
1434 L(bk_write_30bytes):
1437 L(bk_write_26bytes):
1440 L(bk_write_22bytes):
1443 L(bk_write_18bytes):
1446 L(bk_write_14bytes):
1449 L(bk_write_10bytes):
1458 #ifndef USE_AS_BCOPY
1459 movl DEST(%esp), %eax
1460 # ifdef USE_AS_MEMPCPY
1461 movl LEN(%esp), %ecx
1468 L(bk_write_47bytes):
1471 L(bk_write_43bytes):
1474 L(bk_write_39bytes):
1477 L(bk_write_35bytes):
1480 L(bk_write_31bytes):
1483 L(bk_write_27bytes):
1486 L(bk_write_23bytes):
1489 L(bk_write_19bytes):
1492 L(bk_write_15bytes):
1495 L(bk_write_11bytes):
1502 movzwl 1(%eax), %ecx
1506 #ifndef USE_AS_BCOPY
1507 movl DEST(%esp), %eax
1508 # ifdef USE_AS_MEMPCPY
1509 movl LEN(%esp), %ecx
1516 .pushsection .rodata.ssse3,"a",@progbits
1518 L(table_48bytes_fwd):
1519 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1520 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1521 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1522 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1523 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1524 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1525 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1526 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1527 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1528 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1529 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1530 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1531 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1532 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1533 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1534 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1535 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1536 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1537 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1538 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1539 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1540 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1541 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1542 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1543 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1544 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1545 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1546 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1547 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1548 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1549 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1550 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1551 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1552 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1553 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1554 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1555 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1556 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1557 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1558 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1559 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1560 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1561 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1562 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1563 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1570 .int JMPTBL (L(shl_0), L(shl_table))
1571 .int JMPTBL (L(shl_1), L(shl_table))
1572 .int JMPTBL (L(shl_2), L(shl_table))
1573 .int JMPTBL (L(shl_3), L(shl_table))
1574 .int JMPTBL (L(shl_4), L(shl_table))
1575 .int JMPTBL (L(shl_5), L(shl_table))
1576 .int JMPTBL (L(shl_6), L(shl_table))
1577 .int JMPTBL (L(shl_7), L(shl_table))
1578 .int JMPTBL (L(shl_8), L(shl_table))
1579 .int JMPTBL (L(shl_9), L(shl_table))
1580 .int JMPTBL (L(shl_10), L(shl_table))
1581 .int JMPTBL (L(shl_11), L(shl_table))
1582 .int JMPTBL (L(shl_12), L(shl_table))
1583 .int JMPTBL (L(shl_13), L(shl_table))
1584 .int JMPTBL (L(shl_14), L(shl_table))
1585 .int JMPTBL (L(shl_15), L(shl_table))
1588 L(table_48_bytes_bwd):
1589 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1590 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1591 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1592 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1593 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1594 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1595 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1596 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1597 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1598 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1599 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1600 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1601 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1602 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1603 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1604 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1605 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1606 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1607 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1608 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1609 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1610 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1611 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1612 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1613 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1614 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1615 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1616 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1617 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1618 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1619 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1620 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1621 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1622 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1623 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1624 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1625 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1626 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1627 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1628 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1629 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1630 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1631 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1632 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1633 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1640 #ifdef USE_AS_MEMMOVE
1645 lea (%ecx,%edx,1),%edx
1646 lea (%ecx,%esi,1),%esi
1652 jae L(bk_write_more64bytes)
1654 L(bk_write_64bytesless):
1656 jb L(bk_write_less32bytes)
1658 L(bk_write_more32bytes):
1659 /* Copy 32 bytes at a time. */
1665 movl -12(%esi), %eax
1666 movl %eax, -12(%edx)
1667 movl -16(%esi), %eax
1668 movl %eax, -16(%edx)
1669 movl -20(%esi), %eax
1670 movl %eax, -20(%edx)
1671 movl -24(%esi), %eax
1672 movl %eax, -24(%edx)
1673 movl -28(%esi), %eax
1674 movl %eax, -28(%edx)
1675 movl -32(%esi), %eax
1676 movl %eax, -32(%edx)
1680 L(bk_write_less32bytes):
1685 L(bk_write_less32bytes_2):
1686 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1692 jbe L(bk_write_less32bytes)
1694 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1695 then (EDX & 2) must be != 0. */
1715 L(bk_write_more64bytes):
1716 /* Check alignment of last byte. */
1718 jz L(bk_ssse3_cpy_pre)
1720 /* EDX is aligned 4 bytes, but not 16 bytes. */
1729 jz L(bk_ssse3_cpy_pre)
1738 jz L(bk_ssse3_cpy_pre)
1746 L(bk_ssse3_cpy_pre):
1748 jb L(bk_write_more32bytes)
1754 movdqu 0x30(%esi), %xmm3
1755 movdqa %xmm3, 0x30(%edx)
1756 movdqu 0x20(%esi), %xmm2
1757 movdqa %xmm2, 0x20(%edx)
1758 movdqu 0x10(%esi), %xmm1
1759 movdqa %xmm1, 0x10(%edx)
1760 movdqu (%esi), %xmm0
1761 movdqa %xmm0, (%edx)
1764 jmp L(bk_write_64bytesless)