2 Copyright (C) 2010-2017 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 || defined USE_AS_MEMMOVE \
23 || !defined USE_MULTIARCH)
26 # include "asm-syntax.h"
29 # define MEMCPY __memcpy_ssse3
30 # define MEMCPY_CHK __memcpy_chk_ssse3
43 # define CFI_PUSH(REG) \
44 cfi_adjust_cfa_offset (4); \
45 cfi_rel_offset (REG, 0)
47 # define CFI_POP(REG) \
48 cfi_adjust_cfa_offset (-4); \
51 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
52 # define POP(REG) popl REG; CFI_POP (REG)
55 # define PARMS 8 /* Preserve EBX. */
56 # define ENTRANCE PUSH (%ebx);
57 # define RETURN_END POP (%ebx); ret
58 # define RETURN RETURN_END; CFI_PUSH (%ebx)
59 # define JMPTBL(I, B) I - B
61 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
62 jump table with relative offsets. INDEX is a register contains the
63 index into the jump table. SCALE is the scale of INDEX. */
65 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
66 /* We first load PC into EBX. */ \
68 /* Get the address of the jump table. */ \
69 addl $(TABLE - .), %ebx; \
70 /* Get the entry and convert the relative offset to the \
71 absolute address. */ \
72 addl (%ebx, INDEX, SCALE), %ebx; \
73 /* We loaded the jump table. Go. */ \
79 # define RETURN_END ret
80 # define RETURN RETURN_END
81 # define JMPTBL(I, B) I
83 /* Branch to an entry in a jump table. TABLE is a jump table with
84 absolute offsets. INDEX is a register contains the index into the
85 jump table. SCALE is the scale of INDEX. */
87 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
88 jmp *TABLE(, INDEX, SCALE)
91 .section .text.ssse3,"ax",@progbits
92 # if !defined USE_AS_BCOPY
96 jb HIDDEN_JUMPTARGET (__chk_fail)
103 movl DEST(%esp), %edx
105 # ifdef USE_AS_MEMMOVE
108 je L(fwd_write_0bytes)
111 jmp L(bk_write_less32bytes_2)
125 L(fwd_write_less32bytes):
126 # ifndef USE_AS_MEMMOVE
132 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
133 # ifndef USE_AS_MEMMOVE
136 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
141 # ifndef USE_AS_MEMMOVE
143 movlpd 8(%eax), %xmm1
145 movlpd %xmm1, 8(%edx)
157 # ifdef SHARED_CACHE_SIZE_HALF
158 cmp $SHARED_CACHE_SIZE_HALF, %ecx
162 add $_GLOBAL_OFFSET_TABLE_, %ebx
163 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
165 cmp __x86_shared_cache_size_half, %ecx
173 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
177 # ifdef USE_AS_MEMMOVE
178 movl DEST+4(%esp), %edi
188 movdqa (%eax, %edi), %xmm0
189 movdqa 16(%eax, %edi), %xmm1
191 movdqa %xmm0, (%edx, %edi)
192 movdqa %xmm1, 16(%edx, %edi)
196 movdqa (%eax, %edi), %xmm0
197 movdqa 16(%eax, %edi), %xmm1
199 movdqa %xmm0, (%edx, %edi)
200 movdqa %xmm1, 16(%edx, %edi)
204 movdqa (%eax, %edi), %xmm0
205 movdqa 16(%eax, %edi), %xmm1
207 movdqa %xmm0, (%edx, %edi)
208 movdqa %xmm1, 16(%edx, %edi)
212 movdqa (%eax, %edi), %xmm0
213 movdqa 16(%eax, %edi), %xmm1
215 movdqa %xmm0, (%edx, %edi)
216 movdqa %xmm1, 16(%edx, %edi)
225 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
231 # ifdef DATA_CACHE_SIZE_HALF
232 cmp $DATA_CACHE_SIZE_HALF, %ecx
236 add $_GLOBAL_OFFSET_TABLE_, %ebx
237 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
239 cmp __x86_data_cache_size_half, %ecx
244 jae L(shl_0_gobble_mem_loop)
247 L(shl_0_gobble_cache_loop):
249 movdqa 0x10(%eax), %xmm1
250 movdqa 0x20(%eax), %xmm2
251 movdqa 0x30(%eax), %xmm3
252 movdqa 0x40(%eax), %xmm4
253 movdqa 0x50(%eax), %xmm5
254 movdqa 0x60(%eax), %xmm6
255 movdqa 0x70(%eax), %xmm7
259 movdqa %xmm1, 0x10(%edx)
260 movdqa %xmm2, 0x20(%edx)
261 movdqa %xmm3, 0x30(%edx)
262 movdqa %xmm4, 0x40(%edx)
263 movdqa %xmm5, 0x50(%edx)
264 movdqa %xmm6, 0x60(%edx)
265 movdqa %xmm7, 0x70(%edx)
268 jae L(shl_0_gobble_cache_loop)
271 jl L(shl_0_cache_less_64bytes)
275 movdqa 0x10(%eax), %xmm1
277 movdqa %xmm1, 0x10(%edx)
278 movdqa 0x20(%eax), %xmm0
279 movdqa 0x30(%eax), %xmm1
281 movdqa %xmm0, 0x20(%edx)
282 movdqa %xmm1, 0x30(%edx)
285 L(shl_0_cache_less_64bytes):
287 jb L(shl_0_cache_less_32bytes)
290 movdqa 0x10(%eax), %xmm1
293 movdqa %xmm1, 0x10(%edx)
296 L(shl_0_cache_less_32bytes):
298 jb L(shl_0_cache_less_16bytes)
305 L(shl_0_cache_less_16bytes):
308 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
311 L(shl_0_gobble_mem_loop):
312 prefetcht0 0x1c0(%eax)
313 prefetcht0 0x280(%eax)
314 prefetcht0 0x1c0(%edx)
317 movdqa 0x10(%eax), %xmm1
318 movdqa 0x20(%eax), %xmm2
319 movdqa 0x30(%eax), %xmm3
320 movdqa 0x40(%eax), %xmm4
321 movdqa 0x50(%eax), %xmm5
322 movdqa 0x60(%eax), %xmm6
323 movdqa 0x70(%eax), %xmm7
327 movdqa %xmm1, 0x10(%edx)
328 movdqa %xmm2, 0x20(%edx)
329 movdqa %xmm3, 0x30(%edx)
330 movdqa %xmm4, 0x40(%edx)
331 movdqa %xmm5, 0x50(%edx)
332 movdqa %xmm6, 0x60(%edx)
333 movdqa %xmm7, 0x70(%edx)
336 jae L(shl_0_gobble_mem_loop)
339 jl L(shl_0_mem_less_64bytes)
343 movdqa 0x10(%eax), %xmm1
346 movdqa %xmm1, 0x10(%edx)
348 movdqa 0x20(%eax), %xmm0
349 movdqa 0x30(%eax), %xmm1
352 movdqa %xmm0, 0x20(%edx)
353 movdqa %xmm1, 0x30(%edx)
356 L(shl_0_mem_less_64bytes):
358 jb L(shl_0_mem_less_32bytes)
361 movdqa 0x10(%eax), %xmm1
364 movdqa %xmm1, 0x10(%edx)
367 L(shl_0_mem_less_32bytes):
369 jb L(shl_0_mem_less_16bytes)
376 L(shl_0_mem_less_16bytes):
379 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
383 # ifndef USE_AS_MEMMOVE
384 movaps -1(%eax), %xmm1
386 movl DEST+4(%esp), %edi
387 movaps -1(%eax), %xmm1
390 # ifdef DATA_CACHE_SIZE_HALF
391 cmp $DATA_CACHE_SIZE_HALF, %ecx
395 add $_GLOBAL_OFFSET_TABLE_, %ebx
396 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
398 cmp __x86_data_cache_size_half, %ecx
401 jb L(sh_1_no_prefetch)
407 prefetcht0 0x1c0(%eax)
408 prefetcht0 0x1c0(%edx)
409 movaps 15(%eax), %xmm2
410 movaps 31(%eax), %xmm3
411 movaps 47(%eax), %xmm4
412 movaps 63(%eax), %xmm5
414 palignr $1, %xmm4, %xmm5
415 palignr $1, %xmm3, %xmm4
416 movaps %xmm5, 48(%edx)
417 palignr $1, %xmm2, %xmm3
419 palignr $1, %xmm1, %xmm2
420 movaps %xmm4, 32(%edx)
421 movaps %xmm3, 16(%edx)
432 movaps 15(%eax), %xmm2
433 movaps 31(%eax), %xmm3
434 palignr $1, %xmm2, %xmm3
435 palignr $1, %xmm1, %xmm2
437 movaps %xmm3, 16(%edx)
438 lea 32(%edx, %ecx), %edx
439 lea 32(%eax, %ecx), %eax
441 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
452 L(sh_1_no_prefetch_loop):
453 movdqa 16(%eax, %edi), %xmm2
455 movdqa 32(%eax, %edi), %xmm3
457 palignr $1, %xmm2, %xmm3
458 palignr $1, %xmm1, %xmm2
460 movdqa %xmm2, -32(%edx, %edi)
461 movdqa %xmm3, -16(%edx, %edi)
462 jb L(sh_1_end_no_prefetch_loop)
464 movdqa 16(%eax, %edi), %xmm2
466 movdqa 32(%eax, %edi), %xmm3
468 palignr $1, %xmm2, %xmm3
469 palignr $1, %xmm4, %xmm2
471 movdqa %xmm2, -32(%edx, %edi)
472 movdqa %xmm3, -16(%edx, %edi)
473 jae L(sh_1_no_prefetch_loop)
475 L(sh_1_end_no_prefetch_loop):
479 lea 1(%edi, %eax), %eax
481 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
487 # ifndef USE_AS_MEMMOVE
488 movaps -2(%eax), %xmm1
490 movl DEST+4(%esp), %edi
491 movaps -2(%eax), %xmm1
494 # ifdef DATA_CACHE_SIZE_HALF
495 cmp $DATA_CACHE_SIZE_HALF, %ecx
499 add $_GLOBAL_OFFSET_TABLE_, %ebx
500 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
502 cmp __x86_data_cache_size_half, %ecx
505 jb L(sh_2_no_prefetch)
511 prefetcht0 0x1c0(%eax)
512 prefetcht0 0x1c0(%edx)
513 movaps 14(%eax), %xmm2
514 movaps 30(%eax), %xmm3
515 movaps 46(%eax), %xmm4
516 movaps 62(%eax), %xmm5
518 palignr $2, %xmm4, %xmm5
519 palignr $2, %xmm3, %xmm4
520 movaps %xmm5, 48(%edx)
521 palignr $2, %xmm2, %xmm3
523 palignr $2, %xmm1, %xmm2
524 movaps %xmm4, 32(%edx)
525 movaps %xmm3, 16(%edx)
536 movaps 14(%eax), %xmm2
537 movaps 30(%eax), %xmm3
538 palignr $2, %xmm2, %xmm3
539 palignr $2, %xmm1, %xmm2
541 movaps %xmm3, 16(%edx)
542 lea 32(%edx, %ecx), %edx
543 lea 32(%eax, %ecx), %eax
545 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
556 L(sh_2_no_prefetch_loop):
557 movdqa 16(%eax, %edi), %xmm2
559 movdqa 32(%eax, %edi), %xmm3
561 palignr $2, %xmm2, %xmm3
562 palignr $2, %xmm1, %xmm2
564 movdqa %xmm2, -32(%edx, %edi)
565 movdqa %xmm3, -16(%edx, %edi)
566 jb L(sh_2_end_no_prefetch_loop)
568 movdqa 16(%eax, %edi), %xmm2
570 movdqa 32(%eax, %edi), %xmm3
572 palignr $2, %xmm2, %xmm3
573 palignr $2, %xmm4, %xmm2
575 movdqa %xmm2, -32(%edx, %edi)
576 movdqa %xmm3, -16(%edx, %edi)
577 jae L(sh_2_no_prefetch_loop)
579 L(sh_2_end_no_prefetch_loop):
583 lea 2(%edi, %eax), %eax
585 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
591 # ifndef USE_AS_MEMMOVE
592 movaps -3(%eax), %xmm1
594 movl DEST+4(%esp), %edi
595 movaps -3(%eax), %xmm1
598 # ifdef DATA_CACHE_SIZE_HALF
599 cmp $DATA_CACHE_SIZE_HALF, %ecx
603 add $_GLOBAL_OFFSET_TABLE_, %ebx
604 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
606 cmp __x86_data_cache_size_half, %ecx
609 jb L(sh_3_no_prefetch)
615 prefetcht0 0x1c0(%eax)
616 prefetcht0 0x1c0(%edx)
617 movaps 13(%eax), %xmm2
618 movaps 29(%eax), %xmm3
619 movaps 45(%eax), %xmm4
620 movaps 61(%eax), %xmm5
622 palignr $3, %xmm4, %xmm5
623 palignr $3, %xmm3, %xmm4
624 movaps %xmm5, 48(%edx)
625 palignr $3, %xmm2, %xmm3
627 palignr $3, %xmm1, %xmm2
628 movaps %xmm4, 32(%edx)
629 movaps %xmm3, 16(%edx)
640 movaps 13(%eax), %xmm2
641 movaps 29(%eax), %xmm3
642 palignr $3, %xmm2, %xmm3
643 palignr $3, %xmm1, %xmm2
645 movaps %xmm3, 16(%edx)
646 lea 32(%edx, %ecx), %edx
647 lea 32(%eax, %ecx), %eax
649 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
660 L(sh_3_no_prefetch_loop):
661 movdqa 16(%eax, %edi), %xmm2
663 movdqa 32(%eax, %edi), %xmm3
665 palignr $3, %xmm2, %xmm3
666 palignr $3, %xmm1, %xmm2
668 movdqa %xmm2, -32(%edx, %edi)
669 movdqa %xmm3, -16(%edx, %edi)
671 jb L(sh_3_end_no_prefetch_loop)
673 movdqa 16(%eax, %edi), %xmm2
675 movdqa 32(%eax, %edi), %xmm3
677 palignr $3, %xmm2, %xmm3
678 palignr $3, %xmm4, %xmm2
680 movdqa %xmm2, -32(%edx, %edi)
681 movdqa %xmm3, -16(%edx, %edi)
683 jae L(sh_3_no_prefetch_loop)
685 L(sh_3_end_no_prefetch_loop):
689 lea 3(%edi, %eax), %eax
691 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
697 # ifndef USE_AS_MEMMOVE
698 movaps -4(%eax), %xmm1
700 movl DEST+4(%esp), %edi
701 movaps -4(%eax), %xmm1
704 # ifdef DATA_CACHE_SIZE_HALF
705 cmp $DATA_CACHE_SIZE_HALF, %ecx
709 add $_GLOBAL_OFFSET_TABLE_, %ebx
710 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
712 cmp __x86_data_cache_size_half, %ecx
715 jb L(sh_4_no_prefetch)
721 prefetcht0 0x1c0(%eax)
722 prefetcht0 0x1c0(%edx)
723 movaps 12(%eax), %xmm2
724 movaps 28(%eax), %xmm3
725 movaps 44(%eax), %xmm4
726 movaps 60(%eax), %xmm5
728 palignr $4, %xmm4, %xmm5
729 palignr $4, %xmm3, %xmm4
730 movaps %xmm5, 48(%edx)
731 palignr $4, %xmm2, %xmm3
733 palignr $4, %xmm1, %xmm2
734 movaps %xmm4, 32(%edx)
735 movaps %xmm3, 16(%edx)
746 movaps 12(%eax), %xmm2
747 movaps 28(%eax), %xmm3
748 palignr $4, %xmm2, %xmm3
749 palignr $4, %xmm1, %xmm2
751 movaps %xmm3, 16(%edx)
752 lea 32(%edx, %ecx), %edx
753 lea 32(%eax, %ecx), %eax
755 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
766 L(sh_4_no_prefetch_loop):
767 movdqa 16(%eax, %edi), %xmm2
769 movdqa 32(%eax, %edi), %xmm3
771 palignr $4, %xmm2, %xmm3
772 palignr $4, %xmm1, %xmm2
774 movdqa %xmm2, -32(%edx, %edi)
775 movdqa %xmm3, -16(%edx, %edi)
777 jb L(sh_4_end_no_prefetch_loop)
779 movdqa 16(%eax, %edi), %xmm2
781 movdqa 32(%eax, %edi), %xmm3
783 palignr $4, %xmm2, %xmm3
784 palignr $4, %xmm4, %xmm2
786 movdqa %xmm2, -32(%edx, %edi)
787 movdqa %xmm3, -16(%edx, %edi)
789 jae L(sh_4_no_prefetch_loop)
791 L(sh_4_end_no_prefetch_loop):
795 lea 4(%edi, %eax), %eax
797 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
803 # ifndef USE_AS_MEMMOVE
804 movaps -5(%eax), %xmm1
806 movl DEST+4(%esp), %edi
807 movaps -5(%eax), %xmm1
810 # ifdef DATA_CACHE_SIZE_HALF
811 cmp $DATA_CACHE_SIZE_HALF, %ecx
815 add $_GLOBAL_OFFSET_TABLE_, %ebx
816 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
818 cmp __x86_data_cache_size_half, %ecx
821 jb L(sh_5_no_prefetch)
827 prefetcht0 0x1c0(%eax)
828 prefetcht0 0x1c0(%edx)
829 movaps 11(%eax), %xmm2
830 movaps 27(%eax), %xmm3
831 movaps 43(%eax), %xmm4
832 movaps 59(%eax), %xmm5
834 palignr $5, %xmm4, %xmm5
835 palignr $5, %xmm3, %xmm4
836 movaps %xmm5, 48(%edx)
837 palignr $5, %xmm2, %xmm3
839 palignr $5, %xmm1, %xmm2
840 movaps %xmm4, 32(%edx)
841 movaps %xmm3, 16(%edx)
852 movaps 11(%eax), %xmm2
853 movaps 27(%eax), %xmm3
854 palignr $5, %xmm2, %xmm3
855 palignr $5, %xmm1, %xmm2
857 movaps %xmm3, 16(%edx)
858 lea 32(%edx, %ecx), %edx
859 lea 32(%eax, %ecx), %eax
861 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
872 L(sh_5_no_prefetch_loop):
873 movdqa 16(%eax, %edi), %xmm2
875 movdqa 32(%eax, %edi), %xmm3
877 palignr $5, %xmm2, %xmm3
878 palignr $5, %xmm1, %xmm2
880 movdqa %xmm2, -32(%edx, %edi)
881 movdqa %xmm3, -16(%edx, %edi)
883 jb L(sh_5_end_no_prefetch_loop)
885 movdqa 16(%eax, %edi), %xmm2
887 movdqa 32(%eax, %edi), %xmm3
889 palignr $5, %xmm2, %xmm3
890 palignr $5, %xmm4, %xmm2
892 movdqa %xmm2, -32(%edx, %edi)
893 movdqa %xmm3, -16(%edx, %edi)
895 jae L(sh_5_no_prefetch_loop)
897 L(sh_5_end_no_prefetch_loop):
901 lea 5(%edi, %eax), %eax
903 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
909 # ifndef USE_AS_MEMMOVE
910 movaps -6(%eax), %xmm1
912 movl DEST+4(%esp), %edi
913 movaps -6(%eax), %xmm1
916 # ifdef DATA_CACHE_SIZE_HALF
917 cmp $DATA_CACHE_SIZE_HALF, %ecx
921 add $_GLOBAL_OFFSET_TABLE_, %ebx
922 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
924 cmp __x86_data_cache_size_half, %ecx
927 jb L(sh_6_no_prefetch)
933 prefetcht0 0x1c0(%eax)
934 prefetcht0 0x1c0(%edx)
935 movaps 10(%eax), %xmm2
936 movaps 26(%eax), %xmm3
937 movaps 42(%eax), %xmm4
938 movaps 58(%eax), %xmm5
940 palignr $6, %xmm4, %xmm5
941 palignr $6, %xmm3, %xmm4
942 movaps %xmm5, 48(%edx)
943 palignr $6, %xmm2, %xmm3
945 palignr $6, %xmm1, %xmm2
946 movaps %xmm4, 32(%edx)
947 movaps %xmm3, 16(%edx)
958 movaps 10(%eax), %xmm2
959 movaps 26(%eax), %xmm3
960 palignr $6, %xmm2, %xmm3
961 palignr $6, %xmm1, %xmm2
963 movaps %xmm3, 16(%edx)
964 lea 32(%edx, %ecx), %edx
965 lea 32(%eax, %ecx), %eax
967 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
978 L(sh_6_no_prefetch_loop):
979 movdqa 16(%eax, %edi), %xmm2
981 movdqa 32(%eax, %edi), %xmm3
983 palignr $6, %xmm2, %xmm3
984 palignr $6, %xmm1, %xmm2
986 movdqa %xmm2, -32(%edx, %edi)
987 movdqa %xmm3, -16(%edx, %edi)
989 jb L(sh_6_end_no_prefetch_loop)
991 movdqa 16(%eax, %edi), %xmm2
993 movdqa 32(%eax, %edi), %xmm3
995 palignr $6, %xmm2, %xmm3
996 palignr $6, %xmm4, %xmm2
998 movdqa %xmm2, -32(%edx, %edi)
999 movdqa %xmm3, -16(%edx, %edi)
1001 jae L(sh_6_no_prefetch_loop)
1003 L(sh_6_end_no_prefetch_loop):
1007 lea 6(%edi, %eax), %eax
1009 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1015 # ifndef USE_AS_MEMMOVE
1016 movaps -7(%eax), %xmm1
1018 movl DEST+4(%esp), %edi
1019 movaps -7(%eax), %xmm1
1020 movdqu %xmm0, (%edi)
1022 # ifdef DATA_CACHE_SIZE_HALF
1023 cmp $DATA_CACHE_SIZE_HALF, %ecx
1027 add $_GLOBAL_OFFSET_TABLE_, %ebx
1028 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1030 cmp __x86_data_cache_size_half, %ecx
1033 jb L(sh_7_no_prefetch)
1039 prefetcht0 0x1c0(%eax)
1040 prefetcht0 0x1c0(%edx)
1041 movaps 9(%eax), %xmm2
1042 movaps 25(%eax), %xmm3
1043 movaps 41(%eax), %xmm4
1044 movaps 57(%eax), %xmm5
1046 palignr $7, %xmm4, %xmm5
1047 palignr $7, %xmm3, %xmm4
1048 movaps %xmm5, 48(%edx)
1049 palignr $7, %xmm2, %xmm3
1051 palignr $7, %xmm1, %xmm2
1052 movaps %xmm4, 32(%edx)
1053 movaps %xmm3, 16(%edx)
1055 movaps %xmm2, (%edx)
1064 movaps 9(%eax), %xmm2
1065 movaps 25(%eax), %xmm3
1066 palignr $7, %xmm2, %xmm3
1067 palignr $7, %xmm1, %xmm2
1068 movaps %xmm2, (%edx)
1069 movaps %xmm3, 16(%edx)
1070 lea 32(%edx, %ecx), %edx
1071 lea 32(%eax, %ecx), %eax
1073 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1078 L(sh_7_no_prefetch):
1084 L(sh_7_no_prefetch_loop):
1085 movdqa 16(%eax, %edi), %xmm2
1087 movdqa 32(%eax, %edi), %xmm3
1089 palignr $7, %xmm2, %xmm3
1090 palignr $7, %xmm1, %xmm2
1092 movdqa %xmm2, -32(%edx, %edi)
1093 movdqa %xmm3, -16(%edx, %edi)
1094 jb L(sh_7_end_no_prefetch_loop)
1096 movdqa 16(%eax, %edi), %xmm2
1098 movdqa 32(%eax, %edi), %xmm3
1100 palignr $7, %xmm2, %xmm3
1101 palignr $7, %xmm4, %xmm2
1103 movdqa %xmm2, -32(%edx, %edi)
1104 movdqa %xmm3, -16(%edx, %edi)
1105 jae L(sh_7_no_prefetch_loop)
1107 L(sh_7_end_no_prefetch_loop):
1111 lea 7(%edi, %eax), %eax
1113 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1119 # ifndef USE_AS_MEMMOVE
1120 movaps -8(%eax), %xmm1
1122 movl DEST+4(%esp), %edi
1123 movaps -8(%eax), %xmm1
1124 movdqu %xmm0, (%edi)
1126 # ifdef DATA_CACHE_SIZE_HALF
1127 cmp $DATA_CACHE_SIZE_HALF, %ecx
1131 add $_GLOBAL_OFFSET_TABLE_, %ebx
1132 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1134 cmp __x86_data_cache_size_half, %ecx
1137 jb L(sh_8_no_prefetch)
1143 prefetcht0 0x1c0(%eax)
1144 prefetcht0 0x1c0(%edx)
1145 movaps 8(%eax), %xmm2
1146 movaps 24(%eax), %xmm3
1147 movaps 40(%eax), %xmm4
1148 movaps 56(%eax), %xmm5
1150 palignr $8, %xmm4, %xmm5
1151 palignr $8, %xmm3, %xmm4
1152 movaps %xmm5, 48(%edx)
1153 palignr $8, %xmm2, %xmm3
1155 palignr $8, %xmm1, %xmm2
1156 movaps %xmm4, 32(%edx)
1157 movaps %xmm3, 16(%edx)
1159 movaps %xmm2, (%edx)
1168 movaps 8(%eax), %xmm2
1169 movaps 24(%eax), %xmm3
1170 palignr $8, %xmm2, %xmm3
1171 palignr $8, %xmm1, %xmm2
1172 movaps %xmm2, (%edx)
1173 movaps %xmm3, 16(%edx)
1174 lea 32(%edx, %ecx), %edx
1175 lea 32(%eax, %ecx), %eax
1177 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1182 L(sh_8_no_prefetch):
1188 L(sh_8_no_prefetch_loop):
1189 movdqa 16(%eax, %edi), %xmm2
1191 movdqa 32(%eax, %edi), %xmm3
1193 palignr $8, %xmm2, %xmm3
1194 palignr $8, %xmm1, %xmm2
1196 movdqa %xmm2, -32(%edx, %edi)
1197 movdqa %xmm3, -16(%edx, %edi)
1198 jb L(sh_8_end_no_prefetch_loop)
1200 movdqa 16(%eax, %edi), %xmm2
1202 movdqa 32(%eax, %edi), %xmm3
1204 palignr $8, %xmm2, %xmm3
1205 palignr $8, %xmm4, %xmm2
1207 movdqa %xmm2, -32(%edx, %edi)
1208 movdqa %xmm3, -16(%edx, %edi)
1209 jae L(sh_8_no_prefetch_loop)
1211 L(sh_8_end_no_prefetch_loop):
1215 lea 8(%edi, %eax), %eax
1217 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1223 # ifndef USE_AS_MEMMOVE
1224 movaps -9(%eax), %xmm1
1226 movl DEST+4(%esp), %edi
1227 movaps -9(%eax), %xmm1
1228 movdqu %xmm0, (%edi)
1230 # ifdef DATA_CACHE_SIZE_HALF
1231 cmp $DATA_CACHE_SIZE_HALF, %ecx
1235 add $_GLOBAL_OFFSET_TABLE_, %ebx
1236 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1238 cmp __x86_data_cache_size_half, %ecx
1241 jb L(sh_9_no_prefetch)
1247 prefetcht0 0x1c0(%eax)
1248 prefetcht0 0x1c0(%edx)
1249 movaps 7(%eax), %xmm2
1250 movaps 23(%eax), %xmm3
1251 movaps 39(%eax), %xmm4
1252 movaps 55(%eax), %xmm5
1254 palignr $9, %xmm4, %xmm5
1255 palignr $9, %xmm3, %xmm4
1256 movaps %xmm5, 48(%edx)
1257 palignr $9, %xmm2, %xmm3
1259 palignr $9, %xmm1, %xmm2
1260 movaps %xmm4, 32(%edx)
1261 movaps %xmm3, 16(%edx)
1263 movaps %xmm2, (%edx)
1272 movaps 7(%eax), %xmm2
1273 movaps 23(%eax), %xmm3
1274 palignr $9, %xmm2, %xmm3
1275 palignr $9, %xmm1, %xmm2
1277 movaps %xmm2, (%edx)
1278 movaps %xmm3, 16(%edx)
1279 lea 32(%edx, %ecx), %edx
1280 lea 32(%eax, %ecx), %eax
1282 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1287 L(sh_9_no_prefetch):
1293 L(sh_9_no_prefetch_loop):
1294 movdqa 16(%eax, %edi), %xmm2
1296 movdqa 32(%eax, %edi), %xmm3
1298 palignr $9, %xmm2, %xmm3
1299 palignr $9, %xmm1, %xmm2
1301 movdqa %xmm2, -32(%edx, %edi)
1302 movdqa %xmm3, -16(%edx, %edi)
1303 jb L(sh_9_end_no_prefetch_loop)
1305 movdqa 16(%eax, %edi), %xmm2
1307 movdqa 32(%eax, %edi), %xmm3
1309 palignr $9, %xmm2, %xmm3
1310 palignr $9, %xmm4, %xmm2
1312 movdqa %xmm2, -32(%edx, %edi)
1313 movdqa %xmm3, -16(%edx, %edi)
1314 jae L(sh_9_no_prefetch_loop)
1316 L(sh_9_end_no_prefetch_loop):
1320 lea 9(%edi, %eax), %eax
1322 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1328 # ifndef USE_AS_MEMMOVE
1329 movaps -10(%eax), %xmm1
1331 movl DEST+4(%esp), %edi
1332 movaps -10(%eax), %xmm1
1333 movdqu %xmm0, (%edi)
1335 # ifdef DATA_CACHE_SIZE_HALF
1336 cmp $DATA_CACHE_SIZE_HALF, %ecx
1340 add $_GLOBAL_OFFSET_TABLE_, %ebx
1341 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1343 cmp __x86_data_cache_size_half, %ecx
1346 jb L(sh_10_no_prefetch)
1352 prefetcht0 0x1c0(%eax)
1353 prefetcht0 0x1c0(%edx)
1354 movaps 6(%eax), %xmm2
1355 movaps 22(%eax), %xmm3
1356 movaps 38(%eax), %xmm4
1357 movaps 54(%eax), %xmm5
1359 palignr $10, %xmm4, %xmm5
1360 palignr $10, %xmm3, %xmm4
1361 movaps %xmm5, 48(%edx)
1362 palignr $10, %xmm2, %xmm3
1364 palignr $10, %xmm1, %xmm2
1365 movaps %xmm4, 32(%edx)
1366 movaps %xmm3, 16(%edx)
1368 movaps %xmm2, (%edx)
1371 ja L(Shl10LoopStart)
1377 movaps 6(%eax), %xmm2
1378 movaps 22(%eax), %xmm3
1379 palignr $10, %xmm2, %xmm3
1380 palignr $10, %xmm1, %xmm2
1382 movaps %xmm2, (%edx)
1383 movaps %xmm3, 16(%edx)
1384 lea 32(%edx, %ecx), %edx
1385 lea 32(%eax, %ecx), %eax
1387 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1392 L(sh_10_no_prefetch):
1398 L(sh_10_no_prefetch_loop):
1399 movdqa 16(%eax, %edi), %xmm2
1401 movdqa 32(%eax, %edi), %xmm3
1403 palignr $10, %xmm2, %xmm3
1404 palignr $10, %xmm1, %xmm2
1406 movdqa %xmm2, -32(%edx, %edi)
1407 movdqa %xmm3, -16(%edx, %edi)
1408 jb L(sh_10_end_no_prefetch_loop)
1410 movdqa 16(%eax, %edi), %xmm2
1412 movdqa 32(%eax, %edi), %xmm3
1414 palignr $10, %xmm2, %xmm3
1415 palignr $10, %xmm4, %xmm2
1417 movdqa %xmm2, -32(%edx, %edi)
1418 movdqa %xmm3, -16(%edx, %edi)
1419 jae L(sh_10_no_prefetch_loop)
1421 L(sh_10_end_no_prefetch_loop):
1425 lea 10(%edi, %eax), %eax
1427 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1433 # ifndef USE_AS_MEMMOVE
1434 movaps -11(%eax), %xmm1
1436 movl DEST+4(%esp), %edi
1437 movaps -11(%eax), %xmm1
1438 movdqu %xmm0, (%edi)
1440 # ifdef DATA_CACHE_SIZE_HALF
1441 cmp $DATA_CACHE_SIZE_HALF, %ecx
1445 add $_GLOBAL_OFFSET_TABLE_, %ebx
1446 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1448 cmp __x86_data_cache_size_half, %ecx
1451 jb L(sh_11_no_prefetch)
1457 prefetcht0 0x1c0(%eax)
1458 prefetcht0 0x1c0(%edx)
1459 movaps 5(%eax), %xmm2
1460 movaps 21(%eax), %xmm3
1461 movaps 37(%eax), %xmm4
1462 movaps 53(%eax), %xmm5
1464 palignr $11, %xmm4, %xmm5
1465 palignr $11, %xmm3, %xmm4
1466 movaps %xmm5, 48(%edx)
1467 palignr $11, %xmm2, %xmm3
1469 palignr $11, %xmm1, %xmm2
1470 movaps %xmm4, 32(%edx)
1471 movaps %xmm3, 16(%edx)
1473 movaps %xmm2, (%edx)
1476 ja L(Shl11LoopStart)
1482 movaps 5(%eax), %xmm2
1483 movaps 21(%eax), %xmm3
1484 palignr $11, %xmm2, %xmm3
1485 palignr $11, %xmm1, %xmm2
1487 movaps %xmm2, (%edx)
1488 movaps %xmm3, 16(%edx)
1489 lea 32(%edx, %ecx), %edx
1490 lea 32(%eax, %ecx), %eax
1492 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1497 L(sh_11_no_prefetch):
1503 L(sh_11_no_prefetch_loop):
1504 movdqa 16(%eax, %edi), %xmm2
1506 movdqa 32(%eax, %edi), %xmm3
1508 palignr $11, %xmm2, %xmm3
1509 palignr $11, %xmm1, %xmm2
1511 movdqa %xmm2, -32(%edx, %edi)
1512 movdqa %xmm3, -16(%edx, %edi)
1513 jb L(sh_11_end_no_prefetch_loop)
1515 movdqa 16(%eax, %edi), %xmm2
1517 movdqa 32(%eax, %edi), %xmm3
1519 palignr $11, %xmm2, %xmm3
1520 palignr $11, %xmm4, %xmm2
1522 movdqa %xmm2, -32(%edx, %edi)
1523 movdqa %xmm3, -16(%edx, %edi)
1524 jae L(sh_11_no_prefetch_loop)
1526 L(sh_11_end_no_prefetch_loop):
1530 lea 11(%edi, %eax), %eax
1532 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1538 # ifndef USE_AS_MEMMOVE
1539 movaps -12(%eax), %xmm1
1541 movl DEST+4(%esp), %edi
1542 movaps -12(%eax), %xmm1
1543 movdqu %xmm0, (%edi)
1545 # ifdef DATA_CACHE_SIZE_HALF
1546 cmp $DATA_CACHE_SIZE_HALF, %ecx
1550 add $_GLOBAL_OFFSET_TABLE_, %ebx
1551 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1553 cmp __x86_data_cache_size_half, %ecx
1556 jb L(sh_12_no_prefetch)
1562 prefetcht0 0x1c0(%eax)
1563 prefetcht0 0x1c0(%edx)
1564 movaps 4(%eax), %xmm2
1565 movaps 20(%eax), %xmm3
1566 movaps 36(%eax), %xmm4
1567 movaps 52(%eax), %xmm5
1569 palignr $12, %xmm4, %xmm5
1570 palignr $12, %xmm3, %xmm4
1571 movaps %xmm5, 48(%edx)
1572 palignr $12, %xmm2, %xmm3
1574 palignr $12, %xmm1, %xmm2
1575 movaps %xmm4, 32(%edx)
1576 movaps %xmm3, 16(%edx)
1578 movaps %xmm2, (%edx)
1581 ja L(Shl12LoopStart)
1587 movaps 4(%eax), %xmm2
1588 movaps 20(%eax), %xmm3
1589 palignr $12, %xmm2, %xmm3
1590 palignr $12, %xmm1, %xmm2
1592 movaps %xmm2, (%edx)
1593 movaps %xmm3, 16(%edx)
1594 lea 32(%edx, %ecx), %edx
1595 lea 32(%eax, %ecx), %eax
1597 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1602 L(sh_12_no_prefetch):
1608 L(sh_12_no_prefetch_loop):
1609 movdqa 16(%eax, %edi), %xmm2
1611 movdqa 32(%eax, %edi), %xmm3
1613 palignr $12, %xmm2, %xmm3
1614 palignr $12, %xmm1, %xmm2
1616 movdqa %xmm2, -32(%edx, %edi)
1617 movdqa %xmm3, -16(%edx, %edi)
1618 jb L(sh_12_end_no_prefetch_loop)
1620 movdqa 16(%eax, %edi), %xmm2
1622 movdqa 32(%eax, %edi), %xmm3
1624 palignr $12, %xmm2, %xmm3
1625 palignr $12, %xmm4, %xmm2
1627 movdqa %xmm2, -32(%edx, %edi)
1628 movdqa %xmm3, -16(%edx, %edi)
1629 jae L(sh_12_no_prefetch_loop)
1631 L(sh_12_end_no_prefetch_loop):
1635 lea 12(%edi, %eax), %eax
1637 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1643 # ifndef USE_AS_MEMMOVE
1644 movaps -13(%eax), %xmm1
1646 movl DEST+4(%esp), %edi
1647 movaps -13(%eax), %xmm1
1648 movdqu %xmm0, (%edi)
1650 # ifdef DATA_CACHE_SIZE_HALF
1651 cmp $DATA_CACHE_SIZE_HALF, %ecx
1655 add $_GLOBAL_OFFSET_TABLE_, %ebx
1656 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1658 cmp __x86_data_cache_size_half, %ecx
1661 jb L(sh_13_no_prefetch)
1667 prefetcht0 0x1c0(%eax)
1668 prefetcht0 0x1c0(%edx)
1669 movaps 3(%eax), %xmm2
1670 movaps 19(%eax), %xmm3
1671 movaps 35(%eax), %xmm4
1672 movaps 51(%eax), %xmm5
1674 palignr $13, %xmm4, %xmm5
1675 palignr $13, %xmm3, %xmm4
1676 movaps %xmm5, 48(%edx)
1677 palignr $13, %xmm2, %xmm3
1679 palignr $13, %xmm1, %xmm2
1680 movaps %xmm4, 32(%edx)
1681 movaps %xmm3, 16(%edx)
1683 movaps %xmm2, (%edx)
1686 ja L(Shl13LoopStart)
1692 movaps 3(%eax), %xmm2
1693 movaps 19(%eax), %xmm3
1694 palignr $13, %xmm2, %xmm3
1695 palignr $13, %xmm1, %xmm2
1697 movaps %xmm2, (%edx)
1698 movaps %xmm3, 16(%edx)
1699 lea 32(%edx, %ecx), %edx
1700 lea 32(%eax, %ecx), %eax
1702 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1707 L(sh_13_no_prefetch):
1713 L(sh_13_no_prefetch_loop):
1714 movdqa 16(%eax, %edi), %xmm2
1716 movdqa 32(%eax, %edi), %xmm3
1718 palignr $13, %xmm2, %xmm3
1719 palignr $13, %xmm1, %xmm2
1721 movdqa %xmm2, -32(%edx, %edi)
1722 movdqa %xmm3, -16(%edx, %edi)
1723 jb L(sh_13_end_no_prefetch_loop)
1725 movdqa 16(%eax, %edi), %xmm2
1727 movdqa 32(%eax, %edi), %xmm3
1729 palignr $13, %xmm2, %xmm3
1730 palignr $13, %xmm4, %xmm2
1732 movdqa %xmm2, -32(%edx, %edi)
1733 movdqa %xmm3, -16(%edx, %edi)
1734 jae L(sh_13_no_prefetch_loop)
1736 L(sh_13_end_no_prefetch_loop):
1740 lea 13(%edi, %eax), %eax
1742 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1748 # ifndef USE_AS_MEMMOVE
1749 movaps -14(%eax), %xmm1
1751 movl DEST+4(%esp), %edi
1752 movaps -14(%eax), %xmm1
1753 movdqu %xmm0, (%edi)
1755 # ifdef DATA_CACHE_SIZE_HALF
1756 cmp $DATA_CACHE_SIZE_HALF, %ecx
1760 add $_GLOBAL_OFFSET_TABLE_, %ebx
1761 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1763 cmp __x86_data_cache_size_half, %ecx
1766 jb L(sh_14_no_prefetch)
1772 prefetcht0 0x1c0(%eax)
1773 prefetcht0 0x1c0(%edx)
1774 movaps 2(%eax), %xmm2
1775 movaps 18(%eax), %xmm3
1776 movaps 34(%eax), %xmm4
1777 movaps 50(%eax), %xmm5
1779 palignr $14, %xmm4, %xmm5
1780 palignr $14, %xmm3, %xmm4
1781 movaps %xmm5, 48(%edx)
1782 palignr $14, %xmm2, %xmm3
1784 palignr $14, %xmm1, %xmm2
1785 movaps %xmm4, 32(%edx)
1786 movaps %xmm3, 16(%edx)
1788 movaps %xmm2, (%edx)
1791 ja L(Shl14LoopStart)
1797 movaps 2(%eax), %xmm2
1798 movaps 18(%eax), %xmm3
1799 palignr $14, %xmm2, %xmm3
1800 palignr $14, %xmm1, %xmm2
1802 movaps %xmm2, (%edx)
1803 movaps %xmm3, 16(%edx)
1804 lea 32(%edx, %ecx), %edx
1805 lea 32(%eax, %ecx), %eax
1807 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1812 L(sh_14_no_prefetch):
1818 L(sh_14_no_prefetch_loop):
1819 movdqa 16(%eax, %edi), %xmm2
1821 movdqa 32(%eax, %edi), %xmm3
1823 palignr $14, %xmm2, %xmm3
1824 palignr $14, %xmm1, %xmm2
1826 movdqa %xmm2, -32(%edx, %edi)
1827 movdqa %xmm3, -16(%edx, %edi)
1828 jb L(sh_14_end_no_prefetch_loop)
1830 movdqa 16(%eax, %edi), %xmm2
1832 movdqa 32(%eax, %edi), %xmm3
1834 palignr $14, %xmm2, %xmm3
1835 palignr $14, %xmm4, %xmm2
1837 movdqa %xmm2, -32(%edx, %edi)
1838 movdqa %xmm3, -16(%edx, %edi)
1839 jae L(sh_14_no_prefetch_loop)
1841 L(sh_14_end_no_prefetch_loop):
1845 lea 14(%edi, %eax), %eax
1847 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1853 # ifndef USE_AS_MEMMOVE
1854 movaps -15(%eax), %xmm1
1856 movl DEST+4(%esp), %edi
1857 movaps -15(%eax), %xmm1
1858 movdqu %xmm0, (%edi)
1860 # ifdef DATA_CACHE_SIZE_HALF
1861 cmp $DATA_CACHE_SIZE_HALF, %ecx
1865 add $_GLOBAL_OFFSET_TABLE_, %ebx
1866 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1868 cmp __x86_data_cache_size_half, %ecx
1871 jb L(sh_15_no_prefetch)
1877 prefetcht0 0x1c0(%eax)
1878 prefetcht0 0x1c0(%edx)
1879 movaps 1(%eax), %xmm2
1880 movaps 17(%eax), %xmm3
1881 movaps 33(%eax), %xmm4
1882 movaps 49(%eax), %xmm5
1884 palignr $15, %xmm4, %xmm5
1885 palignr $15, %xmm3, %xmm4
1886 movaps %xmm5, 48(%edx)
1887 palignr $15, %xmm2, %xmm3
1889 palignr $15, %xmm1, %xmm2
1890 movaps %xmm4, 32(%edx)
1891 movaps %xmm3, 16(%edx)
1893 movaps %xmm2, (%edx)
1896 ja L(Shl15LoopStart)
1902 movaps 1(%eax), %xmm2
1903 movaps 17(%eax), %xmm3
1904 palignr $15, %xmm2, %xmm3
1905 palignr $15, %xmm1, %xmm2
1907 movaps %xmm2, (%edx)
1908 movaps %xmm3, 16(%edx)
1909 lea 32(%edx, %ecx), %edx
1910 lea 32(%eax, %ecx), %eax
1912 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1917 L(sh_15_no_prefetch):
1923 L(sh_15_no_prefetch_loop):
1924 movdqa 16(%eax, %edi), %xmm2
1926 movdqa 32(%eax, %edi), %xmm3
1928 palignr $15, %xmm2, %xmm3
1929 palignr $15, %xmm1, %xmm2
1931 movdqa %xmm2, -32(%edx, %edi)
1932 movdqa %xmm3, -16(%edx, %edi)
1933 jb L(sh_15_end_no_prefetch_loop)
1935 movdqa 16(%eax, %edi), %xmm2
1937 movdqa 32(%eax, %edi), %xmm3
1939 palignr $15, %xmm2, %xmm3
1940 palignr $15, %xmm4, %xmm2
1942 movdqa %xmm2, -32(%edx, %edi)
1943 movdqa %xmm3, -16(%edx, %edi)
1944 jae L(sh_15_no_prefetch_loop)
1946 L(sh_15_end_no_prefetch_loop):
1950 lea 15(%edi, %eax), %eax
1952 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1959 lea (%edx, %ecx), %edx
1960 lea (%eax, %ecx), %eax
1962 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1965 L(fwd_write_44bytes):
1966 movq -44(%eax), %xmm0
1967 movq %xmm0, -44(%edx)
1968 L(fwd_write_36bytes):
1969 movq -36(%eax), %xmm0
1970 movq %xmm0, -36(%edx)
1971 L(fwd_write_28bytes):
1972 movq -28(%eax), %xmm0
1973 movq %xmm0, -28(%edx)
1974 L(fwd_write_20bytes):
1975 movq -20(%eax), %xmm0
1976 movq %xmm0, -20(%edx)
1977 L(fwd_write_12bytes):
1978 movq -12(%eax), %xmm0
1979 movq %xmm0, -12(%edx)
1980 L(fwd_write_4bytes):
1983 # ifndef USE_AS_BCOPY
1984 # ifdef USE_AS_MEMPCPY
1987 movl DEST(%esp), %eax
1993 L(fwd_write_40bytes):
1994 movq -40(%eax), %xmm0
1995 movq %xmm0, -40(%edx)
1996 L(fwd_write_32bytes):
1997 movq -32(%eax), %xmm0
1998 movq %xmm0, -32(%edx)
1999 L(fwd_write_24bytes):
2000 movq -24(%eax), %xmm0
2001 movq %xmm0, -24(%edx)
2002 L(fwd_write_16bytes):
2003 movq -16(%eax), %xmm0
2004 movq %xmm0, -16(%edx)
2005 L(fwd_write_8bytes):
2006 movq -8(%eax), %xmm0
2007 movq %xmm0, -8(%edx)
2008 L(fwd_write_0bytes):
2009 # ifndef USE_AS_BCOPY
2010 # ifdef USE_AS_MEMPCPY
2013 movl DEST(%esp), %eax
2019 L(fwd_write_5bytes):
2024 # ifndef USE_AS_BCOPY
2025 # ifdef USE_AS_MEMPCPY
2028 movl DEST(%esp), %eax
2034 L(fwd_write_45bytes):
2035 movq -45(%eax), %xmm0
2036 movq %xmm0, -45(%edx)
2037 L(fwd_write_37bytes):
2038 movq -37(%eax), %xmm0
2039 movq %xmm0, -37(%edx)
2040 L(fwd_write_29bytes):
2041 movq -29(%eax), %xmm0
2042 movq %xmm0, -29(%edx)
2043 L(fwd_write_21bytes):
2044 movq -21(%eax), %xmm0
2045 movq %xmm0, -21(%edx)
2046 L(fwd_write_13bytes):
2047 movq -13(%eax), %xmm0
2048 movq %xmm0, -13(%edx)
2051 movzbl -1(%eax), %ecx
2053 # ifndef USE_AS_BCOPY
2054 # ifdef USE_AS_MEMPCPY
2057 movl DEST(%esp), %eax
2063 L(fwd_write_41bytes):
2064 movq -41(%eax), %xmm0
2065 movq %xmm0, -41(%edx)
2066 L(fwd_write_33bytes):
2067 movq -33(%eax), %xmm0
2068 movq %xmm0, -33(%edx)
2069 L(fwd_write_25bytes):
2070 movq -25(%eax), %xmm0
2071 movq %xmm0, -25(%edx)
2072 L(fwd_write_17bytes):
2073 movq -17(%eax), %xmm0
2074 movq %xmm0, -17(%edx)
2075 L(fwd_write_9bytes):
2076 movq -9(%eax), %xmm0
2077 movq %xmm0, -9(%edx)
2078 L(fwd_write_1bytes):
2079 movzbl -1(%eax), %ecx
2081 # ifndef USE_AS_BCOPY
2082 # ifdef USE_AS_MEMPCPY
2085 movl DEST(%esp), %eax
2091 L(fwd_write_46bytes):
2092 movq -46(%eax), %xmm0
2093 movq %xmm0, -46(%edx)
2094 L(fwd_write_38bytes):
2095 movq -38(%eax), %xmm0
2096 movq %xmm0, -38(%edx)
2097 L(fwd_write_30bytes):
2098 movq -30(%eax), %xmm0
2099 movq %xmm0, -30(%edx)
2100 L(fwd_write_22bytes):
2101 movq -22(%eax), %xmm0
2102 movq %xmm0, -22(%edx)
2103 L(fwd_write_14bytes):
2104 movq -14(%eax), %xmm0
2105 movq %xmm0, -14(%edx)
2106 L(fwd_write_6bytes):
2109 movzwl -2(%eax), %ecx
2111 # ifndef USE_AS_BCOPY
2112 # ifdef USE_AS_MEMPCPY
2115 movl DEST(%esp), %eax
2121 L(fwd_write_42bytes):
2122 movq -42(%eax), %xmm0
2123 movq %xmm0, -42(%edx)
2124 L(fwd_write_34bytes):
2125 movq -34(%eax), %xmm0
2126 movq %xmm0, -34(%edx)
2127 L(fwd_write_26bytes):
2128 movq -26(%eax), %xmm0
2129 movq %xmm0, -26(%edx)
2130 L(fwd_write_18bytes):
2131 movq -18(%eax), %xmm0
2132 movq %xmm0, -18(%edx)
2133 L(fwd_write_10bytes):
2134 movq -10(%eax), %xmm0
2135 movq %xmm0, -10(%edx)
2136 L(fwd_write_2bytes):
2137 movzwl -2(%eax), %ecx
2139 # ifndef USE_AS_BCOPY
2140 # ifdef USE_AS_MEMPCPY
2143 movl DEST(%esp), %eax
2149 L(fwd_write_47bytes):
2150 movq -47(%eax), %xmm0
2151 movq %xmm0, -47(%edx)
2152 L(fwd_write_39bytes):
2153 movq -39(%eax), %xmm0
2154 movq %xmm0, -39(%edx)
2155 L(fwd_write_31bytes):
2156 movq -31(%eax), %xmm0
2157 movq %xmm0, -31(%edx)
2158 L(fwd_write_23bytes):
2159 movq -23(%eax), %xmm0
2160 movq %xmm0, -23(%edx)
2161 L(fwd_write_15bytes):
2162 movq -15(%eax), %xmm0
2163 movq %xmm0, -15(%edx)
2164 L(fwd_write_7bytes):
2167 movzwl -3(%eax), %ecx
2168 movzbl -1(%eax), %eax
2171 # ifndef USE_AS_BCOPY
2172 # ifdef USE_AS_MEMPCPY
2175 movl DEST(%esp), %eax
2181 L(fwd_write_43bytes):
2182 movq -43(%eax), %xmm0
2183 movq %xmm0, -43(%edx)
2184 L(fwd_write_35bytes):
2185 movq -35(%eax), %xmm0
2186 movq %xmm0, -35(%edx)
2187 L(fwd_write_27bytes):
2188 movq -27(%eax), %xmm0
2189 movq %xmm0, -27(%edx)
2190 L(fwd_write_19bytes):
2191 movq -19(%eax), %xmm0
2192 movq %xmm0, -19(%edx)
2193 L(fwd_write_11bytes):
2194 movq -11(%eax), %xmm0
2195 movq %xmm0, -11(%edx)
2196 L(fwd_write_3bytes):
2197 movzwl -3(%eax), %ecx
2198 movzbl -1(%eax), %eax
2201 # ifndef USE_AS_BCOPY
2202 # ifdef USE_AS_MEMPCPY
2205 movl DEST(%esp), %eax
2211 L(fwd_write_40bytes_align):
2212 movdqa -40(%eax), %xmm0
2213 movdqa %xmm0, -40(%edx)
2214 L(fwd_write_24bytes_align):
2215 movdqa -24(%eax), %xmm0
2216 movdqa %xmm0, -24(%edx)
2217 L(fwd_write_8bytes_align):
2218 movq -8(%eax), %xmm0
2219 movq %xmm0, -8(%edx)
2220 L(fwd_write_0bytes_align):
2221 # ifndef USE_AS_BCOPY
2222 # ifdef USE_AS_MEMPCPY
2225 movl DEST(%esp), %eax
2231 L(fwd_write_32bytes_align):
2232 movdqa -32(%eax), %xmm0
2233 movdqa %xmm0, -32(%edx)
2234 L(fwd_write_16bytes_align):
2235 movdqa -16(%eax), %xmm0
2236 movdqa %xmm0, -16(%edx)
2237 # ifndef USE_AS_BCOPY
2238 # ifdef USE_AS_MEMPCPY
2241 movl DEST(%esp), %eax
2247 L(fwd_write_5bytes_align):
2252 # ifndef USE_AS_BCOPY
2253 # ifdef USE_AS_MEMPCPY
2256 movl DEST(%esp), %eax
2262 L(fwd_write_45bytes_align):
2263 movdqa -45(%eax), %xmm0
2264 movdqa %xmm0, -45(%edx)
2265 L(fwd_write_29bytes_align):
2266 movdqa -29(%eax), %xmm0
2267 movdqa %xmm0, -29(%edx)
2268 L(fwd_write_13bytes_align):
2269 movq -13(%eax), %xmm0
2270 movq %xmm0, -13(%edx)
2273 movzbl -1(%eax), %ecx
2275 # ifndef USE_AS_BCOPY
2276 # ifdef USE_AS_MEMPCPY
2279 movl DEST(%esp), %eax
2285 L(fwd_write_37bytes_align):
2286 movdqa -37(%eax), %xmm0
2287 movdqa %xmm0, -37(%edx)
2288 L(fwd_write_21bytes_align):
2289 movdqa -21(%eax), %xmm0
2290 movdqa %xmm0, -21(%edx)
2293 movzbl -1(%eax), %ecx
2295 # ifndef USE_AS_BCOPY
2296 # ifdef USE_AS_MEMPCPY
2299 movl DEST(%esp), %eax
2305 L(fwd_write_41bytes_align):
2306 movdqa -41(%eax), %xmm0
2307 movdqa %xmm0, -41(%edx)
2308 L(fwd_write_25bytes_align):
2309 movdqa -25(%eax), %xmm0
2310 movdqa %xmm0, -25(%edx)
2311 L(fwd_write_9bytes_align):
2312 movq -9(%eax), %xmm0
2313 movq %xmm0, -9(%edx)
2314 L(fwd_write_1bytes_align):
2315 movzbl -1(%eax), %ecx
2317 # ifndef USE_AS_BCOPY
2318 # ifdef USE_AS_MEMPCPY
2321 movl DEST(%esp), %eax
2327 L(fwd_write_33bytes_align):
2328 movdqa -33(%eax), %xmm0
2329 movdqa %xmm0, -33(%edx)
2330 L(fwd_write_17bytes_align):
2331 movdqa -17(%eax), %xmm0
2332 movdqa %xmm0, -17(%edx)
2333 movzbl -1(%eax), %ecx
2335 # ifndef USE_AS_BCOPY
2336 # ifdef USE_AS_MEMPCPY
2339 movl DEST(%esp), %eax
2345 L(fwd_write_46bytes_align):
2346 movdqa -46(%eax), %xmm0
2347 movdqa %xmm0, -46(%edx)
2348 L(fwd_write_30bytes_align):
2349 movdqa -30(%eax), %xmm0
2350 movdqa %xmm0, -30(%edx)
2351 L(fwd_write_14bytes_align):
2352 movq -14(%eax), %xmm0
2353 movq %xmm0, -14(%edx)
2354 L(fwd_write_6bytes_align):
2357 movzwl -2(%eax), %ecx
2359 # ifndef USE_AS_BCOPY
2360 # ifdef USE_AS_MEMPCPY
2363 movl DEST(%esp), %eax
2369 L(fwd_write_38bytes_align):
2370 movdqa -38(%eax), %xmm0
2371 movdqa %xmm0, -38(%edx)
2372 L(fwd_write_22bytes_align):
2373 movdqa -22(%eax), %xmm0
2374 movdqa %xmm0, -22(%edx)
2377 movzwl -2(%eax), %ecx
2379 # ifndef USE_AS_BCOPY
2380 # ifdef USE_AS_MEMPCPY
2383 movl DEST(%esp), %eax
2389 L(fwd_write_42bytes_align):
2390 movdqa -42(%eax), %xmm0
2391 movdqa %xmm0, -42(%edx)
2392 L(fwd_write_26bytes_align):
2393 movdqa -26(%eax), %xmm0
2394 movdqa %xmm0, -26(%edx)
2395 L(fwd_write_10bytes_align):
2396 movq -10(%eax), %xmm0
2397 movq %xmm0, -10(%edx)
2398 L(fwd_write_2bytes_align):
2399 movzwl -2(%eax), %ecx
2401 # ifndef USE_AS_BCOPY
2402 # ifdef USE_AS_MEMPCPY
2405 movl DEST(%esp), %eax
2411 L(fwd_write_34bytes_align):
2412 movdqa -34(%eax), %xmm0
2413 movdqa %xmm0, -34(%edx)
2414 L(fwd_write_18bytes_align):
2415 movdqa -18(%eax), %xmm0
2416 movdqa %xmm0, -18(%edx)
2417 movzwl -2(%eax), %ecx
2419 # ifndef USE_AS_BCOPY
2420 # ifdef USE_AS_MEMPCPY
2423 movl DEST(%esp), %eax
2429 L(fwd_write_47bytes_align):
2430 movdqa -47(%eax), %xmm0
2431 movdqa %xmm0, -47(%edx)
2432 L(fwd_write_31bytes_align):
2433 movdqa -31(%eax), %xmm0
2434 movdqa %xmm0, -31(%edx)
2435 L(fwd_write_15bytes_align):
2436 movq -15(%eax), %xmm0
2437 movq %xmm0, -15(%edx)
2438 L(fwd_write_7bytes_align):
2441 movzwl -3(%eax), %ecx
2442 movzbl -1(%eax), %eax
2445 # ifndef USE_AS_BCOPY
2446 # ifdef USE_AS_MEMPCPY
2449 movl DEST(%esp), %eax
2455 L(fwd_write_39bytes_align):
2456 movdqa -39(%eax), %xmm0
2457 movdqa %xmm0, -39(%edx)
2458 L(fwd_write_23bytes_align):
2459 movdqa -23(%eax), %xmm0
2460 movdqa %xmm0, -23(%edx)
2463 movzwl -3(%eax), %ecx
2464 movzbl -1(%eax), %eax
2467 # ifndef USE_AS_BCOPY
2468 # ifdef USE_AS_MEMPCPY
2471 movl DEST(%esp), %eax
2477 L(fwd_write_43bytes_align):
2478 movdqa -43(%eax), %xmm0
2479 movdqa %xmm0, -43(%edx)
2480 L(fwd_write_27bytes_align):
2481 movdqa -27(%eax), %xmm0
2482 movdqa %xmm0, -27(%edx)
2483 L(fwd_write_11bytes_align):
2484 movq -11(%eax), %xmm0
2485 movq %xmm0, -11(%edx)
2486 L(fwd_write_3bytes_align):
2487 movzwl -3(%eax), %ecx
2488 movzbl -1(%eax), %eax
2491 # ifndef USE_AS_BCOPY
2492 # ifdef USE_AS_MEMPCPY
2495 movl DEST(%esp), %eax
2501 L(fwd_write_35bytes_align):
2502 movdqa -35(%eax), %xmm0
2503 movdqa %xmm0, -35(%edx)
2504 L(fwd_write_19bytes_align):
2505 movdqa -19(%eax), %xmm0
2506 movdqa %xmm0, -19(%edx)
2507 movzwl -3(%eax), %ecx
2508 movzbl -1(%eax), %eax
2511 # ifndef USE_AS_BCOPY
2512 # ifdef USE_AS_MEMPCPY
2515 movl DEST(%esp), %eax
2521 L(fwd_write_44bytes_align):
2522 movdqa -44(%eax), %xmm0
2523 movdqa %xmm0, -44(%edx)
2524 L(fwd_write_28bytes_align):
2525 movdqa -28(%eax), %xmm0
2526 movdqa %xmm0, -28(%edx)
2527 L(fwd_write_12bytes_align):
2528 movq -12(%eax), %xmm0
2529 movq %xmm0, -12(%edx)
2530 L(fwd_write_4bytes_align):
2533 # ifndef USE_AS_BCOPY
2534 # ifdef USE_AS_MEMPCPY
2537 movl DEST(%esp), %eax
2543 L(fwd_write_36bytes_align):
2544 movdqa -36(%eax), %xmm0
2545 movdqa %xmm0, -36(%edx)
2546 L(fwd_write_20bytes_align):
2547 movdqa -20(%eax), %xmm0
2548 movdqa %xmm0, -20(%edx)
2551 # ifndef USE_AS_BCOPY
2552 # ifdef USE_AS_MEMPCPY
2555 movl DEST(%esp), %eax
2564 movdqu (%eax), %xmm1
2565 # ifdef USE_AS_MEMMOVE
2566 movl DEST+4(%esp), %edi
2567 movdqu %xmm0, (%edi)
2570 movntdq %xmm1, (%edx)
2572 lea -0x90(%ecx), %ecx
2577 movdqu (%eax), %xmm0
2578 movdqu 0x10(%eax), %xmm1
2579 movdqu 0x20(%eax), %xmm2
2580 movdqu 0x30(%eax), %xmm3
2581 movdqu 0x40(%eax), %xmm4
2582 movdqu 0x50(%eax), %xmm5
2583 movdqu 0x60(%eax), %xmm6
2584 movdqu 0x70(%eax), %xmm7
2585 lea 0x80(%eax), %eax
2588 movntdq %xmm0, (%edx)
2589 movntdq %xmm1, 0x10(%edx)
2590 movntdq %xmm2, 0x20(%edx)
2591 movntdq %xmm3, 0x30(%edx)
2592 movntdq %xmm4, 0x40(%edx)
2593 movntdq %xmm5, 0x50(%edx)
2594 movntdq %xmm6, 0x60(%edx)
2595 movntdq %xmm7, 0x70(%edx)
2596 lea 0x80(%edx), %edx
2597 jae L(large_page_loop)
2599 lea 0x80(%ecx), %ecx
2600 jl L(large_page_less_64bytes)
2602 movdqu (%eax), %xmm0
2603 movdqu 0x10(%eax), %xmm1
2604 movdqu 0x20(%eax), %xmm2
2605 movdqu 0x30(%eax), %xmm3
2606 lea 0x40(%eax), %eax
2608 movntdq %xmm0, (%edx)
2609 movntdq %xmm1, 0x10(%edx)
2610 movntdq %xmm2, 0x20(%edx)
2611 movntdq %xmm3, 0x30(%edx)
2612 lea 0x40(%edx), %edx
2614 L(large_page_less_64bytes):
2616 jb L(large_page_less_32bytes)
2617 movdqu (%eax), %xmm0
2618 movdqu 0x10(%eax), %xmm1
2619 lea 0x20(%eax), %eax
2620 movntdq %xmm0, (%edx)
2621 movntdq %xmm1, 0x10(%edx)
2622 lea 0x20(%edx), %edx
2624 L(large_page_less_32bytes):
2628 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2631 L(bk_write_44bytes):
2632 movq 36(%eax), %xmm0
2633 movq %xmm0, 36(%edx)
2634 L(bk_write_36bytes):
2635 movq 28(%eax), %xmm0
2636 movq %xmm0, 28(%edx)
2637 L(bk_write_28bytes):
2638 movq 20(%eax), %xmm0
2639 movq %xmm0, 20(%edx)
2640 L(bk_write_20bytes):
2641 movq 12(%eax), %xmm0
2642 movq %xmm0, 12(%edx)
2643 L(bk_write_12bytes):
2650 # ifndef USE_AS_BCOPY
2651 movl DEST(%esp), %eax
2652 # ifdef USE_AS_MEMPCPY
2653 movl LEN(%esp), %ecx
2660 L(bk_write_40bytes):
2661 movq 32(%eax), %xmm0
2662 movq %xmm0, 32(%edx)
2663 L(bk_write_32bytes):
2664 movq 24(%eax), %xmm0
2665 movq %xmm0, 24(%edx)
2666 L(bk_write_24bytes):
2667 movq 16(%eax), %xmm0
2668 movq %xmm0, 16(%edx)
2669 L(bk_write_16bytes):
2675 # ifndef USE_AS_BCOPY
2676 movl DEST(%esp), %eax
2677 # ifdef USE_AS_MEMPCPY
2678 movl LEN(%esp), %ecx
2685 L(bk_write_45bytes):
2686 movq 37(%eax), %xmm0
2687 movq %xmm0, 37(%edx)
2688 L(bk_write_37bytes):
2689 movq 29(%eax), %xmm0
2690 movq %xmm0, 29(%edx)
2691 L(bk_write_29bytes):
2692 movq 21(%eax), %xmm0
2693 movq %xmm0, 21(%edx)
2694 L(bk_write_21bytes):
2695 movq 13(%eax), %xmm0
2696 movq %xmm0, 13(%edx)
2697 L(bk_write_13bytes):
2706 # ifndef USE_AS_BCOPY
2707 movl DEST(%esp), %eax
2708 # ifdef USE_AS_MEMPCPY
2709 movl LEN(%esp), %ecx
2716 L(bk_write_41bytes):
2717 movq 33(%eax), %xmm0
2718 movq %xmm0, 33(%edx)
2719 L(bk_write_33bytes):
2720 movq 25(%eax), %xmm0
2721 movq %xmm0, 25(%edx)
2722 L(bk_write_25bytes):
2723 movq 17(%eax), %xmm0
2724 movq %xmm0, 17(%edx)
2725 L(bk_write_17bytes):
2733 # ifndef USE_AS_BCOPY
2734 movl DEST(%esp), %eax
2735 # ifdef USE_AS_MEMPCPY
2736 movl LEN(%esp), %ecx
2743 L(bk_write_46bytes):
2744 movq 38(%eax), %xmm0
2745 movq %xmm0, 38(%edx)
2746 L(bk_write_38bytes):
2747 movq 30(%eax), %xmm0
2748 movq %xmm0, 30(%edx)
2749 L(bk_write_30bytes):
2750 movq 22(%eax), %xmm0
2751 movq %xmm0, 22(%edx)
2752 L(bk_write_22bytes):
2753 movq 14(%eax), %xmm0
2754 movq %xmm0, 14(%edx)
2755 L(bk_write_14bytes):
2763 # ifndef USE_AS_BCOPY
2764 movl DEST(%esp), %eax
2765 # ifdef USE_AS_MEMPCPY
2766 movl LEN(%esp), %ecx
2773 L(bk_write_42bytes):
2774 movq 34(%eax), %xmm0
2775 movq %xmm0, 34(%edx)
2776 L(bk_write_34bytes):
2777 movq 26(%eax), %xmm0
2778 movq %xmm0, 26(%edx)
2779 L(bk_write_26bytes):
2780 movq 18(%eax), %xmm0
2781 movq %xmm0, 18(%edx)
2782 L(bk_write_18bytes):
2783 movq 10(%eax), %xmm0
2784 movq %xmm0, 10(%edx)
2785 L(bk_write_10bytes):
2791 # ifndef USE_AS_BCOPY
2792 movl DEST(%esp), %eax
2793 # ifdef USE_AS_MEMPCPY
2794 movl LEN(%esp), %ecx
2801 L(bk_write_47bytes):
2802 movq 39(%eax), %xmm0
2803 movq %xmm0, 39(%edx)
2804 L(bk_write_39bytes):
2805 movq 31(%eax), %xmm0
2806 movq %xmm0, 31(%edx)
2807 L(bk_write_31bytes):
2808 movq 23(%eax), %xmm0
2809 movq %xmm0, 23(%edx)
2810 L(bk_write_23bytes):
2811 movq 15(%eax), %xmm0
2812 movq %xmm0, 15(%edx)
2813 L(bk_write_15bytes):
2819 movzwl 1(%eax), %ecx
2823 # ifndef USE_AS_BCOPY
2824 movl DEST(%esp), %eax
2825 # ifdef USE_AS_MEMPCPY
2826 movl LEN(%esp), %ecx
2833 L(bk_write_43bytes):
2834 movq 35(%eax), %xmm0
2835 movq %xmm0, 35(%edx)
2836 L(bk_write_35bytes):
2837 movq 27(%eax), %xmm0
2838 movq %xmm0, 27(%edx)
2839 L(bk_write_27bytes):
2840 movq 19(%eax), %xmm0
2841 movq %xmm0, 19(%edx)
2842 L(bk_write_19bytes):
2843 movq 11(%eax), %xmm0
2844 movq %xmm0, 11(%edx)
2845 L(bk_write_11bytes):
2849 movzwl 1(%eax), %ecx
2853 # ifndef USE_AS_BCOPY
2854 movl DEST(%esp), %eax
2855 # ifdef USE_AS_MEMPCPY
2856 movl LEN(%esp), %ecx
2863 .pushsection .rodata.ssse3,"a",@progbits
2865 L(table_48bytes_fwd):
2866 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2878 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2879 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2880 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2881 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2882 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2883 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2884 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2885 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2886 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2887 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2888 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2889 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2890 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2891 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2892 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2893 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2894 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2895 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2896 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2897 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2898 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2899 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2900 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2901 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2902 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2903 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2904 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2905 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2906 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2913 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2916 L(table_48bytes_fwd_align):
2917 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2929 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2930 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2931 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2932 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2933 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2934 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2935 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2936 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2937 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2938 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2939 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2940 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2941 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2942 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2943 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2944 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2945 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2946 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2947 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2948 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2949 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2950 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2951 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2952 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2953 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2954 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2955 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2956 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2957 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2964 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2968 .int JMPTBL (L(shl_0), L(shl_table))
2969 .int JMPTBL (L(shl_1), L(shl_table))
2970 .int JMPTBL (L(shl_2), L(shl_table))
2971 .int JMPTBL (L(shl_3), L(shl_table))
2972 .int JMPTBL (L(shl_4), L(shl_table))
2973 .int JMPTBL (L(shl_5), L(shl_table))
2974 .int JMPTBL (L(shl_6), L(shl_table))
2975 .int JMPTBL (L(shl_7), L(shl_table))
2976 .int JMPTBL (L(shl_8), L(shl_table))
2977 .int JMPTBL (L(shl_9), L(shl_table))
2978 .int JMPTBL (L(shl_10), L(shl_table))
2979 .int JMPTBL (L(shl_11), L(shl_table))
2980 .int JMPTBL (L(shl_12), L(shl_table))
2981 .int JMPTBL (L(shl_13), L(shl_table))
2982 .int JMPTBL (L(shl_14), L(shl_table))
2983 .int JMPTBL (L(shl_15), L(shl_table))
2986 L(table_48_bytes_bwd):
2987 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2999 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
3000 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3001 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3002 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3003 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3004 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3005 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3006 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3007 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3008 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3009 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3010 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3011 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3012 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3013 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3014 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3015 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3016 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3017 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3018 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3019 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3020 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3021 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3022 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3023 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3024 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3025 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3026 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3027 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3034 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3038 # ifdef USE_AS_MEMMOVE
3043 lea (%ecx,%edx,1),%edx
3044 lea (%ecx,%edi,1),%edi
3050 jae L(bk_write_more64bytes)
3052 L(bk_write_64bytesless):
3054 jb L(bk_write_less32bytes)
3056 L(bk_write_more32bytes):
3057 /* Copy 32 bytes at a time. */
3059 movq -8(%edi), %xmm0
3060 movq %xmm0, -8(%edx)
3061 movq -16(%edi), %xmm0
3062 movq %xmm0, -16(%edx)
3063 movq -24(%edi), %xmm0
3064 movq %xmm0, -24(%edx)
3065 movq -32(%edi), %xmm0
3066 movq %xmm0, -32(%edx)
3070 L(bk_write_less32bytes):
3075 L(bk_write_less32bytes_2):
3076 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3083 jbe L(bk_write_less32bytes)
3085 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3086 then (EDX & 2) must be != 0. */
3106 L(bk_write_more64bytes):
3107 /* Check alignment of last byte. */
3109 jz L(bk_ssse3_cpy_pre)
3111 /* EDX is aligned 4 bytes, but not 16 bytes. */
3120 jz L(bk_ssse3_cpy_pre)
3129 jz L(bk_ssse3_cpy_pre)
3137 L(bk_ssse3_cpy_pre):
3139 jb L(bk_write_more32bytes)
3146 movdqu 0x30(%edi), %xmm3
3147 movdqa %xmm3, 0x30(%edx)
3148 movdqu 0x20(%edi), %xmm2
3149 movdqa %xmm2, 0x20(%edx)
3150 movdqu 0x10(%edi), %xmm1
3151 movdqa %xmm1, 0x10(%edx)
3152 movdqu (%edi), %xmm0
3153 movdqa %xmm0, (%edx)
3156 jmp L(bk_write_64bytesless)