2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
25 # include "asm-syntax.h"
28 # define MEMCPY __memcpy_ssse3
29 # define MEMCPY_CHK __memcpy_chk_ssse3
36 # define CFI_PUSH(REG) \
37 cfi_adjust_cfa_offset (4); \
38 cfi_rel_offset (REG, 0)
40 # define CFI_POP(REG) \
41 cfi_adjust_cfa_offset (-4); \
44 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
45 # define POP(REG) popl REG; CFI_POP (REG)
48 # define PARMS 8 /* Preserve EBX. */
49 # define ENTRANCE PUSH (%ebx);
50 # define RETURN_END POP (%ebx); ret
51 # define RETURN RETURN_END; CFI_PUSH (%ebx)
52 # define JMPTBL(I, B) I - B
54 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
55 jump table with relative offsets. INDEX is a register contains the
56 index into the jump table. SCALE is the scale of INDEX. */
58 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
59 /* We first load PC into EBX. */ \
61 /* Get the address of the jump table. */ \
62 addl $(TABLE - .), %ebx; \
63 /* Get the entry and convert the relative offset to the \
64 absolute address. */ \
65 addl (%ebx, INDEX, SCALE), %ebx; \
66 /* We loaded the jump table. Go. */ \
72 # define RETURN_END ret
73 # define RETURN RETURN_END
74 # define JMPTBL(I, B) I
76 /* Branch to an entry in a jump table. TABLE is a jump table with
77 absolute offsets. INDEX is a register contains the index into the
78 jump table. SCALE is the scale of INDEX. */
80 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
81 jmp *TABLE(, INDEX, SCALE)
84 .section .text.ssse3,"ax",@progbits
89 jb HIDDEN_JUMPTARGET (__chk_fail)
98 # ifdef USE_AS_MEMMOVE
101 je L(fwd_write_0bytes)
104 jmp L(bk_write_less32bytes_2)
118 L(fwd_write_less32bytes):
119 # ifndef USE_AS_MEMMOVE
125 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
126 # ifndef USE_AS_MEMMOVE
129 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
134 # ifndef USE_AS_MEMMOVE
136 movlpd 8(%eax), %xmm1
138 movlpd %xmm1, 8(%edx)
150 # ifdef SHARED_CACHE_SIZE_HALF
151 cmp $SHARED_CACHE_SIZE_HALF, %ecx
155 add $_GLOBAL_OFFSET_TABLE_, %ebx
156 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
158 cmp __x86_shared_cache_size_half, %ecx
166 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
170 # ifdef USE_AS_MEMMOVE
171 movl DEST+4(%esp), %edi
181 movdqa (%eax, %edi), %xmm0
182 movdqa 16(%eax, %edi), %xmm1
184 movdqa %xmm0, (%edx, %edi)
185 movdqa %xmm1, 16(%edx, %edi)
189 movdqa (%eax, %edi), %xmm0
190 movdqa 16(%eax, %edi), %xmm1
192 movdqa %xmm0, (%edx, %edi)
193 movdqa %xmm1, 16(%edx, %edi)
197 movdqa (%eax, %edi), %xmm0
198 movdqa 16(%eax, %edi), %xmm1
200 movdqa %xmm0, (%edx, %edi)
201 movdqa %xmm1, 16(%edx, %edi)
205 movdqa (%eax, %edi), %xmm0
206 movdqa 16(%eax, %edi), %xmm1
208 movdqa %xmm0, (%edx, %edi)
209 movdqa %xmm1, 16(%edx, %edi)
218 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
224 # ifdef DATA_CACHE_SIZE_HALF
225 cmp $DATA_CACHE_SIZE_HALF, %ecx
229 add $_GLOBAL_OFFSET_TABLE_, %ebx
230 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
232 cmp __x86_data_cache_size_half, %ecx
237 jae L(shl_0_gobble_mem_loop)
240 L(shl_0_gobble_cache_loop):
242 movdqa 0x10(%eax), %xmm1
243 movdqa 0x20(%eax), %xmm2
244 movdqa 0x30(%eax), %xmm3
245 movdqa 0x40(%eax), %xmm4
246 movdqa 0x50(%eax), %xmm5
247 movdqa 0x60(%eax), %xmm6
248 movdqa 0x70(%eax), %xmm7
252 movdqa %xmm1, 0x10(%edx)
253 movdqa %xmm2, 0x20(%edx)
254 movdqa %xmm3, 0x30(%edx)
255 movdqa %xmm4, 0x40(%edx)
256 movdqa %xmm5, 0x50(%edx)
257 movdqa %xmm6, 0x60(%edx)
258 movdqa %xmm7, 0x70(%edx)
261 jae L(shl_0_gobble_cache_loop)
264 jl L(shl_0_cache_less_64bytes)
268 movdqa 0x10(%eax), %xmm1
270 movdqa %xmm1, 0x10(%edx)
271 movdqa 0x20(%eax), %xmm0
272 movdqa 0x30(%eax), %xmm1
274 movdqa %xmm0, 0x20(%edx)
275 movdqa %xmm1, 0x30(%edx)
278 L(shl_0_cache_less_64bytes):
280 jb L(shl_0_cache_less_32bytes)
283 movdqa 0x10(%eax), %xmm1
286 movdqa %xmm1, 0x10(%edx)
289 L(shl_0_cache_less_32bytes):
291 jb L(shl_0_cache_less_16bytes)
298 L(shl_0_cache_less_16bytes):
301 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
304 L(shl_0_gobble_mem_loop):
305 prefetcht0 0x1c0(%eax)
306 prefetcht0 0x280(%eax)
307 prefetcht0 0x1c0(%edx)
310 movdqa 0x10(%eax), %xmm1
311 movdqa 0x20(%eax), %xmm2
312 movdqa 0x30(%eax), %xmm3
313 movdqa 0x40(%eax), %xmm4
314 movdqa 0x50(%eax), %xmm5
315 movdqa 0x60(%eax), %xmm6
316 movdqa 0x70(%eax), %xmm7
320 movdqa %xmm1, 0x10(%edx)
321 movdqa %xmm2, 0x20(%edx)
322 movdqa %xmm3, 0x30(%edx)
323 movdqa %xmm4, 0x40(%edx)
324 movdqa %xmm5, 0x50(%edx)
325 movdqa %xmm6, 0x60(%edx)
326 movdqa %xmm7, 0x70(%edx)
329 jae L(shl_0_gobble_mem_loop)
332 jl L(shl_0_mem_less_64bytes)
336 movdqa 0x10(%eax), %xmm1
339 movdqa %xmm1, 0x10(%edx)
341 movdqa 0x20(%eax), %xmm0
342 movdqa 0x30(%eax), %xmm1
345 movdqa %xmm0, 0x20(%edx)
346 movdqa %xmm1, 0x30(%edx)
349 L(shl_0_mem_less_64bytes):
351 jb L(shl_0_mem_less_32bytes)
354 movdqa 0x10(%eax), %xmm1
357 movdqa %xmm1, 0x10(%edx)
360 L(shl_0_mem_less_32bytes):
362 jb L(shl_0_mem_less_16bytes)
369 L(shl_0_mem_less_16bytes):
372 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
376 # ifndef USE_AS_MEMMOVE
377 movaps -1(%eax), %xmm1
379 movl DEST+4(%esp), %edi
380 movaps -1(%eax), %xmm1
383 # ifdef DATA_CACHE_SIZE_HALF
384 cmp $DATA_CACHE_SIZE_HALF, %ecx
388 add $_GLOBAL_OFFSET_TABLE_, %ebx
389 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
391 cmp __x86_data_cache_size_half, %ecx
394 jb L(sh_1_no_prefetch)
400 prefetcht0 0x1c0(%eax)
401 prefetcht0 0x1c0(%edx)
402 movaps 15(%eax), %xmm2
403 movaps 31(%eax), %xmm3
404 movaps 47(%eax), %xmm4
405 movaps 63(%eax), %xmm5
407 palignr $1, %xmm4, %xmm5
408 palignr $1, %xmm3, %xmm4
409 movaps %xmm5, 48(%edx)
410 palignr $1, %xmm2, %xmm3
412 palignr $1, %xmm1, %xmm2
413 movaps %xmm4, 32(%edx)
414 movaps %xmm3, 16(%edx)
425 movaps 15(%eax), %xmm2
426 movaps 31(%eax), %xmm3
427 palignr $1, %xmm2, %xmm3
428 palignr $1, %xmm1, %xmm2
430 movaps %xmm3, 16(%edx)
431 lea 32(%edx, %ecx), %edx
432 lea 32(%eax, %ecx), %eax
434 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
445 L(sh_1_no_prefetch_loop):
446 movdqa 16(%eax, %edi), %xmm2
448 movdqa 32(%eax, %edi), %xmm3
450 palignr $1, %xmm2, %xmm3
451 palignr $1, %xmm1, %xmm2
453 movdqa %xmm2, -32(%edx, %edi)
454 movdqa %xmm3, -16(%edx, %edi)
455 jb L(sh_1_end_no_prefetch_loop)
457 movdqa 16(%eax, %edi), %xmm2
459 movdqa 32(%eax, %edi), %xmm3
461 palignr $1, %xmm2, %xmm3
462 palignr $1, %xmm4, %xmm2
464 movdqa %xmm2, -32(%edx, %edi)
465 movdqa %xmm3, -16(%edx, %edi)
466 jae L(sh_1_no_prefetch_loop)
468 L(sh_1_end_no_prefetch_loop):
472 lea 1(%edi, %eax), %eax
474 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
480 # ifndef USE_AS_MEMMOVE
481 movaps -2(%eax), %xmm1
483 movl DEST+4(%esp), %edi
484 movaps -2(%eax), %xmm1
487 # ifdef DATA_CACHE_SIZE_HALF
488 cmp $DATA_CACHE_SIZE_HALF, %ecx
492 add $_GLOBAL_OFFSET_TABLE_, %ebx
493 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
495 cmp __x86_data_cache_size_half, %ecx
498 jb L(sh_2_no_prefetch)
504 prefetcht0 0x1c0(%eax)
505 prefetcht0 0x1c0(%edx)
506 movaps 14(%eax), %xmm2
507 movaps 30(%eax), %xmm3
508 movaps 46(%eax), %xmm4
509 movaps 62(%eax), %xmm5
511 palignr $2, %xmm4, %xmm5
512 palignr $2, %xmm3, %xmm4
513 movaps %xmm5, 48(%edx)
514 palignr $2, %xmm2, %xmm3
516 palignr $2, %xmm1, %xmm2
517 movaps %xmm4, 32(%edx)
518 movaps %xmm3, 16(%edx)
529 movaps 14(%eax), %xmm2
530 movaps 30(%eax), %xmm3
531 palignr $2, %xmm2, %xmm3
532 palignr $2, %xmm1, %xmm2
534 movaps %xmm3, 16(%edx)
535 lea 32(%edx, %ecx), %edx
536 lea 32(%eax, %ecx), %eax
538 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
549 L(sh_2_no_prefetch_loop):
550 movdqa 16(%eax, %edi), %xmm2
552 movdqa 32(%eax, %edi), %xmm3
554 palignr $2, %xmm2, %xmm3
555 palignr $2, %xmm1, %xmm2
557 movdqa %xmm2, -32(%edx, %edi)
558 movdqa %xmm3, -16(%edx, %edi)
559 jb L(sh_2_end_no_prefetch_loop)
561 movdqa 16(%eax, %edi), %xmm2
563 movdqa 32(%eax, %edi), %xmm3
565 palignr $2, %xmm2, %xmm3
566 palignr $2, %xmm4, %xmm2
568 movdqa %xmm2, -32(%edx, %edi)
569 movdqa %xmm3, -16(%edx, %edi)
570 jae L(sh_2_no_prefetch_loop)
572 L(sh_2_end_no_prefetch_loop):
576 lea 2(%edi, %eax), %eax
578 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
584 # ifndef USE_AS_MEMMOVE
585 movaps -3(%eax), %xmm1
587 movl DEST+4(%esp), %edi
588 movaps -3(%eax), %xmm1
591 # ifdef DATA_CACHE_SIZE_HALF
592 cmp $DATA_CACHE_SIZE_HALF, %ecx
596 add $_GLOBAL_OFFSET_TABLE_, %ebx
597 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
599 cmp __x86_data_cache_size_half, %ecx
602 jb L(sh_3_no_prefetch)
608 prefetcht0 0x1c0(%eax)
609 prefetcht0 0x1c0(%edx)
610 movaps 13(%eax), %xmm2
611 movaps 29(%eax), %xmm3
612 movaps 45(%eax), %xmm4
613 movaps 61(%eax), %xmm5
615 palignr $3, %xmm4, %xmm5
616 palignr $3, %xmm3, %xmm4
617 movaps %xmm5, 48(%edx)
618 palignr $3, %xmm2, %xmm3
620 palignr $3, %xmm1, %xmm2
621 movaps %xmm4, 32(%edx)
622 movaps %xmm3, 16(%edx)
633 movaps 13(%eax), %xmm2
634 movaps 29(%eax), %xmm3
635 palignr $3, %xmm2, %xmm3
636 palignr $3, %xmm1, %xmm2
638 movaps %xmm3, 16(%edx)
639 lea 32(%edx, %ecx), %edx
640 lea 32(%eax, %ecx), %eax
642 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
653 L(sh_3_no_prefetch_loop):
654 movdqa 16(%eax, %edi), %xmm2
656 movdqa 32(%eax, %edi), %xmm3
658 palignr $3, %xmm2, %xmm3
659 palignr $3, %xmm1, %xmm2
661 movdqa %xmm2, -32(%edx, %edi)
662 movdqa %xmm3, -16(%edx, %edi)
664 jb L(sh_3_end_no_prefetch_loop)
666 movdqa 16(%eax, %edi), %xmm2
668 movdqa 32(%eax, %edi), %xmm3
670 palignr $3, %xmm2, %xmm3
671 palignr $3, %xmm4, %xmm2
673 movdqa %xmm2, -32(%edx, %edi)
674 movdqa %xmm3, -16(%edx, %edi)
676 jae L(sh_3_no_prefetch_loop)
678 L(sh_3_end_no_prefetch_loop):
682 lea 3(%edi, %eax), %eax
684 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
690 # ifndef USE_AS_MEMMOVE
691 movaps -4(%eax), %xmm1
693 movl DEST+4(%esp), %edi
694 movaps -4(%eax), %xmm1
697 # ifdef DATA_CACHE_SIZE_HALF
698 cmp $DATA_CACHE_SIZE_HALF, %ecx
702 add $_GLOBAL_OFFSET_TABLE_, %ebx
703 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
705 cmp __x86_data_cache_size_half, %ecx
708 jb L(sh_4_no_prefetch)
714 prefetcht0 0x1c0(%eax)
715 prefetcht0 0x1c0(%edx)
716 movaps 12(%eax), %xmm2
717 movaps 28(%eax), %xmm3
718 movaps 44(%eax), %xmm4
719 movaps 60(%eax), %xmm5
721 palignr $4, %xmm4, %xmm5
722 palignr $4, %xmm3, %xmm4
723 movaps %xmm5, 48(%edx)
724 palignr $4, %xmm2, %xmm3
726 palignr $4, %xmm1, %xmm2
727 movaps %xmm4, 32(%edx)
728 movaps %xmm3, 16(%edx)
739 movaps 12(%eax), %xmm2
740 movaps 28(%eax), %xmm3
741 palignr $4, %xmm2, %xmm3
742 palignr $4, %xmm1, %xmm2
744 movaps %xmm3, 16(%edx)
745 lea 32(%edx, %ecx), %edx
746 lea 32(%eax, %ecx), %eax
748 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
759 L(sh_4_no_prefetch_loop):
760 movdqa 16(%eax, %edi), %xmm2
762 movdqa 32(%eax, %edi), %xmm3
764 palignr $4, %xmm2, %xmm3
765 palignr $4, %xmm1, %xmm2
767 movdqa %xmm2, -32(%edx, %edi)
768 movdqa %xmm3, -16(%edx, %edi)
770 jb L(sh_4_end_no_prefetch_loop)
772 movdqa 16(%eax, %edi), %xmm2
774 movdqa 32(%eax, %edi), %xmm3
776 palignr $4, %xmm2, %xmm3
777 palignr $4, %xmm4, %xmm2
779 movdqa %xmm2, -32(%edx, %edi)
780 movdqa %xmm3, -16(%edx, %edi)
782 jae L(sh_4_no_prefetch_loop)
784 L(sh_4_end_no_prefetch_loop):
788 lea 4(%edi, %eax), %eax
790 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
796 # ifndef USE_AS_MEMMOVE
797 movaps -5(%eax), %xmm1
799 movl DEST+4(%esp), %edi
800 movaps -5(%eax), %xmm1
803 # ifdef DATA_CACHE_SIZE_HALF
804 cmp $DATA_CACHE_SIZE_HALF, %ecx
808 add $_GLOBAL_OFFSET_TABLE_, %ebx
809 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
811 cmp __x86_data_cache_size_half, %ecx
814 jb L(sh_5_no_prefetch)
820 prefetcht0 0x1c0(%eax)
821 prefetcht0 0x1c0(%edx)
822 movaps 11(%eax), %xmm2
823 movaps 27(%eax), %xmm3
824 movaps 43(%eax), %xmm4
825 movaps 59(%eax), %xmm5
827 palignr $5, %xmm4, %xmm5
828 palignr $5, %xmm3, %xmm4
829 movaps %xmm5, 48(%edx)
830 palignr $5, %xmm2, %xmm3
832 palignr $5, %xmm1, %xmm2
833 movaps %xmm4, 32(%edx)
834 movaps %xmm3, 16(%edx)
845 movaps 11(%eax), %xmm2
846 movaps 27(%eax), %xmm3
847 palignr $5, %xmm2, %xmm3
848 palignr $5, %xmm1, %xmm2
850 movaps %xmm3, 16(%edx)
851 lea 32(%edx, %ecx), %edx
852 lea 32(%eax, %ecx), %eax
854 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
865 L(sh_5_no_prefetch_loop):
866 movdqa 16(%eax, %edi), %xmm2
868 movdqa 32(%eax, %edi), %xmm3
870 palignr $5, %xmm2, %xmm3
871 palignr $5, %xmm1, %xmm2
873 movdqa %xmm2, -32(%edx, %edi)
874 movdqa %xmm3, -16(%edx, %edi)
876 jb L(sh_5_end_no_prefetch_loop)
878 movdqa 16(%eax, %edi), %xmm2
880 movdqa 32(%eax, %edi), %xmm3
882 palignr $5, %xmm2, %xmm3
883 palignr $5, %xmm4, %xmm2
885 movdqa %xmm2, -32(%edx, %edi)
886 movdqa %xmm3, -16(%edx, %edi)
888 jae L(sh_5_no_prefetch_loop)
890 L(sh_5_end_no_prefetch_loop):
894 lea 5(%edi, %eax), %eax
896 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
902 # ifndef USE_AS_MEMMOVE
903 movaps -6(%eax), %xmm1
905 movl DEST+4(%esp), %edi
906 movaps -6(%eax), %xmm1
909 # ifdef DATA_CACHE_SIZE_HALF
910 cmp $DATA_CACHE_SIZE_HALF, %ecx
914 add $_GLOBAL_OFFSET_TABLE_, %ebx
915 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
917 cmp __x86_data_cache_size_half, %ecx
920 jb L(sh_6_no_prefetch)
926 prefetcht0 0x1c0(%eax)
927 prefetcht0 0x1c0(%edx)
928 movaps 10(%eax), %xmm2
929 movaps 26(%eax), %xmm3
930 movaps 42(%eax), %xmm4
931 movaps 58(%eax), %xmm5
933 palignr $6, %xmm4, %xmm5
934 palignr $6, %xmm3, %xmm4
935 movaps %xmm5, 48(%edx)
936 palignr $6, %xmm2, %xmm3
938 palignr $6, %xmm1, %xmm2
939 movaps %xmm4, 32(%edx)
940 movaps %xmm3, 16(%edx)
951 movaps 10(%eax), %xmm2
952 movaps 26(%eax), %xmm3
953 palignr $6, %xmm2, %xmm3
954 palignr $6, %xmm1, %xmm2
956 movaps %xmm3, 16(%edx)
957 lea 32(%edx, %ecx), %edx
958 lea 32(%eax, %ecx), %eax
960 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
971 L(sh_6_no_prefetch_loop):
972 movdqa 16(%eax, %edi), %xmm2
974 movdqa 32(%eax, %edi), %xmm3
976 palignr $6, %xmm2, %xmm3
977 palignr $6, %xmm1, %xmm2
979 movdqa %xmm2, -32(%edx, %edi)
980 movdqa %xmm3, -16(%edx, %edi)
982 jb L(sh_6_end_no_prefetch_loop)
984 movdqa 16(%eax, %edi), %xmm2
986 movdqa 32(%eax, %edi), %xmm3
988 palignr $6, %xmm2, %xmm3
989 palignr $6, %xmm4, %xmm2
991 movdqa %xmm2, -32(%edx, %edi)
992 movdqa %xmm3, -16(%edx, %edi)
994 jae L(sh_6_no_prefetch_loop)
996 L(sh_6_end_no_prefetch_loop):
1000 lea 6(%edi, %eax), %eax
1002 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1008 # ifndef USE_AS_MEMMOVE
1009 movaps -7(%eax), %xmm1
1011 movl DEST+4(%esp), %edi
1012 movaps -7(%eax), %xmm1
1013 movdqu %xmm0, (%edi)
1015 # ifdef DATA_CACHE_SIZE_HALF
1016 cmp $DATA_CACHE_SIZE_HALF, %ecx
1020 add $_GLOBAL_OFFSET_TABLE_, %ebx
1021 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1023 cmp __x86_data_cache_size_half, %ecx
1026 jb L(sh_7_no_prefetch)
1032 prefetcht0 0x1c0(%eax)
1033 prefetcht0 0x1c0(%edx)
1034 movaps 9(%eax), %xmm2
1035 movaps 25(%eax), %xmm3
1036 movaps 41(%eax), %xmm4
1037 movaps 57(%eax), %xmm5
1039 palignr $7, %xmm4, %xmm5
1040 palignr $7, %xmm3, %xmm4
1041 movaps %xmm5, 48(%edx)
1042 palignr $7, %xmm2, %xmm3
1044 palignr $7, %xmm1, %xmm2
1045 movaps %xmm4, 32(%edx)
1046 movaps %xmm3, 16(%edx)
1048 movaps %xmm2, (%edx)
1057 movaps 9(%eax), %xmm2
1058 movaps 25(%eax), %xmm3
1059 palignr $7, %xmm2, %xmm3
1060 palignr $7, %xmm1, %xmm2
1061 movaps %xmm2, (%edx)
1062 movaps %xmm3, 16(%edx)
1063 lea 32(%edx, %ecx), %edx
1064 lea 32(%eax, %ecx), %eax
1066 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1071 L(sh_7_no_prefetch):
1077 L(sh_7_no_prefetch_loop):
1078 movdqa 16(%eax, %edi), %xmm2
1080 movdqa 32(%eax, %edi), %xmm3
1082 palignr $7, %xmm2, %xmm3
1083 palignr $7, %xmm1, %xmm2
1085 movdqa %xmm2, -32(%edx, %edi)
1086 movdqa %xmm3, -16(%edx, %edi)
1087 jb L(sh_7_end_no_prefetch_loop)
1089 movdqa 16(%eax, %edi), %xmm2
1091 movdqa 32(%eax, %edi), %xmm3
1093 palignr $7, %xmm2, %xmm3
1094 palignr $7, %xmm4, %xmm2
1096 movdqa %xmm2, -32(%edx, %edi)
1097 movdqa %xmm3, -16(%edx, %edi)
1098 jae L(sh_7_no_prefetch_loop)
1100 L(sh_7_end_no_prefetch_loop):
1104 lea 7(%edi, %eax), %eax
1106 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1112 # ifndef USE_AS_MEMMOVE
1113 movaps -8(%eax), %xmm1
1115 movl DEST+4(%esp), %edi
1116 movaps -8(%eax), %xmm1
1117 movdqu %xmm0, (%edi)
1119 # ifdef DATA_CACHE_SIZE_HALF
1120 cmp $DATA_CACHE_SIZE_HALF, %ecx
1124 add $_GLOBAL_OFFSET_TABLE_, %ebx
1125 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1127 cmp __x86_data_cache_size_half, %ecx
1130 jb L(sh_8_no_prefetch)
1136 prefetcht0 0x1c0(%eax)
1137 prefetcht0 0x1c0(%edx)
1138 movaps 8(%eax), %xmm2
1139 movaps 24(%eax), %xmm3
1140 movaps 40(%eax), %xmm4
1141 movaps 56(%eax), %xmm5
1143 palignr $8, %xmm4, %xmm5
1144 palignr $8, %xmm3, %xmm4
1145 movaps %xmm5, 48(%edx)
1146 palignr $8, %xmm2, %xmm3
1148 palignr $8, %xmm1, %xmm2
1149 movaps %xmm4, 32(%edx)
1150 movaps %xmm3, 16(%edx)
1152 movaps %xmm2, (%edx)
1161 movaps 8(%eax), %xmm2
1162 movaps 24(%eax), %xmm3
1163 palignr $8, %xmm2, %xmm3
1164 palignr $8, %xmm1, %xmm2
1165 movaps %xmm2, (%edx)
1166 movaps %xmm3, 16(%edx)
1167 lea 32(%edx, %ecx), %edx
1168 lea 32(%eax, %ecx), %eax
1170 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1175 L(sh_8_no_prefetch):
1181 L(sh_8_no_prefetch_loop):
1182 movdqa 16(%eax, %edi), %xmm2
1184 movdqa 32(%eax, %edi), %xmm3
1186 palignr $8, %xmm2, %xmm3
1187 palignr $8, %xmm1, %xmm2
1189 movdqa %xmm2, -32(%edx, %edi)
1190 movdqa %xmm3, -16(%edx, %edi)
1191 jb L(sh_8_end_no_prefetch_loop)
1193 movdqa 16(%eax, %edi), %xmm2
1195 movdqa 32(%eax, %edi), %xmm3
1197 palignr $8, %xmm2, %xmm3
1198 palignr $8, %xmm4, %xmm2
1200 movdqa %xmm2, -32(%edx, %edi)
1201 movdqa %xmm3, -16(%edx, %edi)
1202 jae L(sh_8_no_prefetch_loop)
1204 L(sh_8_end_no_prefetch_loop):
1208 lea 8(%edi, %eax), %eax
1210 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1216 # ifndef USE_AS_MEMMOVE
1217 movaps -9(%eax), %xmm1
1219 movl DEST+4(%esp), %edi
1220 movaps -9(%eax), %xmm1
1221 movdqu %xmm0, (%edi)
1223 # ifdef DATA_CACHE_SIZE_HALF
1224 cmp $DATA_CACHE_SIZE_HALF, %ecx
1228 add $_GLOBAL_OFFSET_TABLE_, %ebx
1229 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1231 cmp __x86_data_cache_size_half, %ecx
1234 jb L(sh_9_no_prefetch)
1240 prefetcht0 0x1c0(%eax)
1241 prefetcht0 0x1c0(%edx)
1242 movaps 7(%eax), %xmm2
1243 movaps 23(%eax), %xmm3
1244 movaps 39(%eax), %xmm4
1245 movaps 55(%eax), %xmm5
1247 palignr $9, %xmm4, %xmm5
1248 palignr $9, %xmm3, %xmm4
1249 movaps %xmm5, 48(%edx)
1250 palignr $9, %xmm2, %xmm3
1252 palignr $9, %xmm1, %xmm2
1253 movaps %xmm4, 32(%edx)
1254 movaps %xmm3, 16(%edx)
1256 movaps %xmm2, (%edx)
1265 movaps 7(%eax), %xmm2
1266 movaps 23(%eax), %xmm3
1267 palignr $9, %xmm2, %xmm3
1268 palignr $9, %xmm1, %xmm2
1270 movaps %xmm2, (%edx)
1271 movaps %xmm3, 16(%edx)
1272 lea 32(%edx, %ecx), %edx
1273 lea 32(%eax, %ecx), %eax
1275 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1280 L(sh_9_no_prefetch):
1286 L(sh_9_no_prefetch_loop):
1287 movdqa 16(%eax, %edi), %xmm2
1289 movdqa 32(%eax, %edi), %xmm3
1291 palignr $9, %xmm2, %xmm3
1292 palignr $9, %xmm1, %xmm2
1294 movdqa %xmm2, -32(%edx, %edi)
1295 movdqa %xmm3, -16(%edx, %edi)
1296 jb L(sh_9_end_no_prefetch_loop)
1298 movdqa 16(%eax, %edi), %xmm2
1300 movdqa 32(%eax, %edi), %xmm3
1302 palignr $9, %xmm2, %xmm3
1303 palignr $9, %xmm4, %xmm2
1305 movdqa %xmm2, -32(%edx, %edi)
1306 movdqa %xmm3, -16(%edx, %edi)
1307 jae L(sh_9_no_prefetch_loop)
1309 L(sh_9_end_no_prefetch_loop):
1313 lea 9(%edi, %eax), %eax
1315 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1321 # ifndef USE_AS_MEMMOVE
1322 movaps -10(%eax), %xmm1
1324 movl DEST+4(%esp), %edi
1325 movaps -10(%eax), %xmm1
1326 movdqu %xmm0, (%edi)
1328 # ifdef DATA_CACHE_SIZE_HALF
1329 cmp $DATA_CACHE_SIZE_HALF, %ecx
1333 add $_GLOBAL_OFFSET_TABLE_, %ebx
1334 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1336 cmp __x86_data_cache_size_half, %ecx
1339 jb L(sh_10_no_prefetch)
1345 prefetcht0 0x1c0(%eax)
1346 prefetcht0 0x1c0(%edx)
1347 movaps 6(%eax), %xmm2
1348 movaps 22(%eax), %xmm3
1349 movaps 38(%eax), %xmm4
1350 movaps 54(%eax), %xmm5
1352 palignr $10, %xmm4, %xmm5
1353 palignr $10, %xmm3, %xmm4
1354 movaps %xmm5, 48(%edx)
1355 palignr $10, %xmm2, %xmm3
1357 palignr $10, %xmm1, %xmm2
1358 movaps %xmm4, 32(%edx)
1359 movaps %xmm3, 16(%edx)
1361 movaps %xmm2, (%edx)
1364 ja L(Shl10LoopStart)
1370 movaps 6(%eax), %xmm2
1371 movaps 22(%eax), %xmm3
1372 palignr $10, %xmm2, %xmm3
1373 palignr $10, %xmm1, %xmm2
1375 movaps %xmm2, (%edx)
1376 movaps %xmm3, 16(%edx)
1377 lea 32(%edx, %ecx), %edx
1378 lea 32(%eax, %ecx), %eax
1380 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1385 L(sh_10_no_prefetch):
1391 L(sh_10_no_prefetch_loop):
1392 movdqa 16(%eax, %edi), %xmm2
1394 movdqa 32(%eax, %edi), %xmm3
1396 palignr $10, %xmm2, %xmm3
1397 palignr $10, %xmm1, %xmm2
1399 movdqa %xmm2, -32(%edx, %edi)
1400 movdqa %xmm3, -16(%edx, %edi)
1401 jb L(sh_10_end_no_prefetch_loop)
1403 movdqa 16(%eax, %edi), %xmm2
1405 movdqa 32(%eax, %edi), %xmm3
1407 palignr $10, %xmm2, %xmm3
1408 palignr $10, %xmm4, %xmm2
1410 movdqa %xmm2, -32(%edx, %edi)
1411 movdqa %xmm3, -16(%edx, %edi)
1412 jae L(sh_10_no_prefetch_loop)
1414 L(sh_10_end_no_prefetch_loop):
1418 lea 10(%edi, %eax), %eax
1420 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1426 # ifndef USE_AS_MEMMOVE
1427 movaps -11(%eax), %xmm1
1429 movl DEST+4(%esp), %edi
1430 movaps -11(%eax), %xmm1
1431 movdqu %xmm0, (%edi)
1433 # ifdef DATA_CACHE_SIZE_HALF
1434 cmp $DATA_CACHE_SIZE_HALF, %ecx
1438 add $_GLOBAL_OFFSET_TABLE_, %ebx
1439 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1441 cmp __x86_data_cache_size_half, %ecx
1444 jb L(sh_11_no_prefetch)
1450 prefetcht0 0x1c0(%eax)
1451 prefetcht0 0x1c0(%edx)
1452 movaps 5(%eax), %xmm2
1453 movaps 21(%eax), %xmm3
1454 movaps 37(%eax), %xmm4
1455 movaps 53(%eax), %xmm5
1457 palignr $11, %xmm4, %xmm5
1458 palignr $11, %xmm3, %xmm4
1459 movaps %xmm5, 48(%edx)
1460 palignr $11, %xmm2, %xmm3
1462 palignr $11, %xmm1, %xmm2
1463 movaps %xmm4, 32(%edx)
1464 movaps %xmm3, 16(%edx)
1466 movaps %xmm2, (%edx)
1469 ja L(Shl11LoopStart)
1475 movaps 5(%eax), %xmm2
1476 movaps 21(%eax), %xmm3
1477 palignr $11, %xmm2, %xmm3
1478 palignr $11, %xmm1, %xmm2
1480 movaps %xmm2, (%edx)
1481 movaps %xmm3, 16(%edx)
1482 lea 32(%edx, %ecx), %edx
1483 lea 32(%eax, %ecx), %eax
1485 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1490 L(sh_11_no_prefetch):
1496 L(sh_11_no_prefetch_loop):
1497 movdqa 16(%eax, %edi), %xmm2
1499 movdqa 32(%eax, %edi), %xmm3
1501 palignr $11, %xmm2, %xmm3
1502 palignr $11, %xmm1, %xmm2
1504 movdqa %xmm2, -32(%edx, %edi)
1505 movdqa %xmm3, -16(%edx, %edi)
1506 jb L(sh_11_end_no_prefetch_loop)
1508 movdqa 16(%eax, %edi), %xmm2
1510 movdqa 32(%eax, %edi), %xmm3
1512 palignr $11, %xmm2, %xmm3
1513 palignr $11, %xmm4, %xmm2
1515 movdqa %xmm2, -32(%edx, %edi)
1516 movdqa %xmm3, -16(%edx, %edi)
1517 jae L(sh_11_no_prefetch_loop)
1519 L(sh_11_end_no_prefetch_loop):
1523 lea 11(%edi, %eax), %eax
1525 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1531 # ifndef USE_AS_MEMMOVE
1532 movaps -12(%eax), %xmm1
1534 movl DEST+4(%esp), %edi
1535 movaps -12(%eax), %xmm1
1536 movdqu %xmm0, (%edi)
1538 # ifdef DATA_CACHE_SIZE_HALF
1539 cmp $DATA_CACHE_SIZE_HALF, %ecx
1543 add $_GLOBAL_OFFSET_TABLE_, %ebx
1544 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1546 cmp __x86_data_cache_size_half, %ecx
1549 jb L(sh_12_no_prefetch)
1555 prefetcht0 0x1c0(%eax)
1556 prefetcht0 0x1c0(%edx)
1557 movaps 4(%eax), %xmm2
1558 movaps 20(%eax), %xmm3
1559 movaps 36(%eax), %xmm4
1560 movaps 52(%eax), %xmm5
1562 palignr $12, %xmm4, %xmm5
1563 palignr $12, %xmm3, %xmm4
1564 movaps %xmm5, 48(%edx)
1565 palignr $12, %xmm2, %xmm3
1567 palignr $12, %xmm1, %xmm2
1568 movaps %xmm4, 32(%edx)
1569 movaps %xmm3, 16(%edx)
1571 movaps %xmm2, (%edx)
1574 ja L(Shl12LoopStart)
1580 movaps 4(%eax), %xmm2
1581 movaps 20(%eax), %xmm3
1582 palignr $12, %xmm2, %xmm3
1583 palignr $12, %xmm1, %xmm2
1585 movaps %xmm2, (%edx)
1586 movaps %xmm3, 16(%edx)
1587 lea 32(%edx, %ecx), %edx
1588 lea 32(%eax, %ecx), %eax
1590 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1595 L(sh_12_no_prefetch):
1601 L(sh_12_no_prefetch_loop):
1602 movdqa 16(%eax, %edi), %xmm2
1604 movdqa 32(%eax, %edi), %xmm3
1606 palignr $12, %xmm2, %xmm3
1607 palignr $12, %xmm1, %xmm2
1609 movdqa %xmm2, -32(%edx, %edi)
1610 movdqa %xmm3, -16(%edx, %edi)
1611 jb L(sh_12_end_no_prefetch_loop)
1613 movdqa 16(%eax, %edi), %xmm2
1615 movdqa 32(%eax, %edi), %xmm3
1617 palignr $12, %xmm2, %xmm3
1618 palignr $12, %xmm4, %xmm2
1620 movdqa %xmm2, -32(%edx, %edi)
1621 movdqa %xmm3, -16(%edx, %edi)
1622 jae L(sh_12_no_prefetch_loop)
1624 L(sh_12_end_no_prefetch_loop):
1628 lea 12(%edi, %eax), %eax
1630 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1636 # ifndef USE_AS_MEMMOVE
1637 movaps -13(%eax), %xmm1
1639 movl DEST+4(%esp), %edi
1640 movaps -13(%eax), %xmm1
1641 movdqu %xmm0, (%edi)
1643 # ifdef DATA_CACHE_SIZE_HALF
1644 cmp $DATA_CACHE_SIZE_HALF, %ecx
1648 add $_GLOBAL_OFFSET_TABLE_, %ebx
1649 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1651 cmp __x86_data_cache_size_half, %ecx
1654 jb L(sh_13_no_prefetch)
1660 prefetcht0 0x1c0(%eax)
1661 prefetcht0 0x1c0(%edx)
1662 movaps 3(%eax), %xmm2
1663 movaps 19(%eax), %xmm3
1664 movaps 35(%eax), %xmm4
1665 movaps 51(%eax), %xmm5
1667 palignr $13, %xmm4, %xmm5
1668 palignr $13, %xmm3, %xmm4
1669 movaps %xmm5, 48(%edx)
1670 palignr $13, %xmm2, %xmm3
1672 palignr $13, %xmm1, %xmm2
1673 movaps %xmm4, 32(%edx)
1674 movaps %xmm3, 16(%edx)
1676 movaps %xmm2, (%edx)
1679 ja L(Shl13LoopStart)
1685 movaps 3(%eax), %xmm2
1686 movaps 19(%eax), %xmm3
1687 palignr $13, %xmm2, %xmm3
1688 palignr $13, %xmm1, %xmm2
1690 movaps %xmm2, (%edx)
1691 movaps %xmm3, 16(%edx)
1692 lea 32(%edx, %ecx), %edx
1693 lea 32(%eax, %ecx), %eax
1695 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1700 L(sh_13_no_prefetch):
1706 L(sh_13_no_prefetch_loop):
1707 movdqa 16(%eax, %edi), %xmm2
1709 movdqa 32(%eax, %edi), %xmm3
1711 palignr $13, %xmm2, %xmm3
1712 palignr $13, %xmm1, %xmm2
1714 movdqa %xmm2, -32(%edx, %edi)
1715 movdqa %xmm3, -16(%edx, %edi)
1716 jb L(sh_13_end_no_prefetch_loop)
1718 movdqa 16(%eax, %edi), %xmm2
1720 movdqa 32(%eax, %edi), %xmm3
1722 palignr $13, %xmm2, %xmm3
1723 palignr $13, %xmm4, %xmm2
1725 movdqa %xmm2, -32(%edx, %edi)
1726 movdqa %xmm3, -16(%edx, %edi)
1727 jae L(sh_13_no_prefetch_loop)
1729 L(sh_13_end_no_prefetch_loop):
1733 lea 13(%edi, %eax), %eax
1735 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1741 # ifndef USE_AS_MEMMOVE
1742 movaps -14(%eax), %xmm1
1744 movl DEST+4(%esp), %edi
1745 movaps -14(%eax), %xmm1
1746 movdqu %xmm0, (%edi)
1748 # ifdef DATA_CACHE_SIZE_HALF
1749 cmp $DATA_CACHE_SIZE_HALF, %ecx
1753 add $_GLOBAL_OFFSET_TABLE_, %ebx
1754 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1756 cmp __x86_data_cache_size_half, %ecx
1759 jb L(sh_14_no_prefetch)
1765 prefetcht0 0x1c0(%eax)
1766 prefetcht0 0x1c0(%edx)
1767 movaps 2(%eax), %xmm2
1768 movaps 18(%eax), %xmm3
1769 movaps 34(%eax), %xmm4
1770 movaps 50(%eax), %xmm5
1772 palignr $14, %xmm4, %xmm5
1773 palignr $14, %xmm3, %xmm4
1774 movaps %xmm5, 48(%edx)
1775 palignr $14, %xmm2, %xmm3
1777 palignr $14, %xmm1, %xmm2
1778 movaps %xmm4, 32(%edx)
1779 movaps %xmm3, 16(%edx)
1781 movaps %xmm2, (%edx)
1784 ja L(Shl14LoopStart)
1790 movaps 2(%eax), %xmm2
1791 movaps 18(%eax), %xmm3
1792 palignr $14, %xmm2, %xmm3
1793 palignr $14, %xmm1, %xmm2
1795 movaps %xmm2, (%edx)
1796 movaps %xmm3, 16(%edx)
1797 lea 32(%edx, %ecx), %edx
1798 lea 32(%eax, %ecx), %eax
1800 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1805 L(sh_14_no_prefetch):
1811 L(sh_14_no_prefetch_loop):
1812 movdqa 16(%eax, %edi), %xmm2
1814 movdqa 32(%eax, %edi), %xmm3
1816 palignr $14, %xmm2, %xmm3
1817 palignr $14, %xmm1, %xmm2
1819 movdqa %xmm2, -32(%edx, %edi)
1820 movdqa %xmm3, -16(%edx, %edi)
1821 jb L(sh_14_end_no_prefetch_loop)
1823 movdqa 16(%eax, %edi), %xmm2
1825 movdqa 32(%eax, %edi), %xmm3
1827 palignr $14, %xmm2, %xmm3
1828 palignr $14, %xmm4, %xmm2
1830 movdqa %xmm2, -32(%edx, %edi)
1831 movdqa %xmm3, -16(%edx, %edi)
1832 jae L(sh_14_no_prefetch_loop)
1834 L(sh_14_end_no_prefetch_loop):
1838 lea 14(%edi, %eax), %eax
1840 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1846 # ifndef USE_AS_MEMMOVE
1847 movaps -15(%eax), %xmm1
1849 movl DEST+4(%esp), %edi
1850 movaps -15(%eax), %xmm1
1851 movdqu %xmm0, (%edi)
1853 # ifdef DATA_CACHE_SIZE_HALF
1854 cmp $DATA_CACHE_SIZE_HALF, %ecx
1858 add $_GLOBAL_OFFSET_TABLE_, %ebx
1859 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1861 cmp __x86_data_cache_size_half, %ecx
1864 jb L(sh_15_no_prefetch)
1870 prefetcht0 0x1c0(%eax)
1871 prefetcht0 0x1c0(%edx)
1872 movaps 1(%eax), %xmm2
1873 movaps 17(%eax), %xmm3
1874 movaps 33(%eax), %xmm4
1875 movaps 49(%eax), %xmm5
1877 palignr $15, %xmm4, %xmm5
1878 palignr $15, %xmm3, %xmm4
1879 movaps %xmm5, 48(%edx)
1880 palignr $15, %xmm2, %xmm3
1882 palignr $15, %xmm1, %xmm2
1883 movaps %xmm4, 32(%edx)
1884 movaps %xmm3, 16(%edx)
1886 movaps %xmm2, (%edx)
1889 ja L(Shl15LoopStart)
1895 movaps 1(%eax), %xmm2
1896 movaps 17(%eax), %xmm3
1897 palignr $15, %xmm2, %xmm3
1898 palignr $15, %xmm1, %xmm2
1900 movaps %xmm2, (%edx)
1901 movaps %xmm3, 16(%edx)
1902 lea 32(%edx, %ecx), %edx
1903 lea 32(%eax, %ecx), %eax
1905 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1910 L(sh_15_no_prefetch):
1916 L(sh_15_no_prefetch_loop):
1917 movdqa 16(%eax, %edi), %xmm2
1919 movdqa 32(%eax, %edi), %xmm3
1921 palignr $15, %xmm2, %xmm3
1922 palignr $15, %xmm1, %xmm2
1924 movdqa %xmm2, -32(%edx, %edi)
1925 movdqa %xmm3, -16(%edx, %edi)
1926 jb L(sh_15_end_no_prefetch_loop)
1928 movdqa 16(%eax, %edi), %xmm2
1930 movdqa 32(%eax, %edi), %xmm3
1932 palignr $15, %xmm2, %xmm3
1933 palignr $15, %xmm4, %xmm2
1935 movdqa %xmm2, -32(%edx, %edi)
1936 movdqa %xmm3, -16(%edx, %edi)
1937 jae L(sh_15_no_prefetch_loop)
1939 L(sh_15_end_no_prefetch_loop):
1943 lea 15(%edi, %eax), %eax
1945 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1952 lea (%edx, %ecx), %edx
1953 lea (%eax, %ecx), %eax
1955 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1958 L(fwd_write_44bytes):
1959 movq -44(%eax), %xmm0
1960 movq %xmm0, -44(%edx)
1961 L(fwd_write_36bytes):
1962 movq -36(%eax), %xmm0
1963 movq %xmm0, -36(%edx)
1964 L(fwd_write_28bytes):
1965 movq -28(%eax), %xmm0
1966 movq %xmm0, -28(%edx)
1967 L(fwd_write_20bytes):
1968 movq -20(%eax), %xmm0
1969 movq %xmm0, -20(%edx)
1970 L(fwd_write_12bytes):
1971 movq -12(%eax), %xmm0
1972 movq %xmm0, -12(%edx)
1973 L(fwd_write_4bytes):
1976 # ifdef USE_AS_MEMPCPY
1979 movl DEST(%esp), %eax
1984 L(fwd_write_40bytes):
1985 movq -40(%eax), %xmm0
1986 movq %xmm0, -40(%edx)
1987 L(fwd_write_32bytes):
1988 movq -32(%eax), %xmm0
1989 movq %xmm0, -32(%edx)
1990 L(fwd_write_24bytes):
1991 movq -24(%eax), %xmm0
1992 movq %xmm0, -24(%edx)
1993 L(fwd_write_16bytes):
1994 movq -16(%eax), %xmm0
1995 movq %xmm0, -16(%edx)
1996 L(fwd_write_8bytes):
1997 movq -8(%eax), %xmm0
1998 movq %xmm0, -8(%edx)
1999 L(fwd_write_0bytes):
2000 # ifdef USE_AS_MEMPCPY
2003 movl DEST(%esp), %eax
2008 L(fwd_write_5bytes):
2013 # ifdef USE_AS_MEMPCPY
2016 movl DEST(%esp), %eax
2021 L(fwd_write_45bytes):
2022 movq -45(%eax), %xmm0
2023 movq %xmm0, -45(%edx)
2024 L(fwd_write_37bytes):
2025 movq -37(%eax), %xmm0
2026 movq %xmm0, -37(%edx)
2027 L(fwd_write_29bytes):
2028 movq -29(%eax), %xmm0
2029 movq %xmm0, -29(%edx)
2030 L(fwd_write_21bytes):
2031 movq -21(%eax), %xmm0
2032 movq %xmm0, -21(%edx)
2033 L(fwd_write_13bytes):
2034 movq -13(%eax), %xmm0
2035 movq %xmm0, -13(%edx)
2038 movzbl -1(%eax), %ecx
2040 # ifdef USE_AS_MEMPCPY
2043 movl DEST(%esp), %eax
2048 L(fwd_write_41bytes):
2049 movq -41(%eax), %xmm0
2050 movq %xmm0, -41(%edx)
2051 L(fwd_write_33bytes):
2052 movq -33(%eax), %xmm0
2053 movq %xmm0, -33(%edx)
2054 L(fwd_write_25bytes):
2055 movq -25(%eax), %xmm0
2056 movq %xmm0, -25(%edx)
2057 L(fwd_write_17bytes):
2058 movq -17(%eax), %xmm0
2059 movq %xmm0, -17(%edx)
2060 L(fwd_write_9bytes):
2061 movq -9(%eax), %xmm0
2062 movq %xmm0, -9(%edx)
2063 L(fwd_write_1bytes):
2064 movzbl -1(%eax), %ecx
2066 # ifdef USE_AS_MEMPCPY
2069 movl DEST(%esp), %eax
2074 L(fwd_write_46bytes):
2075 movq -46(%eax), %xmm0
2076 movq %xmm0, -46(%edx)
2077 L(fwd_write_38bytes):
2078 movq -38(%eax), %xmm0
2079 movq %xmm0, -38(%edx)
2080 L(fwd_write_30bytes):
2081 movq -30(%eax), %xmm0
2082 movq %xmm0, -30(%edx)
2083 L(fwd_write_22bytes):
2084 movq -22(%eax), %xmm0
2085 movq %xmm0, -22(%edx)
2086 L(fwd_write_14bytes):
2087 movq -14(%eax), %xmm0
2088 movq %xmm0, -14(%edx)
2089 L(fwd_write_6bytes):
2092 movzwl -2(%eax), %ecx
2094 # ifdef USE_AS_MEMPCPY
2097 movl DEST(%esp), %eax
2102 L(fwd_write_42bytes):
2103 movq -42(%eax), %xmm0
2104 movq %xmm0, -42(%edx)
2105 L(fwd_write_34bytes):
2106 movq -34(%eax), %xmm0
2107 movq %xmm0, -34(%edx)
2108 L(fwd_write_26bytes):
2109 movq -26(%eax), %xmm0
2110 movq %xmm0, -26(%edx)
2111 L(fwd_write_18bytes):
2112 movq -18(%eax), %xmm0
2113 movq %xmm0, -18(%edx)
2114 L(fwd_write_10bytes):
2115 movq -10(%eax), %xmm0
2116 movq %xmm0, -10(%edx)
2117 L(fwd_write_2bytes):
2118 movzwl -2(%eax), %ecx
2120 # ifdef USE_AS_MEMPCPY
2123 movl DEST(%esp), %eax
2128 L(fwd_write_47bytes):
2129 movq -47(%eax), %xmm0
2130 movq %xmm0, -47(%edx)
2131 L(fwd_write_39bytes):
2132 movq -39(%eax), %xmm0
2133 movq %xmm0, -39(%edx)
2134 L(fwd_write_31bytes):
2135 movq -31(%eax), %xmm0
2136 movq %xmm0, -31(%edx)
2137 L(fwd_write_23bytes):
2138 movq -23(%eax), %xmm0
2139 movq %xmm0, -23(%edx)
2140 L(fwd_write_15bytes):
2141 movq -15(%eax), %xmm0
2142 movq %xmm0, -15(%edx)
2143 L(fwd_write_7bytes):
2146 movzwl -3(%eax), %ecx
2147 movzbl -1(%eax), %eax
2150 # ifdef USE_AS_MEMPCPY
2153 movl DEST(%esp), %eax
2158 L(fwd_write_43bytes):
2159 movq -43(%eax), %xmm0
2160 movq %xmm0, -43(%edx)
2161 L(fwd_write_35bytes):
2162 movq -35(%eax), %xmm0
2163 movq %xmm0, -35(%edx)
2164 L(fwd_write_27bytes):
2165 movq -27(%eax), %xmm0
2166 movq %xmm0, -27(%edx)
2167 L(fwd_write_19bytes):
2168 movq -19(%eax), %xmm0
2169 movq %xmm0, -19(%edx)
2170 L(fwd_write_11bytes):
2171 movq -11(%eax), %xmm0
2172 movq %xmm0, -11(%edx)
2173 L(fwd_write_3bytes):
2174 movzwl -3(%eax), %ecx
2175 movzbl -1(%eax), %eax
2178 # ifdef USE_AS_MEMPCPY
2181 movl DEST(%esp), %eax
2186 L(fwd_write_40bytes_align):
2187 movdqa -40(%eax), %xmm0
2188 movdqa %xmm0, -40(%edx)
2189 L(fwd_write_24bytes_align):
2190 movdqa -24(%eax), %xmm0
2191 movdqa %xmm0, -24(%edx)
2192 L(fwd_write_8bytes_align):
2193 movq -8(%eax), %xmm0
2194 movq %xmm0, -8(%edx)
2195 L(fwd_write_0bytes_align):
2196 # ifdef USE_AS_MEMPCPY
2199 movl DEST(%esp), %eax
2204 L(fwd_write_32bytes_align):
2205 movdqa -32(%eax), %xmm0
2206 movdqa %xmm0, -32(%edx)
2207 L(fwd_write_16bytes_align):
2208 movdqa -16(%eax), %xmm0
2209 movdqa %xmm0, -16(%edx)
2210 # ifdef USE_AS_MEMPCPY
2213 movl DEST(%esp), %eax
2218 L(fwd_write_5bytes_align):
2223 # ifdef USE_AS_MEMPCPY
2226 movl DEST(%esp), %eax
2231 L(fwd_write_45bytes_align):
2232 movdqa -45(%eax), %xmm0
2233 movdqa %xmm0, -45(%edx)
2234 L(fwd_write_29bytes_align):
2235 movdqa -29(%eax), %xmm0
2236 movdqa %xmm0, -29(%edx)
2237 L(fwd_write_13bytes_align):
2238 movq -13(%eax), %xmm0
2239 movq %xmm0, -13(%edx)
2242 movzbl -1(%eax), %ecx
2244 # ifdef USE_AS_MEMPCPY
2247 movl DEST(%esp), %eax
2252 L(fwd_write_37bytes_align):
2253 movdqa -37(%eax), %xmm0
2254 movdqa %xmm0, -37(%edx)
2255 L(fwd_write_21bytes_align):
2256 movdqa -21(%eax), %xmm0
2257 movdqa %xmm0, -21(%edx)
2260 movzbl -1(%eax), %ecx
2262 # ifdef USE_AS_MEMPCPY
2265 movl DEST(%esp), %eax
2270 L(fwd_write_41bytes_align):
2271 movdqa -41(%eax), %xmm0
2272 movdqa %xmm0, -41(%edx)
2273 L(fwd_write_25bytes_align):
2274 movdqa -25(%eax), %xmm0
2275 movdqa %xmm0, -25(%edx)
2276 L(fwd_write_9bytes_align):
2277 movq -9(%eax), %xmm0
2278 movq %xmm0, -9(%edx)
2279 L(fwd_write_1bytes_align):
2280 movzbl -1(%eax), %ecx
2282 # ifdef USE_AS_MEMPCPY
2285 movl DEST(%esp), %eax
2290 L(fwd_write_33bytes_align):
2291 movdqa -33(%eax), %xmm0
2292 movdqa %xmm0, -33(%edx)
2293 L(fwd_write_17bytes_align):
2294 movdqa -17(%eax), %xmm0
2295 movdqa %xmm0, -17(%edx)
2296 movzbl -1(%eax), %ecx
2298 # ifdef USE_AS_MEMPCPY
2301 movl DEST(%esp), %eax
2306 L(fwd_write_46bytes_align):
2307 movdqa -46(%eax), %xmm0
2308 movdqa %xmm0, -46(%edx)
2309 L(fwd_write_30bytes_align):
2310 movdqa -30(%eax), %xmm0
2311 movdqa %xmm0, -30(%edx)
2312 L(fwd_write_14bytes_align):
2313 movq -14(%eax), %xmm0
2314 movq %xmm0, -14(%edx)
2315 L(fwd_write_6bytes_align):
2318 movzwl -2(%eax), %ecx
2320 # ifdef USE_AS_MEMPCPY
2323 movl DEST(%esp), %eax
2328 L(fwd_write_38bytes_align):
2329 movdqa -38(%eax), %xmm0
2330 movdqa %xmm0, -38(%edx)
2331 L(fwd_write_22bytes_align):
2332 movdqa -22(%eax), %xmm0
2333 movdqa %xmm0, -22(%edx)
2336 movzwl -2(%eax), %ecx
2338 # ifdef USE_AS_MEMPCPY
2341 movl DEST(%esp), %eax
2346 L(fwd_write_42bytes_align):
2347 movdqa -42(%eax), %xmm0
2348 movdqa %xmm0, -42(%edx)
2349 L(fwd_write_26bytes_align):
2350 movdqa -26(%eax), %xmm0
2351 movdqa %xmm0, -26(%edx)
2352 L(fwd_write_10bytes_align):
2353 movq -10(%eax), %xmm0
2354 movq %xmm0, -10(%edx)
2355 L(fwd_write_2bytes_align):
2356 movzwl -2(%eax), %ecx
2358 # ifdef USE_AS_MEMPCPY
2361 movl DEST(%esp), %eax
2366 L(fwd_write_34bytes_align):
2367 movdqa -34(%eax), %xmm0
2368 movdqa %xmm0, -34(%edx)
2369 L(fwd_write_18bytes_align):
2370 movdqa -18(%eax), %xmm0
2371 movdqa %xmm0, -18(%edx)
2372 movzwl -2(%eax), %ecx
2374 # ifdef USE_AS_MEMPCPY
2377 movl DEST(%esp), %eax
2382 L(fwd_write_47bytes_align):
2383 movdqa -47(%eax), %xmm0
2384 movdqa %xmm0, -47(%edx)
2385 L(fwd_write_31bytes_align):
2386 movdqa -31(%eax), %xmm0
2387 movdqa %xmm0, -31(%edx)
2388 L(fwd_write_15bytes_align):
2389 movq -15(%eax), %xmm0
2390 movq %xmm0, -15(%edx)
2391 L(fwd_write_7bytes_align):
2394 movzwl -3(%eax), %ecx
2395 movzbl -1(%eax), %eax
2398 # ifdef USE_AS_MEMPCPY
2401 movl DEST(%esp), %eax
2406 L(fwd_write_39bytes_align):
2407 movdqa -39(%eax), %xmm0
2408 movdqa %xmm0, -39(%edx)
2409 L(fwd_write_23bytes_align):
2410 movdqa -23(%eax), %xmm0
2411 movdqa %xmm0, -23(%edx)
2414 movzwl -3(%eax), %ecx
2415 movzbl -1(%eax), %eax
2418 # ifdef USE_AS_MEMPCPY
2421 movl DEST(%esp), %eax
2426 L(fwd_write_43bytes_align):
2427 movdqa -43(%eax), %xmm0
2428 movdqa %xmm0, -43(%edx)
2429 L(fwd_write_27bytes_align):
2430 movdqa -27(%eax), %xmm0
2431 movdqa %xmm0, -27(%edx)
2432 L(fwd_write_11bytes_align):
2433 movq -11(%eax), %xmm0
2434 movq %xmm0, -11(%edx)
2435 L(fwd_write_3bytes_align):
2436 movzwl -3(%eax), %ecx
2437 movzbl -1(%eax), %eax
2440 # ifdef USE_AS_MEMPCPY
2443 movl DEST(%esp), %eax
2448 L(fwd_write_35bytes_align):
2449 movdqa -35(%eax), %xmm0
2450 movdqa %xmm0, -35(%edx)
2451 L(fwd_write_19bytes_align):
2452 movdqa -19(%eax), %xmm0
2453 movdqa %xmm0, -19(%edx)
2454 movzwl -3(%eax), %ecx
2455 movzbl -1(%eax), %eax
2458 # ifdef USE_AS_MEMPCPY
2461 movl DEST(%esp), %eax
2466 L(fwd_write_44bytes_align):
2467 movdqa -44(%eax), %xmm0
2468 movdqa %xmm0, -44(%edx)
2469 L(fwd_write_28bytes_align):
2470 movdqa -28(%eax), %xmm0
2471 movdqa %xmm0, -28(%edx)
2472 L(fwd_write_12bytes_align):
2473 movq -12(%eax), %xmm0
2474 movq %xmm0, -12(%edx)
2475 L(fwd_write_4bytes_align):
2478 # ifdef USE_AS_MEMPCPY
2481 movl DEST(%esp), %eax
2486 L(fwd_write_36bytes_align):
2487 movdqa -36(%eax), %xmm0
2488 movdqa %xmm0, -36(%edx)
2489 L(fwd_write_20bytes_align):
2490 movdqa -20(%eax), %xmm0
2491 movdqa %xmm0, -20(%edx)
2494 # ifdef USE_AS_MEMPCPY
2497 movl DEST(%esp), %eax
2505 movdqu (%eax), %xmm1
2506 # ifdef USE_AS_MEMMOVE
2507 movl DEST+4(%esp), %edi
2508 movdqu %xmm0, (%edi)
2511 movntdq %xmm1, (%edx)
2513 lea -0x90(%ecx), %ecx
2518 movdqu (%eax), %xmm0
2519 movdqu 0x10(%eax), %xmm1
2520 movdqu 0x20(%eax), %xmm2
2521 movdqu 0x30(%eax), %xmm3
2522 movdqu 0x40(%eax), %xmm4
2523 movdqu 0x50(%eax), %xmm5
2524 movdqu 0x60(%eax), %xmm6
2525 movdqu 0x70(%eax), %xmm7
2526 lea 0x80(%eax), %eax
2529 movntdq %xmm0, (%edx)
2530 movntdq %xmm1, 0x10(%edx)
2531 movntdq %xmm2, 0x20(%edx)
2532 movntdq %xmm3, 0x30(%edx)
2533 movntdq %xmm4, 0x40(%edx)
2534 movntdq %xmm5, 0x50(%edx)
2535 movntdq %xmm6, 0x60(%edx)
2536 movntdq %xmm7, 0x70(%edx)
2537 lea 0x80(%edx), %edx
2538 jae L(large_page_loop)
2540 lea 0x80(%ecx), %ecx
2541 jl L(large_page_less_64bytes)
2543 movdqu (%eax), %xmm0
2544 movdqu 0x10(%eax), %xmm1
2545 movdqu 0x20(%eax), %xmm2
2546 movdqu 0x30(%eax), %xmm3
2547 lea 0x40(%eax), %eax
2549 movntdq %xmm0, (%edx)
2550 movntdq %xmm1, 0x10(%edx)
2551 movntdq %xmm2, 0x20(%edx)
2552 movntdq %xmm3, 0x30(%edx)
2553 lea 0x40(%edx), %edx
2555 L(large_page_less_64bytes):
2557 jb L(large_page_less_32bytes)
2558 movdqu (%eax), %xmm0
2559 movdqu 0x10(%eax), %xmm1
2560 lea 0x20(%eax), %eax
2561 movntdq %xmm0, (%edx)
2562 movntdq %xmm1, 0x10(%edx)
2563 lea 0x20(%edx), %edx
2565 L(large_page_less_32bytes):
2569 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2572 L(bk_write_44bytes):
2573 movq 36(%eax), %xmm0
2574 movq %xmm0, 36(%edx)
2575 L(bk_write_36bytes):
2576 movq 28(%eax), %xmm0
2577 movq %xmm0, 28(%edx)
2578 L(bk_write_28bytes):
2579 movq 20(%eax), %xmm0
2580 movq %xmm0, 20(%edx)
2581 L(bk_write_20bytes):
2582 movq 12(%eax), %xmm0
2583 movq %xmm0, 12(%edx)
2584 L(bk_write_12bytes):
2591 movl DEST(%esp), %eax
2592 # ifdef USE_AS_MEMPCPY
2593 movl LEN(%esp), %ecx
2599 L(bk_write_40bytes):
2600 movq 32(%eax), %xmm0
2601 movq %xmm0, 32(%edx)
2602 L(bk_write_32bytes):
2603 movq 24(%eax), %xmm0
2604 movq %xmm0, 24(%edx)
2605 L(bk_write_24bytes):
2606 movq 16(%eax), %xmm0
2607 movq %xmm0, 16(%edx)
2608 L(bk_write_16bytes):
2614 movl DEST(%esp), %eax
2615 # ifdef USE_AS_MEMPCPY
2616 movl LEN(%esp), %ecx
2622 L(bk_write_45bytes):
2623 movq 37(%eax), %xmm0
2624 movq %xmm0, 37(%edx)
2625 L(bk_write_37bytes):
2626 movq 29(%eax), %xmm0
2627 movq %xmm0, 29(%edx)
2628 L(bk_write_29bytes):
2629 movq 21(%eax), %xmm0
2630 movq %xmm0, 21(%edx)
2631 L(bk_write_21bytes):
2632 movq 13(%eax), %xmm0
2633 movq %xmm0, 13(%edx)
2634 L(bk_write_13bytes):
2643 movl DEST(%esp), %eax
2644 # ifdef USE_AS_MEMPCPY
2645 movl LEN(%esp), %ecx
2651 L(bk_write_41bytes):
2652 movq 33(%eax), %xmm0
2653 movq %xmm0, 33(%edx)
2654 L(bk_write_33bytes):
2655 movq 25(%eax), %xmm0
2656 movq %xmm0, 25(%edx)
2657 L(bk_write_25bytes):
2658 movq 17(%eax), %xmm0
2659 movq %xmm0, 17(%edx)
2660 L(bk_write_17bytes):
2668 movl DEST(%esp), %eax
2669 # ifdef USE_AS_MEMPCPY
2670 movl LEN(%esp), %ecx
2676 L(bk_write_46bytes):
2677 movq 38(%eax), %xmm0
2678 movq %xmm0, 38(%edx)
2679 L(bk_write_38bytes):
2680 movq 30(%eax), %xmm0
2681 movq %xmm0, 30(%edx)
2682 L(bk_write_30bytes):
2683 movq 22(%eax), %xmm0
2684 movq %xmm0, 22(%edx)
2685 L(bk_write_22bytes):
2686 movq 14(%eax), %xmm0
2687 movq %xmm0, 14(%edx)
2688 L(bk_write_14bytes):
2696 movl DEST(%esp), %eax
2697 # ifdef USE_AS_MEMPCPY
2698 movl LEN(%esp), %ecx
2704 L(bk_write_42bytes):
2705 movq 34(%eax), %xmm0
2706 movq %xmm0, 34(%edx)
2707 L(bk_write_34bytes):
2708 movq 26(%eax), %xmm0
2709 movq %xmm0, 26(%edx)
2710 L(bk_write_26bytes):
2711 movq 18(%eax), %xmm0
2712 movq %xmm0, 18(%edx)
2713 L(bk_write_18bytes):
2714 movq 10(%eax), %xmm0
2715 movq %xmm0, 10(%edx)
2716 L(bk_write_10bytes):
2722 movl DEST(%esp), %eax
2723 # ifdef USE_AS_MEMPCPY
2724 movl LEN(%esp), %ecx
2730 L(bk_write_47bytes):
2731 movq 39(%eax), %xmm0
2732 movq %xmm0, 39(%edx)
2733 L(bk_write_39bytes):
2734 movq 31(%eax), %xmm0
2735 movq %xmm0, 31(%edx)
2736 L(bk_write_31bytes):
2737 movq 23(%eax), %xmm0
2738 movq %xmm0, 23(%edx)
2739 L(bk_write_23bytes):
2740 movq 15(%eax), %xmm0
2741 movq %xmm0, 15(%edx)
2742 L(bk_write_15bytes):
2748 movzwl 1(%eax), %ecx
2752 movl DEST(%esp), %eax
2753 # ifdef USE_AS_MEMPCPY
2754 movl LEN(%esp), %ecx
2760 L(bk_write_43bytes):
2761 movq 35(%eax), %xmm0
2762 movq %xmm0, 35(%edx)
2763 L(bk_write_35bytes):
2764 movq 27(%eax), %xmm0
2765 movq %xmm0, 27(%edx)
2766 L(bk_write_27bytes):
2767 movq 19(%eax), %xmm0
2768 movq %xmm0, 19(%edx)
2769 L(bk_write_19bytes):
2770 movq 11(%eax), %xmm0
2771 movq %xmm0, 11(%edx)
2772 L(bk_write_11bytes):
2776 movzwl 1(%eax), %ecx
2780 movl DEST(%esp), %eax
2781 # ifdef USE_AS_MEMPCPY
2782 movl LEN(%esp), %ecx
2788 .pushsection .rodata.ssse3,"a",@progbits
2790 L(table_48bytes_fwd):
2791 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2792 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2793 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2794 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2795 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2796 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2797 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2798 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2799 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2800 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2801 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2802 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2803 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2804 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2805 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2806 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2807 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2808 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2809 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2810 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2811 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2812 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2813 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2814 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2815 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2816 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2817 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2818 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2819 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2820 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2821 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2822 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2823 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2824 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2825 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2826 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2827 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2828 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2829 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2830 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2831 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2832 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2833 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2834 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2835 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2836 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2837 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2838 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2841 L(table_48bytes_fwd_align):
2842 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2843 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2844 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2845 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2846 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2847 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2848 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2849 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2850 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2851 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2852 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2853 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2854 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2855 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2856 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2857 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2858 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2859 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2860 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2861 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2862 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2863 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2864 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2865 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2866 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2867 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2868 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2869 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2870 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2871 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2872 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2873 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2874 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2875 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2876 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2877 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2878 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2879 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2880 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2881 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2882 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2883 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2884 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2885 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2886 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2887 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2888 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2889 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2893 .int JMPTBL (L(shl_0), L(shl_table))
2894 .int JMPTBL (L(shl_1), L(shl_table))
2895 .int JMPTBL (L(shl_2), L(shl_table))
2896 .int JMPTBL (L(shl_3), L(shl_table))
2897 .int JMPTBL (L(shl_4), L(shl_table))
2898 .int JMPTBL (L(shl_5), L(shl_table))
2899 .int JMPTBL (L(shl_6), L(shl_table))
2900 .int JMPTBL (L(shl_7), L(shl_table))
2901 .int JMPTBL (L(shl_8), L(shl_table))
2902 .int JMPTBL (L(shl_9), L(shl_table))
2903 .int JMPTBL (L(shl_10), L(shl_table))
2904 .int JMPTBL (L(shl_11), L(shl_table))
2905 .int JMPTBL (L(shl_12), L(shl_table))
2906 .int JMPTBL (L(shl_13), L(shl_table))
2907 .int JMPTBL (L(shl_14), L(shl_table))
2908 .int JMPTBL (L(shl_15), L(shl_table))
2911 L(table_48_bytes_bwd):
2912 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2913 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2914 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2915 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2916 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2917 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2918 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2919 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2920 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2921 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2922 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2923 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2924 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2925 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2926 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2927 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2928 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2929 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2930 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2931 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2932 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2933 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2934 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2935 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2936 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2937 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2938 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2939 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2940 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2941 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2942 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2943 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2944 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2945 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2946 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2947 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2948 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2949 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2950 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2951 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2952 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2953 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2954 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2955 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2956 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2957 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2958 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2959 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
2963 # ifdef USE_AS_MEMMOVE
2968 lea (%ecx,%edx,1),%edx
2969 lea (%ecx,%edi,1),%edi
2975 jae L(bk_write_more64bytes)
2977 L(bk_write_64bytesless):
2979 jb L(bk_write_less32bytes)
2981 L(bk_write_more32bytes):
2982 /* Copy 32 bytes at a time. */
2984 movq -8(%edi), %xmm0
2985 movq %xmm0, -8(%edx)
2986 movq -16(%edi), %xmm0
2987 movq %xmm0, -16(%edx)
2988 movq -24(%edi), %xmm0
2989 movq %xmm0, -24(%edx)
2990 movq -32(%edi), %xmm0
2991 movq %xmm0, -32(%edx)
2995 L(bk_write_less32bytes):
3000 L(bk_write_less32bytes_2):
3001 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3008 jbe L(bk_write_less32bytes)
3010 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3011 then (EDX & 2) must be != 0. */
3031 L(bk_write_more64bytes):
3032 /* Check alignment of last byte. */
3034 jz L(bk_ssse3_cpy_pre)
3036 /* EDX is aligned 4 bytes, but not 16 bytes. */
3045 jz L(bk_ssse3_cpy_pre)
3054 jz L(bk_ssse3_cpy_pre)
3062 L(bk_ssse3_cpy_pre):
3064 jb L(bk_write_more32bytes)
3071 movdqu 0x30(%edi), %xmm3
3072 movdqa %xmm3, 0x30(%edx)
3073 movdqu 0x20(%edi), %xmm2
3074 movdqa %xmm2, 0x20(%edx)
3075 movdqu 0x10(%edi), %xmm1
3076 movdqa %xmm1, 0x10(%edx)
3077 movdqu (%edi), %xmm0
3078 movdqa %xmm0, (%edx)
3081 jmp L(bk_write_64bytesless)