2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #if !defined NOT_IN_libc \
25 || defined USE_AS_MEMMOVE \
26 || !defined USE_MULTIARCH)
28 #include "asm-syntax.h"
31 # define MEMCPY __memcpy_ssse3
32 # define MEMCPY_CHK __memcpy_chk_ssse3
36 # define ALIGN(n) .p2align n
39 #define JMPTBL(I, B) I - B
41 /* Branch to an entry in a jump table. TABLE is a jump table with
42 relative offsets. INDEX is a register contains the index into the
43 jump table. SCALE is the scale of INDEX. */
44 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
45 lea TABLE(%rip), %r11; \
46 movslq (%r11, INDEX, SCALE), INDEX; \
47 lea (%r11, INDEX), INDEX; \
51 .section .text.ssse3,"ax",@progbits
52 #if defined SHARED && !defined NOT_IN_libc
55 jb HIDDEN_JUMPTARGET (__chk_fail)
75 lea L(table_less_80bytes)(%rip), %r11
77 movslq (%r11, %rdx, 4), %r9
86 #ifndef USE_AS_MEMMOVE
100 #ifdef SHARED_CACHE_SIZE_HALF
101 mov $SHARED_CACHE_SIZE_HALF, %rcx
103 mov __x86_64_shared_cache_size_half(%rip), %rcx
110 #ifdef DATA_CACHE_SIZE_HALF
111 mov $DATA_CACHE_SIZE_HALF, %rcx
113 mov __x86_64_data_cache_size_half(%rip), %rcx
115 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
119 movdqu -16(%rsi, %rdx), %xmm0
121 lea -16(%rdi, %rdx), %r8
130 #ifdef SHARED_CACHE_SIZE_HALF
131 mov $SHARED_CACHE_SIZE_HALF, %rcx
133 mov __x86_64_shared_cache_size_half(%rip), %rcx
141 #ifdef DATA_CACHE_SIZE_HALF
142 mov $DATA_CACHE_SIZE_HALF, %rcx
144 mov __x86_64_data_cache_size_half(%rip), %rcx
146 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
159 jb L(shl_0_less_64bytes)
161 movaps 16(%rsi), %xmm1
162 movaps 32(%rsi), %xmm2
163 movaps 48(%rsi), %xmm3
165 movaps %xmm1, 16(%rdi)
166 movaps %xmm2, 32(%rdi)
167 movaps %xmm3, 48(%rdi)
171 L(shl_0_less_64bytes):
174 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
178 #ifdef DATA_CACHE_SIZE_HALF
179 cmp $DATA_CACHE_SIZE_HALF, %rdx
181 cmp __x86_64_data_cache_size_half(%rip), %rdx
184 jae L(shl_0_gobble_mem_loop)
185 L(shl_0_gobble_cache_loop):
187 movaps 0x10(%rsi), %xmm1
188 movaps 0x20(%rsi), %xmm2
189 movaps 0x30(%rsi), %xmm3
192 movaps %xmm1, 0x10(%rdi)
193 movaps %xmm2, 0x20(%rdi)
194 movaps %xmm3, 0x30(%rdi)
197 movaps 0x40(%rsi), %xmm4
198 movaps 0x50(%rsi), %xmm5
199 movaps 0x60(%rsi), %xmm6
200 movaps 0x70(%rsi), %xmm7
202 movaps %xmm4, 0x40(%rdi)
203 movaps %xmm5, 0x50(%rdi)
204 movaps %xmm6, 0x60(%rdi)
205 movaps %xmm7, 0x70(%rdi)
208 jae L(shl_0_gobble_cache_loop)
211 jl L(shl_0_cache_less_64bytes)
215 movdqa 0x10(%rsi), %xmm1
218 movdqa %xmm1, 0x10(%rdi)
220 movdqa 0x20(%rsi), %xmm4
221 movdqa 0x30(%rsi), %xmm1
224 movdqa %xmm4, 0x20(%rdi)
225 movdqa %xmm1, 0x30(%rdi)
227 L(shl_0_cache_less_64bytes):
230 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
233 L(shl_0_gobble_mem_loop):
234 prefetcht0 0x1c0(%rsi)
235 prefetcht0 0x280(%rsi)
238 movdqa 0x10(%rsi), %xmm1
239 movdqa 0x20(%rsi), %xmm2
240 movdqa 0x30(%rsi), %xmm3
241 movdqa 0x40(%rsi), %xmm4
242 movdqa 0x50(%rsi), %xmm5
243 movdqa 0x60(%rsi), %xmm6
244 movdqa 0x70(%rsi), %xmm7
248 movdqa %xmm1, 0x10(%rdi)
249 movdqa %xmm2, 0x20(%rdi)
250 movdqa %xmm3, 0x30(%rdi)
251 movdqa %xmm4, 0x40(%rdi)
252 movdqa %xmm5, 0x50(%rdi)
253 movdqa %xmm6, 0x60(%rdi)
254 movdqa %xmm7, 0x70(%rdi)
257 jae L(shl_0_gobble_mem_loop)
260 jl L(shl_0_mem_less_64bytes)
264 movdqa 0x10(%rsi), %xmm1
267 movdqa %xmm1, 0x10(%rdi)
269 movdqa 0x20(%rsi), %xmm0
270 movdqa 0x30(%rsi), %xmm1
273 movdqa %xmm0, 0x20(%rdi)
274 movdqa %xmm1, 0x30(%rdi)
276 L(shl_0_mem_less_64bytes):
278 jb L(shl_0_mem_less_32bytes)
281 movdqa 0x10(%rsi), %xmm1
284 movdqa %xmm1, 0x10(%rdi)
286 L(shl_0_mem_less_32bytes):
289 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
294 movdqa -0x10(%rsi), %xmm1
296 movdqa %xmm1, -0x10(%rdi)
300 ja L(shl_0_gobble_bwd)
302 jb L(shl_0_less_64bytes_bwd)
303 movaps -0x10(%rsi), %xmm0
304 movaps -0x20(%rsi), %xmm1
305 movaps -0x30(%rsi), %xmm2
306 movaps -0x40(%rsi), %xmm3
307 movaps %xmm0, -0x10(%rdi)
308 movaps %xmm1, -0x20(%rdi)
309 movaps %xmm2, -0x30(%rdi)
310 movaps %xmm3, -0x40(%rdi)
314 L(shl_0_less_64bytes_bwd):
315 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
319 #ifdef DATA_CACHE_SIZE_HALF
320 cmp $DATA_CACHE_SIZE_HALF, %rdx
322 cmp __x86_64_data_cache_size_half(%rip), %rdx
325 jae L(shl_0_gobble_mem_bwd_loop)
326 L(shl_0_gobble_bwd_loop):
327 movdqa -0x10(%rsi), %xmm0
328 movaps -0x20(%rsi), %xmm1
329 movaps -0x30(%rsi), %xmm2
330 movaps -0x40(%rsi), %xmm3
332 movdqa %xmm0, -0x10(%rdi)
333 movaps %xmm1, -0x20(%rdi)
334 movaps %xmm2, -0x30(%rdi)
335 movaps %xmm3, -0x40(%rdi)
338 movaps -0x50(%rsi), %xmm4
339 movaps -0x60(%rsi), %xmm5
340 movaps -0x70(%rsi), %xmm6
341 movaps -0x80(%rsi), %xmm7
342 lea -0x80(%rsi), %rsi
343 movaps %xmm4, -0x50(%rdi)
344 movaps %xmm5, -0x60(%rdi)
345 movaps %xmm6, -0x70(%rdi)
346 movaps %xmm7, -0x80(%rdi)
347 lea -0x80(%rdi), %rdi
349 jae L(shl_0_gobble_bwd_loop)
352 jl L(shl_0_gobble_bwd_less_64bytes)
354 movdqa -0x10(%rsi), %xmm0
356 movdqa -0x20(%rsi), %xmm1
358 movdqa %xmm0, -0x10(%rdi)
359 movdqa %xmm1, -0x20(%rdi)
361 movdqa -0x30(%rsi), %xmm0
362 movdqa -0x40(%rsi), %xmm1
365 movdqa %xmm0, -0x30(%rdi)
366 movdqa %xmm1, -0x40(%rdi)
368 L(shl_0_gobble_bwd_less_64bytes):
369 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
372 L(shl_0_gobble_mem_bwd_loop):
373 prefetcht0 -0x1c0(%rsi)
374 prefetcht0 -0x280(%rsi)
375 movdqa -0x10(%rsi), %xmm0
376 movdqa -0x20(%rsi), %xmm1
377 movdqa -0x30(%rsi), %xmm2
378 movdqa -0x40(%rsi), %xmm3
379 movdqa -0x50(%rsi), %xmm4
380 movdqa -0x60(%rsi), %xmm5
381 movdqa -0x70(%rsi), %xmm6
382 movdqa -0x80(%rsi), %xmm7
383 lea -0x80(%rsi), %rsi
385 movdqa %xmm0, -0x10(%rdi)
386 movdqa %xmm1, -0x20(%rdi)
387 movdqa %xmm2, -0x30(%rdi)
388 movdqa %xmm3, -0x40(%rdi)
389 movdqa %xmm4, -0x50(%rdi)
390 movdqa %xmm5, -0x60(%rdi)
391 movdqa %xmm6, -0x70(%rdi)
392 movdqa %xmm7, -0x80(%rdi)
393 lea -0x80(%rdi), %rdi
395 jae L(shl_0_gobble_mem_bwd_loop)
398 jl L(shl_0_mem_bwd_less_64bytes)
400 movdqa -0x10(%rsi), %xmm0
402 movdqa -0x20(%rsi), %xmm1
404 movdqa %xmm0, -0x10(%rdi)
405 movdqa %xmm1, -0x20(%rdi)
407 movdqa -0x30(%rsi), %xmm0
408 movdqa -0x40(%rsi), %xmm1
411 movdqa %xmm0, -0x30(%rdi)
412 movdqa %xmm1, -0x40(%rdi)
414 L(shl_0_mem_bwd_less_64bytes):
416 jb L(shl_0_mem_bwd_less_32bytes)
417 movdqa -0x10(%rsi), %xmm0
419 movdqa -0x20(%rsi), %xmm1
421 movdqa %xmm0, -0x10(%rdi)
422 movdqa %xmm1, -0x20(%rdi)
424 L(shl_0_mem_bwd_less_32bytes):
425 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
429 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
431 movaps -0x01(%rsi), %xmm1
433 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
439 prefetchnta 0x1c0(%rsi)
442 movaps 0x0f(%rsi), %xmm2
443 movaps 0x1f(%rsi), %xmm3
444 movaps 0x2f(%rsi), %xmm4
445 movaps 0x3f(%rsi), %xmm5
447 palignr $1, %xmm4, %xmm5
449 palignr $1, %xmm3, %xmm4
450 palignr $1, %xmm2, %xmm3
452 palignr $1, %xmm1, %xmm2
454 movdqa %xmm2, -0x40(%rdi)
455 movaps %xmm3, -0x30(%rdi)
457 movaps %xmm4, -0x20(%rdi)
458 movaps %xmm5, -0x10(%rdi)
462 movaps %xmm4, -0x20(%rdi)
464 movaps %xmm5, -0x10(%rdi)
468 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
472 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
474 movaps -0x01(%rsi), %xmm1
476 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
481 L(shl_1_bwd_loop_L2):
482 prefetchnta -0x1c0(%rsi)
483 L(shl_1_bwd_loop_L1):
484 movaps -0x11(%rsi), %xmm2
486 movaps -0x21(%rsi), %xmm3
487 movaps -0x31(%rsi), %xmm4
488 movaps -0x41(%rsi), %xmm5
489 lea -0x40(%rsi), %rsi
490 palignr $1, %xmm2, %xmm1
491 palignr $1, %xmm3, %xmm2
492 palignr $1, %xmm4, %xmm3
493 palignr $1, %xmm5, %xmm4
495 movaps %xmm1, -0x10(%rdi)
498 movaps %xmm2, -0x20(%rdi)
499 lea -0x40(%rdi), %rdi
501 movaps %xmm3, 0x10(%rdi)
510 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
514 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
516 movaps -0x02(%rsi), %xmm1
518 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
524 prefetchnta 0x1c0(%rsi)
527 movaps 0x0e(%rsi), %xmm2
528 movaps 0x1e(%rsi), %xmm3
529 movaps 0x2e(%rsi), %xmm4
530 movaps 0x3e(%rsi), %xmm5
532 palignr $2, %xmm4, %xmm5
534 palignr $2, %xmm3, %xmm4
535 palignr $2, %xmm2, %xmm3
537 palignr $2, %xmm1, %xmm2
539 movdqa %xmm2, -0x40(%rdi)
540 movaps %xmm3, -0x30(%rdi)
542 movaps %xmm4, -0x20(%rdi)
543 movaps %xmm5, -0x10(%rdi)
547 movaps %xmm4, -0x20(%rdi)
549 movaps %xmm5, -0x10(%rdi)
553 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
557 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
559 movaps -0x02(%rsi), %xmm1
561 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
566 L(shl_2_bwd_loop_L2):
567 prefetchnta -0x1c0(%rsi)
568 L(shl_2_bwd_loop_L1):
569 movaps -0x12(%rsi), %xmm2
571 movaps -0x22(%rsi), %xmm3
572 movaps -0x32(%rsi), %xmm4
573 movaps -0x42(%rsi), %xmm5
574 lea -0x40(%rsi), %rsi
575 palignr $2, %xmm2, %xmm1
576 palignr $2, %xmm3, %xmm2
577 palignr $2, %xmm4, %xmm3
578 palignr $2, %xmm5, %xmm4
580 movaps %xmm1, -0x10(%rdi)
583 movaps %xmm2, -0x20(%rdi)
584 lea -0x40(%rdi), %rdi
586 movaps %xmm3, 0x10(%rdi)
595 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
599 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
601 movaps -0x03(%rsi), %xmm1
603 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
609 prefetchnta 0x1c0(%rsi)
612 movaps 0x0d(%rsi), %xmm2
613 movaps 0x1d(%rsi), %xmm3
614 movaps 0x2d(%rsi), %xmm4
615 movaps 0x3d(%rsi), %xmm5
617 palignr $3, %xmm4, %xmm5
619 palignr $3, %xmm3, %xmm4
620 palignr $3, %xmm2, %xmm3
622 palignr $3, %xmm1, %xmm2
624 movdqa %xmm2, -0x40(%rdi)
625 movaps %xmm3, -0x30(%rdi)
627 movaps %xmm4, -0x20(%rdi)
628 movaps %xmm5, -0x10(%rdi)
632 movaps %xmm4, -0x20(%rdi)
634 movaps %xmm5, -0x10(%rdi)
638 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
642 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
644 movaps -0x03(%rsi), %xmm1
646 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
651 L(shl_3_bwd_loop_L2):
652 prefetchnta -0x1c0(%rsi)
653 L(shl_3_bwd_loop_L1):
654 movaps -0x13(%rsi), %xmm2
656 movaps -0x23(%rsi), %xmm3
657 movaps -0x33(%rsi), %xmm4
658 movaps -0x43(%rsi), %xmm5
659 lea -0x40(%rsi), %rsi
660 palignr $3, %xmm2, %xmm1
661 palignr $3, %xmm3, %xmm2
662 palignr $3, %xmm4, %xmm3
663 palignr $3, %xmm5, %xmm4
665 movaps %xmm1, -0x10(%rdi)
668 movaps %xmm2, -0x20(%rdi)
669 lea -0x40(%rdi), %rdi
671 movaps %xmm3, 0x10(%rdi)
680 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
684 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
686 movaps -0x04(%rsi), %xmm1
688 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
694 prefetchnta 0x1c0(%rsi)
697 movaps 0x0c(%rsi), %xmm2
698 movaps 0x1c(%rsi), %xmm3
699 movaps 0x2c(%rsi), %xmm4
700 movaps 0x3c(%rsi), %xmm5
702 palignr $4, %xmm4, %xmm5
704 palignr $4, %xmm3, %xmm4
705 palignr $4, %xmm2, %xmm3
707 palignr $4, %xmm1, %xmm2
709 movdqa %xmm2, -0x40(%rdi)
710 movaps %xmm3, -0x30(%rdi)
712 movaps %xmm4, -0x20(%rdi)
713 movaps %xmm5, -0x10(%rdi)
717 movaps %xmm4, -0x20(%rdi)
719 movaps %xmm5, -0x10(%rdi)
723 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
727 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
729 movaps -0x04(%rsi), %xmm1
731 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
736 L(shl_4_bwd_loop_L2):
737 prefetchnta -0x1c0(%rsi)
738 L(shl_4_bwd_loop_L1):
739 movaps -0x14(%rsi), %xmm2
741 movaps -0x24(%rsi), %xmm3
742 movaps -0x34(%rsi), %xmm4
743 movaps -0x44(%rsi), %xmm5
744 lea -0x40(%rsi), %rsi
745 palignr $4, %xmm2, %xmm1
746 palignr $4, %xmm3, %xmm2
747 palignr $4, %xmm4, %xmm3
748 palignr $4, %xmm5, %xmm4
750 movaps %xmm1, -0x10(%rdi)
753 movaps %xmm2, -0x20(%rdi)
754 lea -0x40(%rdi), %rdi
756 movaps %xmm3, 0x10(%rdi)
765 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
769 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
771 movaps -0x05(%rsi), %xmm1
773 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
779 prefetchnta 0x1c0(%rsi)
782 movaps 0x0b(%rsi), %xmm2
783 movaps 0x1b(%rsi), %xmm3
784 movaps 0x2b(%rsi), %xmm4
785 movaps 0x3b(%rsi), %xmm5
787 palignr $5, %xmm4, %xmm5
789 palignr $5, %xmm3, %xmm4
790 palignr $5, %xmm2, %xmm3
792 palignr $5, %xmm1, %xmm2
794 movdqa %xmm2, -0x40(%rdi)
795 movaps %xmm3, -0x30(%rdi)
797 movaps %xmm4, -0x20(%rdi)
798 movaps %xmm5, -0x10(%rdi)
802 movaps %xmm4, -0x20(%rdi)
804 movaps %xmm5, -0x10(%rdi)
808 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
812 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
814 movaps -0x05(%rsi), %xmm1
816 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
821 L(shl_5_bwd_loop_L2):
822 prefetchnta -0x1c0(%rsi)
823 L(shl_5_bwd_loop_L1):
824 movaps -0x15(%rsi), %xmm2
826 movaps -0x25(%rsi), %xmm3
827 movaps -0x35(%rsi), %xmm4
828 movaps -0x45(%rsi), %xmm5
829 lea -0x40(%rsi), %rsi
830 palignr $5, %xmm2, %xmm1
831 palignr $5, %xmm3, %xmm2
832 palignr $5, %xmm4, %xmm3
833 palignr $5, %xmm5, %xmm4
835 movaps %xmm1, -0x10(%rdi)
838 movaps %xmm2, -0x20(%rdi)
839 lea -0x40(%rdi), %rdi
841 movaps %xmm3, 0x10(%rdi)
850 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
854 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
856 movaps -0x06(%rsi), %xmm1
858 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
864 prefetchnta 0x1c0(%rsi)
867 movaps 0x0a(%rsi), %xmm2
868 movaps 0x1a(%rsi), %xmm3
869 movaps 0x2a(%rsi), %xmm4
870 movaps 0x3a(%rsi), %xmm5
872 palignr $6, %xmm4, %xmm5
874 palignr $6, %xmm3, %xmm4
875 palignr $6, %xmm2, %xmm3
877 palignr $6, %xmm1, %xmm2
879 movdqa %xmm2, -0x40(%rdi)
880 movaps %xmm3, -0x30(%rdi)
882 movaps %xmm4, -0x20(%rdi)
883 movaps %xmm5, -0x10(%rdi)
887 movaps %xmm4, -0x20(%rdi)
889 movaps %xmm5, -0x10(%rdi)
893 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
897 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
899 movaps -0x06(%rsi), %xmm1
901 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
906 L(shl_6_bwd_loop_L2):
907 prefetchnta -0x1c0(%rsi)
908 L(shl_6_bwd_loop_L1):
909 movaps -0x16(%rsi), %xmm2
911 movaps -0x26(%rsi), %xmm3
912 movaps -0x36(%rsi), %xmm4
913 movaps -0x46(%rsi), %xmm5
914 lea -0x40(%rsi), %rsi
915 palignr $6, %xmm2, %xmm1
916 palignr $6, %xmm3, %xmm2
917 palignr $6, %xmm4, %xmm3
918 palignr $6, %xmm5, %xmm4
920 movaps %xmm1, -0x10(%rdi)
923 movaps %xmm2, -0x20(%rdi)
924 lea -0x40(%rdi), %rdi
926 movaps %xmm3, 0x10(%rdi)
935 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
939 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
941 movaps -0x07(%rsi), %xmm1
943 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
949 prefetchnta 0x1c0(%rsi)
952 movaps 0x09(%rsi), %xmm2
953 movaps 0x19(%rsi), %xmm3
954 movaps 0x29(%rsi), %xmm4
955 movaps 0x39(%rsi), %xmm5
957 palignr $7, %xmm4, %xmm5
959 palignr $7, %xmm3, %xmm4
960 palignr $7, %xmm2, %xmm3
962 palignr $7, %xmm1, %xmm2
964 movdqa %xmm2, -0x40(%rdi)
965 movaps %xmm3, -0x30(%rdi)
967 movaps %xmm4, -0x20(%rdi)
968 movaps %xmm5, -0x10(%rdi)
972 movaps %xmm4, -0x20(%rdi)
974 movaps %xmm5, -0x10(%rdi)
978 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
982 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
984 movaps -0x07(%rsi), %xmm1
986 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
991 L(shl_7_bwd_loop_L2):
992 prefetchnta -0x1c0(%rsi)
993 L(shl_7_bwd_loop_L1):
994 movaps -0x17(%rsi), %xmm2
996 movaps -0x27(%rsi), %xmm3
997 movaps -0x37(%rsi), %xmm4
998 movaps -0x47(%rsi), %xmm5
999 lea -0x40(%rsi), %rsi
1000 palignr $7, %xmm2, %xmm1
1001 palignr $7, %xmm3, %xmm2
1002 palignr $7, %xmm4, %xmm3
1003 palignr $7, %xmm5, %xmm4
1005 movaps %xmm1, -0x10(%rdi)
1008 movaps %xmm2, -0x20(%rdi)
1009 lea -0x40(%rdi), %rdi
1011 movaps %xmm3, 0x10(%rdi)
1013 movaps %xmm4, (%rdi)
1017 movaps %xmm4, (%rdi)
1020 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1024 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1026 movaps -0x08(%rsi), %xmm1
1028 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1033 prefetchnta 0x1c0(%rsi)
1036 movaps 0x08(%rsi), %xmm2
1037 movaps 0x18(%rsi), %xmm3
1038 movaps 0x28(%rsi), %xmm4
1039 movaps 0x38(%rsi), %xmm5
1041 palignr $8, %xmm4, %xmm5
1043 palignr $8, %xmm3, %xmm4
1044 palignr $8, %xmm2, %xmm3
1046 palignr $8, %xmm1, %xmm2
1048 movdqa %xmm2, -0x40(%rdi)
1049 movaps %xmm3, -0x30(%rdi)
1051 movaps %xmm4, -0x20(%rdi)
1052 movaps %xmm5, -0x10(%rdi)
1058 movaps %xmm4, -0x20(%rdi)
1060 movaps %xmm5, -0x10(%rdi)
1063 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1067 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1069 movaps -0x08(%rsi), %xmm1
1071 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1076 L(shl_8_bwd_loop_L2):
1077 prefetchnta -0x1c0(%rsi)
1078 L(shl_8_bwd_loop_L1):
1079 movaps -0x18(%rsi), %xmm2
1081 movaps -0x28(%rsi), %xmm3
1082 movaps -0x38(%rsi), %xmm4
1083 movaps -0x48(%rsi), %xmm5
1084 lea -0x40(%rsi), %rsi
1085 palignr $8, %xmm2, %xmm1
1086 palignr $8, %xmm3, %xmm2
1087 palignr $8, %xmm4, %xmm3
1088 palignr $8, %xmm5, %xmm4
1090 movaps %xmm1, -0x10(%rdi)
1093 movaps %xmm2, -0x20(%rdi)
1094 lea -0x40(%rdi), %rdi
1096 movaps %xmm3, 0x10(%rdi)
1098 movaps %xmm4, (%rdi)
1102 movaps %xmm4, (%rdi)
1105 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1109 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1111 movaps -0x09(%rsi), %xmm1
1113 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1119 prefetchnta 0x1c0(%rsi)
1122 movaps 0x07(%rsi), %xmm2
1123 movaps 0x17(%rsi), %xmm3
1124 movaps 0x27(%rsi), %xmm4
1125 movaps 0x37(%rsi), %xmm5
1127 palignr $9, %xmm4, %xmm5
1129 palignr $9, %xmm3, %xmm4
1130 palignr $9, %xmm2, %xmm3
1132 palignr $9, %xmm1, %xmm2
1134 movdqa %xmm2, -0x40(%rdi)
1135 movaps %xmm3, -0x30(%rdi)
1137 movaps %xmm4, -0x20(%rdi)
1138 movaps %xmm5, -0x10(%rdi)
1142 movaps %xmm4, -0x20(%rdi)
1144 movaps %xmm5, -0x10(%rdi)
1148 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1152 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1154 movaps -0x09(%rsi), %xmm1
1156 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1161 L(shl_9_bwd_loop_L2):
1162 prefetchnta -0x1c0(%rsi)
1163 L(shl_9_bwd_loop_L1):
1164 movaps -0x19(%rsi), %xmm2
1166 movaps -0x29(%rsi), %xmm3
1167 movaps -0x39(%rsi), %xmm4
1168 movaps -0x49(%rsi), %xmm5
1169 lea -0x40(%rsi), %rsi
1170 palignr $9, %xmm2, %xmm1
1171 palignr $9, %xmm3, %xmm2
1172 palignr $9, %xmm4, %xmm3
1173 palignr $9, %xmm5, %xmm4
1175 movaps %xmm1, -0x10(%rdi)
1178 movaps %xmm2, -0x20(%rdi)
1179 lea -0x40(%rdi), %rdi
1181 movaps %xmm3, 0x10(%rdi)
1183 movaps %xmm4, (%rdi)
1187 movaps %xmm4, (%rdi)
1190 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1194 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1196 movaps -0x0a(%rsi), %xmm1
1198 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1204 prefetchnta 0x1c0(%rsi)
1207 movaps 0x06(%rsi), %xmm2
1208 movaps 0x16(%rsi), %xmm3
1209 movaps 0x26(%rsi), %xmm4
1210 movaps 0x36(%rsi), %xmm5
1212 palignr $10, %xmm4, %xmm5
1214 palignr $10, %xmm3, %xmm4
1215 palignr $10, %xmm2, %xmm3
1217 palignr $10, %xmm1, %xmm2
1219 movdqa %xmm2, -0x40(%rdi)
1220 movaps %xmm3, -0x30(%rdi)
1222 movaps %xmm4, -0x20(%rdi)
1223 movaps %xmm5, -0x10(%rdi)
1227 movaps %xmm4, -0x20(%rdi)
1229 movaps %xmm5, -0x10(%rdi)
1233 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1237 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1239 movaps -0x0a(%rsi), %xmm1
1241 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1246 L(shl_10_bwd_loop_L2):
1247 prefetchnta -0x1c0(%rsi)
1248 L(shl_10_bwd_loop_L1):
1249 movaps -0x1a(%rsi), %xmm2
1251 movaps -0x2a(%rsi), %xmm3
1252 movaps -0x3a(%rsi), %xmm4
1253 movaps -0x4a(%rsi), %xmm5
1254 lea -0x40(%rsi), %rsi
1255 palignr $10, %xmm2, %xmm1
1256 palignr $10, %xmm3, %xmm2
1257 palignr $10, %xmm4, %xmm3
1258 palignr $10, %xmm5, %xmm4
1260 movaps %xmm1, -0x10(%rdi)
1263 movaps %xmm2, -0x20(%rdi)
1264 lea -0x40(%rdi), %rdi
1266 movaps %xmm3, 0x10(%rdi)
1267 jb L(shl_10_bwd_end)
1268 movaps %xmm4, (%rdi)
1272 movaps %xmm4, (%rdi)
1275 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1279 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1281 movaps -0x0b(%rsi), %xmm1
1283 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1289 prefetchnta 0x1c0(%rsi)
1292 movaps 0x05(%rsi), %xmm2
1293 movaps 0x15(%rsi), %xmm3
1294 movaps 0x25(%rsi), %xmm4
1295 movaps 0x35(%rsi), %xmm5
1297 palignr $11, %xmm4, %xmm5
1299 palignr $11, %xmm3, %xmm4
1300 palignr $11, %xmm2, %xmm3
1302 palignr $11, %xmm1, %xmm2
1304 movdqa %xmm2, -0x40(%rdi)
1305 movaps %xmm3, -0x30(%rdi)
1307 movaps %xmm4, -0x20(%rdi)
1308 movaps %xmm5, -0x10(%rdi)
1312 movaps %xmm4, -0x20(%rdi)
1314 movaps %xmm5, -0x10(%rdi)
1318 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1322 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1324 movaps -0x0b(%rsi), %xmm1
1326 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1331 L(shl_11_bwd_loop_L2):
1332 prefetchnta -0x1c0(%rsi)
1333 L(shl_11_bwd_loop_L1):
1334 movaps -0x1b(%rsi), %xmm2
1336 movaps -0x2b(%rsi), %xmm3
1337 movaps -0x3b(%rsi), %xmm4
1338 movaps -0x4b(%rsi), %xmm5
1339 lea -0x40(%rsi), %rsi
1340 palignr $11, %xmm2, %xmm1
1341 palignr $11, %xmm3, %xmm2
1342 palignr $11, %xmm4, %xmm3
1343 palignr $11, %xmm5, %xmm4
1345 movaps %xmm1, -0x10(%rdi)
1348 movaps %xmm2, -0x20(%rdi)
1349 lea -0x40(%rdi), %rdi
1351 movaps %xmm3, 0x10(%rdi)
1352 jb L(shl_11_bwd_end)
1353 movaps %xmm4, (%rdi)
1357 movaps %xmm4, (%rdi)
1360 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1364 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1366 movaps -0x0c(%rsi), %xmm1
1368 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1374 prefetchnta 0x1c0(%rsi)
1377 movaps 0x04(%rsi), %xmm2
1378 movaps 0x14(%rsi), %xmm3
1379 movaps 0x24(%rsi), %xmm4
1380 movaps 0x34(%rsi), %xmm5
1382 palignr $12, %xmm4, %xmm5
1384 palignr $12, %xmm3, %xmm4
1385 palignr $12, %xmm2, %xmm3
1387 palignr $12, %xmm1, %xmm2
1389 movdqa %xmm2, -0x40(%rdi)
1390 movaps %xmm3, -0x30(%rdi)
1392 movaps %xmm4, -0x20(%rdi)
1393 movaps %xmm5, -0x10(%rdi)
1397 movaps %xmm4, -0x20(%rdi)
1399 movaps %xmm5, -0x10(%rdi)
1403 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1407 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1409 movaps -0x0c(%rsi), %xmm1
1411 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1416 L(shl_12_bwd_loop_L2):
1417 prefetchnta -0x1c0(%rsi)
1418 L(shl_12_bwd_loop_L1):
1419 movaps -0x1c(%rsi), %xmm2
1421 movaps -0x2c(%rsi), %xmm3
1422 movaps -0x3c(%rsi), %xmm4
1423 movaps -0x4c(%rsi), %xmm5
1424 lea -0x40(%rsi), %rsi
1425 palignr $12, %xmm2, %xmm1
1426 palignr $12, %xmm3, %xmm2
1427 palignr $12, %xmm4, %xmm3
1428 palignr $12, %xmm5, %xmm4
1430 movaps %xmm1, -0x10(%rdi)
1433 movaps %xmm2, -0x20(%rdi)
1434 lea -0x40(%rdi), %rdi
1436 movaps %xmm3, 0x10(%rdi)
1437 jb L(shl_12_bwd_end)
1438 movaps %xmm4, (%rdi)
1442 movaps %xmm4, (%rdi)
1445 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1449 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1451 movaps -0x0d(%rsi), %xmm1
1453 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1459 prefetchnta 0x1c0(%rsi)
1462 movaps 0x03(%rsi), %xmm2
1463 movaps 0x13(%rsi), %xmm3
1464 movaps 0x23(%rsi), %xmm4
1465 movaps 0x33(%rsi), %xmm5
1467 palignr $13, %xmm4, %xmm5
1469 palignr $13, %xmm3, %xmm4
1470 palignr $13, %xmm2, %xmm3
1472 palignr $13, %xmm1, %xmm2
1474 movdqa %xmm2, -0x40(%rdi)
1475 movaps %xmm3, -0x30(%rdi)
1477 movaps %xmm4, -0x20(%rdi)
1478 movaps %xmm5, -0x10(%rdi)
1482 movaps %xmm4, -0x20(%rdi)
1484 movaps %xmm5, -0x10(%rdi)
1488 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1492 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1494 movaps -0x0d(%rsi), %xmm1
1496 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1501 L(shl_13_bwd_loop_L2):
1502 prefetchnta -0x1c0(%rsi)
1503 L(shl_13_bwd_loop_L1):
1504 movaps -0x1d(%rsi), %xmm2
1506 movaps -0x2d(%rsi), %xmm3
1507 movaps -0x3d(%rsi), %xmm4
1508 movaps -0x4d(%rsi), %xmm5
1509 lea -0x40(%rsi), %rsi
1510 palignr $13, %xmm2, %xmm1
1511 palignr $13, %xmm3, %xmm2
1512 palignr $13, %xmm4, %xmm3
1513 palignr $13, %xmm5, %xmm4
1515 movaps %xmm1, -0x10(%rdi)
1518 movaps %xmm2, -0x20(%rdi)
1519 lea -0x40(%rdi), %rdi
1521 movaps %xmm3, 0x10(%rdi)
1522 jb L(shl_13_bwd_end)
1523 movaps %xmm4, (%rdi)
1527 movaps %xmm4, (%rdi)
1530 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1534 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1536 movaps -0x0e(%rsi), %xmm1
1538 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1544 prefetchnta 0x1c0(%rsi)
1547 movaps 0x02(%rsi), %xmm2
1548 movaps 0x12(%rsi), %xmm3
1549 movaps 0x22(%rsi), %xmm4
1550 movaps 0x32(%rsi), %xmm5
1552 palignr $14, %xmm4, %xmm5
1554 palignr $14, %xmm3, %xmm4
1555 palignr $14, %xmm2, %xmm3
1557 palignr $14, %xmm1, %xmm2
1559 movdqa %xmm2, -0x40(%rdi)
1560 movaps %xmm3, -0x30(%rdi)
1562 movaps %xmm4, -0x20(%rdi)
1563 movaps %xmm5, -0x10(%rdi)
1567 movaps %xmm4, -0x20(%rdi)
1569 movaps %xmm5, -0x10(%rdi)
1573 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1577 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1579 movaps -0x0e(%rsi), %xmm1
1581 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1586 L(shl_14_bwd_loop_L2):
1587 prefetchnta -0x1c0(%rsi)
1588 L(shl_14_bwd_loop_L1):
1589 movaps -0x1e(%rsi), %xmm2
1591 movaps -0x2e(%rsi), %xmm3
1592 movaps -0x3e(%rsi), %xmm4
1593 movaps -0x4e(%rsi), %xmm5
1594 lea -0x40(%rsi), %rsi
1595 palignr $14, %xmm2, %xmm1
1596 palignr $14, %xmm3, %xmm2
1597 palignr $14, %xmm4, %xmm3
1598 palignr $14, %xmm5, %xmm4
1600 movaps %xmm1, -0x10(%rdi)
1603 movaps %xmm2, -0x20(%rdi)
1604 lea -0x40(%rdi), %rdi
1606 movaps %xmm3, 0x10(%rdi)
1607 jb L(shl_14_bwd_end)
1608 movaps %xmm4, (%rdi)
1612 movaps %xmm4, (%rdi)
1615 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1619 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1621 movaps -0x0f(%rsi), %xmm1
1623 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1629 prefetchnta 0x1c0(%rsi)
1632 movaps 0x01(%rsi), %xmm2
1633 movaps 0x11(%rsi), %xmm3
1634 movaps 0x21(%rsi), %xmm4
1635 movaps 0x31(%rsi), %xmm5
1637 palignr $15, %xmm4, %xmm5
1639 palignr $15, %xmm3, %xmm4
1640 palignr $15, %xmm2, %xmm3
1642 palignr $15, %xmm1, %xmm2
1644 movdqa %xmm2, -0x40(%rdi)
1645 movaps %xmm3, -0x30(%rdi)
1647 movaps %xmm4, -0x20(%rdi)
1648 movaps %xmm5, -0x10(%rdi)
1652 movaps %xmm4, -0x20(%rdi)
1654 movaps %xmm5, -0x10(%rdi)
1658 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1662 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1664 movaps -0x0f(%rsi), %xmm1
1666 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1671 L(shl_15_bwd_loop_L2):
1672 prefetchnta -0x1c0(%rsi)
1673 L(shl_15_bwd_loop_L1):
1674 movaps -0x1f(%rsi), %xmm2
1676 movaps -0x2f(%rsi), %xmm3
1677 movaps -0x3f(%rsi), %xmm4
1678 movaps -0x4f(%rsi), %xmm5
1679 lea -0x40(%rsi), %rsi
1680 palignr $15, %xmm2, %xmm1
1681 palignr $15, %xmm3, %xmm2
1682 palignr $15, %xmm4, %xmm3
1683 palignr $15, %xmm5, %xmm4
1685 movaps %xmm1, -0x10(%rdi)
1688 movaps %xmm2, -0x20(%rdi)
1689 lea -0x40(%rdi), %rdi
1691 movaps %xmm3, 0x10(%rdi)
1692 jb L(shl_15_bwd_end)
1693 movaps %xmm4, (%rdi)
1697 movaps %xmm4, (%rdi)
1700 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1704 movdqu -72(%rsi), %xmm0
1705 movdqu -56(%rsi), %xmm1
1711 movdqu %xmm0, -72(%rdi)
1712 movdqu %xmm1, -56(%rdi)
1722 movdqu -64(%rsi), %xmm0
1729 movdqu %xmm0, -64(%rdi)
1740 movdqu -56(%rsi), %xmm0
1746 movdqu %xmm0, -56(%rdi)
1823 movdqu -73(%rsi), %xmm0
1824 movdqu -57(%rsi), %xmm1
1831 movdqu %xmm0, -73(%rdi)
1832 movdqu %xmm1, -57(%rdi)
1843 movdqu -65(%rsi), %xmm0
1844 movdqu -49(%rsi), %xmm1
1850 movdqu %xmm0, -65(%rdi)
1851 movdqu %xmm1, -49(%rdi)
1861 movdqu -57(%rsi), %xmm0
1868 movdqu %xmm0, -57(%rdi)
1879 movdqu -49(%rsi), %xmm0
1885 movdqu %xmm0, -49(%rdi)
1961 movdqu -74(%rsi), %xmm0
1962 movdqu -58(%rsi), %xmm1
1969 movdqu %xmm0, -74(%rdi)
1970 movdqu %xmm1, -58(%rdi)
1981 movdqu -66(%rsi), %xmm0
1982 movdqu -50(%rsi), %xmm1
1989 movdqu %xmm0, -66(%rdi)
1990 movdqu %xmm1, -50(%rdi)
2001 movdqu -58(%rsi), %xmm1
2008 movdqu %xmm1, -58(%rdi)
2019 movdqu -50(%rsi), %xmm0
2025 movdqu %xmm0, -50(%rdi)
2101 movdqu -75(%rsi), %xmm0
2102 movdqu -59(%rsi), %xmm1
2109 movdqu %xmm0, -75(%rdi)
2110 movdqu %xmm1, -59(%rdi)
2121 movdqu -67(%rsi), %xmm0
2122 movdqu -59(%rsi), %xmm1
2129 movdqu %xmm0, -67(%rdi)
2130 movdqu %xmm1, -59(%rdi)
2141 movdqu -59(%rsi), %xmm0
2148 movdqu %xmm0, -59(%rdi)
2159 movdqu -51(%rsi), %xmm0
2165 movdqu %xmm0, -51(%rdi)
2243 movdqu -76(%rsi), %xmm0
2244 movdqu -60(%rsi), %xmm1
2251 movdqu %xmm0, -76(%rdi)
2252 movdqu %xmm1, -60(%rdi)
2263 movdqu -68(%rsi), %xmm0
2264 movdqu -52(%rsi), %xmm1
2270 movdqu %xmm0, -68(%rdi)
2271 movdqu %xmm1, -52(%rdi)
2281 movdqu -60(%rsi), %xmm0
2288 movdqu %xmm0, -60(%rdi)
2299 movdqu -52(%rsi), %xmm0
2305 movdqu %xmm0, -52(%rdi)
2381 movdqu -77(%rsi), %xmm0
2382 movdqu -61(%rsi), %xmm1
2389 movdqu %xmm0, -77(%rdi)
2390 movdqu %xmm1, -61(%rdi)
2401 movdqu -69(%rsi), %xmm0
2402 movdqu -53(%rsi), %xmm1
2408 movdqu %xmm0, -69(%rdi)
2409 movdqu %xmm1, -53(%rdi)
2419 movdqu -61(%rsi), %xmm0
2426 movdqu %xmm0, -61(%rdi)
2437 movdqu -53(%rsi), %xmm0
2444 movdqu %xmm0, -53(%rdi)
2522 movdqu -78(%rsi), %xmm0
2523 movdqu -62(%rsi), %xmm1
2530 movdqu %xmm0, -78(%rdi)
2531 movdqu %xmm1, -62(%rdi)
2542 movdqu -70(%rsi), %xmm0
2543 movdqu -54(%rsi), %xmm1
2549 movdqu %xmm0, -70(%rdi)
2550 movdqu %xmm1, -54(%rdi)
2560 movdqu -62(%rsi), %xmm0
2567 movdqu %xmm0, -62(%rdi)
2578 movdqu -54(%rsi), %xmm0
2584 movdqu %xmm0, -54(%rdi)
2662 movdqu -79(%rsi), %xmm0
2663 movdqu -63(%rsi), %xmm1
2670 movdqu %xmm0, -79(%rdi)
2671 movdqu %xmm1, -63(%rdi)
2682 movdqu -71(%rsi), %xmm0
2683 movdqu -55(%rsi), %xmm1
2689 movdqu %xmm0, -71(%rdi)
2690 movdqu %xmm1, -55(%rdi)
2700 movdqu -63(%rsi), %xmm0
2707 movdqu %xmm0, -63(%rdi)
2718 movdqu -55(%rsi), %xmm0
2724 movdqu %xmm0, -55(%rdi)
2802 movdqu (%rsi), %xmm1
2805 movntdq %xmm1, (%rdi)
2807 lea -0x90(%rdx), %rdx
2808 #ifdef USE_AS_MEMMOVE
2812 jae L(memmove_is_memcpy_fwd)
2815 jb L(ll_cache_copy_fwd_start)
2816 L(memmove_is_memcpy_fwd):
2819 movdqu (%rsi), %xmm0
2820 movdqu 0x10(%rsi), %xmm1
2821 movdqu 0x20(%rsi), %xmm2
2822 movdqu 0x30(%rsi), %xmm3
2823 movdqu 0x40(%rsi), %xmm4
2824 movdqu 0x50(%rsi), %xmm5
2825 movdqu 0x60(%rsi), %xmm6
2826 movdqu 0x70(%rsi), %xmm7
2827 lea 0x80(%rsi), %rsi
2830 movntdq %xmm0, (%rdi)
2831 movntdq %xmm1, 0x10(%rdi)
2832 movntdq %xmm2, 0x20(%rdi)
2833 movntdq %xmm3, 0x30(%rdi)
2834 movntdq %xmm4, 0x40(%rdi)
2835 movntdq %xmm5, 0x50(%rdi)
2836 movntdq %xmm6, 0x60(%rdi)
2837 movntdq %xmm7, 0x70(%rdi)
2838 lea 0x80(%rdi), %rdi
2839 jae L(large_page_loop)
2841 lea 0x80(%rdx), %rdx
2842 jl L(large_page_less_64bytes)
2844 movdqu (%rsi), %xmm0
2845 movdqu 0x10(%rsi), %xmm1
2846 movdqu 0x20(%rsi), %xmm2
2847 movdqu 0x30(%rsi), %xmm3
2848 lea 0x40(%rsi), %rsi
2850 movntdq %xmm0, (%rdi)
2851 movntdq %xmm1, 0x10(%rdi)
2852 movntdq %xmm2, 0x20(%rdi)
2853 movntdq %xmm3, 0x30(%rdi)
2854 lea 0x40(%rdi), %rdi
2856 L(large_page_less_64bytes):
2860 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2862 #ifdef USE_AS_MEMMOVE
2864 L(ll_cache_copy_fwd_start):
2865 prefetcht0 0x1c0(%rsi)
2866 prefetcht0 0x200(%rsi)
2867 movdqu (%rsi), %xmm0
2868 movdqu 0x10(%rsi), %xmm1
2869 movdqu 0x20(%rsi), %xmm2
2870 movdqu 0x30(%rsi), %xmm3
2871 movdqu 0x40(%rsi), %xmm4
2872 movdqu 0x50(%rsi), %xmm5
2873 movdqu 0x60(%rsi), %xmm6
2874 movdqu 0x70(%rsi), %xmm7
2875 lea 0x80(%rsi), %rsi
2878 movaps %xmm0, (%rdi)
2879 movaps %xmm1, 0x10(%rdi)
2880 movaps %xmm2, 0x20(%rdi)
2881 movaps %xmm3, 0x30(%rdi)
2882 movaps %xmm4, 0x40(%rdi)
2883 movaps %xmm5, 0x50(%rdi)
2884 movaps %xmm6, 0x60(%rdi)
2885 movaps %xmm7, 0x70(%rdi)
2886 lea 0x80(%rdi), %rdi
2887 jae L(ll_cache_copy_fwd_start)
2889 lea 0x80(%rdx), %rdx
2890 jl L(large_page_ll_less_fwd_64bytes)
2892 movdqu (%rsi), %xmm0
2893 movdqu 0x10(%rsi), %xmm1
2894 movdqu 0x20(%rsi), %xmm2
2895 movdqu 0x30(%rsi), %xmm3
2896 lea 0x40(%rsi), %rsi
2898 movaps %xmm0, (%rdi)
2899 movaps %xmm1, 0x10(%rdi)
2900 movaps %xmm2, 0x20(%rdi)
2901 movaps %xmm3, 0x30(%rdi)
2902 lea 0x40(%rdi), %rdi
2904 L(large_page_ll_less_fwd_64bytes):
2907 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2912 movdqu -0x10(%rsi), %xmm1
2915 movdqa %xmm1, -0x10(%rdi)
2917 lea -0x90(%rdx), %rdx
2918 #ifdef USE_AS_MEMMOVE
2922 jae L(memmove_is_memcpy_bwd)
2924 jb L(ll_cache_copy_bwd_start)
2925 L(memmove_is_memcpy_bwd):
2927 L(large_page_bwd_loop):
2928 movdqu -0x10(%rsi), %xmm0
2929 movdqu -0x20(%rsi), %xmm1
2930 movdqu -0x30(%rsi), %xmm2
2931 movdqu -0x40(%rsi), %xmm3
2932 movdqu -0x50(%rsi), %xmm4
2933 movdqu -0x60(%rsi), %xmm5
2934 movdqu -0x70(%rsi), %xmm6
2935 movdqu -0x80(%rsi), %xmm7
2936 lea -0x80(%rsi), %rsi
2939 movntdq %xmm0, -0x10(%rdi)
2940 movntdq %xmm1, -0x20(%rdi)
2941 movntdq %xmm2, -0x30(%rdi)
2942 movntdq %xmm3, -0x40(%rdi)
2943 movntdq %xmm4, -0x50(%rdi)
2944 movntdq %xmm5, -0x60(%rdi)
2945 movntdq %xmm6, -0x70(%rdi)
2946 movntdq %xmm7, -0x80(%rdi)
2947 lea -0x80(%rdi), %rdi
2948 jae L(large_page_bwd_loop)
2950 lea 0x80(%rdx), %rdx
2951 jl L(large_page_less_bwd_64bytes)
2953 movdqu -0x10(%rsi), %xmm0
2954 movdqu -0x20(%rsi), %xmm1
2955 movdqu -0x30(%rsi), %xmm2
2956 movdqu -0x40(%rsi), %xmm3
2957 lea -0x40(%rsi), %rsi
2959 movntdq %xmm0, -0x10(%rdi)
2960 movntdq %xmm1, -0x20(%rdi)
2961 movntdq %xmm2, -0x30(%rdi)
2962 movntdq %xmm3, -0x40(%rdi)
2963 lea -0x40(%rdi), %rdi
2965 L(large_page_less_bwd_64bytes):
2967 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2969 #ifdef USE_AS_MEMMOVE
2971 L(ll_cache_copy_bwd_start):
2972 prefetcht0 -0x1c0(%rsi)
2973 prefetcht0 -0x200(%rsi)
2974 movdqu -0x10(%rsi), %xmm0
2975 movdqu -0x20(%rsi), %xmm1
2976 movdqu -0x30(%rsi), %xmm2
2977 movdqu -0x40(%rsi), %xmm3
2978 movdqu -0x50(%rsi), %xmm4
2979 movdqu -0x60(%rsi), %xmm5
2980 movdqu -0x70(%rsi), %xmm6
2981 movdqu -0x80(%rsi), %xmm7
2982 lea -0x80(%rsi), %rsi
2985 movaps %xmm0, -0x10(%rdi)
2986 movaps %xmm1, -0x20(%rdi)
2987 movaps %xmm2, -0x30(%rdi)
2988 movaps %xmm3, -0x40(%rdi)
2989 movaps %xmm4, -0x50(%rdi)
2990 movaps %xmm5, -0x60(%rdi)
2991 movaps %xmm6, -0x70(%rdi)
2992 movaps %xmm7, -0x80(%rdi)
2993 lea -0x80(%rdi), %rdi
2994 jae L(ll_cache_copy_bwd_start)
2996 lea 0x80(%rdx), %rdx
2997 jl L(large_page_ll_less_bwd_64bytes)
2999 movdqu -0x10(%rsi), %xmm0
3000 movdqu -0x20(%rsi), %xmm1
3001 movdqu -0x30(%rsi), %xmm2
3002 movdqu -0x40(%rsi), %xmm3
3003 lea -0x40(%rsi), %rsi
3005 movaps %xmm0, -0x10(%rdi)
3006 movaps %xmm1, -0x20(%rdi)
3007 movaps %xmm2, -0x30(%rdi)
3008 movaps %xmm3, -0x40(%rdi)
3009 lea -0x40(%rdi), %rdi
3011 L(large_page_ll_less_bwd_64bytes):
3012 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3017 .section .rodata.ssse3,"a",@progbits
3019 L(table_less_80bytes):
3020 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3021 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3022 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3023 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3024 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3025 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3026 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3027 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3028 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3029 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3030 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3031 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(shl_0), L(shl_table))
3104 .int JMPTBL (L(shl_1), L(shl_table))
3105 .int JMPTBL (L(shl_2), L(shl_table))
3106 .int JMPTBL (L(shl_3), L(shl_table))
3107 .int JMPTBL (L(shl_4), L(shl_table))
3108 .int JMPTBL (L(shl_5), L(shl_table))
3109 .int JMPTBL (L(shl_6), L(shl_table))
3110 .int JMPTBL (L(shl_7), L(shl_table))
3111 .int JMPTBL (L(shl_8), L(shl_table))
3112 .int JMPTBL (L(shl_9), L(shl_table))
3113 .int JMPTBL (L(shl_10), L(shl_table))
3114 .int JMPTBL (L(shl_11), L(shl_table))
3115 .int JMPTBL (L(shl_12), L(shl_table))
3116 .int JMPTBL (L(shl_13), L(shl_table))
3117 .int JMPTBL (L(shl_14), L(shl_table))
3118 .int JMPTBL (L(shl_15), L(shl_table))
3122 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3123 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3124 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3125 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3126 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3127 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3128 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3129 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3130 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3131 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3132 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3133 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3134 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3135 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))