2 Copyright (C) 2010-2016 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 || defined USE_AS_MEMMOVE \
25 || !defined USE_MULTIARCH)
27 #include "asm-syntax.h"
30 # define MEMCPY __memcpy_ssse3
31 # define MEMCPY_CHK __memcpy_chk_ssse3
32 # define MEMPCPY __mempcpy_ssse3
33 # define MEMPCPY_CHK __mempcpy_chk_ssse3
36 #define JMPTBL(I, B) I - B
38 /* Branch to an entry in a jump table. TABLE is a jump table with
39 relative offsets. INDEX is a register contains the index into the
40 jump table. SCALE is the scale of INDEX. */
41 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
42 lea TABLE(%rip), %r11; \
43 movslq (%r11, INDEX, SCALE), INDEX; \
44 lea (%r11, INDEX), INDEX; \
48 .section .text.ssse3,"ax",@progbits
49 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
52 jb HIDDEN_JUMPTARGET (__chk_fail)
62 #if !defined USE_AS_BCOPY
65 jb HIDDEN_JUMPTARGET (__chk_fail)
86 lea L(table_less_80bytes)(%rip), %r11
88 movslq (%r11, %rdx, 4), %r9
97 #ifndef USE_AS_MEMMOVE
111 #ifdef SHARED_CACHE_SIZE_HALF
112 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
114 mov __x86_shared_cache_size_half(%rip), %RCX_LP
121 #ifdef DATA_CACHE_SIZE_HALF
122 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
124 mov __x86_data_cache_size_half(%rip), %RCX_LP
126 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
130 movdqu -16(%rsi, %rdx), %xmm0
132 lea -16(%rdi, %rdx), %r8
141 #ifdef SHARED_CACHE_SIZE_HALF
142 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
144 mov __x86_shared_cache_size_half(%rip), %RCX_LP
152 #ifdef DATA_CACHE_SIZE_HALF
153 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
155 mov __x86_data_cache_size_half(%rip), %RCX_LP
157 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
170 jb L(shl_0_less_64bytes)
172 movaps 16(%rsi), %xmm1
173 movaps 32(%rsi), %xmm2
174 movaps 48(%rsi), %xmm3
176 movaps %xmm1, 16(%rdi)
177 movaps %xmm2, 32(%rdi)
178 movaps %xmm3, 48(%rdi)
182 L(shl_0_less_64bytes):
185 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
189 #ifdef DATA_CACHE_SIZE_HALF
190 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
192 cmp __x86_data_cache_size_half(%rip), %RDX_LP
195 jae L(shl_0_gobble_mem_loop)
196 L(shl_0_gobble_cache_loop):
198 movaps 0x10(%rsi), %xmm1
199 movaps 0x20(%rsi), %xmm2
200 movaps 0x30(%rsi), %xmm3
203 movaps %xmm1, 0x10(%rdi)
204 movaps %xmm2, 0x20(%rdi)
205 movaps %xmm3, 0x30(%rdi)
208 movaps 0x40(%rsi), %xmm4
209 movaps 0x50(%rsi), %xmm5
210 movaps 0x60(%rsi), %xmm6
211 movaps 0x70(%rsi), %xmm7
213 movaps %xmm4, 0x40(%rdi)
214 movaps %xmm5, 0x50(%rdi)
215 movaps %xmm6, 0x60(%rdi)
216 movaps %xmm7, 0x70(%rdi)
219 jae L(shl_0_gobble_cache_loop)
222 jl L(shl_0_cache_less_64bytes)
226 movdqa 0x10(%rsi), %xmm1
229 movdqa %xmm1, 0x10(%rdi)
231 movdqa 0x20(%rsi), %xmm4
232 movdqa 0x30(%rsi), %xmm1
235 movdqa %xmm4, 0x20(%rdi)
236 movdqa %xmm1, 0x30(%rdi)
238 L(shl_0_cache_less_64bytes):
241 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
244 L(shl_0_gobble_mem_loop):
245 prefetcht0 0x1c0(%rsi)
246 prefetcht0 0x280(%rsi)
249 movdqa 0x10(%rsi), %xmm1
250 movdqa 0x20(%rsi), %xmm2
251 movdqa 0x30(%rsi), %xmm3
252 movdqa 0x40(%rsi), %xmm4
253 movdqa 0x50(%rsi), %xmm5
254 movdqa 0x60(%rsi), %xmm6
255 movdqa 0x70(%rsi), %xmm7
259 movdqa %xmm1, 0x10(%rdi)
260 movdqa %xmm2, 0x20(%rdi)
261 movdqa %xmm3, 0x30(%rdi)
262 movdqa %xmm4, 0x40(%rdi)
263 movdqa %xmm5, 0x50(%rdi)
264 movdqa %xmm6, 0x60(%rdi)
265 movdqa %xmm7, 0x70(%rdi)
268 jae L(shl_0_gobble_mem_loop)
271 jl L(shl_0_mem_less_64bytes)
275 movdqa 0x10(%rsi), %xmm1
278 movdqa %xmm1, 0x10(%rdi)
280 movdqa 0x20(%rsi), %xmm0
281 movdqa 0x30(%rsi), %xmm1
284 movdqa %xmm0, 0x20(%rdi)
285 movdqa %xmm1, 0x30(%rdi)
287 L(shl_0_mem_less_64bytes):
289 jb L(shl_0_mem_less_32bytes)
292 movdqa 0x10(%rsi), %xmm1
295 movdqa %xmm1, 0x10(%rdi)
297 L(shl_0_mem_less_32bytes):
300 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
305 movdqa -0x10(%rsi), %xmm1
307 movdqa %xmm1, -0x10(%rdi)
311 ja L(shl_0_gobble_bwd)
313 jb L(shl_0_less_64bytes_bwd)
314 movaps -0x10(%rsi), %xmm0
315 movaps -0x20(%rsi), %xmm1
316 movaps -0x30(%rsi), %xmm2
317 movaps -0x40(%rsi), %xmm3
318 movaps %xmm0, -0x10(%rdi)
319 movaps %xmm1, -0x20(%rdi)
320 movaps %xmm2, -0x30(%rdi)
321 movaps %xmm3, -0x40(%rdi)
325 L(shl_0_less_64bytes_bwd):
326 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
330 #ifdef DATA_CACHE_SIZE_HALF
331 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
333 cmp __x86_data_cache_size_half(%rip), %RDX_LP
336 jae L(shl_0_gobble_mem_bwd_loop)
337 L(shl_0_gobble_bwd_loop):
338 movdqa -0x10(%rsi), %xmm0
339 movaps -0x20(%rsi), %xmm1
340 movaps -0x30(%rsi), %xmm2
341 movaps -0x40(%rsi), %xmm3
343 movdqa %xmm0, -0x10(%rdi)
344 movaps %xmm1, -0x20(%rdi)
345 movaps %xmm2, -0x30(%rdi)
346 movaps %xmm3, -0x40(%rdi)
349 movaps -0x50(%rsi), %xmm4
350 movaps -0x60(%rsi), %xmm5
351 movaps -0x70(%rsi), %xmm6
352 movaps -0x80(%rsi), %xmm7
353 lea -0x80(%rsi), %rsi
354 movaps %xmm4, -0x50(%rdi)
355 movaps %xmm5, -0x60(%rdi)
356 movaps %xmm6, -0x70(%rdi)
357 movaps %xmm7, -0x80(%rdi)
358 lea -0x80(%rdi), %rdi
360 jae L(shl_0_gobble_bwd_loop)
363 jl L(shl_0_gobble_bwd_less_64bytes)
365 movdqa -0x10(%rsi), %xmm0
367 movdqa -0x20(%rsi), %xmm1
369 movdqa %xmm0, -0x10(%rdi)
370 movdqa %xmm1, -0x20(%rdi)
372 movdqa -0x30(%rsi), %xmm0
373 movdqa -0x40(%rsi), %xmm1
376 movdqa %xmm0, -0x30(%rdi)
377 movdqa %xmm1, -0x40(%rdi)
379 L(shl_0_gobble_bwd_less_64bytes):
380 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
383 L(shl_0_gobble_mem_bwd_loop):
384 prefetcht0 -0x1c0(%rsi)
385 prefetcht0 -0x280(%rsi)
386 movdqa -0x10(%rsi), %xmm0
387 movdqa -0x20(%rsi), %xmm1
388 movdqa -0x30(%rsi), %xmm2
389 movdqa -0x40(%rsi), %xmm3
390 movdqa -0x50(%rsi), %xmm4
391 movdqa -0x60(%rsi), %xmm5
392 movdqa -0x70(%rsi), %xmm6
393 movdqa -0x80(%rsi), %xmm7
394 lea -0x80(%rsi), %rsi
396 movdqa %xmm0, -0x10(%rdi)
397 movdqa %xmm1, -0x20(%rdi)
398 movdqa %xmm2, -0x30(%rdi)
399 movdqa %xmm3, -0x40(%rdi)
400 movdqa %xmm4, -0x50(%rdi)
401 movdqa %xmm5, -0x60(%rdi)
402 movdqa %xmm6, -0x70(%rdi)
403 movdqa %xmm7, -0x80(%rdi)
404 lea -0x80(%rdi), %rdi
406 jae L(shl_0_gobble_mem_bwd_loop)
409 jl L(shl_0_mem_bwd_less_64bytes)
411 movdqa -0x10(%rsi), %xmm0
413 movdqa -0x20(%rsi), %xmm1
415 movdqa %xmm0, -0x10(%rdi)
416 movdqa %xmm1, -0x20(%rdi)
418 movdqa -0x30(%rsi), %xmm0
419 movdqa -0x40(%rsi), %xmm1
422 movdqa %xmm0, -0x30(%rdi)
423 movdqa %xmm1, -0x40(%rdi)
425 L(shl_0_mem_bwd_less_64bytes):
427 jb L(shl_0_mem_bwd_less_32bytes)
428 movdqa -0x10(%rsi), %xmm0
430 movdqa -0x20(%rsi), %xmm1
432 movdqa %xmm0, -0x10(%rdi)
433 movdqa %xmm1, -0x20(%rdi)
435 L(shl_0_mem_bwd_less_32bytes):
436 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
440 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
442 movaps -0x01(%rsi), %xmm1
444 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
450 prefetchnta 0x1c0(%rsi)
453 movaps 0x0f(%rsi), %xmm2
454 movaps 0x1f(%rsi), %xmm3
455 movaps 0x2f(%rsi), %xmm4
456 movaps 0x3f(%rsi), %xmm5
458 palignr $1, %xmm4, %xmm5
460 palignr $1, %xmm3, %xmm4
461 palignr $1, %xmm2, %xmm3
463 palignr $1, %xmm1, %xmm2
465 movdqa %xmm2, -0x40(%rdi)
466 movaps %xmm3, -0x30(%rdi)
468 movaps %xmm4, -0x20(%rdi)
469 movaps %xmm5, -0x10(%rdi)
473 movaps %xmm4, -0x20(%rdi)
475 movaps %xmm5, -0x10(%rdi)
479 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
483 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
485 movaps -0x01(%rsi), %xmm1
487 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
492 L(shl_1_bwd_loop_L2):
493 prefetchnta -0x1c0(%rsi)
494 L(shl_1_bwd_loop_L1):
495 movaps -0x11(%rsi), %xmm2
497 movaps -0x21(%rsi), %xmm3
498 movaps -0x31(%rsi), %xmm4
499 movaps -0x41(%rsi), %xmm5
500 lea -0x40(%rsi), %rsi
501 palignr $1, %xmm2, %xmm1
502 palignr $1, %xmm3, %xmm2
503 palignr $1, %xmm4, %xmm3
504 palignr $1, %xmm5, %xmm4
506 movaps %xmm1, -0x10(%rdi)
509 movaps %xmm2, -0x20(%rdi)
510 lea -0x40(%rdi), %rdi
512 movaps %xmm3, 0x10(%rdi)
521 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
525 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
527 movaps -0x02(%rsi), %xmm1
529 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
535 prefetchnta 0x1c0(%rsi)
538 movaps 0x0e(%rsi), %xmm2
539 movaps 0x1e(%rsi), %xmm3
540 movaps 0x2e(%rsi), %xmm4
541 movaps 0x3e(%rsi), %xmm5
543 palignr $2, %xmm4, %xmm5
545 palignr $2, %xmm3, %xmm4
546 palignr $2, %xmm2, %xmm3
548 palignr $2, %xmm1, %xmm2
550 movdqa %xmm2, -0x40(%rdi)
551 movaps %xmm3, -0x30(%rdi)
553 movaps %xmm4, -0x20(%rdi)
554 movaps %xmm5, -0x10(%rdi)
558 movaps %xmm4, -0x20(%rdi)
560 movaps %xmm5, -0x10(%rdi)
564 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
568 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
570 movaps -0x02(%rsi), %xmm1
572 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
577 L(shl_2_bwd_loop_L2):
578 prefetchnta -0x1c0(%rsi)
579 L(shl_2_bwd_loop_L1):
580 movaps -0x12(%rsi), %xmm2
582 movaps -0x22(%rsi), %xmm3
583 movaps -0x32(%rsi), %xmm4
584 movaps -0x42(%rsi), %xmm5
585 lea -0x40(%rsi), %rsi
586 palignr $2, %xmm2, %xmm1
587 palignr $2, %xmm3, %xmm2
588 palignr $2, %xmm4, %xmm3
589 palignr $2, %xmm5, %xmm4
591 movaps %xmm1, -0x10(%rdi)
594 movaps %xmm2, -0x20(%rdi)
595 lea -0x40(%rdi), %rdi
597 movaps %xmm3, 0x10(%rdi)
606 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
610 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
612 movaps -0x03(%rsi), %xmm1
614 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
620 prefetchnta 0x1c0(%rsi)
623 movaps 0x0d(%rsi), %xmm2
624 movaps 0x1d(%rsi), %xmm3
625 movaps 0x2d(%rsi), %xmm4
626 movaps 0x3d(%rsi), %xmm5
628 palignr $3, %xmm4, %xmm5
630 palignr $3, %xmm3, %xmm4
631 palignr $3, %xmm2, %xmm3
633 palignr $3, %xmm1, %xmm2
635 movdqa %xmm2, -0x40(%rdi)
636 movaps %xmm3, -0x30(%rdi)
638 movaps %xmm4, -0x20(%rdi)
639 movaps %xmm5, -0x10(%rdi)
643 movaps %xmm4, -0x20(%rdi)
645 movaps %xmm5, -0x10(%rdi)
649 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
653 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
655 movaps -0x03(%rsi), %xmm1
657 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
662 L(shl_3_bwd_loop_L2):
663 prefetchnta -0x1c0(%rsi)
664 L(shl_3_bwd_loop_L1):
665 movaps -0x13(%rsi), %xmm2
667 movaps -0x23(%rsi), %xmm3
668 movaps -0x33(%rsi), %xmm4
669 movaps -0x43(%rsi), %xmm5
670 lea -0x40(%rsi), %rsi
671 palignr $3, %xmm2, %xmm1
672 palignr $3, %xmm3, %xmm2
673 palignr $3, %xmm4, %xmm3
674 palignr $3, %xmm5, %xmm4
676 movaps %xmm1, -0x10(%rdi)
679 movaps %xmm2, -0x20(%rdi)
680 lea -0x40(%rdi), %rdi
682 movaps %xmm3, 0x10(%rdi)
691 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
695 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
697 movaps -0x04(%rsi), %xmm1
699 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
705 prefetchnta 0x1c0(%rsi)
708 movaps 0x0c(%rsi), %xmm2
709 movaps 0x1c(%rsi), %xmm3
710 movaps 0x2c(%rsi), %xmm4
711 movaps 0x3c(%rsi), %xmm5
713 palignr $4, %xmm4, %xmm5
715 palignr $4, %xmm3, %xmm4
716 palignr $4, %xmm2, %xmm3
718 palignr $4, %xmm1, %xmm2
720 movdqa %xmm2, -0x40(%rdi)
721 movaps %xmm3, -0x30(%rdi)
723 movaps %xmm4, -0x20(%rdi)
724 movaps %xmm5, -0x10(%rdi)
728 movaps %xmm4, -0x20(%rdi)
730 movaps %xmm5, -0x10(%rdi)
734 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
738 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
740 movaps -0x04(%rsi), %xmm1
742 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
747 L(shl_4_bwd_loop_L2):
748 prefetchnta -0x1c0(%rsi)
749 L(shl_4_bwd_loop_L1):
750 movaps -0x14(%rsi), %xmm2
752 movaps -0x24(%rsi), %xmm3
753 movaps -0x34(%rsi), %xmm4
754 movaps -0x44(%rsi), %xmm5
755 lea -0x40(%rsi), %rsi
756 palignr $4, %xmm2, %xmm1
757 palignr $4, %xmm3, %xmm2
758 palignr $4, %xmm4, %xmm3
759 palignr $4, %xmm5, %xmm4
761 movaps %xmm1, -0x10(%rdi)
764 movaps %xmm2, -0x20(%rdi)
765 lea -0x40(%rdi), %rdi
767 movaps %xmm3, 0x10(%rdi)
776 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
780 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
782 movaps -0x05(%rsi), %xmm1
784 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
790 prefetchnta 0x1c0(%rsi)
793 movaps 0x0b(%rsi), %xmm2
794 movaps 0x1b(%rsi), %xmm3
795 movaps 0x2b(%rsi), %xmm4
796 movaps 0x3b(%rsi), %xmm5
798 palignr $5, %xmm4, %xmm5
800 palignr $5, %xmm3, %xmm4
801 palignr $5, %xmm2, %xmm3
803 palignr $5, %xmm1, %xmm2
805 movdqa %xmm2, -0x40(%rdi)
806 movaps %xmm3, -0x30(%rdi)
808 movaps %xmm4, -0x20(%rdi)
809 movaps %xmm5, -0x10(%rdi)
813 movaps %xmm4, -0x20(%rdi)
815 movaps %xmm5, -0x10(%rdi)
819 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
823 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
825 movaps -0x05(%rsi), %xmm1
827 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
832 L(shl_5_bwd_loop_L2):
833 prefetchnta -0x1c0(%rsi)
834 L(shl_5_bwd_loop_L1):
835 movaps -0x15(%rsi), %xmm2
837 movaps -0x25(%rsi), %xmm3
838 movaps -0x35(%rsi), %xmm4
839 movaps -0x45(%rsi), %xmm5
840 lea -0x40(%rsi), %rsi
841 palignr $5, %xmm2, %xmm1
842 palignr $5, %xmm3, %xmm2
843 palignr $5, %xmm4, %xmm3
844 palignr $5, %xmm5, %xmm4
846 movaps %xmm1, -0x10(%rdi)
849 movaps %xmm2, -0x20(%rdi)
850 lea -0x40(%rdi), %rdi
852 movaps %xmm3, 0x10(%rdi)
861 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
865 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
867 movaps -0x06(%rsi), %xmm1
869 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
875 prefetchnta 0x1c0(%rsi)
878 movaps 0x0a(%rsi), %xmm2
879 movaps 0x1a(%rsi), %xmm3
880 movaps 0x2a(%rsi), %xmm4
881 movaps 0x3a(%rsi), %xmm5
883 palignr $6, %xmm4, %xmm5
885 palignr $6, %xmm3, %xmm4
886 palignr $6, %xmm2, %xmm3
888 palignr $6, %xmm1, %xmm2
890 movdqa %xmm2, -0x40(%rdi)
891 movaps %xmm3, -0x30(%rdi)
893 movaps %xmm4, -0x20(%rdi)
894 movaps %xmm5, -0x10(%rdi)
898 movaps %xmm4, -0x20(%rdi)
900 movaps %xmm5, -0x10(%rdi)
904 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
908 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
910 movaps -0x06(%rsi), %xmm1
912 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
917 L(shl_6_bwd_loop_L2):
918 prefetchnta -0x1c0(%rsi)
919 L(shl_6_bwd_loop_L1):
920 movaps -0x16(%rsi), %xmm2
922 movaps -0x26(%rsi), %xmm3
923 movaps -0x36(%rsi), %xmm4
924 movaps -0x46(%rsi), %xmm5
925 lea -0x40(%rsi), %rsi
926 palignr $6, %xmm2, %xmm1
927 palignr $6, %xmm3, %xmm2
928 palignr $6, %xmm4, %xmm3
929 palignr $6, %xmm5, %xmm4
931 movaps %xmm1, -0x10(%rdi)
934 movaps %xmm2, -0x20(%rdi)
935 lea -0x40(%rdi), %rdi
937 movaps %xmm3, 0x10(%rdi)
946 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
950 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
952 movaps -0x07(%rsi), %xmm1
954 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
960 prefetchnta 0x1c0(%rsi)
963 movaps 0x09(%rsi), %xmm2
964 movaps 0x19(%rsi), %xmm3
965 movaps 0x29(%rsi), %xmm4
966 movaps 0x39(%rsi), %xmm5
968 palignr $7, %xmm4, %xmm5
970 palignr $7, %xmm3, %xmm4
971 palignr $7, %xmm2, %xmm3
973 palignr $7, %xmm1, %xmm2
975 movdqa %xmm2, -0x40(%rdi)
976 movaps %xmm3, -0x30(%rdi)
978 movaps %xmm4, -0x20(%rdi)
979 movaps %xmm5, -0x10(%rdi)
983 movaps %xmm4, -0x20(%rdi)
985 movaps %xmm5, -0x10(%rdi)
989 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
993 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
995 movaps -0x07(%rsi), %xmm1
997 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
1002 L(shl_7_bwd_loop_L2):
1003 prefetchnta -0x1c0(%rsi)
1004 L(shl_7_bwd_loop_L1):
1005 movaps -0x17(%rsi), %xmm2
1007 movaps -0x27(%rsi), %xmm3
1008 movaps -0x37(%rsi), %xmm4
1009 movaps -0x47(%rsi), %xmm5
1010 lea -0x40(%rsi), %rsi
1011 palignr $7, %xmm2, %xmm1
1012 palignr $7, %xmm3, %xmm2
1013 palignr $7, %xmm4, %xmm3
1014 palignr $7, %xmm5, %xmm4
1016 movaps %xmm1, -0x10(%rdi)
1019 movaps %xmm2, -0x20(%rdi)
1020 lea -0x40(%rdi), %rdi
1022 movaps %xmm3, 0x10(%rdi)
1024 movaps %xmm4, (%rdi)
1028 movaps %xmm4, (%rdi)
1031 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1035 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1037 movaps -0x08(%rsi), %xmm1
1039 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1044 prefetchnta 0x1c0(%rsi)
1047 movaps 0x08(%rsi), %xmm2
1048 movaps 0x18(%rsi), %xmm3
1049 movaps 0x28(%rsi), %xmm4
1050 movaps 0x38(%rsi), %xmm5
1052 palignr $8, %xmm4, %xmm5
1054 palignr $8, %xmm3, %xmm4
1055 palignr $8, %xmm2, %xmm3
1057 palignr $8, %xmm1, %xmm2
1059 movdqa %xmm2, -0x40(%rdi)
1060 movaps %xmm3, -0x30(%rdi)
1062 movaps %xmm4, -0x20(%rdi)
1063 movaps %xmm5, -0x10(%rdi)
1069 movaps %xmm4, -0x20(%rdi)
1071 movaps %xmm5, -0x10(%rdi)
1074 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1078 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1080 movaps -0x08(%rsi), %xmm1
1082 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1087 L(shl_8_bwd_loop_L2):
1088 prefetchnta -0x1c0(%rsi)
1089 L(shl_8_bwd_loop_L1):
1090 movaps -0x18(%rsi), %xmm2
1092 movaps -0x28(%rsi), %xmm3
1093 movaps -0x38(%rsi), %xmm4
1094 movaps -0x48(%rsi), %xmm5
1095 lea -0x40(%rsi), %rsi
1096 palignr $8, %xmm2, %xmm1
1097 palignr $8, %xmm3, %xmm2
1098 palignr $8, %xmm4, %xmm3
1099 palignr $8, %xmm5, %xmm4
1101 movaps %xmm1, -0x10(%rdi)
1104 movaps %xmm2, -0x20(%rdi)
1105 lea -0x40(%rdi), %rdi
1107 movaps %xmm3, 0x10(%rdi)
1109 movaps %xmm4, (%rdi)
1113 movaps %xmm4, (%rdi)
1116 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1120 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1122 movaps -0x09(%rsi), %xmm1
1124 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1130 prefetchnta 0x1c0(%rsi)
1133 movaps 0x07(%rsi), %xmm2
1134 movaps 0x17(%rsi), %xmm3
1135 movaps 0x27(%rsi), %xmm4
1136 movaps 0x37(%rsi), %xmm5
1138 palignr $9, %xmm4, %xmm5
1140 palignr $9, %xmm3, %xmm4
1141 palignr $9, %xmm2, %xmm3
1143 palignr $9, %xmm1, %xmm2
1145 movdqa %xmm2, -0x40(%rdi)
1146 movaps %xmm3, -0x30(%rdi)
1148 movaps %xmm4, -0x20(%rdi)
1149 movaps %xmm5, -0x10(%rdi)
1153 movaps %xmm4, -0x20(%rdi)
1155 movaps %xmm5, -0x10(%rdi)
1159 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1163 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1165 movaps -0x09(%rsi), %xmm1
1167 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1172 L(shl_9_bwd_loop_L2):
1173 prefetchnta -0x1c0(%rsi)
1174 L(shl_9_bwd_loop_L1):
1175 movaps -0x19(%rsi), %xmm2
1177 movaps -0x29(%rsi), %xmm3
1178 movaps -0x39(%rsi), %xmm4
1179 movaps -0x49(%rsi), %xmm5
1180 lea -0x40(%rsi), %rsi
1181 palignr $9, %xmm2, %xmm1
1182 palignr $9, %xmm3, %xmm2
1183 palignr $9, %xmm4, %xmm3
1184 palignr $9, %xmm5, %xmm4
1186 movaps %xmm1, -0x10(%rdi)
1189 movaps %xmm2, -0x20(%rdi)
1190 lea -0x40(%rdi), %rdi
1192 movaps %xmm3, 0x10(%rdi)
1194 movaps %xmm4, (%rdi)
1198 movaps %xmm4, (%rdi)
1201 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1205 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1207 movaps -0x0a(%rsi), %xmm1
1209 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1215 prefetchnta 0x1c0(%rsi)
1218 movaps 0x06(%rsi), %xmm2
1219 movaps 0x16(%rsi), %xmm3
1220 movaps 0x26(%rsi), %xmm4
1221 movaps 0x36(%rsi), %xmm5
1223 palignr $10, %xmm4, %xmm5
1225 palignr $10, %xmm3, %xmm4
1226 palignr $10, %xmm2, %xmm3
1228 palignr $10, %xmm1, %xmm2
1230 movdqa %xmm2, -0x40(%rdi)
1231 movaps %xmm3, -0x30(%rdi)
1233 movaps %xmm4, -0x20(%rdi)
1234 movaps %xmm5, -0x10(%rdi)
1238 movaps %xmm4, -0x20(%rdi)
1240 movaps %xmm5, -0x10(%rdi)
1244 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1248 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1250 movaps -0x0a(%rsi), %xmm1
1252 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1257 L(shl_10_bwd_loop_L2):
1258 prefetchnta -0x1c0(%rsi)
1259 L(shl_10_bwd_loop_L1):
1260 movaps -0x1a(%rsi), %xmm2
1262 movaps -0x2a(%rsi), %xmm3
1263 movaps -0x3a(%rsi), %xmm4
1264 movaps -0x4a(%rsi), %xmm5
1265 lea -0x40(%rsi), %rsi
1266 palignr $10, %xmm2, %xmm1
1267 palignr $10, %xmm3, %xmm2
1268 palignr $10, %xmm4, %xmm3
1269 palignr $10, %xmm5, %xmm4
1271 movaps %xmm1, -0x10(%rdi)
1274 movaps %xmm2, -0x20(%rdi)
1275 lea -0x40(%rdi), %rdi
1277 movaps %xmm3, 0x10(%rdi)
1278 jb L(shl_10_bwd_end)
1279 movaps %xmm4, (%rdi)
1283 movaps %xmm4, (%rdi)
1286 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1290 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1292 movaps -0x0b(%rsi), %xmm1
1294 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1300 prefetchnta 0x1c0(%rsi)
1303 movaps 0x05(%rsi), %xmm2
1304 movaps 0x15(%rsi), %xmm3
1305 movaps 0x25(%rsi), %xmm4
1306 movaps 0x35(%rsi), %xmm5
1308 palignr $11, %xmm4, %xmm5
1310 palignr $11, %xmm3, %xmm4
1311 palignr $11, %xmm2, %xmm3
1313 palignr $11, %xmm1, %xmm2
1315 movdqa %xmm2, -0x40(%rdi)
1316 movaps %xmm3, -0x30(%rdi)
1318 movaps %xmm4, -0x20(%rdi)
1319 movaps %xmm5, -0x10(%rdi)
1323 movaps %xmm4, -0x20(%rdi)
1325 movaps %xmm5, -0x10(%rdi)
1329 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1333 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1335 movaps -0x0b(%rsi), %xmm1
1337 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1342 L(shl_11_bwd_loop_L2):
1343 prefetchnta -0x1c0(%rsi)
1344 L(shl_11_bwd_loop_L1):
1345 movaps -0x1b(%rsi), %xmm2
1347 movaps -0x2b(%rsi), %xmm3
1348 movaps -0x3b(%rsi), %xmm4
1349 movaps -0x4b(%rsi), %xmm5
1350 lea -0x40(%rsi), %rsi
1351 palignr $11, %xmm2, %xmm1
1352 palignr $11, %xmm3, %xmm2
1353 palignr $11, %xmm4, %xmm3
1354 palignr $11, %xmm5, %xmm4
1356 movaps %xmm1, -0x10(%rdi)
1359 movaps %xmm2, -0x20(%rdi)
1360 lea -0x40(%rdi), %rdi
1362 movaps %xmm3, 0x10(%rdi)
1363 jb L(shl_11_bwd_end)
1364 movaps %xmm4, (%rdi)
1368 movaps %xmm4, (%rdi)
1371 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1375 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1377 movaps -0x0c(%rsi), %xmm1
1379 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1385 prefetchnta 0x1c0(%rsi)
1388 movaps 0x04(%rsi), %xmm2
1389 movaps 0x14(%rsi), %xmm3
1390 movaps 0x24(%rsi), %xmm4
1391 movaps 0x34(%rsi), %xmm5
1393 palignr $12, %xmm4, %xmm5
1395 palignr $12, %xmm3, %xmm4
1396 palignr $12, %xmm2, %xmm3
1398 palignr $12, %xmm1, %xmm2
1400 movdqa %xmm2, -0x40(%rdi)
1401 movaps %xmm3, -0x30(%rdi)
1403 movaps %xmm4, -0x20(%rdi)
1404 movaps %xmm5, -0x10(%rdi)
1408 movaps %xmm4, -0x20(%rdi)
1410 movaps %xmm5, -0x10(%rdi)
1414 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1418 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1420 movaps -0x0c(%rsi), %xmm1
1422 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1427 L(shl_12_bwd_loop_L2):
1428 prefetchnta -0x1c0(%rsi)
1429 L(shl_12_bwd_loop_L1):
1430 movaps -0x1c(%rsi), %xmm2
1432 movaps -0x2c(%rsi), %xmm3
1433 movaps -0x3c(%rsi), %xmm4
1434 movaps -0x4c(%rsi), %xmm5
1435 lea -0x40(%rsi), %rsi
1436 palignr $12, %xmm2, %xmm1
1437 palignr $12, %xmm3, %xmm2
1438 palignr $12, %xmm4, %xmm3
1439 palignr $12, %xmm5, %xmm4
1441 movaps %xmm1, -0x10(%rdi)
1444 movaps %xmm2, -0x20(%rdi)
1445 lea -0x40(%rdi), %rdi
1447 movaps %xmm3, 0x10(%rdi)
1448 jb L(shl_12_bwd_end)
1449 movaps %xmm4, (%rdi)
1453 movaps %xmm4, (%rdi)
1456 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1460 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1462 movaps -0x0d(%rsi), %xmm1
1464 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1470 prefetchnta 0x1c0(%rsi)
1473 movaps 0x03(%rsi), %xmm2
1474 movaps 0x13(%rsi), %xmm3
1475 movaps 0x23(%rsi), %xmm4
1476 movaps 0x33(%rsi), %xmm5
1478 palignr $13, %xmm4, %xmm5
1480 palignr $13, %xmm3, %xmm4
1481 palignr $13, %xmm2, %xmm3
1483 palignr $13, %xmm1, %xmm2
1485 movdqa %xmm2, -0x40(%rdi)
1486 movaps %xmm3, -0x30(%rdi)
1488 movaps %xmm4, -0x20(%rdi)
1489 movaps %xmm5, -0x10(%rdi)
1493 movaps %xmm4, -0x20(%rdi)
1495 movaps %xmm5, -0x10(%rdi)
1499 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1503 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1505 movaps -0x0d(%rsi), %xmm1
1507 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1512 L(shl_13_bwd_loop_L2):
1513 prefetchnta -0x1c0(%rsi)
1514 L(shl_13_bwd_loop_L1):
1515 movaps -0x1d(%rsi), %xmm2
1517 movaps -0x2d(%rsi), %xmm3
1518 movaps -0x3d(%rsi), %xmm4
1519 movaps -0x4d(%rsi), %xmm5
1520 lea -0x40(%rsi), %rsi
1521 palignr $13, %xmm2, %xmm1
1522 palignr $13, %xmm3, %xmm2
1523 palignr $13, %xmm4, %xmm3
1524 palignr $13, %xmm5, %xmm4
1526 movaps %xmm1, -0x10(%rdi)
1529 movaps %xmm2, -0x20(%rdi)
1530 lea -0x40(%rdi), %rdi
1532 movaps %xmm3, 0x10(%rdi)
1533 jb L(shl_13_bwd_end)
1534 movaps %xmm4, (%rdi)
1538 movaps %xmm4, (%rdi)
1541 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1545 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1547 movaps -0x0e(%rsi), %xmm1
1549 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1555 prefetchnta 0x1c0(%rsi)
1558 movaps 0x02(%rsi), %xmm2
1559 movaps 0x12(%rsi), %xmm3
1560 movaps 0x22(%rsi), %xmm4
1561 movaps 0x32(%rsi), %xmm5
1563 palignr $14, %xmm4, %xmm5
1565 palignr $14, %xmm3, %xmm4
1566 palignr $14, %xmm2, %xmm3
1568 palignr $14, %xmm1, %xmm2
1570 movdqa %xmm2, -0x40(%rdi)
1571 movaps %xmm3, -0x30(%rdi)
1573 movaps %xmm4, -0x20(%rdi)
1574 movaps %xmm5, -0x10(%rdi)
1578 movaps %xmm4, -0x20(%rdi)
1580 movaps %xmm5, -0x10(%rdi)
1584 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1588 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1590 movaps -0x0e(%rsi), %xmm1
1592 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1597 L(shl_14_bwd_loop_L2):
1598 prefetchnta -0x1c0(%rsi)
1599 L(shl_14_bwd_loop_L1):
1600 movaps -0x1e(%rsi), %xmm2
1602 movaps -0x2e(%rsi), %xmm3
1603 movaps -0x3e(%rsi), %xmm4
1604 movaps -0x4e(%rsi), %xmm5
1605 lea -0x40(%rsi), %rsi
1606 palignr $14, %xmm2, %xmm1
1607 palignr $14, %xmm3, %xmm2
1608 palignr $14, %xmm4, %xmm3
1609 palignr $14, %xmm5, %xmm4
1611 movaps %xmm1, -0x10(%rdi)
1614 movaps %xmm2, -0x20(%rdi)
1615 lea -0x40(%rdi), %rdi
1617 movaps %xmm3, 0x10(%rdi)
1618 jb L(shl_14_bwd_end)
1619 movaps %xmm4, (%rdi)
1623 movaps %xmm4, (%rdi)
1626 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1630 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1632 movaps -0x0f(%rsi), %xmm1
1634 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1640 prefetchnta 0x1c0(%rsi)
1643 movaps 0x01(%rsi), %xmm2
1644 movaps 0x11(%rsi), %xmm3
1645 movaps 0x21(%rsi), %xmm4
1646 movaps 0x31(%rsi), %xmm5
1648 palignr $15, %xmm4, %xmm5
1650 palignr $15, %xmm3, %xmm4
1651 palignr $15, %xmm2, %xmm3
1653 palignr $15, %xmm1, %xmm2
1655 movdqa %xmm2, -0x40(%rdi)
1656 movaps %xmm3, -0x30(%rdi)
1658 movaps %xmm4, -0x20(%rdi)
1659 movaps %xmm5, -0x10(%rdi)
1663 movaps %xmm4, -0x20(%rdi)
1665 movaps %xmm5, -0x10(%rdi)
1669 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1673 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1675 movaps -0x0f(%rsi), %xmm1
1677 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1682 L(shl_15_bwd_loop_L2):
1683 prefetchnta -0x1c0(%rsi)
1684 L(shl_15_bwd_loop_L1):
1685 movaps -0x1f(%rsi), %xmm2
1687 movaps -0x2f(%rsi), %xmm3
1688 movaps -0x3f(%rsi), %xmm4
1689 movaps -0x4f(%rsi), %xmm5
1690 lea -0x40(%rsi), %rsi
1691 palignr $15, %xmm2, %xmm1
1692 palignr $15, %xmm3, %xmm2
1693 palignr $15, %xmm4, %xmm3
1694 palignr $15, %xmm5, %xmm4
1696 movaps %xmm1, -0x10(%rdi)
1699 movaps %xmm2, -0x20(%rdi)
1700 lea -0x40(%rdi), %rdi
1702 movaps %xmm3, 0x10(%rdi)
1703 jb L(shl_15_bwd_end)
1704 movaps %xmm4, (%rdi)
1708 movaps %xmm4, (%rdi)
1711 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1715 movdqu -72(%rsi), %xmm0
1716 movdqu -56(%rsi), %xmm1
1722 movdqu %xmm0, -72(%rdi)
1723 movdqu %xmm1, -56(%rdi)
1733 movdqu -64(%rsi), %xmm0
1740 movdqu %xmm0, -64(%rdi)
1751 movdqu -56(%rsi), %xmm0
1757 movdqu %xmm0, -56(%rdi)
1834 movdqu -73(%rsi), %xmm0
1835 movdqu -57(%rsi), %xmm1
1842 movdqu %xmm0, -73(%rdi)
1843 movdqu %xmm1, -57(%rdi)
1854 movdqu -65(%rsi), %xmm0
1855 movdqu -49(%rsi), %xmm1
1861 movdqu %xmm0, -65(%rdi)
1862 movdqu %xmm1, -49(%rdi)
1872 movdqu -57(%rsi), %xmm0
1879 movdqu %xmm0, -57(%rdi)
1890 movdqu -49(%rsi), %xmm0
1896 movdqu %xmm0, -49(%rdi)
1972 movdqu -74(%rsi), %xmm0
1973 movdqu -58(%rsi), %xmm1
1980 movdqu %xmm0, -74(%rdi)
1981 movdqu %xmm1, -58(%rdi)
1992 movdqu -66(%rsi), %xmm0
1993 movdqu -50(%rsi), %xmm1
2000 movdqu %xmm0, -66(%rdi)
2001 movdqu %xmm1, -50(%rdi)
2012 movdqu -58(%rsi), %xmm1
2019 movdqu %xmm1, -58(%rdi)
2030 movdqu -50(%rsi), %xmm0
2036 movdqu %xmm0, -50(%rdi)
2112 movdqu -75(%rsi), %xmm0
2113 movdqu -59(%rsi), %xmm1
2120 movdqu %xmm0, -75(%rdi)
2121 movdqu %xmm1, -59(%rdi)
2132 movdqu -67(%rsi), %xmm0
2133 movdqu -59(%rsi), %xmm1
2140 movdqu %xmm0, -67(%rdi)
2141 movdqu %xmm1, -59(%rdi)
2152 movdqu -59(%rsi), %xmm0
2159 movdqu %xmm0, -59(%rdi)
2170 movdqu -51(%rsi), %xmm0
2176 movdqu %xmm0, -51(%rdi)
2254 movdqu -76(%rsi), %xmm0
2255 movdqu -60(%rsi), %xmm1
2262 movdqu %xmm0, -76(%rdi)
2263 movdqu %xmm1, -60(%rdi)
2274 movdqu -68(%rsi), %xmm0
2275 movdqu -52(%rsi), %xmm1
2281 movdqu %xmm0, -68(%rdi)
2282 movdqu %xmm1, -52(%rdi)
2292 movdqu -60(%rsi), %xmm0
2299 movdqu %xmm0, -60(%rdi)
2310 movdqu -52(%rsi), %xmm0
2316 movdqu %xmm0, -52(%rdi)
2392 movdqu -77(%rsi), %xmm0
2393 movdqu -61(%rsi), %xmm1
2400 movdqu %xmm0, -77(%rdi)
2401 movdqu %xmm1, -61(%rdi)
2412 movdqu -69(%rsi), %xmm0
2413 movdqu -53(%rsi), %xmm1
2419 movdqu %xmm0, -69(%rdi)
2420 movdqu %xmm1, -53(%rdi)
2430 movdqu -61(%rsi), %xmm0
2437 movdqu %xmm0, -61(%rdi)
2448 movdqu -53(%rsi), %xmm0
2455 movdqu %xmm0, -53(%rdi)
2533 movdqu -78(%rsi), %xmm0
2534 movdqu -62(%rsi), %xmm1
2541 movdqu %xmm0, -78(%rdi)
2542 movdqu %xmm1, -62(%rdi)
2553 movdqu -70(%rsi), %xmm0
2554 movdqu -54(%rsi), %xmm1
2560 movdqu %xmm0, -70(%rdi)
2561 movdqu %xmm1, -54(%rdi)
2571 movdqu -62(%rsi), %xmm0
2578 movdqu %xmm0, -62(%rdi)
2589 movdqu -54(%rsi), %xmm0
2595 movdqu %xmm0, -54(%rdi)
2673 movdqu -79(%rsi), %xmm0
2674 movdqu -63(%rsi), %xmm1
2681 movdqu %xmm0, -79(%rdi)
2682 movdqu %xmm1, -63(%rdi)
2693 movdqu -71(%rsi), %xmm0
2694 movdqu -55(%rsi), %xmm1
2700 movdqu %xmm0, -71(%rdi)
2701 movdqu %xmm1, -55(%rdi)
2711 movdqu -63(%rsi), %xmm0
2718 movdqu %xmm0, -63(%rdi)
2729 movdqu -55(%rsi), %xmm0
2735 movdqu %xmm0, -55(%rdi)
2813 movdqu (%rsi), %xmm1
2816 movntdq %xmm1, (%rdi)
2818 lea -0x90(%rdx), %rdx
2819 #ifdef USE_AS_MEMMOVE
2823 jae L(memmove_is_memcpy_fwd)
2826 jb L(ll_cache_copy_fwd_start)
2827 L(memmove_is_memcpy_fwd):
2830 movdqu (%rsi), %xmm0
2831 movdqu 0x10(%rsi), %xmm1
2832 movdqu 0x20(%rsi), %xmm2
2833 movdqu 0x30(%rsi), %xmm3
2834 movdqu 0x40(%rsi), %xmm4
2835 movdqu 0x50(%rsi), %xmm5
2836 movdqu 0x60(%rsi), %xmm6
2837 movdqu 0x70(%rsi), %xmm7
2838 lea 0x80(%rsi), %rsi
2841 movntdq %xmm0, (%rdi)
2842 movntdq %xmm1, 0x10(%rdi)
2843 movntdq %xmm2, 0x20(%rdi)
2844 movntdq %xmm3, 0x30(%rdi)
2845 movntdq %xmm4, 0x40(%rdi)
2846 movntdq %xmm5, 0x50(%rdi)
2847 movntdq %xmm6, 0x60(%rdi)
2848 movntdq %xmm7, 0x70(%rdi)
2849 lea 0x80(%rdi), %rdi
2850 jae L(large_page_loop)
2852 lea 0x80(%rdx), %rdx
2853 jl L(large_page_less_64bytes)
2855 movdqu (%rsi), %xmm0
2856 movdqu 0x10(%rsi), %xmm1
2857 movdqu 0x20(%rsi), %xmm2
2858 movdqu 0x30(%rsi), %xmm3
2859 lea 0x40(%rsi), %rsi
2861 movntdq %xmm0, (%rdi)
2862 movntdq %xmm1, 0x10(%rdi)
2863 movntdq %xmm2, 0x20(%rdi)
2864 movntdq %xmm3, 0x30(%rdi)
2865 lea 0x40(%rdi), %rdi
2867 L(large_page_less_64bytes):
2871 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2873 #ifdef USE_AS_MEMMOVE
2875 L(ll_cache_copy_fwd_start):
2876 prefetcht0 0x1c0(%rsi)
2877 prefetcht0 0x200(%rsi)
2878 movdqu (%rsi), %xmm0
2879 movdqu 0x10(%rsi), %xmm1
2880 movdqu 0x20(%rsi), %xmm2
2881 movdqu 0x30(%rsi), %xmm3
2882 movdqu 0x40(%rsi), %xmm4
2883 movdqu 0x50(%rsi), %xmm5
2884 movdqu 0x60(%rsi), %xmm6
2885 movdqu 0x70(%rsi), %xmm7
2886 lea 0x80(%rsi), %rsi
2889 movaps %xmm0, (%rdi)
2890 movaps %xmm1, 0x10(%rdi)
2891 movaps %xmm2, 0x20(%rdi)
2892 movaps %xmm3, 0x30(%rdi)
2893 movaps %xmm4, 0x40(%rdi)
2894 movaps %xmm5, 0x50(%rdi)
2895 movaps %xmm6, 0x60(%rdi)
2896 movaps %xmm7, 0x70(%rdi)
2897 lea 0x80(%rdi), %rdi
2898 jae L(ll_cache_copy_fwd_start)
2900 lea 0x80(%rdx), %rdx
2901 jl L(large_page_ll_less_fwd_64bytes)
2903 movdqu (%rsi), %xmm0
2904 movdqu 0x10(%rsi), %xmm1
2905 movdqu 0x20(%rsi), %xmm2
2906 movdqu 0x30(%rsi), %xmm3
2907 lea 0x40(%rsi), %rsi
2909 movaps %xmm0, (%rdi)
2910 movaps %xmm1, 0x10(%rdi)
2911 movaps %xmm2, 0x20(%rdi)
2912 movaps %xmm3, 0x30(%rdi)
2913 lea 0x40(%rdi), %rdi
2915 L(large_page_ll_less_fwd_64bytes):
2918 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2923 movdqu -0x10(%rsi), %xmm1
2926 movdqa %xmm1, -0x10(%rdi)
2928 lea -0x90(%rdx), %rdx
2929 #ifdef USE_AS_MEMMOVE
2933 jae L(memmove_is_memcpy_bwd)
2935 jb L(ll_cache_copy_bwd_start)
2936 L(memmove_is_memcpy_bwd):
2938 L(large_page_bwd_loop):
2939 movdqu -0x10(%rsi), %xmm0
2940 movdqu -0x20(%rsi), %xmm1
2941 movdqu -0x30(%rsi), %xmm2
2942 movdqu -0x40(%rsi), %xmm3
2943 movdqu -0x50(%rsi), %xmm4
2944 movdqu -0x60(%rsi), %xmm5
2945 movdqu -0x70(%rsi), %xmm6
2946 movdqu -0x80(%rsi), %xmm7
2947 lea -0x80(%rsi), %rsi
2950 movntdq %xmm0, -0x10(%rdi)
2951 movntdq %xmm1, -0x20(%rdi)
2952 movntdq %xmm2, -0x30(%rdi)
2953 movntdq %xmm3, -0x40(%rdi)
2954 movntdq %xmm4, -0x50(%rdi)
2955 movntdq %xmm5, -0x60(%rdi)
2956 movntdq %xmm6, -0x70(%rdi)
2957 movntdq %xmm7, -0x80(%rdi)
2958 lea -0x80(%rdi), %rdi
2959 jae L(large_page_bwd_loop)
2961 lea 0x80(%rdx), %rdx
2962 jl L(large_page_less_bwd_64bytes)
2964 movdqu -0x10(%rsi), %xmm0
2965 movdqu -0x20(%rsi), %xmm1
2966 movdqu -0x30(%rsi), %xmm2
2967 movdqu -0x40(%rsi), %xmm3
2968 lea -0x40(%rsi), %rsi
2970 movntdq %xmm0, -0x10(%rdi)
2971 movntdq %xmm1, -0x20(%rdi)
2972 movntdq %xmm2, -0x30(%rdi)
2973 movntdq %xmm3, -0x40(%rdi)
2974 lea -0x40(%rdi), %rdi
2976 L(large_page_less_bwd_64bytes):
2978 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2980 #ifdef USE_AS_MEMMOVE
2982 L(ll_cache_copy_bwd_start):
2983 prefetcht0 -0x1c0(%rsi)
2984 prefetcht0 -0x200(%rsi)
2985 movdqu -0x10(%rsi), %xmm0
2986 movdqu -0x20(%rsi), %xmm1
2987 movdqu -0x30(%rsi), %xmm2
2988 movdqu -0x40(%rsi), %xmm3
2989 movdqu -0x50(%rsi), %xmm4
2990 movdqu -0x60(%rsi), %xmm5
2991 movdqu -0x70(%rsi), %xmm6
2992 movdqu -0x80(%rsi), %xmm7
2993 lea -0x80(%rsi), %rsi
2996 movaps %xmm0, -0x10(%rdi)
2997 movaps %xmm1, -0x20(%rdi)
2998 movaps %xmm2, -0x30(%rdi)
2999 movaps %xmm3, -0x40(%rdi)
3000 movaps %xmm4, -0x50(%rdi)
3001 movaps %xmm5, -0x60(%rdi)
3002 movaps %xmm6, -0x70(%rdi)
3003 movaps %xmm7, -0x80(%rdi)
3004 lea -0x80(%rdi), %rdi
3005 jae L(ll_cache_copy_bwd_start)
3007 lea 0x80(%rdx), %rdx
3008 jl L(large_page_ll_less_bwd_64bytes)
3010 movdqu -0x10(%rsi), %xmm0
3011 movdqu -0x20(%rsi), %xmm1
3012 movdqu -0x30(%rsi), %xmm2
3013 movdqu -0x40(%rsi), %xmm3
3014 lea -0x40(%rsi), %rsi
3016 movaps %xmm0, -0x10(%rdi)
3017 movaps %xmm1, -0x20(%rdi)
3018 movaps %xmm2, -0x30(%rdi)
3019 movaps %xmm3, -0x40(%rdi)
3020 lea -0x40(%rdi), %rdi
3022 L(large_page_ll_less_bwd_64bytes):
3023 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3028 .section .rodata.ssse3,"a",@progbits
3030 L(table_less_80bytes):
3031 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3108 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3109 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3110 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3114 .int JMPTBL (L(shl_0), L(shl_table))
3115 .int JMPTBL (L(shl_1), L(shl_table))
3116 .int JMPTBL (L(shl_2), L(shl_table))
3117 .int JMPTBL (L(shl_3), L(shl_table))
3118 .int JMPTBL (L(shl_4), L(shl_table))
3119 .int JMPTBL (L(shl_5), L(shl_table))
3120 .int JMPTBL (L(shl_6), L(shl_table))
3121 .int JMPTBL (L(shl_7), L(shl_table))
3122 .int JMPTBL (L(shl_8), L(shl_table))
3123 .int JMPTBL (L(shl_9), L(shl_table))
3124 .int JMPTBL (L(shl_10), L(shl_table))
3125 .int JMPTBL (L(shl_11), L(shl_table))
3126 .int JMPTBL (L(shl_12), L(shl_table))
3127 .int JMPTBL (L(shl_13), L(shl_table))
3128 .int JMPTBL (L(shl_14), L(shl_table))
3129 .int JMPTBL (L(shl_15), L(shl_table))
3133 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3134 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3135 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3146 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3147 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3148 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))