2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 #include <init-arch.h>
24 #if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
26 # define STRCPY strcpy
31 # ifdef USE_AS_STRNCPY
32 # define STRCPY_SSSE3 __stpncpy_ssse3
33 # define STRCPY_SSE2 __stpncpy_sse2
34 # define __GI_STRCPY __GI_stpncpy
36 # define STRCPY_SSSE3 __stpcpy_ssse3
37 # define STRCPY_SSE2 __stpcpy_sse2
38 # define __GI_STRCPY __GI_stpcpy
39 # define __GI___STRCPY __GI___stpcpy
42 # ifdef USE_AS_STRNCPY
43 # define STRCPY_SSSE3 __strncpy_ssse3
44 # define STRCPY_SSE2 __strncpy_sse2
45 # define __GI_STRCPY __GI_strncpy
47 # define STRCPY_SSSE3 __strcpy_ssse3
48 # define STRCPY_SSE2 __strcpy_sse2
49 # define __GI_STRCPY __GI_strcpy
57 /* Define multiple versions only for the definition in libc. */
61 .type STRCPY, @gnu_indirect_function
62 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
64 call __init_cpu_features
65 1: leaq STRCPY_SSE2(%rip), %rax
66 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
68 leaq STRCPY_SSSE3(%rip), %rax
72 .section .text.ssse3,"ax",@progbits
78 * This implementation uses SSE to copy up to 16 bytes at a time.
82 jz LABEL(strncpy_exitz)
88 and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/
90 mov %rdi, %rax /*store return parameter*/
93 pxor %xmm0, %xmm0 /* clear %xmm0 */
94 pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
95 pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
96 shr %cl, %edx /* get real bits left in edx*/
97 test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */
98 jnz LABEL(less16bytes)
100 #ifdef USE_AS_STRNCPY
101 lea -16(%r8,%rcx), %r11
103 jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */
110 jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
112 neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/
114 pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/
115 pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
118 jnz LABEL(less32bytes)
120 * at least 16 byte available to fill destination rdi
122 #ifdef USE_AS_STRNCPY
124 jbe LABEL(less32bytes_strncpy_truncation)
126 mov (%rsi, %r9), %rdx
128 mov 8(%rsi, %r9), %rdx
132 * so far destatination rdi may be aligned by 16, re-calculate rsi to jump
134 * rcx is offset of rsi
135 * rax is offset of rdi
138 and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
139 mov %rax, %rdx /* rax store orignal rdi */
140 xor %rdi, %rdx /* equal to and $15, %rdx */
141 #ifdef USE_AS_STRNCPY
145 add $16, %rdi /* next 16 bytes for rdi */
148 lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */
149 mov %esi, %ecx /*store offset of rsi */
150 and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
152 and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/
158 lea LABEL(unaligned_table)(%rip), %r11
159 movslq (%r11, %rcx,4), %rcx
160 lea (%r11, %rcx), %rcx
164 * The following cases will be handled by ashr_0 & ashr_0_start
165 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
167 * n(1~15) n(1~15) 0 ashr_0_start
172 #ifdef USE_AS_STRNCPY
174 jbe LABEL(strncpy_truncation_aligned)
176 movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */
177 movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */
180 pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
181 pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
183 test %edx, %edx /* edx must be 0 if there is no null char in rsi*/
184 jnz LABEL(aligned_16bytes)
187 #ifdef USE_AS_STRNCPY
189 jbe LABEL(strncpy_truncation_aligned)
191 movdqa (%rsi, %rcx), %xmm1
192 movdqa %xmm1, (%rdi, %rcx)
194 pcmpeqb (%rsi, %rcx), %xmm0
197 jnz LABEL(aligned_exit)
199 #ifdef USE_AS_STRNCPY
201 jbe LABEL(strncpy_truncation_aligned)
203 movdqa (%rsi, %rcx), %xmm1
204 movdqa %xmm1, (%rdi, %rcx)
206 pcmpeqb (%rsi, %rcx), %xmm0
209 jnz LABEL(aligned_exit)
211 #ifdef USE_AS_STRNCPY
213 jbe LABEL(strncpy_truncation_aligned)
215 movdqa (%rsi, %rcx), %xmm1
216 movdqa %xmm1, (%rdi, %rcx)
218 pcmpeqb (%rsi, %rcx), %xmm0
221 jnz LABEL(aligned_exit)
223 #ifdef USE_AS_STRNCPY
225 jbe LABEL(strncpy_truncation_aligned)
227 movdqa (%rsi, %rcx), %xmm1
228 movdqa %xmm1, (%rdi, %rcx)
230 pcmpeqb (%rsi, %rcx), %xmm0
233 jz LABEL(ashr_0_loop)
235 jmp LABEL(aligned_exit)
239 * The following cases will be handled by ashr_15
240 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
241 * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15
243 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
247 xor %ecx, %ecx /*clear ecx */
248 #ifdef USE_AS_STRNCPY
250 jbe LABEL(unaligned_exit)
254 LABEL(ashr_15_use_ssse3):
255 movdqa 16(%rsi, %rcx), %xmm3
259 jnz LABEL(unaligned_exit)
260 #ifdef USE_AS_STRNCPY
262 jbe LABEL(strncpy_truncation_unaligned)
265 palignr $15, (%rsi, %rcx), %xmm3
266 movdqa %xmm3, (%rdi, %rcx)
269 #ifdef USE_AS_STRNCPY
271 jbe LABEL(unaligned_exit)
274 movdqa 16(%rsi, %rcx), %xmm3
278 jnz LABEL(unaligned_exit)
279 #ifdef USE_AS_STRNCPY
281 jbe LABEL(strncpy_truncation_unaligned)
284 palignr $15, (%rsi, %rcx), %xmm3
285 movdqa %xmm3, (%rdi, %rcx)
288 #ifdef USE_AS_STRNCPY
290 jbe LABEL(unaligned_exit)
292 jmp LABEL(ashr_15_use_ssse3)
295 * The following cases will be handled by ashr_14
296 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
297 * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14
299 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
303 xor %ecx, %ecx /*clear ecx */
304 #ifdef USE_AS_STRNCPY
306 jbe LABEL(unaligned_exit)
310 LABEL(ashr_14_use_ssse3):
311 movdqa 16(%rsi, %rcx), %xmm3
315 jnz LABEL(unaligned_exit)
316 #ifdef USE_AS_STRNCPY
318 jbe LABEL(strncpy_truncation_unaligned)
321 palignr $14, (%rsi, %rcx), %xmm3
322 movdqa %xmm3, (%rdi, %rcx)
325 #ifdef USE_AS_STRNCPY
327 jbe LABEL(unaligned_exit)
330 movdqa 16(%rsi, %rcx), %xmm3
334 jnz LABEL(unaligned_exit)
335 #ifdef USE_AS_STRNCPY
337 jbe LABEL(strncpy_truncation_unaligned)
340 palignr $14, (%rsi, %rcx), %xmm3
341 movdqa %xmm3, (%rdi, %rcx)
344 #ifdef USE_AS_STRNCPY
346 jbe LABEL(unaligned_exit)
348 jmp LABEL(ashr_14_use_ssse3)
351 * The following cases will be handled by ashr_13
352 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
353 * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13
355 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
359 xor %ecx, %ecx /*clear ecx */
360 #ifdef USE_AS_STRNCPY
362 jbe LABEL(unaligned_exit)
366 LABEL(ashr_13_use_ssse3):
367 movdqa 16(%rsi, %rcx), %xmm3
371 jnz LABEL(unaligned_exit)
372 #ifdef USE_AS_STRNCPY
374 jbe LABEL(strncpy_truncation_unaligned)
377 palignr $13, (%rsi, %rcx), %xmm3
378 movdqa %xmm3, (%rdi, %rcx)
381 #ifdef USE_AS_STRNCPY
383 jbe LABEL(unaligned_exit)
386 movdqa 16(%rsi, %rcx), %xmm3
390 jnz LABEL(unaligned_exit)
391 #ifdef USE_AS_STRNCPY
393 jbe LABEL(strncpy_truncation_unaligned)
396 palignr $13, (%rsi, %rcx), %xmm3
397 movdqa %xmm3, (%rdi, %rcx)
400 #ifdef USE_AS_STRNCPY
402 jbe LABEL(unaligned_exit)
404 jmp LABEL(ashr_13_use_ssse3)
407 * The following cases will be handled by ashr_12
408 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
409 * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12
411 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
415 xor %ecx, %ecx /*clear ecx */
416 #ifdef USE_AS_STRNCPY
418 jbe LABEL(unaligned_exit)
422 LABEL(ashr_12_use_ssse3):
423 movdqa 16(%rsi, %rcx), %xmm3
427 jnz LABEL(unaligned_exit)
428 #ifdef USE_AS_STRNCPY
430 jbe LABEL(strncpy_truncation_unaligned)
433 palignr $12, (%rsi, %rcx), %xmm3
434 movdqa %xmm3, (%rdi, %rcx)
437 #ifdef USE_AS_STRNCPY
439 jbe LABEL(unaligned_exit)
442 movdqa 16(%rsi, %rcx), %xmm3
446 jnz LABEL(unaligned_exit)
447 #ifdef USE_AS_STRNCPY
449 jbe LABEL(strncpy_truncation_unaligned)
452 palignr $12, (%rsi, %rcx), %xmm3
453 movdqa %xmm3, (%rdi, %rcx)
456 #ifdef USE_AS_STRNCPY
458 jbe LABEL(unaligned_exit)
460 jmp LABEL(ashr_12_use_ssse3)
463 * The following cases will be handled by ashr_11
464 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
465 * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11
467 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
471 xor %ecx, %ecx /*clear ecx */
472 #ifdef USE_AS_STRNCPY
474 jbe LABEL(unaligned_exit)
478 LABEL(ashr_11_use_ssse3):
479 movdqa 16(%rsi, %rcx), %xmm3
483 jnz LABEL(unaligned_exit)
484 #ifdef USE_AS_STRNCPY
486 jbe LABEL(strncpy_truncation_unaligned)
489 palignr $11, (%rsi, %rcx), %xmm3
490 movdqa %xmm3, (%rdi, %rcx)
493 #ifdef USE_AS_STRNCPY
495 jbe LABEL(unaligned_exit)
498 movdqa 16(%rsi, %rcx), %xmm3
502 jnz LABEL(unaligned_exit)
503 #ifdef USE_AS_STRNCPY
505 jbe LABEL(strncpy_truncation_unaligned)
508 palignr $11, (%rsi, %rcx), %xmm3
509 movdqa %xmm3, (%rdi, %rcx)
512 #ifdef USE_AS_STRNCPY
514 jbe LABEL(unaligned_exit)
516 jmp LABEL(ashr_11_use_ssse3)
519 * The following cases will be handled by ashr_10
520 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
521 * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10
523 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
527 xor %ecx, %ecx /*clear ecx */
528 #ifdef USE_AS_STRNCPY
530 jbe LABEL(unaligned_exit)
534 LABEL(ashr_10_use_ssse3):
535 movdqa 16(%rsi, %rcx), %xmm3
539 jnz LABEL(unaligned_exit)
540 #ifdef USE_AS_STRNCPY
542 jbe LABEL(strncpy_truncation_unaligned)
545 palignr $10, (%rsi, %rcx), %xmm3
546 movdqa %xmm3, (%rdi, %rcx)
549 #ifdef USE_AS_STRNCPY
551 jbe LABEL(unaligned_exit)
554 movdqa 16(%rsi, %rcx), %xmm3
558 jnz LABEL(unaligned_exit)
559 #ifdef USE_AS_STRNCPY
561 jbe LABEL(strncpy_truncation_unaligned)
564 palignr $10, (%rsi, %rcx), %xmm3
565 movdqa %xmm3, (%rdi, %rcx)
568 #ifdef USE_AS_STRNCPY
570 jbe LABEL(unaligned_exit)
572 jmp LABEL(ashr_10_use_ssse3)
575 * The following cases will be handled by ashr_9
576 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
577 * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9
579 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
583 xor %ecx, %ecx /*clear ecx */
584 #ifdef USE_AS_STRNCPY
586 jbe LABEL(unaligned_exit)
590 LABEL(ashr_9_use_ssse3):
591 movdqa 16(%rsi, %rcx), %xmm3
595 jnz LABEL(unaligned_exit)
596 #ifdef USE_AS_STRNCPY
598 jbe LABEL(strncpy_truncation_unaligned)
601 palignr $9, (%rsi, %rcx), %xmm3
602 movdqa %xmm3, (%rdi, %rcx)
605 #ifdef USE_AS_STRNCPY
607 jbe LABEL(unaligned_exit)
610 movdqa 16(%rsi, %rcx), %xmm3
614 jnz LABEL(unaligned_exit)
615 #ifdef USE_AS_STRNCPY
617 jbe LABEL(strncpy_truncation_unaligned)
620 palignr $9, (%rsi, %rcx), %xmm3
621 movdqa %xmm3, (%rdi, %rcx)
624 #ifdef USE_AS_STRNCPY
626 jbe LABEL(unaligned_exit)
628 jmp LABEL(ashr_9_use_ssse3)
631 * The following cases will be handled by ashr_8
632 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
633 * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8
635 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
639 xor %ecx, %ecx /*clear ecx */
640 #ifdef USE_AS_STRNCPY
642 jbe LABEL(unaligned_exit)
646 LABEL(ashr_8_use_ssse3):
647 movdqa 16(%rsi, %rcx), %xmm3
651 jnz LABEL(unaligned_exit)
652 #ifdef USE_AS_STRNCPY
654 jbe LABEL(strncpy_truncation_unaligned)
657 palignr $8, (%rsi, %rcx), %xmm3
658 movdqa %xmm3, (%rdi, %rcx)
661 #ifdef USE_AS_STRNCPY
663 jbe LABEL(unaligned_exit)
666 movdqa 16(%rsi, %rcx), %xmm3
670 jnz LABEL(unaligned_exit)
671 #ifdef USE_AS_STRNCPY
673 jbe LABEL(strncpy_truncation_unaligned)
676 palignr $8, (%rsi, %rcx), %xmm3
677 movdqa %xmm3, (%rdi, %rcx)
680 #ifdef USE_AS_STRNCPY
682 jbe LABEL(unaligned_exit)
684 jmp LABEL(ashr_8_use_ssse3)
687 * The following cases will be handled by ashr_7
688 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
689 * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7
691 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
695 xor %ecx, %ecx /*clear ecx */
696 #ifdef USE_AS_STRNCPY
698 jbe LABEL(unaligned_exit)
702 LABEL(ashr_7_use_ssse3):
703 movdqa 16(%rsi, %rcx), %xmm3
707 jnz LABEL(unaligned_exit)
708 #ifdef USE_AS_STRNCPY
710 jbe LABEL(strncpy_truncation_unaligned)
713 palignr $7, (%rsi, %rcx), %xmm3
714 movdqa %xmm3, (%rdi, %rcx)
717 #ifdef USE_AS_STRNCPY
719 jbe LABEL(unaligned_exit)
722 movdqa 16(%rsi, %rcx), %xmm3
726 jnz LABEL(unaligned_exit)
727 #ifdef USE_AS_STRNCPY
729 jbe LABEL(strncpy_truncation_unaligned)
732 palignr $7, (%rsi, %rcx), %xmm3
733 movdqa %xmm3, (%rdi, %rcx)
736 #ifdef USE_AS_STRNCPY
738 jbe LABEL(unaligned_exit)
740 jmp LABEL(ashr_7_use_ssse3)
743 * The following cases will be handled by ashr_6
744 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
745 * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6
747 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
751 xor %ecx, %ecx /*clear ecx */
752 #ifdef USE_AS_STRNCPY
754 jbe LABEL(unaligned_exit)
758 LABEL(ashr_6_use_ssse3):
759 movdqa 16(%rsi, %rcx), %xmm3
763 jnz LABEL(unaligned_exit)
764 #ifdef USE_AS_STRNCPY
766 jbe LABEL(strncpy_truncation_unaligned)
769 palignr $6, (%rsi, %rcx), %xmm3
770 movdqa %xmm3, (%rdi, %rcx)
773 #ifdef USE_AS_STRNCPY
775 jbe LABEL(unaligned_exit)
778 movdqa 16(%rsi, %rcx), %xmm3
782 jnz LABEL(unaligned_exit)
783 #ifdef USE_AS_STRNCPY
785 jbe LABEL(strncpy_truncation_unaligned)
788 palignr $6, (%rsi, %rcx), %xmm3
789 movdqa %xmm3, (%rdi, %rcx)
792 #ifdef USE_AS_STRNCPY
794 jbe LABEL(unaligned_exit)
796 jmp LABEL(ashr_6_use_ssse3)
799 * The following cases will be handled by ashr_5
800 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
801 * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5
803 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
807 xor %ecx, %ecx /*clear ecx */
808 #ifdef USE_AS_STRNCPY
810 jbe LABEL(unaligned_exit)
814 LABEL(ashr_5_use_ssse3):
815 movdqa 16(%rsi, %rcx), %xmm3
819 jnz LABEL(unaligned_exit)
820 #ifdef USE_AS_STRNCPY
822 jbe LABEL(strncpy_truncation_unaligned)
825 palignr $5, (%rsi, %rcx), %xmm3
826 movdqa %xmm3, (%rdi, %rcx)
829 #ifdef USE_AS_STRNCPY
831 jbe LABEL(unaligned_exit)
834 movdqa 16(%rsi, %rcx), %xmm3
838 jnz LABEL(unaligned_exit)
839 #ifdef USE_AS_STRNCPY
841 jbe LABEL(strncpy_truncation_unaligned)
844 palignr $5, (%rsi, %rcx), %xmm3
845 movdqa %xmm3, (%rdi, %rcx)
848 #ifdef USE_AS_STRNCPY
850 jbe LABEL(unaligned_exit)
852 jmp LABEL(ashr_5_use_ssse3)
856 * The following cases will be handled by ashr_4
857 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
858 * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4
860 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
864 xor %ecx, %ecx /*clear ecx */
865 #ifdef USE_AS_STRNCPY
867 jbe LABEL(unaligned_exit)
871 LABEL(ashr_4_use_ssse3):
872 movdqa 16(%rsi, %rcx), %xmm3
876 jnz LABEL(unaligned_exit)
877 #ifdef USE_AS_STRNCPY
879 jbe LABEL(strncpy_truncation_unaligned)
882 palignr $4, (%rsi, %rcx), %xmm3
883 movdqa %xmm3, (%rdi, %rcx)
886 #ifdef USE_AS_STRNCPY
888 jbe LABEL(unaligned_exit)
891 movdqa 16(%rsi, %rcx), %xmm3
895 jnz LABEL(unaligned_exit)
896 #ifdef USE_AS_STRNCPY
898 jbe LABEL(strncpy_truncation_unaligned)
901 palignr $4, (%rsi, %rcx), %xmm3
902 movdqa %xmm3, (%rdi, %rcx)
905 #ifdef USE_AS_STRNCPY
907 jbe LABEL(unaligned_exit)
909 jmp LABEL(ashr_4_use_ssse3)
913 * The following cases will be handled by ashr_3
914 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
915 * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3
917 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
921 xor %ecx, %ecx /*clear ecx */
922 #ifdef USE_AS_STRNCPY
924 jbe LABEL(unaligned_exit)
928 LABEL(ashr_3_use_ssse3):
929 movdqa 16(%rsi, %rcx), %xmm3
933 jnz LABEL(unaligned_exit)
934 #ifdef USE_AS_STRNCPY
936 jbe LABEL(strncpy_truncation_unaligned)
939 palignr $3, (%rsi, %rcx), %xmm3
940 movdqa %xmm3, (%rdi, %rcx)
943 #ifdef USE_AS_STRNCPY
945 jbe LABEL(unaligned_exit)
948 movdqa 16(%rsi, %rcx), %xmm3
952 jnz LABEL(unaligned_exit)
953 #ifdef USE_AS_STRNCPY
955 jbe LABEL(strncpy_truncation_unaligned)
958 palignr $3, (%rsi, %rcx), %xmm3
959 movdqa %xmm3, (%rdi, %rcx)
962 #ifdef USE_AS_STRNCPY
964 jbe LABEL(unaligned_exit)
966 jmp LABEL(ashr_3_use_ssse3)
970 * The following cases will be handled by ashr_2
971 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
972 * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2
974 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
978 xor %ecx, %ecx /*clear ecx */
979 #ifdef USE_AS_STRNCPY
981 jbe LABEL(unaligned_exit)
985 LABEL(ashr_2_use_ssse3):
986 movdqa 16(%rsi, %rcx), %xmm3
990 jnz LABEL(unaligned_exit)
991 #ifdef USE_AS_STRNCPY
993 jbe LABEL(strncpy_truncation_unaligned)
996 palignr $2, (%rsi, %rcx), %xmm3
997 movdqa %xmm3, (%rdi, %rcx)
1000 #ifdef USE_AS_STRNCPY
1002 jbe LABEL(unaligned_exit)
1005 movdqa 16(%rsi, %rcx), %xmm3
1006 pcmpeqb %xmm3, %xmm0
1007 pmovmskb %xmm0, %edx
1009 jnz LABEL(unaligned_exit)
1010 #ifdef USE_AS_STRNCPY
1012 jbe LABEL(strncpy_truncation_unaligned)
1015 palignr $2, (%rsi, %rcx), %xmm3
1016 movdqa %xmm3, (%rdi, %rcx)
1019 #ifdef USE_AS_STRNCPY
1021 jbe LABEL(unaligned_exit)
1023 jmp LABEL(ashr_2_use_ssse3)
1027 * The following cases will be handled by ashr_1
1028 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1029 * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1
1031 * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
1035 xor %ecx, %ecx /*clear ecx */
1036 #ifdef USE_AS_STRNCPY
1038 jbe LABEL(unaligned_exit)
1042 LABEL(ashr_1_use_ssse3):
1043 movdqa 16(%rsi, %rcx), %xmm3
1044 pcmpeqb %xmm3, %xmm0
1045 pmovmskb %xmm0, %edx
1047 jnz LABEL(unaligned_exit)
1048 #ifdef USE_AS_STRNCPY
1050 jbe LABEL(strncpy_truncation_unaligned)
1053 palignr $1, (%rsi, %rcx), %xmm3
1054 movdqa %xmm3, (%rdi, %rcx)
1056 #ifdef USE_AS_STRNCPY
1058 jbe LABEL(unaligned_exit)
1061 movdqa 16(%rsi, %rcx), %xmm3
1062 pcmpeqb %xmm3, %xmm0
1063 pmovmskb %xmm0, %edx
1065 jnz LABEL(unaligned_exit)
1066 #ifdef USE_AS_STRNCPY
1068 jbe LABEL(strncpy_truncation_unaligned)
1070 palignr $1, (%rsi, %rcx), %xmm3
1071 movdqa %xmm3, (%rdi, %rcx)
1074 #ifdef USE_AS_STRNCPY
1076 jbe LABEL(unaligned_exit)
1078 jmp LABEL(ashr_1_use_ssse3)
1083 LABEL(unaligned_exit):
1084 add %r9, %rsi /* r9 stores original offset of rsi*/
1087 shl %cl, %edx /* after shl, calculate the exact number to be filled*/
1090 LABEL(aligned_exit):
1091 add %rcx, %rdi /*locate exact address for rdi */
1093 add %rcx, %rsi /*locate exact address for rsi */
1094 LABEL(aligned_16bytes):
1095 #ifdef USE_AS_STRNCPY
1100 ja LABEL(strncpy_tail)
1102 LABEL(strncpy_tail):
1104 bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
1105 lea LABEL(tail_table)(%rip), %r11
1106 movslq (%r11, %rcx,4), %rcx
1107 lea (%r11, %rcx), %rcx
1110 #ifdef USE_AS_STRNCPY
1112 LABEL(less32bytes_strncpy_truncation):
1114 LABEL(strncpy_truncation_unaligned):
1116 LABEL(strncpy_truncation_aligned):
1121 lea LABEL(tail_table)(%rip), %r11
1122 movslq (%r11, %rcx,4), %rcx
1123 lea (%r11, %rcx), %rcx
1126 LABEL(strncpy_exitz):
1131 #ifdef USE_AS_STRNCPY
1133 LABEL(strncpy_fill_tail):
1140 jz LABEL(strncpy_fill_less_8)
1143 LABEL(strncpy_fill_less_8):
1146 jz LABEL(strncpy_fill_return)
1147 LABEL(strncpy_fill_less_7):
1149 mov %al, (%rdi, %rcx)
1150 jnz LABEL(strncpy_fill_less_7)
1151 LABEL(strncpy_fill_return):
1152 #ifdef USE_AS_STPCPY
1163 #ifdef USE_AS_STPCPY
1166 #ifdef USE_AS_STRNCPY
1169 jnz LABEL(strncpy_fill_tail)
1170 #ifdef USE_AS_STPCPY
1180 #ifdef USE_AS_STPCPY
1183 #ifdef USE_AS_STRNCPY
1186 jnz LABEL(strncpy_fill_tail)
1187 #ifdef USE_AS_STPCPY
1199 #ifdef USE_AS_STPCPY
1202 #ifdef USE_AS_STRNCPY
1205 jnz LABEL(strncpy_fill_tail)
1206 #ifdef USE_AS_STPCPY
1216 #ifdef USE_AS_STPCPY
1219 #ifdef USE_AS_STRNCPY
1222 jnz LABEL(strncpy_fill_tail)
1223 #ifdef USE_AS_STPCPY
1235 #ifdef USE_AS_STPCPY
1238 #ifdef USE_AS_STRNCPY
1241 jnz LABEL(strncpy_fill_tail)
1242 #ifdef USE_AS_STPCPY
1254 #ifdef USE_AS_STPCPY
1257 #ifdef USE_AS_STRNCPY
1260 jnz LABEL(strncpy_fill_tail)
1261 #ifdef USE_AS_STPCPY
1273 #ifdef USE_AS_STPCPY
1276 #ifdef USE_AS_STRNCPY
1279 jnz LABEL(strncpy_fill_tail)
1280 #ifdef USE_AS_STPCPY
1291 #ifdef USE_AS_STPCPY
1294 #ifdef USE_AS_STRNCPY
1297 jnz LABEL(strncpy_fill_tail)
1298 #ifdef USE_AS_STPCPY
1312 #ifdef USE_AS_STPCPY
1315 #ifdef USE_AS_STRNCPY
1318 jnz LABEL(strncpy_fill_tail)
1319 #ifdef USE_AS_STPCPY
1332 #ifdef USE_AS_STPCPY
1335 #ifdef USE_AS_STRNCPY
1338 jnz LABEL(strncpy_fill_tail)
1339 #ifdef USE_AS_STPCPY
1352 #ifdef USE_AS_STPCPY
1355 #ifdef USE_AS_STRNCPY
1358 jnz LABEL(strncpy_fill_tail)
1359 #ifdef USE_AS_STPCPY
1371 #ifdef USE_AS_STPCPY
1374 #ifdef USE_AS_STRNCPY
1377 jnz LABEL(strncpy_fill_tail)
1378 #ifdef USE_AS_STPCPY
1390 #ifdef USE_AS_STPCPY
1393 #ifdef USE_AS_STRNCPY
1396 jnz LABEL(strncpy_fill_tail)
1397 #ifdef USE_AS_STPCPY
1410 #ifdef USE_AS_STPCPY
1413 #ifdef USE_AS_STRNCPY
1416 jnz LABEL(strncpy_fill_tail)
1417 #ifdef USE_AS_STPCPY
1430 #ifdef USE_AS_STPCPY
1433 #ifdef USE_AS_STRNCPY
1436 jnz LABEL(strncpy_fill_tail)
1437 #ifdef USE_AS_STPCPY
1449 #ifdef USE_AS_STPCPY
1452 #ifdef USE_AS_STRNCPY
1455 jnz LABEL(strncpy_fill_tail)
1456 #ifdef USE_AS_STPCPY
1472 #ifdef USE_AS_STPCPY
1475 #ifdef USE_AS_STRNCPY
1478 jnz LABEL(strncpy_fill_tail)
1479 #ifdef USE_AS_STPCPY
1493 #ifdef USE_AS_STPCPY
1496 #ifdef USE_AS_STRNCPY
1499 jnz LABEL(strncpy_fill_tail)
1500 #ifdef USE_AS_STPCPY
1515 #ifdef USE_AS_STPCPY
1518 #ifdef USE_AS_STRNCPY
1521 jnz LABEL(strncpy_fill_tail)
1522 #ifdef USE_AS_STPCPY
1537 #ifdef USE_AS_STPCPY
1540 #ifdef USE_AS_STRNCPY
1543 jnz LABEL(strncpy_fill_tail)
1544 #ifdef USE_AS_STPCPY
1558 #ifdef USE_AS_STPCPY
1561 #ifdef USE_AS_STRNCPY
1564 jnz LABEL(strncpy_fill_tail)
1565 #ifdef USE_AS_STPCPY
1579 #ifdef USE_AS_STPCPY
1582 #ifdef USE_AS_STRNCPY
1585 jnz LABEL(strncpy_fill_tail)
1586 #ifdef USE_AS_STPCPY
1601 #ifdef USE_AS_STPCPY
1604 #ifdef USE_AS_STRNCPY
1607 jnz LABEL(strncpy_fill_tail)
1608 #ifdef USE_AS_STPCPY
1623 #ifdef USE_AS_STPCPY
1626 #ifdef USE_AS_STRNCPY
1629 jnz LABEL(strncpy_fill_tail)
1630 #ifdef USE_AS_STPCPY
1648 #ifdef USE_AS_STPCPY
1651 #ifdef USE_AS_STRNCPY
1654 jnz LABEL(strncpy_fill_tail)
1655 #ifdef USE_AS_STPCPY
1672 #ifdef USE_AS_STPCPY
1675 #ifdef USE_AS_STRNCPY
1678 jnz LABEL(strncpy_fill_tail)
1679 #ifdef USE_AS_STPCPY
1696 #ifdef USE_AS_STPCPY
1699 #ifdef USE_AS_STRNCPY
1702 jnz LABEL(strncpy_fill_tail)
1703 #ifdef USE_AS_STPCPY
1720 #ifdef USE_AS_STPCPY
1723 #ifdef USE_AS_STRNCPY
1726 jnz LABEL(strncpy_fill_tail)
1727 #ifdef USE_AS_STPCPY
1743 #ifdef USE_AS_STPCPY
1746 #ifdef USE_AS_STRNCPY
1749 jnz LABEL(strncpy_fill_tail)
1750 #ifdef USE_AS_STPCPY
1768 #ifdef USE_AS_STPCPY
1771 #ifdef USE_AS_STRNCPY
1774 jnz LABEL(strncpy_fill_tail)
1775 #ifdef USE_AS_STPCPY
1794 #ifdef USE_AS_STPCPY
1797 #ifdef USE_AS_STRNCPY
1800 jnz LABEL(strncpy_fill_tail)
1801 #ifdef USE_AS_STPCPY
1818 #ifdef USE_AS_STPCPY
1821 #ifdef USE_AS_STRNCPY
1824 jnz LABEL(strncpy_fill_tail)
1825 #ifdef USE_AS_STPCPY
1832 .size STRCPY_SSSE3, .-STRCPY_SSSE3
1835 .section .rodata.ssse3,"a",@progbits
1837 .int LABEL(tail_0) - LABEL(tail_table)
1838 .int LABEL(tail_1) - LABEL(tail_table)
1839 .int LABEL(tail_2) - LABEL(tail_table)
1840 .int LABEL(tail_3) - LABEL(tail_table)
1841 .int LABEL(tail_4) - LABEL(tail_table)
1842 .int LABEL(tail_5) - LABEL(tail_table)
1843 .int LABEL(tail_6) - LABEL(tail_table)
1844 .int LABEL(tail_7) - LABEL(tail_table)
1845 .int LABEL(tail_8) - LABEL(tail_table)
1846 .int LABEL(tail_9) - LABEL(tail_table)
1847 .int LABEL(tail_10) - LABEL(tail_table)
1848 .int LABEL(tail_11) - LABEL(tail_table)
1849 .int LABEL(tail_12) - LABEL(tail_table)
1850 .int LABEL(tail_13) - LABEL(tail_table)
1851 .int LABEL(tail_14) - LABEL(tail_table)
1852 .int LABEL(tail_15) - LABEL(tail_table)
1853 .int LABEL(tail_16) - LABEL(tail_table)
1854 .int LABEL(tail_17) - LABEL(tail_table)
1855 .int LABEL(tail_18) - LABEL(tail_table)
1856 .int LABEL(tail_19) - LABEL(tail_table)
1857 .int LABEL(tail_20) - LABEL(tail_table)
1858 .int LABEL(tail_21) - LABEL(tail_table)
1859 .int LABEL(tail_22) - LABEL(tail_table)
1860 .int LABEL(tail_23) - LABEL(tail_table)
1861 .int LABEL(tail_24) - LABEL(tail_table)
1862 .int LABEL(tail_25) - LABEL(tail_table)
1863 .int LABEL(tail_26) - LABEL(tail_table)
1864 .int LABEL(tail_27) - LABEL(tail_table)
1865 .int LABEL(tail_28) - LABEL(tail_table)
1866 .int LABEL(tail_29) - LABEL(tail_table)
1867 .int LABEL(tail_30) - LABEL(tail_table)
1868 .int LABEL(tail_31) - LABEL(tail_table)
1871 LABEL(unaligned_table):
1872 .int LABEL(ashr_0) - LABEL(unaligned_table)
1873 .int LABEL(ashr_1) - LABEL(unaligned_table)
1874 .int LABEL(ashr_2) - LABEL(unaligned_table)
1875 .int LABEL(ashr_3) - LABEL(unaligned_table)
1876 .int LABEL(ashr_4) - LABEL(unaligned_table)
1877 .int LABEL(ashr_5) - LABEL(unaligned_table)
1878 .int LABEL(ashr_6) - LABEL(unaligned_table)
1879 .int LABEL(ashr_7) - LABEL(unaligned_table)
1880 .int LABEL(ashr_8) - LABEL(unaligned_table)
1881 .int LABEL(ashr_9) - LABEL(unaligned_table)
1882 .int LABEL(ashr_10) - LABEL(unaligned_table)
1883 .int LABEL(ashr_11) - LABEL(unaligned_table)
1884 .int LABEL(ashr_12) - LABEL(unaligned_table)
1885 .int LABEL(ashr_13) - LABEL(unaligned_table)
1886 .int LABEL(ashr_14) - LABEL(unaligned_table)
1887 .int LABEL(ashr_15) - LABEL(unaligned_table)
1890 # define ENTRY(name) \
1891 .type STRCPY_SSE2, @function; \
1893 STRCPY_SSE2: cfi_startproc; \
1896 # define END(name) \
1897 cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
1898 # undef libc_hidden_builtin_def
1899 /* It doesn't make sense to send libc-internal strcpy calls through a PLT.
1900 The speedup we get from using SSSE3 instruction is likely eaten away
1901 by the indirect call in the PLT. */
1902 # define libc_hidden_builtin_def(name) \
1903 .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
1904 # undef libc_hidden_def
1905 # define libc_hidden_def(name) \
1906 .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
1909 #ifndef USE_AS_STRNCPY
1910 #include "../strcpy.S"