2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 #include <ifunc-defines.h>
25 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
26 if the new counter > the old one or is 0. */
27 #define UPDATE_STRNCMP_COUNTER \
28 /* calculate left number to compare */ \
29 lea -16(%rcx, %r11), %r9; \
31 jb LABEL(strcmp_exitz_sse4_2); \
33 je LABEL(strcmp_exitz_sse4_2); \
36 #define STRCMP_SSE42 __strncmp_sse42
37 #define STRCMP_SSSE3 __strncmp_ssse3
38 #define STRCMP_SSE2 __strncmp_sse2
39 #define __GI_STRCMP __GI_strncmp
41 #define UPDATE_STRNCMP_COUNTER
44 #define STRCMP_SSE42 __strcmp_sse42
45 #define STRCMP_SSSE3 __strcmp_ssse3
46 #define STRCMP_SSE2 __strcmp_sse2
47 #define __GI_STRCMP __GI_strcmp
55 /* Define multiple versions only for the definition in libc. Don't
56 define multiple versions for strncmp in static library since we
57 need strncmp before the initialization happened. */
58 #if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
61 .type STRCMP, @gnu_indirect_function
62 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
64 call __init_cpu_features
66 leaq STRCMP_SSE42(%rip), %rax
67 testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
69 leaq STRCMP_SSSE3(%rip), %rax
70 testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
72 leaq STRCMP_SSE2(%rip), %rax
78 | _SIDD_CMP_EQUAL_EACH
79 | _SIDD_NEGATIVE_POLARITY
80 | _SIDD_LEAST_SIGNIFICANT
81 on pcmpistri to find out if two 16byte data elements are the same
82 and the offset of the first different byte. There are 4 cases:
84 1. Both 16byte data elements are valid and identical.
85 2. Both 16byte data elements have EOS and identical.
86 3. Both 16byte data elements are valid and they differ at offset X.
87 4. At least one 16byte data element has EOS at offset X. Two 16byte
88 data elements must differ at or before offset X.
90 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
92 case ECX CFlag ZFlag SFlag
98 We exit from the loop for cases 2, 3 and 4 with jbe which branches
99 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
102 /* Put all SSE 4.2 functions together. */
103 .section .text.sse4.2,"ax",@progbits
105 .type STRCMP_SSE42, @function
111 * This implementation uses SSE to compare up to 16 bytes at a time.
113 #ifdef USE_AS_STRNCMP
115 je LABEL(strcmp_exitz_sse4_2)
117 je LABEL(Byte0_sse4_2)
122 /* Use 64bit AND here to avoid long NOP padding. */
123 and $0x3f, %rcx /* rsi alignment in cache line */
124 and $0x3f, %rax /* rdi alignment in cache line */
126 ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
128 ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
131 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
132 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
133 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
134 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
136 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
137 jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
138 #ifdef USE_AS_STRNCMP
140 jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */
142 add $16, %rsi /* prepare to search next 16 bytes */
143 add $16, %rdi /* prepare to search next 16 bytes */
146 * Determine source and destination string offsets from 16-byte alignment.
147 * Use relative offset difference between the two to determine which case
151 LABEL(crosscache_sse4_2):
152 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
153 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
154 mov $0xffff, %edx /* for equivalent offset */
156 and $0xf, %ecx /* offset of rsi */
157 and $0xf, %eax /* offset of rdi */
159 je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */
160 ja LABEL(bigger_sse4_2)
161 mov %edx, %r8d /* r8d is offset flag for exit tail */
164 LABEL(bigger_sse4_2):
167 lea LABEL(unaligned_table_sse4_2)(%rip), %r10
168 movslq (%r10, %r9,4), %r9
169 lea (%r10, %r9), %r10
170 jmp *%r10 /* jump to corresponding case */
173 * The following cases will be handled by ashr_0
174 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
175 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
178 LABEL(ashr_0_sse4_2):
181 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
182 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
183 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
184 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
186 shr %cl, %edx /* adjust 0xffff for offset */
187 shr %cl, %r9d /* adjust for 16-byte offset */
190 * edx must be the same with r9d if in left byte (16-rcx) is equal to
191 * the start from (16-rax) and no null char was seen.
193 jne LABEL(less32bytes_sse4_2) /* mismatch or null char */
194 UPDATE_STRNCMP_COUNTER
197 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
200 * Now both strings are aligned at 16-byte boundary. Loop over strings
201 * checking 32-bytes per iteration.
203 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
205 LABEL(ashr_0_use_sse4_2):
206 movdqa (%rdi,%rdx), %xmm0
207 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
209 jbe LABEL(ashr_0_use_sse4_2_exit)
210 #ifdef USE_AS_STRNCMP
212 jbe LABEL(strcmp_exitz_sse4_2)
215 movdqa (%rdi,%rdx), %xmm0
216 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
218 jbe LABEL(ashr_0_use_sse4_2_exit)
219 #ifdef USE_AS_STRNCMP
221 jbe LABEL(strcmp_exitz_sse4_2)
223 jmp LABEL(ashr_0_use_sse4_2)
227 LABEL(ashr_0_use_sse4_2_exit):
228 jnc LABEL(strcmp_exitz_sse4_2)
229 #ifdef USE_AS_STRNCMP
231 jbe LABEL(strcmp_exitz_sse4_2)
233 lea -16(%rdx, %rcx), %rcx
234 movzbl (%rdi, %rcx), %eax
235 movzbl (%rsi, %rcx), %edx
243 * The following cases will be handled by ashr_1
244 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
245 * n(15) n -15 0(15 +(n-15) - n) ashr_1
248 LABEL(ashr_1_sse4_2):
252 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
253 pslldq $15, %xmm2 /* shift first string to align with second */
254 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
255 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
257 shr %cl, %edx /* adjust 0xffff for offset */
258 shr %cl, %r9d /* adjust for 16-byte offset */
260 jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
262 UPDATE_STRNCMP_COUNTER
265 mov $16, %rcx /* index for loads*/
266 mov $1, %r9d /* byte position left over from less32bytes case */
268 * Setup %r10 value allows us to detect crossing a page boundary.
269 * When %r10 goes positive we have crossed a page boundary and
270 * need to do a nibble.
273 and $0xfff, %r10 /* offset into 4K page */
274 sub $0x1000, %r10 /* subtract 4K pagesize */
275 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
278 LABEL(loop_ashr_1_use_sse4_2):
280 jg LABEL(nibble_ashr_1_use_sse4_2)
282 movdqa (%rdi, %rdx), %xmm0
283 palignr $1, -16(%rdi, %rdx), %xmm0
284 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
285 jbe LABEL(use_sse4_2_exit)
286 #ifdef USE_AS_STRNCMP
288 jbe LABEL(strcmp_exitz_sse4_2)
293 jg LABEL(nibble_ashr_1_use_sse4_2)
295 movdqa (%rdi, %rdx), %xmm0
296 palignr $1, -16(%rdi, %rdx), %xmm0
297 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
298 jbe LABEL(use_sse4_2_exit)
299 #ifdef USE_AS_STRNCMP
301 jbe LABEL(strcmp_exitz_sse4_2)
304 jmp LABEL(loop_ashr_1_use_sse4_2)
307 LABEL(nibble_ashr_1_use_sse4_2):
309 movdqa -16(%rdi, %rdx), %xmm0
311 pcmpistri $0x3a,%xmm0, %xmm0
312 #ifdef USE_AS_STRNCMP
314 jae LABEL(nibble_ashr_use_sse4_2_exit)
317 ja LABEL(loop_ashr_1_use_sse4_2)
319 jmp LABEL(nibble_ashr_use_sse4_2_exit)
322 * The following cases will be handled by ashr_2
323 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
324 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
327 LABEL(ashr_2_sse4_2):
339 jnz LABEL(less32bytes_sse4_2)
341 UPDATE_STRNCMP_COUNTER
344 mov $16, %rcx /* index for loads */
345 mov $2, %r9d /* byte position left over from less32bytes case */
347 * Setup %r10 value allows us to detect crossing a page boundary.
348 * When %r10 goes positive we have crossed a page boundary and
349 * need to do a nibble.
352 and $0xfff, %r10 /* offset into 4K page */
353 sub $0x1000, %r10 /* subtract 4K pagesize */
354 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
357 LABEL(loop_ashr_2_use_sse4_2):
359 jg LABEL(nibble_ashr_2_use_sse4_2)
361 movdqa (%rdi, %rdx), %xmm0
362 palignr $2, -16(%rdi, %rdx), %xmm0
363 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
364 jbe LABEL(use_sse4_2_exit)
365 #ifdef USE_AS_STRNCMP
367 jbe LABEL(strcmp_exitz_sse4_2)
372 jg LABEL(nibble_ashr_2_use_sse4_2)
374 movdqa (%rdi, %rdx), %xmm0
375 palignr $2, -16(%rdi, %rdx), %xmm0
376 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
377 jbe LABEL(use_sse4_2_exit)
378 #ifdef USE_AS_STRNCMP
380 jbe LABEL(strcmp_exitz_sse4_2)
383 jmp LABEL(loop_ashr_2_use_sse4_2)
386 LABEL(nibble_ashr_2_use_sse4_2):
388 movdqa -16(%rdi, %rdx), %xmm0
390 pcmpistri $0x3a,%xmm0, %xmm0
391 #ifdef USE_AS_STRNCMP
393 jae LABEL(nibble_ashr_use_sse4_2_exit)
396 ja LABEL(loop_ashr_2_use_sse4_2)
398 jmp LABEL(nibble_ashr_use_sse4_2_exit)
401 * The following cases will be handled by ashr_3
402 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
403 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
406 LABEL(ashr_3_sse4_2):
418 jnz LABEL(less32bytes_sse4_2)
421 UPDATE_STRNCMP_COUNTER
424 mov $16, %rcx /* index for loads */
425 mov $3, %r9d /* byte position left over from less32bytes case */
427 * Setup %r10 value allows us to detect crossing a page boundary.
428 * When %r10 goes positive we have crossed a page boundary and
429 * need to do a nibble.
432 and $0xfff, %r10 /* offset into 4K page */
433 sub $0x1000, %r10 /* subtract 4K pagesize */
434 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
436 LABEL(loop_ashr_3_use_sse4_2):
438 jg LABEL(nibble_ashr_3_use_sse4_2)
440 movdqa (%rdi, %rdx), %xmm0
441 palignr $3, -16(%rdi, %rdx), %xmm0
442 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
443 jbe LABEL(use_sse4_2_exit)
444 #ifdef USE_AS_STRNCMP
446 jbe LABEL(strcmp_exitz_sse4_2)
451 jg LABEL(nibble_ashr_3_use_sse4_2)
453 movdqa (%rdi, %rdx), %xmm0
454 palignr $3, -16(%rdi, %rdx), %xmm0
455 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
456 jbe LABEL(use_sse4_2_exit)
457 #ifdef USE_AS_STRNCMP
459 jbe LABEL(strcmp_exitz_sse4_2)
462 jmp LABEL(loop_ashr_3_use_sse4_2)
465 LABEL(nibble_ashr_3_use_sse4_2):
467 movdqa -16(%rdi, %rdx), %xmm0
469 pcmpistri $0x3a,%xmm0, %xmm0
470 #ifdef USE_AS_STRNCMP
472 jae LABEL(nibble_ashr_use_sse4_2_exit)
475 ja LABEL(loop_ashr_3_use_sse4_2)
477 jmp LABEL(nibble_ashr_use_sse4_2_exit)
480 * The following cases will be handled by ashr_4
481 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
482 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
485 LABEL(ashr_4_sse4_2):
497 jnz LABEL(less32bytes_sse4_2)
500 UPDATE_STRNCMP_COUNTER
503 mov $16, %rcx /* index for loads */
504 mov $4, %r9d /* byte position left over from less32bytes case */
506 * Setup %r10 value allows us to detect crossing a page boundary.
507 * When %r10 goes positive we have crossed a page boundary and
508 * need to do a nibble.
511 and $0xfff, %r10 /* offset into 4K page */
512 sub $0x1000, %r10 /* subtract 4K pagesize */
513 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
516 LABEL(loop_ashr_4_use_sse4_2):
518 jg LABEL(nibble_ashr_4_use_sse4_2)
520 movdqa (%rdi, %rdx), %xmm0
521 palignr $4, -16(%rdi, %rdx), %xmm0
522 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
523 jbe LABEL(use_sse4_2_exit)
524 #ifdef USE_AS_STRNCMP
526 jbe LABEL(strcmp_exitz_sse4_2)
531 jg LABEL(nibble_ashr_4_use_sse4_2)
533 movdqa (%rdi, %rdx), %xmm0
534 palignr $4, -16(%rdi, %rdx), %xmm0
535 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
536 jbe LABEL(use_sse4_2_exit)
537 #ifdef USE_AS_STRNCMP
539 jbe LABEL(strcmp_exitz_sse4_2)
542 jmp LABEL(loop_ashr_4_use_sse4_2)
545 LABEL(nibble_ashr_4_use_sse4_2):
547 movdqa -16(%rdi, %rdx), %xmm0
549 pcmpistri $0x3a,%xmm0, %xmm0
550 #ifdef USE_AS_STRNCMP
552 jae LABEL(nibble_ashr_use_sse4_2_exit)
555 ja LABEL(loop_ashr_4_use_sse4_2)
557 jmp LABEL(nibble_ashr_use_sse4_2_exit)
560 * The following cases will be handled by ashr_5
561 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
562 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
565 LABEL(ashr_5_sse4_2):
577 jnz LABEL(less32bytes_sse4_2)
580 UPDATE_STRNCMP_COUNTER
583 mov $16, %rcx /* index for loads */
584 mov $5, %r9d /* byte position left over from less32bytes case */
586 * Setup %r10 value allows us to detect crossing a page boundary.
587 * When %r10 goes positive we have crossed a page boundary and
588 * need to do a nibble.
591 and $0xfff, %r10 /* offset into 4K page */
592 sub $0x1000, %r10 /* subtract 4K pagesize */
593 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
596 LABEL(loop_ashr_5_use_sse4_2):
598 jg LABEL(nibble_ashr_5_use_sse4_2)
600 movdqa (%rdi, %rdx), %xmm0
601 palignr $5, -16(%rdi, %rdx), %xmm0
602 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
603 jbe LABEL(use_sse4_2_exit)
604 #ifdef USE_AS_STRNCMP
606 jbe LABEL(strcmp_exitz_sse4_2)
611 jg LABEL(nibble_ashr_5_use_sse4_2)
613 movdqa (%rdi, %rdx), %xmm0
615 palignr $5, -16(%rdi, %rdx), %xmm0
616 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
617 jbe LABEL(use_sse4_2_exit)
618 #ifdef USE_AS_STRNCMP
620 jbe LABEL(strcmp_exitz_sse4_2)
623 jmp LABEL(loop_ashr_5_use_sse4_2)
626 LABEL(nibble_ashr_5_use_sse4_2):
628 movdqa -16(%rdi, %rdx), %xmm0
630 pcmpistri $0x3a,%xmm0, %xmm0
631 #ifdef USE_AS_STRNCMP
633 jae LABEL(nibble_ashr_use_sse4_2_exit)
636 ja LABEL(loop_ashr_5_use_sse4_2)
638 jmp LABEL(nibble_ashr_use_sse4_2_exit)
641 * The following cases will be handled by ashr_6
642 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
643 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
646 LABEL(ashr_6_sse4_2):
658 jnz LABEL(less32bytes_sse4_2)
661 UPDATE_STRNCMP_COUNTER
664 mov $16, %rcx /* index for loads */
665 mov $6, %r9d /* byte position left over from less32bytes case */
667 * Setup %r10 value allows us to detect crossing a page boundary.
668 * When %r10 goes positive we have crossed a page boundary and
669 * need to do a nibble.
672 and $0xfff, %r10 /* offset into 4K page */
673 sub $0x1000, %r10 /* subtract 4K pagesize */
674 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
677 LABEL(loop_ashr_6_use_sse4_2):
679 jg LABEL(nibble_ashr_6_use_sse4_2)
681 movdqa (%rdi, %rdx), %xmm0
682 palignr $6, -16(%rdi, %rdx), %xmm0
683 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
684 jbe LABEL(use_sse4_2_exit)
685 #ifdef USE_AS_STRNCMP
687 jbe LABEL(strcmp_exitz_sse4_2)
692 jg LABEL(nibble_ashr_6_use_sse4_2)
694 movdqa (%rdi, %rdx), %xmm0
695 palignr $6, -16(%rdi, %rdx), %xmm0
696 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
697 jbe LABEL(use_sse4_2_exit)
698 #ifdef USE_AS_STRNCMP
700 jbe LABEL(strcmp_exitz_sse4_2)
703 jmp LABEL(loop_ashr_6_use_sse4_2)
706 LABEL(nibble_ashr_6_use_sse4_2):
708 movdqa -16(%rdi, %rdx), %xmm0
710 pcmpistri $0x3a,%xmm0, %xmm0
711 #ifdef USE_AS_STRNCMP
713 jae LABEL(nibble_ashr_use_sse4_2_exit)
716 ja LABEL(loop_ashr_6_use_sse4_2)
718 jmp LABEL(nibble_ashr_use_sse4_2_exit)
721 * The following cases will be handled by ashr_7
722 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
723 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
726 LABEL(ashr_7_sse4_2):
738 jnz LABEL(less32bytes_sse4_2)
741 UPDATE_STRNCMP_COUNTER
744 mov $16, %rcx /* index for loads */
745 mov $7, %r9d /* byte position left over from less32bytes case */
747 * Setup %r10 value allows us to detect crossing a page boundary.
748 * When %r10 goes positive we have crossed a page boundary and
749 * need to do a nibble.
752 and $0xfff, %r10 /* offset into 4K page */
753 sub $0x1000, %r10 /* subtract 4K pagesize */
754 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
757 LABEL(loop_ashr_7_use_sse4_2):
759 jg LABEL(nibble_ashr_7_use_sse4_2)
761 movdqa (%rdi, %rdx), %xmm0
762 palignr $7, -16(%rdi, %rdx), %xmm0
763 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
764 jbe LABEL(use_sse4_2_exit)
765 #ifdef USE_AS_STRNCMP
767 jbe LABEL(strcmp_exitz_sse4_2)
772 jg LABEL(nibble_ashr_7_use_sse4_2)
774 movdqa (%rdi, %rdx), %xmm0
775 palignr $7, -16(%rdi, %rdx), %xmm0
776 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
777 jbe LABEL(use_sse4_2_exit)
778 #ifdef USE_AS_STRNCMP
780 jbe LABEL(strcmp_exitz_sse4_2)
783 jmp LABEL(loop_ashr_7_use_sse4_2)
786 LABEL(nibble_ashr_7_use_sse4_2):
788 movdqa -16(%rdi, %rdx), %xmm0
790 pcmpistri $0x3a,%xmm0, %xmm0
791 #ifdef USE_AS_STRNCMP
793 jae LABEL(nibble_ashr_use_sse4_2_exit)
796 ja LABEL(loop_ashr_7_use_sse4_2)
798 jmp LABEL(nibble_ashr_use_sse4_2_exit)
801 * The following cases will be handled by ashr_8
802 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
803 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
806 LABEL(ashr_8_sse4_2):
818 jnz LABEL(less32bytes_sse4_2)
821 UPDATE_STRNCMP_COUNTER
824 mov $16, %rcx /* index for loads */
825 mov $8, %r9d /* byte position left over from less32bytes case */
827 * Setup %r10 value allows us to detect crossing a page boundary.
828 * When %r10 goes positive we have crossed a page boundary and
829 * need to do a nibble.
832 and $0xfff, %r10 /* offset into 4K page */
833 sub $0x1000, %r10 /* subtract 4K pagesize */
834 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
837 LABEL(loop_ashr_8_use_sse4_2):
839 jg LABEL(nibble_ashr_8_use_sse4_2)
841 movdqa (%rdi, %rdx), %xmm0
842 palignr $8, -16(%rdi, %rdx), %xmm0
843 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
844 jbe LABEL(use_sse4_2_exit)
845 #ifdef USE_AS_STRNCMP
847 jbe LABEL(strcmp_exitz_sse4_2)
852 jg LABEL(nibble_ashr_8_use_sse4_2)
854 movdqa (%rdi, %rdx), %xmm0
855 palignr $8, -16(%rdi, %rdx), %xmm0
856 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
857 jbe LABEL(use_sse4_2_exit)
858 #ifdef USE_AS_STRNCMP
860 jbe LABEL(strcmp_exitz_sse4_2)
863 jmp LABEL(loop_ashr_8_use_sse4_2)
866 LABEL(nibble_ashr_8_use_sse4_2):
868 movdqa -16(%rdi, %rdx), %xmm0
870 pcmpistri $0x3a,%xmm0, %xmm0
871 #ifdef USE_AS_STRNCMP
873 jae LABEL(nibble_ashr_use_sse4_2_exit)
876 ja LABEL(loop_ashr_8_use_sse4_2)
878 jmp LABEL(nibble_ashr_use_sse4_2_exit)
881 * The following cases will be handled by ashr_9
882 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
883 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
886 LABEL(ashr_9_sse4_2):
898 jnz LABEL(less32bytes_sse4_2)
901 UPDATE_STRNCMP_COUNTER
904 mov $16, %rcx /* index for loads */
905 mov $9, %r9d /* byte position left over from less32bytes case */
907 * Setup %r10 value allows us to detect crossing a page boundary.
908 * When %r10 goes positive we have crossed a page boundary and
909 * need to do a nibble.
912 and $0xfff, %r10 /* offset into 4K page */
913 sub $0x1000, %r10 /* subtract 4K pagesize */
914 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
917 LABEL(loop_ashr_9_use_sse4_2):
919 jg LABEL(nibble_ashr_9_use_sse4_2)
921 movdqa (%rdi, %rdx), %xmm0
923 palignr $9, -16(%rdi, %rdx), %xmm0
924 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
925 jbe LABEL(use_sse4_2_exit)
926 #ifdef USE_AS_STRNCMP
928 jbe LABEL(strcmp_exitz_sse4_2)
933 jg LABEL(nibble_ashr_9_use_sse4_2)
935 movdqa (%rdi, %rdx), %xmm0
936 palignr $9, -16(%rdi, %rdx), %xmm0
937 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
938 jbe LABEL(use_sse4_2_exit)
939 #ifdef USE_AS_STRNCMP
941 jbe LABEL(strcmp_exitz_sse4_2)
944 jmp LABEL(loop_ashr_9_use_sse4_2)
947 LABEL(nibble_ashr_9_use_sse4_2):
949 movdqa -16(%rdi, %rdx), %xmm0
951 pcmpistri $0x3a,%xmm0, %xmm0
952 #ifdef USE_AS_STRNCMP
954 jae LABEL(nibble_ashr_use_sse4_2_exit)
957 ja LABEL(loop_ashr_9_use_sse4_2)
959 jmp LABEL(nibble_ashr_use_sse4_2_exit)
962 * The following cases will be handled by ashr_10
963 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
964 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
967 LABEL(ashr_10_sse4_2):
979 jnz LABEL(less32bytes_sse4_2)
982 UPDATE_STRNCMP_COUNTER
985 mov $16, %rcx /* index for loads */
986 mov $10, %r9d /* byte position left over from less32bytes case */
988 * Setup %r10 value allows us to detect crossing a page boundary.
989 * When %r10 goes positive we have crossed a page boundary and
990 * need to do a nibble.
993 and $0xfff, %r10 /* offset into 4K page */
994 sub $0x1000, %r10 /* subtract 4K pagesize */
995 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
998 LABEL(loop_ashr_10_use_sse4_2):
1000 jg LABEL(nibble_ashr_10_use_sse4_2)
1002 movdqa (%rdi, %rdx), %xmm0
1003 palignr $10, -16(%rdi, %rdx), %xmm0
1004 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1005 jbe LABEL(use_sse4_2_exit)
1006 #ifdef USE_AS_STRNCMP
1008 jbe LABEL(strcmp_exitz_sse4_2)
1013 jg LABEL(nibble_ashr_10_use_sse4_2)
1015 movdqa (%rdi, %rdx), %xmm0
1016 palignr $10, -16(%rdi, %rdx), %xmm0
1017 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1018 jbe LABEL(use_sse4_2_exit)
1019 #ifdef USE_AS_STRNCMP
1021 jbe LABEL(strcmp_exitz_sse4_2)
1024 jmp LABEL(loop_ashr_10_use_sse4_2)
1027 LABEL(nibble_ashr_10_use_sse4_2):
1029 movdqa -16(%rdi, %rdx), %xmm0
1031 pcmpistri $0x3a,%xmm0, %xmm0
1032 #ifdef USE_AS_STRNCMP
1034 jae LABEL(nibble_ashr_use_sse4_2_exit)
1037 ja LABEL(loop_ashr_10_use_sse4_2)
1039 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1042 * The following cases will be handled by ashr_11
1043 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1044 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1047 LABEL(ashr_11_sse4_2):
1049 movdqa (%rdi), %xmm2
1050 movdqa (%rsi), %xmm1
1051 pcmpeqb %xmm1, %xmm0
1053 pcmpeqb %xmm1, %xmm2
1055 pmovmskb %xmm2, %r9d
1059 jnz LABEL(less32bytes_sse4_2)
1060 movdqa (%rdi), %xmm3
1062 UPDATE_STRNCMP_COUNTER
1065 mov $16, %rcx /* index for loads */
1066 mov $11, %r9d /* byte position left over from less32bytes case */
1068 * Setup %r10 value allows us to detect crossing a page boundary.
1069 * When %r10 goes positive we have crossed a page boundary and
1070 * need to do a nibble.
1073 and $0xfff, %r10 /* offset into 4K page */
1074 sub $0x1000, %r10 /* subtract 4K pagesize */
1075 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1078 LABEL(loop_ashr_11_use_sse4_2):
1080 jg LABEL(nibble_ashr_11_use_sse4_2)
1082 movdqa (%rdi, %rdx), %xmm0
1083 palignr $11, -16(%rdi, %rdx), %xmm0
1084 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1085 jbe LABEL(use_sse4_2_exit)
1086 #ifdef USE_AS_STRNCMP
1088 jbe LABEL(strcmp_exitz_sse4_2)
1093 jg LABEL(nibble_ashr_11_use_sse4_2)
1095 movdqa (%rdi, %rdx), %xmm0
1096 palignr $11, -16(%rdi, %rdx), %xmm0
1097 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1098 jbe LABEL(use_sse4_2_exit)
1099 #ifdef USE_AS_STRNCMP
1101 jbe LABEL(strcmp_exitz_sse4_2)
1104 jmp LABEL(loop_ashr_11_use_sse4_2)
1107 LABEL(nibble_ashr_11_use_sse4_2):
1109 movdqa -16(%rdi, %rdx), %xmm0
1111 pcmpistri $0x3a,%xmm0, %xmm0
1112 #ifdef USE_AS_STRNCMP
1114 jae LABEL(nibble_ashr_use_sse4_2_exit)
1117 ja LABEL(loop_ashr_11_use_sse4_2)
1119 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1122 * The following cases will be handled by ashr_12
1123 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1124 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1127 LABEL(ashr_12_sse4_2):
1129 movdqa (%rdi), %xmm2
1130 movdqa (%rsi), %xmm1
1131 pcmpeqb %xmm1, %xmm0
1133 pcmpeqb %xmm1, %xmm2
1135 pmovmskb %xmm2, %r9d
1139 jnz LABEL(less32bytes_sse4_2)
1140 movdqa (%rdi), %xmm3
1142 UPDATE_STRNCMP_COUNTER
1145 mov $16, %rcx /* index for loads */
1146 mov $12, %r9d /* byte position left over from less32bytes case */
1148 * Setup %r10 value allows us to detect crossing a page boundary.
1149 * When %r10 goes positive we have crossed a page boundary and
1150 * need to do a nibble.
1153 and $0xfff, %r10 /* offset into 4K page */
1154 sub $0x1000, %r10 /* subtract 4K pagesize */
1155 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1158 LABEL(loop_ashr_12_use_sse4_2):
1160 jg LABEL(nibble_ashr_12_use_sse4_2)
1162 movdqa (%rdi, %rdx), %xmm0
1163 palignr $12, -16(%rdi, %rdx), %xmm0
1164 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1165 jbe LABEL(use_sse4_2_exit)
1166 #ifdef USE_AS_STRNCMP
1168 jbe LABEL(strcmp_exitz_sse4_2)
1173 jg LABEL(nibble_ashr_12_use_sse4_2)
1175 movdqa (%rdi, %rdx), %xmm0
1176 palignr $12, -16(%rdi, %rdx), %xmm0
1177 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1178 jbe LABEL(use_sse4_2_exit)
1179 #ifdef USE_AS_STRNCMP
1181 jbe LABEL(strcmp_exitz_sse4_2)
1184 jmp LABEL(loop_ashr_12_use_sse4_2)
1187 LABEL(nibble_ashr_12_use_sse4_2):
1189 movdqa -16(%rdi, %rdx), %xmm0
1191 pcmpistri $0x3a,%xmm0, %xmm0
1192 #ifdef USE_AS_STRNCMP
1194 jae LABEL(nibble_ashr_use_sse4_2_exit)
1197 ja LABEL(loop_ashr_12_use_sse4_2)
1199 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1202 * The following cases will be handled by ashr_13
1203 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1204 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1207 LABEL(ashr_13_sse4_2):
1209 movdqa (%rdi), %xmm2
1210 movdqa (%rsi), %xmm1
1211 pcmpeqb %xmm1, %xmm0
1213 pcmpeqb %xmm1, %xmm2
1215 pmovmskb %xmm2, %r9d
1219 jnz LABEL(less32bytes_sse4_2)
1220 movdqa (%rdi), %xmm3
1222 UPDATE_STRNCMP_COUNTER
1225 mov $16, %rcx /* index for loads */
1226 mov $13, %r9d /* byte position left over from less32bytes case */
1228 * Setup %r10 value allows us to detect crossing a page boundary.
1229 * When %r10 goes positive we have crossed a page boundary and
1230 * need to do a nibble.
1233 and $0xfff, %r10 /* offset into 4K page */
1234 sub $0x1000, %r10 /* subtract 4K pagesize */
1236 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1239 LABEL(loop_ashr_13_use_sse4_2):
1241 jg LABEL(nibble_ashr_13_use_sse4_2)
1243 movdqa (%rdi, %rdx), %xmm0
1244 palignr $13, -16(%rdi, %rdx), %xmm0
1245 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1246 jbe LABEL(use_sse4_2_exit)
1247 #ifdef USE_AS_STRNCMP
1249 jbe LABEL(strcmp_exitz_sse4_2)
1254 jg LABEL(nibble_ashr_13_use_sse4_2)
1256 movdqa (%rdi, %rdx), %xmm0
1257 palignr $13, -16(%rdi, %rdx), %xmm0
1258 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1259 jbe LABEL(use_sse4_2_exit)
1260 #ifdef USE_AS_STRNCMP
1262 jbe LABEL(strcmp_exitz_sse4_2)
1265 jmp LABEL(loop_ashr_13_use_sse4_2)
1268 LABEL(nibble_ashr_13_use_sse4_2):
1270 movdqa -16(%rdi, %rdx), %xmm0
1272 pcmpistri $0x3a,%xmm0, %xmm0
1273 #ifdef USE_AS_STRNCMP
1275 jae LABEL(nibble_ashr_use_sse4_2_exit)
1278 ja LABEL(loop_ashr_13_use_sse4_2)
1280 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1283 * The following cases will be handled by ashr_14
1284 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1285 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1288 LABEL(ashr_14_sse4_2):
1290 movdqa (%rdi), %xmm2
1291 movdqa (%rsi), %xmm1
1292 pcmpeqb %xmm1, %xmm0
1294 pcmpeqb %xmm1, %xmm2
1296 pmovmskb %xmm2, %r9d
1300 jnz LABEL(less32bytes_sse4_2)
1301 movdqa (%rdi), %xmm3
1303 UPDATE_STRNCMP_COUNTER
1306 mov $16, %rcx /* index for loads */
1307 mov $14, %r9d /* byte position left over from less32bytes case */
1309 * Setup %r10 value allows us to detect crossing a page boundary.
1310 * When %r10 goes positive we have crossed a page boundary and
1311 * need to do a nibble.
1314 and $0xfff, %r10 /* offset into 4K page */
1315 sub $0x1000, %r10 /* subtract 4K pagesize */
1317 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1320 LABEL(loop_ashr_14_use_sse4_2):
1322 jg LABEL(nibble_ashr_14_use_sse4_2)
1324 movdqa (%rdi, %rdx), %xmm0
1325 palignr $14, -16(%rdi, %rdx), %xmm0
1326 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1327 jbe LABEL(use_sse4_2_exit)
1328 #ifdef USE_AS_STRNCMP
1330 jbe LABEL(strcmp_exitz_sse4_2)
1335 jg LABEL(nibble_ashr_14_use_sse4_2)
1337 movdqa (%rdi, %rdx), %xmm0
1338 palignr $14, -16(%rdi, %rdx), %xmm0
1339 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1340 jbe LABEL(use_sse4_2_exit)
1341 #ifdef USE_AS_STRNCMP
1343 jbe LABEL(strcmp_exitz_sse4_2)
1346 jmp LABEL(loop_ashr_14_use_sse4_2)
1349 LABEL(nibble_ashr_14_use_sse4_2):
1351 movdqa -16(%rdi, %rdx), %xmm0
1353 pcmpistri $0x3a,%xmm0, %xmm0
1354 #ifdef USE_AS_STRNCMP
1356 jae LABEL(nibble_ashr_use_sse4_2_exit)
1359 ja LABEL(loop_ashr_14_use_sse4_2)
1361 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1364 * The following cases will be handled by ashr_15
1365 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1366 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1369 LABEL(ashr_15_sse4_2):
1371 movdqa (%rdi), %xmm2
1372 movdqa (%rsi), %xmm1
1373 pcmpeqb %xmm1, %xmm0
1375 pcmpeqb %xmm1, %xmm2
1377 pmovmskb %xmm2, %r9d
1381 jnz LABEL(less32bytes_sse4_2)
1383 movdqa (%rdi), %xmm3
1385 UPDATE_STRNCMP_COUNTER
1388 mov $16, %rcx /* index for loads */
1389 mov $15, %r9d /* byte position left over from less32bytes case */
1391 * Setup %r10 value allows us to detect crossing a page boundary.
1392 * When %r10 goes positive we have crossed a page boundary and
1393 * need to do a nibble.
1396 and $0xfff, %r10 /* offset into 4K page */
1398 sub $0x1000, %r10 /* subtract 4K pagesize */
1400 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1403 LABEL(loop_ashr_15_use_sse4_2):
1405 jg LABEL(nibble_ashr_15_use_sse4_2)
1407 movdqa (%rdi, %rdx), %xmm0
1408 palignr $15, -16(%rdi, %rdx), %xmm0
1409 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1410 jbe LABEL(use_sse4_2_exit)
1411 #ifdef USE_AS_STRNCMP
1413 jbe LABEL(strcmp_exitz_sse4_2)
1418 jg LABEL(nibble_ashr_15_use_sse4_2)
1420 movdqa (%rdi, %rdx), %xmm0
1421 palignr $15, -16(%rdi, %rdx), %xmm0
1422 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1423 jbe LABEL(use_sse4_2_exit)
1424 #ifdef USE_AS_STRNCMP
1426 jbe LABEL(strcmp_exitz_sse4_2)
1429 jmp LABEL(loop_ashr_15_use_sse4_2)
1432 LABEL(nibble_ashr_15_use_sse4_2):
1434 movdqa -16(%rdi, %rdx), %xmm0
1436 pcmpistri $0x3a,%xmm0, %xmm0
1437 #ifdef USE_AS_STRNCMP
1439 jae LABEL(nibble_ashr_use_sse4_2_exit)
1442 ja LABEL(loop_ashr_15_use_sse4_2)
1444 LABEL(nibble_ashr_use_sse4_2_exit):
1445 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1447 LABEL(use_sse4_2_exit):
1448 jnc LABEL(strcmp_exitz_sse4_2)
1449 #ifdef USE_AS_STRNCMP
1451 jbe LABEL(strcmp_exitz_sse4_2)
1454 lea -16(%rdi, %r9), %rdi
1455 movzbl (%rdi, %rdx), %eax
1456 movzbl (%rsi, %rdx), %edx
1458 jz LABEL(use_sse4_2_ret_sse4_2)
1460 LABEL(use_sse4_2_ret_sse4_2):
1464 LABEL(less32bytes_sse4_2):
1465 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1466 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1468 jz LABEL(ret_sse4_2)
1469 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1473 LABEL(less16bytes_sse4_2):
1474 bsf %rdx, %rdx /* find and store bit index in %rdx */
1476 #ifdef USE_AS_STRNCMP
1478 jbe LABEL(strcmp_exitz_sse4_2)
1480 movzbl (%rsi, %rdx), %ecx
1481 movzbl (%rdi, %rdx), %eax
1486 LABEL(strcmp_exitz_sse4_2):
1491 LABEL(Byte0_sse4_2):
1498 .size STRCMP_SSE42, .-STRCMP_SSE42
1500 /* Put all SSE 4.2 functions together. */
1501 .section .rodata.sse4.2,"a",@progbits
1503 LABEL(unaligned_table_sse4_2):
1504 .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
1505 .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
1506 .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
1507 .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
1508 .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
1509 .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
1510 .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
1511 .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
1512 .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
1513 .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
1514 .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
1515 .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
1516 .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
1517 .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
1518 .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
1519 .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
1523 # define ENTRY(name) \
1524 .type STRCMP_SSE2, @function; \
1526 STRCMP_SSE2: cfi_startproc; \
1529 # define END(name) \
1530 cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
1531 # undef libc_hidden_builtin_def
1532 /* It doesn't make sense to send libc-internal strcmp calls through a PLT.
1533 The speedup we get from using SSE4.2 instruction is likely eaten away
1534 by the indirect call in the PLT. */
1535 # define libc_hidden_builtin_def(name) \
1536 .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
1539 #include "../strcmp.S"