2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 #include <ifunc-defines.h>
25 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
26 if the new counter > the old one or is 0. */
27 #define UPDATE_STRNCMP_COUNTER \
28 /* calculate left number to compare */ \
29 lea -16(%rcx, %r11), %r9; \
31 jb LABEL(strcmp_exitz_sse4_2); \
33 je LABEL(strcmp_exitz_sse4_2); \
36 #define STRCMP_SSE42 __strncmp_sse42
37 #define STRCMP_SSE2 __strncmp_sse2
38 #define __GI_STRCMP __GI_strncmp
40 #define UPDATE_STRNCMP_COUNTER
43 #define STRCMP_SSE42 __strcmp_sse42
44 #define STRCMP_SSE2 __strcmp_sse2
45 #define __GI_STRCMP __GI_strcmp
53 /* Define multiple versions only for the definition in libc. Don't
54 define multiple versions for strncmp in static library since we
55 need strncmp before the initialization happened. */
56 #if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
59 .type STRCMP, @gnu_indirect_function
60 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
62 call __init_cpu_features
63 1: leaq STRCMP_SSE2(%rip), %rax
64 testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
66 leaq STRCMP_SSE42(%rip), %rax
72 | _SIDD_CMP_EQUAL_EACH
73 | _SIDD_NEGATIVE_POLARITY
74 | _SIDD_LEAST_SIGNIFICANT
75 on pcmpistri to find out if two 16byte data elements are the same
76 and the offset of the first different byte. There are 4 cases:
78 1. Both 16byte data elements are valid and identical.
79 2. Both 16byte data elements have EOS and identical.
80 3. Both 16byte data elements are valid and they differ at offset X.
81 4. At least one 16byte data element has EOS at offset X. Two 16byte
82 data elements must differ at or before offset X.
84 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
86 case ECX CFlag ZFlag SFlag
92 We exit from the loop for cases 2, 3 and 4 with jbe which branches
93 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
96 /* Put all SSE 4.2 functions together. */
97 .section .text.sse4.2,"ax",@progbits
99 .type STRCMP_SSE42, @function
105 * This implementation uses SSE to compare up to 16 bytes at a time.
107 #ifdef USE_AS_STRNCMP
109 je LABEL(strcmp_exitz_sse4_2)
111 je LABEL(Byte0_sse4_2)
116 /* Use 64bit AND here to avoid long NOP padding. */
117 and $0x3f, %rcx /* rsi alignment in cache line */
118 and $0x3f, %rax /* rdi alignment in cache line */
120 ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
122 ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
125 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
126 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
127 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
128 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
130 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
131 jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
132 #ifdef USE_AS_STRNCMP
134 jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */
136 add $16, %rsi /* prepare to search next 16 bytes */
137 add $16, %rdi /* prepare to search next 16 bytes */
140 * Determine source and destination string offsets from 16-byte alignment.
141 * Use relative offset difference between the two to determine which case
145 LABEL(crosscache_sse4_2):
146 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
147 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
148 mov $0xffff, %edx /* for equivalent offset */
150 and $0xf, %ecx /* offset of rsi */
151 and $0xf, %eax /* offset of rdi */
153 je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */
154 ja LABEL(bigger_sse4_2)
155 mov %edx, %r8d /* r8d is offset flag for exit tail */
158 LABEL(bigger_sse4_2):
161 lea LABEL(unaligned_table_sse4_2)(%rip), %r10
162 movslq (%r10, %r9,4), %r9
163 lea (%r10, %r9), %r10
164 jmp *%r10 /* jump to corresponding case */
167 * The following cases will be handled by ashr_0
168 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
169 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
172 LABEL(ashr_0_sse4_2):
175 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
176 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
177 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
178 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
180 shr %cl, %edx /* adjust 0xffff for offset */
181 shr %cl, %r9d /* adjust for 16-byte offset */
184 * edx must be the same with r9d if in left byte (16-rcx) is equal to
185 * the start from (16-rax) and no null char was seen.
187 jne LABEL(less32bytes_sse4_2) /* mismatch or null char */
188 UPDATE_STRNCMP_COUNTER
191 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
194 * Now both strings are aligned at 16-byte boundary. Loop over strings
195 * checking 32-bytes per iteration.
197 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
199 LABEL(ashr_0_use_sse4_2):
200 movdqa (%rdi,%rdx), %xmm0
201 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
203 jbe LABEL(ashr_0_use_sse4_2_exit)
204 #ifdef USE_AS_STRNCMP
206 jbe LABEL(strcmp_exitz_sse4_2)
209 movdqa (%rdi,%rdx), %xmm0
210 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
212 jbe LABEL(ashr_0_use_sse4_2_exit)
213 #ifdef USE_AS_STRNCMP
215 jbe LABEL(strcmp_exitz_sse4_2)
217 jmp LABEL(ashr_0_use_sse4_2)
221 LABEL(ashr_0_use_sse4_2_exit):
222 jnc LABEL(strcmp_exitz_sse4_2)
223 #ifdef USE_AS_STRNCMP
225 jbe LABEL(strcmp_exitz_sse4_2)
227 lea -16(%rdx, %rcx), %rcx
228 movzbl (%rdi, %rcx), %eax
229 movzbl (%rsi, %rcx), %edx
237 * The following cases will be handled by ashr_1
238 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
239 * n(15) n -15 0(15 +(n-15) - n) ashr_1
242 LABEL(ashr_1_sse4_2):
246 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
247 pslldq $15, %xmm2 /* shift first string to align with second */
248 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
249 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
251 shr %cl, %edx /* adjust 0xffff for offset */
252 shr %cl, %r9d /* adjust for 16-byte offset */
254 jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
256 UPDATE_STRNCMP_COUNTER
259 mov $16, %rcx /* index for loads*/
260 mov $1, %r9d /* byte position left over from less32bytes case */
262 * Setup %r10 value allows us to detect crossing a page boundary.
263 * When %r10 goes positive we have crossed a page boundary and
264 * need to do a nibble.
267 and $0xfff, %r10 /* offset into 4K page */
268 sub $0x1000, %r10 /* subtract 4K pagesize */
269 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
272 LABEL(loop_ashr_1_use_sse4_2):
274 jg LABEL(nibble_ashr_1_use_sse4_2)
276 movdqa (%rdi, %rdx), %xmm0
277 palignr $1, -16(%rdi, %rdx), %xmm0
278 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
279 jbe LABEL(use_sse4_2_exit)
280 #ifdef USE_AS_STRNCMP
282 jbe LABEL(strcmp_exitz_sse4_2)
287 jg LABEL(nibble_ashr_1_use_sse4_2)
289 movdqa (%rdi, %rdx), %xmm0
290 palignr $1, -16(%rdi, %rdx), %xmm0
291 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
292 jbe LABEL(use_sse4_2_exit)
293 #ifdef USE_AS_STRNCMP
295 jbe LABEL(strcmp_exitz_sse4_2)
298 jmp LABEL(loop_ashr_1_use_sse4_2)
301 LABEL(nibble_ashr_1_use_sse4_2):
303 movdqa -16(%rdi, %rdx), %xmm0
305 pcmpistri $0x3a,%xmm0, %xmm0
306 #ifdef USE_AS_STRNCMP
308 jae LABEL(nibble_ashr_use_sse4_2_exit)
311 ja LABEL(loop_ashr_1_use_sse4_2)
313 jmp LABEL(nibble_ashr_use_sse4_2_exit)
316 * The following cases will be handled by ashr_2
317 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
318 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
321 LABEL(ashr_2_sse4_2):
333 jnz LABEL(less32bytes_sse4_2)
335 UPDATE_STRNCMP_COUNTER
338 mov $16, %rcx /* index for loads */
339 mov $2, %r9d /* byte position left over from less32bytes case */
341 * Setup %r10 value allows us to detect crossing a page boundary.
342 * When %r10 goes positive we have crossed a page boundary and
343 * need to do a nibble.
346 and $0xfff, %r10 /* offset into 4K page */
347 sub $0x1000, %r10 /* subtract 4K pagesize */
348 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
351 LABEL(loop_ashr_2_use_sse4_2):
353 jg LABEL(nibble_ashr_2_use_sse4_2)
355 movdqa (%rdi, %rdx), %xmm0
356 palignr $2, -16(%rdi, %rdx), %xmm0
357 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
358 jbe LABEL(use_sse4_2_exit)
359 #ifdef USE_AS_STRNCMP
361 jbe LABEL(strcmp_exitz_sse4_2)
366 jg LABEL(nibble_ashr_2_use_sse4_2)
368 movdqa (%rdi, %rdx), %xmm0
369 palignr $2, -16(%rdi, %rdx), %xmm0
370 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
371 jbe LABEL(use_sse4_2_exit)
372 #ifdef USE_AS_STRNCMP
374 jbe LABEL(strcmp_exitz_sse4_2)
377 jmp LABEL(loop_ashr_2_use_sse4_2)
380 LABEL(nibble_ashr_2_use_sse4_2):
382 movdqa -16(%rdi, %rdx), %xmm0
384 pcmpistri $0x3a,%xmm0, %xmm0
385 #ifdef USE_AS_STRNCMP
387 jae LABEL(nibble_ashr_use_sse4_2_exit)
390 ja LABEL(loop_ashr_2_use_sse4_2)
392 jmp LABEL(nibble_ashr_use_sse4_2_exit)
395 * The following cases will be handled by ashr_3
396 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
397 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
400 LABEL(ashr_3_sse4_2):
412 jnz LABEL(less32bytes_sse4_2)
415 UPDATE_STRNCMP_COUNTER
418 mov $16, %rcx /* index for loads */
419 mov $3, %r9d /* byte position left over from less32bytes case */
421 * Setup %r10 value allows us to detect crossing a page boundary.
422 * When %r10 goes positive we have crossed a page boundary and
423 * need to do a nibble.
426 and $0xfff, %r10 /* offset into 4K page */
427 sub $0x1000, %r10 /* subtract 4K pagesize */
428 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
430 LABEL(loop_ashr_3_use_sse4_2):
432 jg LABEL(nibble_ashr_3_use_sse4_2)
434 movdqa (%rdi, %rdx), %xmm0
435 palignr $3, -16(%rdi, %rdx), %xmm0
436 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
437 jbe LABEL(use_sse4_2_exit)
438 #ifdef USE_AS_STRNCMP
440 jbe LABEL(strcmp_exitz_sse4_2)
445 jg LABEL(nibble_ashr_3_use_sse4_2)
447 movdqa (%rdi, %rdx), %xmm0
448 palignr $3, -16(%rdi, %rdx), %xmm0
449 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
450 jbe LABEL(use_sse4_2_exit)
451 #ifdef USE_AS_STRNCMP
453 jbe LABEL(strcmp_exitz_sse4_2)
456 jmp LABEL(loop_ashr_3_use_sse4_2)
459 LABEL(nibble_ashr_3_use_sse4_2):
461 movdqa -16(%rdi, %rdx), %xmm0
463 pcmpistri $0x3a,%xmm0, %xmm0
464 #ifdef USE_AS_STRNCMP
466 jae LABEL(nibble_ashr_use_sse4_2_exit)
469 ja LABEL(loop_ashr_3_use_sse4_2)
471 jmp LABEL(nibble_ashr_use_sse4_2_exit)
474 * The following cases will be handled by ashr_4
475 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
476 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
479 LABEL(ashr_4_sse4_2):
491 jnz LABEL(less32bytes_sse4_2)
494 UPDATE_STRNCMP_COUNTER
497 mov $16, %rcx /* index for loads */
498 mov $4, %r9d /* byte position left over from less32bytes case */
500 * Setup %r10 value allows us to detect crossing a page boundary.
501 * When %r10 goes positive we have crossed a page boundary and
502 * need to do a nibble.
505 and $0xfff, %r10 /* offset into 4K page */
506 sub $0x1000, %r10 /* subtract 4K pagesize */
507 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
510 LABEL(loop_ashr_4_use_sse4_2):
512 jg LABEL(nibble_ashr_4_use_sse4_2)
514 movdqa (%rdi, %rdx), %xmm0
515 palignr $4, -16(%rdi, %rdx), %xmm0
516 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
517 jbe LABEL(use_sse4_2_exit)
518 #ifdef USE_AS_STRNCMP
520 jbe LABEL(strcmp_exitz_sse4_2)
525 jg LABEL(nibble_ashr_4_use_sse4_2)
527 movdqa (%rdi, %rdx), %xmm0
528 palignr $4, -16(%rdi, %rdx), %xmm0
529 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
530 jbe LABEL(use_sse4_2_exit)
531 #ifdef USE_AS_STRNCMP
533 jbe LABEL(strcmp_exitz_sse4_2)
536 jmp LABEL(loop_ashr_4_use_sse4_2)
539 LABEL(nibble_ashr_4_use_sse4_2):
541 movdqa -16(%rdi, %rdx), %xmm0
543 pcmpistri $0x3a,%xmm0, %xmm0
544 #ifdef USE_AS_STRNCMP
546 jae LABEL(nibble_ashr_use_sse4_2_exit)
549 ja LABEL(loop_ashr_4_use_sse4_2)
551 jmp LABEL(nibble_ashr_use_sse4_2_exit)
554 * The following cases will be handled by ashr_5
555 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
556 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
559 LABEL(ashr_5_sse4_2):
571 jnz LABEL(less32bytes_sse4_2)
574 UPDATE_STRNCMP_COUNTER
577 mov $16, %rcx /* index for loads */
578 mov $5, %r9d /* byte position left over from less32bytes case */
580 * Setup %r10 value allows us to detect crossing a page boundary.
581 * When %r10 goes positive we have crossed a page boundary and
582 * need to do a nibble.
585 and $0xfff, %r10 /* offset into 4K page */
586 sub $0x1000, %r10 /* subtract 4K pagesize */
587 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
590 LABEL(loop_ashr_5_use_sse4_2):
592 jg LABEL(nibble_ashr_5_use_sse4_2)
594 movdqa (%rdi, %rdx), %xmm0
595 palignr $5, -16(%rdi, %rdx), %xmm0
596 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
597 jbe LABEL(use_sse4_2_exit)
598 #ifdef USE_AS_STRNCMP
600 jbe LABEL(strcmp_exitz_sse4_2)
605 jg LABEL(nibble_ashr_5_use_sse4_2)
607 movdqa (%rdi, %rdx), %xmm0
609 palignr $5, -16(%rdi, %rdx), %xmm0
610 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
611 jbe LABEL(use_sse4_2_exit)
612 #ifdef USE_AS_STRNCMP
614 jbe LABEL(strcmp_exitz_sse4_2)
617 jmp LABEL(loop_ashr_5_use_sse4_2)
620 LABEL(nibble_ashr_5_use_sse4_2):
622 movdqa -16(%rdi, %rdx), %xmm0
624 pcmpistri $0x3a,%xmm0, %xmm0
625 #ifdef USE_AS_STRNCMP
627 jae LABEL(nibble_ashr_use_sse4_2_exit)
630 ja LABEL(loop_ashr_5_use_sse4_2)
632 jmp LABEL(nibble_ashr_use_sse4_2_exit)
635 * The following cases will be handled by ashr_6
636 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
637 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
640 LABEL(ashr_6_sse4_2):
652 jnz LABEL(less32bytes_sse4_2)
655 UPDATE_STRNCMP_COUNTER
658 mov $16, %rcx /* index for loads */
659 mov $6, %r9d /* byte position left over from less32bytes case */
661 * Setup %r10 value allows us to detect crossing a page boundary.
662 * When %r10 goes positive we have crossed a page boundary and
663 * need to do a nibble.
666 and $0xfff, %r10 /* offset into 4K page */
667 sub $0x1000, %r10 /* subtract 4K pagesize */
668 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
671 LABEL(loop_ashr_6_use_sse4_2):
673 jg LABEL(nibble_ashr_6_use_sse4_2)
675 movdqa (%rdi, %rdx), %xmm0
676 palignr $6, -16(%rdi, %rdx), %xmm0
677 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
678 jbe LABEL(use_sse4_2_exit)
679 #ifdef USE_AS_STRNCMP
681 jbe LABEL(strcmp_exitz_sse4_2)
686 jg LABEL(nibble_ashr_6_use_sse4_2)
688 movdqa (%rdi, %rdx), %xmm0
689 palignr $6, -16(%rdi, %rdx), %xmm0
690 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
691 jbe LABEL(use_sse4_2_exit)
692 #ifdef USE_AS_STRNCMP
694 jbe LABEL(strcmp_exitz_sse4_2)
697 jmp LABEL(loop_ashr_6_use_sse4_2)
700 LABEL(nibble_ashr_6_use_sse4_2):
702 movdqa -16(%rdi, %rdx), %xmm0
704 pcmpistri $0x3a,%xmm0, %xmm0
705 #ifdef USE_AS_STRNCMP
707 jae LABEL(nibble_ashr_use_sse4_2_exit)
710 ja LABEL(loop_ashr_6_use_sse4_2)
712 jmp LABEL(nibble_ashr_use_sse4_2_exit)
715 * The following cases will be handled by ashr_7
716 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
717 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
720 LABEL(ashr_7_sse4_2):
732 jnz LABEL(less32bytes_sse4_2)
735 UPDATE_STRNCMP_COUNTER
738 mov $16, %rcx /* index for loads */
739 mov $7, %r9d /* byte position left over from less32bytes case */
741 * Setup %r10 value allows us to detect crossing a page boundary.
742 * When %r10 goes positive we have crossed a page boundary and
743 * need to do a nibble.
746 and $0xfff, %r10 /* offset into 4K page */
747 sub $0x1000, %r10 /* subtract 4K pagesize */
748 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
751 LABEL(loop_ashr_7_use_sse4_2):
753 jg LABEL(nibble_ashr_7_use_sse4_2)
755 movdqa (%rdi, %rdx), %xmm0
756 palignr $7, -16(%rdi, %rdx), %xmm0
757 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
758 jbe LABEL(use_sse4_2_exit)
759 #ifdef USE_AS_STRNCMP
761 jbe LABEL(strcmp_exitz_sse4_2)
766 jg LABEL(nibble_ashr_7_use_sse4_2)
768 movdqa (%rdi, %rdx), %xmm0
769 palignr $7, -16(%rdi, %rdx), %xmm0
770 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
771 jbe LABEL(use_sse4_2_exit)
772 #ifdef USE_AS_STRNCMP
774 jbe LABEL(strcmp_exitz_sse4_2)
777 jmp LABEL(loop_ashr_7_use_sse4_2)
780 LABEL(nibble_ashr_7_use_sse4_2):
782 movdqa -16(%rdi, %rdx), %xmm0
784 pcmpistri $0x3a,%xmm0, %xmm0
785 #ifdef USE_AS_STRNCMP
787 jae LABEL(nibble_ashr_use_sse4_2_exit)
790 ja LABEL(loop_ashr_7_use_sse4_2)
792 jmp LABEL(nibble_ashr_use_sse4_2_exit)
795 * The following cases will be handled by ashr_8
796 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
797 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
800 LABEL(ashr_8_sse4_2):
812 jnz LABEL(less32bytes_sse4_2)
815 UPDATE_STRNCMP_COUNTER
818 mov $16, %rcx /* index for loads */
819 mov $8, %r9d /* byte position left over from less32bytes case */
821 * Setup %r10 value allows us to detect crossing a page boundary.
822 * When %r10 goes positive we have crossed a page boundary and
823 * need to do a nibble.
826 and $0xfff, %r10 /* offset into 4K page */
827 sub $0x1000, %r10 /* subtract 4K pagesize */
828 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
831 LABEL(loop_ashr_8_use_sse4_2):
833 jg LABEL(nibble_ashr_8_use_sse4_2)
835 movdqa (%rdi, %rdx), %xmm0
836 palignr $8, -16(%rdi, %rdx), %xmm0
837 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
838 jbe LABEL(use_sse4_2_exit)
839 #ifdef USE_AS_STRNCMP
841 jbe LABEL(strcmp_exitz_sse4_2)
846 jg LABEL(nibble_ashr_8_use_sse4_2)
848 movdqa (%rdi, %rdx), %xmm0
849 palignr $8, -16(%rdi, %rdx), %xmm0
850 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
851 jbe LABEL(use_sse4_2_exit)
852 #ifdef USE_AS_STRNCMP
854 jbe LABEL(strcmp_exitz_sse4_2)
857 jmp LABEL(loop_ashr_8_use_sse4_2)
860 LABEL(nibble_ashr_8_use_sse4_2):
862 movdqa -16(%rdi, %rdx), %xmm0
864 pcmpistri $0x3a,%xmm0, %xmm0
865 #ifdef USE_AS_STRNCMP
867 jae LABEL(nibble_ashr_use_sse4_2_exit)
870 ja LABEL(loop_ashr_8_use_sse4_2)
872 jmp LABEL(nibble_ashr_use_sse4_2_exit)
875 * The following cases will be handled by ashr_9
876 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
877 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
880 LABEL(ashr_9_sse4_2):
892 jnz LABEL(less32bytes_sse4_2)
895 UPDATE_STRNCMP_COUNTER
898 mov $16, %rcx /* index for loads */
899 mov $9, %r9d /* byte position left over from less32bytes case */
901 * Setup %r10 value allows us to detect crossing a page boundary.
902 * When %r10 goes positive we have crossed a page boundary and
903 * need to do a nibble.
906 and $0xfff, %r10 /* offset into 4K page */
907 sub $0x1000, %r10 /* subtract 4K pagesize */
908 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
911 LABEL(loop_ashr_9_use_sse4_2):
913 jg LABEL(nibble_ashr_9_use_sse4_2)
915 movdqa (%rdi, %rdx), %xmm0
917 palignr $9, -16(%rdi, %rdx), %xmm0
918 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
919 jbe LABEL(use_sse4_2_exit)
920 #ifdef USE_AS_STRNCMP
922 jbe LABEL(strcmp_exitz_sse4_2)
927 jg LABEL(nibble_ashr_9_use_sse4_2)
929 movdqa (%rdi, %rdx), %xmm0
930 palignr $9, -16(%rdi, %rdx), %xmm0
931 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
932 jbe LABEL(use_sse4_2_exit)
933 #ifdef USE_AS_STRNCMP
935 jbe LABEL(strcmp_exitz_sse4_2)
938 jmp LABEL(loop_ashr_9_use_sse4_2)
941 LABEL(nibble_ashr_9_use_sse4_2):
943 movdqa -16(%rdi, %rdx), %xmm0
945 pcmpistri $0x3a,%xmm0, %xmm0
946 #ifdef USE_AS_STRNCMP
948 jae LABEL(nibble_ashr_use_sse4_2_exit)
951 ja LABEL(loop_ashr_9_use_sse4_2)
953 jmp LABEL(nibble_ashr_use_sse4_2_exit)
956 * The following cases will be handled by ashr_10
957 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
958 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
961 LABEL(ashr_10_sse4_2):
973 jnz LABEL(less32bytes_sse4_2)
976 UPDATE_STRNCMP_COUNTER
979 mov $16, %rcx /* index for loads */
980 mov $10, %r9d /* byte position left over from less32bytes case */
982 * Setup %r10 value allows us to detect crossing a page boundary.
983 * When %r10 goes positive we have crossed a page boundary and
984 * need to do a nibble.
987 and $0xfff, %r10 /* offset into 4K page */
988 sub $0x1000, %r10 /* subtract 4K pagesize */
989 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
992 LABEL(loop_ashr_10_use_sse4_2):
994 jg LABEL(nibble_ashr_10_use_sse4_2)
996 movdqa (%rdi, %rdx), %xmm0
997 palignr $10, -16(%rdi, %rdx), %xmm0
998 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
999 jbe LABEL(use_sse4_2_exit)
1000 #ifdef USE_AS_STRNCMP
1002 jbe LABEL(strcmp_exitz_sse4_2)
1007 jg LABEL(nibble_ashr_10_use_sse4_2)
1009 movdqa (%rdi, %rdx), %xmm0
1010 palignr $10, -16(%rdi, %rdx), %xmm0
1011 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1012 jbe LABEL(use_sse4_2_exit)
1013 #ifdef USE_AS_STRNCMP
1015 jbe LABEL(strcmp_exitz_sse4_2)
1018 jmp LABEL(loop_ashr_10_use_sse4_2)
1021 LABEL(nibble_ashr_10_use_sse4_2):
1023 movdqa -16(%rdi, %rdx), %xmm0
1025 pcmpistri $0x3a,%xmm0, %xmm0
1026 #ifdef USE_AS_STRNCMP
1028 jae LABEL(nibble_ashr_use_sse4_2_exit)
1031 ja LABEL(loop_ashr_10_use_sse4_2)
1033 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1036 * The following cases will be handled by ashr_11
1037 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1038 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1041 LABEL(ashr_11_sse4_2):
1043 movdqa (%rdi), %xmm2
1044 movdqa (%rsi), %xmm1
1045 pcmpeqb %xmm1, %xmm0
1047 pcmpeqb %xmm1, %xmm2
1049 pmovmskb %xmm2, %r9d
1053 jnz LABEL(less32bytes_sse4_2)
1054 movdqa (%rdi), %xmm3
1056 UPDATE_STRNCMP_COUNTER
1059 mov $16, %rcx /* index for loads */
1060 mov $11, %r9d /* byte position left over from less32bytes case */
1062 * Setup %r10 value allows us to detect crossing a page boundary.
1063 * When %r10 goes positive we have crossed a page boundary and
1064 * need to do a nibble.
1067 and $0xfff, %r10 /* offset into 4K page */
1068 sub $0x1000, %r10 /* subtract 4K pagesize */
1069 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1072 LABEL(loop_ashr_11_use_sse4_2):
1074 jg LABEL(nibble_ashr_11_use_sse4_2)
1076 movdqa (%rdi, %rdx), %xmm0
1077 palignr $11, -16(%rdi, %rdx), %xmm0
1078 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1079 jbe LABEL(use_sse4_2_exit)
1080 #ifdef USE_AS_STRNCMP
1082 jbe LABEL(strcmp_exitz_sse4_2)
1087 jg LABEL(nibble_ashr_11_use_sse4_2)
1089 movdqa (%rdi, %rdx), %xmm0
1090 palignr $11, -16(%rdi, %rdx), %xmm0
1091 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1092 jbe LABEL(use_sse4_2_exit)
1093 #ifdef USE_AS_STRNCMP
1095 jbe LABEL(strcmp_exitz_sse4_2)
1098 jmp LABEL(loop_ashr_11_use_sse4_2)
1101 LABEL(nibble_ashr_11_use_sse4_2):
1103 movdqa -16(%rdi, %rdx), %xmm0
1105 pcmpistri $0x3a,%xmm0, %xmm0
1106 #ifdef USE_AS_STRNCMP
1108 jae LABEL(nibble_ashr_use_sse4_2_exit)
1111 ja LABEL(loop_ashr_11_use_sse4_2)
1113 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1116 * The following cases will be handled by ashr_12
1117 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1118 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1121 LABEL(ashr_12_sse4_2):
1123 movdqa (%rdi), %xmm2
1124 movdqa (%rsi), %xmm1
1125 pcmpeqb %xmm1, %xmm0
1127 pcmpeqb %xmm1, %xmm2
1129 pmovmskb %xmm2, %r9d
1133 jnz LABEL(less32bytes_sse4_2)
1134 movdqa (%rdi), %xmm3
1136 UPDATE_STRNCMP_COUNTER
1139 mov $16, %rcx /* index for loads */
1140 mov $12, %r9d /* byte position left over from less32bytes case */
1142 * Setup %r10 value allows us to detect crossing a page boundary.
1143 * When %r10 goes positive we have crossed a page boundary and
1144 * need to do a nibble.
1147 and $0xfff, %r10 /* offset into 4K page */
1148 sub $0x1000, %r10 /* subtract 4K pagesize */
1149 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1152 LABEL(loop_ashr_12_use_sse4_2):
1154 jg LABEL(nibble_ashr_12_use_sse4_2)
1156 movdqa (%rdi, %rdx), %xmm0
1157 palignr $12, -16(%rdi, %rdx), %xmm0
1158 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1159 jbe LABEL(use_sse4_2_exit)
1160 #ifdef USE_AS_STRNCMP
1162 jbe LABEL(strcmp_exitz_sse4_2)
1167 jg LABEL(nibble_ashr_12_use_sse4_2)
1169 movdqa (%rdi, %rdx), %xmm0
1170 palignr $12, -16(%rdi, %rdx), %xmm0
1171 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1172 jbe LABEL(use_sse4_2_exit)
1173 #ifdef USE_AS_STRNCMP
1175 jbe LABEL(strcmp_exitz_sse4_2)
1178 jmp LABEL(loop_ashr_12_use_sse4_2)
1181 LABEL(nibble_ashr_12_use_sse4_2):
1183 movdqa -16(%rdi, %rdx), %xmm0
1185 pcmpistri $0x3a,%xmm0, %xmm0
1186 #ifdef USE_AS_STRNCMP
1188 jae LABEL(nibble_ashr_use_sse4_2_exit)
1191 ja LABEL(loop_ashr_12_use_sse4_2)
1193 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1196 * The following cases will be handled by ashr_13
1197 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1198 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1201 LABEL(ashr_13_sse4_2):
1203 movdqa (%rdi), %xmm2
1204 movdqa (%rsi), %xmm1
1205 pcmpeqb %xmm1, %xmm0
1207 pcmpeqb %xmm1, %xmm2
1209 pmovmskb %xmm2, %r9d
1213 jnz LABEL(less32bytes_sse4_2)
1214 movdqa (%rdi), %xmm3
1216 UPDATE_STRNCMP_COUNTER
1219 mov $16, %rcx /* index for loads */
1220 mov $13, %r9d /* byte position left over from less32bytes case */
1222 * Setup %r10 value allows us to detect crossing a page boundary.
1223 * When %r10 goes positive we have crossed a page boundary and
1224 * need to do a nibble.
1227 and $0xfff, %r10 /* offset into 4K page */
1228 sub $0x1000, %r10 /* subtract 4K pagesize */
1230 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1233 LABEL(loop_ashr_13_use_sse4_2):
1235 jg LABEL(nibble_ashr_13_use_sse4_2)
1237 movdqa (%rdi, %rdx), %xmm0
1238 palignr $13, -16(%rdi, %rdx), %xmm0
1239 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1240 jbe LABEL(use_sse4_2_exit)
1241 #ifdef USE_AS_STRNCMP
1243 jbe LABEL(strcmp_exitz_sse4_2)
1248 jg LABEL(nibble_ashr_13_use_sse4_2)
1250 movdqa (%rdi, %rdx), %xmm0
1251 palignr $13, -16(%rdi, %rdx), %xmm0
1252 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1253 jbe LABEL(use_sse4_2_exit)
1254 #ifdef USE_AS_STRNCMP
1256 jbe LABEL(strcmp_exitz_sse4_2)
1259 jmp LABEL(loop_ashr_13_use_sse4_2)
1262 LABEL(nibble_ashr_13_use_sse4_2):
1264 movdqa -16(%rdi, %rdx), %xmm0
1266 pcmpistri $0x3a,%xmm0, %xmm0
1267 #ifdef USE_AS_STRNCMP
1269 jae LABEL(nibble_ashr_use_sse4_2_exit)
1272 ja LABEL(loop_ashr_13_use_sse4_2)
1274 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1277 * The following cases will be handled by ashr_14
1278 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1279 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1282 LABEL(ashr_14_sse4_2):
1284 movdqa (%rdi), %xmm2
1285 movdqa (%rsi), %xmm1
1286 pcmpeqb %xmm1, %xmm0
1288 pcmpeqb %xmm1, %xmm2
1290 pmovmskb %xmm2, %r9d
1294 jnz LABEL(less32bytes_sse4_2)
1295 movdqa (%rdi), %xmm3
1297 UPDATE_STRNCMP_COUNTER
1300 mov $16, %rcx /* index for loads */
1301 mov $14, %r9d /* byte position left over from less32bytes case */
1303 * Setup %r10 value allows us to detect crossing a page boundary.
1304 * When %r10 goes positive we have crossed a page boundary and
1305 * need to do a nibble.
1308 and $0xfff, %r10 /* offset into 4K page */
1309 sub $0x1000, %r10 /* subtract 4K pagesize */
1311 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1314 LABEL(loop_ashr_14_use_sse4_2):
1316 jg LABEL(nibble_ashr_14_use_sse4_2)
1318 movdqa (%rdi, %rdx), %xmm0
1319 palignr $14, -16(%rdi, %rdx), %xmm0
1320 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1321 jbe LABEL(use_sse4_2_exit)
1322 #ifdef USE_AS_STRNCMP
1324 jbe LABEL(strcmp_exitz_sse4_2)
1329 jg LABEL(nibble_ashr_14_use_sse4_2)
1331 movdqa (%rdi, %rdx), %xmm0
1332 palignr $14, -16(%rdi, %rdx), %xmm0
1333 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1334 jbe LABEL(use_sse4_2_exit)
1335 #ifdef USE_AS_STRNCMP
1337 jbe LABEL(strcmp_exitz_sse4_2)
1340 jmp LABEL(loop_ashr_14_use_sse4_2)
1343 LABEL(nibble_ashr_14_use_sse4_2):
1345 movdqa -16(%rdi, %rdx), %xmm0
1347 pcmpistri $0x3a,%xmm0, %xmm0
1348 #ifdef USE_AS_STRNCMP
1350 jae LABEL(nibble_ashr_use_sse4_2_exit)
1353 ja LABEL(loop_ashr_14_use_sse4_2)
1355 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1358 * The following cases will be handled by ashr_15
1359 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1360 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1363 LABEL(ashr_15_sse4_2):
1365 movdqa (%rdi), %xmm2
1366 movdqa (%rsi), %xmm1
1367 pcmpeqb %xmm1, %xmm0
1369 pcmpeqb %xmm1, %xmm2
1371 pmovmskb %xmm2, %r9d
1375 jnz LABEL(less32bytes_sse4_2)
1377 movdqa (%rdi), %xmm3
1379 UPDATE_STRNCMP_COUNTER
1382 mov $16, %rcx /* index for loads */
1383 mov $15, %r9d /* byte position left over from less32bytes case */
1385 * Setup %r10 value allows us to detect crossing a page boundary.
1386 * When %r10 goes positive we have crossed a page boundary and
1387 * need to do a nibble.
1390 and $0xfff, %r10 /* offset into 4K page */
1392 sub $0x1000, %r10 /* subtract 4K pagesize */
1394 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1397 LABEL(loop_ashr_15_use_sse4_2):
1399 jg LABEL(nibble_ashr_15_use_sse4_2)
1401 movdqa (%rdi, %rdx), %xmm0
1402 palignr $15, -16(%rdi, %rdx), %xmm0
1403 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1404 jbe LABEL(use_sse4_2_exit)
1405 #ifdef USE_AS_STRNCMP
1407 jbe LABEL(strcmp_exitz_sse4_2)
1412 jg LABEL(nibble_ashr_15_use_sse4_2)
1414 movdqa (%rdi, %rdx), %xmm0
1415 palignr $15, -16(%rdi, %rdx), %xmm0
1416 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1417 jbe LABEL(use_sse4_2_exit)
1418 #ifdef USE_AS_STRNCMP
1420 jbe LABEL(strcmp_exitz_sse4_2)
1423 jmp LABEL(loop_ashr_15_use_sse4_2)
1426 LABEL(nibble_ashr_15_use_sse4_2):
1428 movdqa -16(%rdi, %rdx), %xmm0
1430 pcmpistri $0x3a,%xmm0, %xmm0
1431 #ifdef USE_AS_STRNCMP
1433 jae LABEL(nibble_ashr_use_sse4_2_exit)
1436 ja LABEL(loop_ashr_15_use_sse4_2)
1438 LABEL(nibble_ashr_use_sse4_2_exit):
1439 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1441 LABEL(use_sse4_2_exit):
1442 jnc LABEL(strcmp_exitz_sse4_2)
1443 #ifdef USE_AS_STRNCMP
1445 jbe LABEL(strcmp_exitz_sse4_2)
1448 lea -16(%rdi, %r9), %rdi
1449 movzbl (%rdi, %rdx), %eax
1450 movzbl (%rsi, %rdx), %edx
1452 jz LABEL(use_sse4_2_ret_sse4_2)
1454 LABEL(use_sse4_2_ret_sse4_2):
1458 LABEL(less32bytes_sse4_2):
1459 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1460 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1462 jz LABEL(ret_sse4_2)
1463 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1467 LABEL(less16bytes_sse4_2):
1468 bsf %rdx, %rdx /* find and store bit index in %rdx */
1470 #ifdef USE_AS_STRNCMP
1472 jbe LABEL(strcmp_exitz_sse4_2)
1474 movzbl (%rsi, %rdx), %ecx
1475 movzbl (%rdi, %rdx), %eax
1480 LABEL(strcmp_exitz_sse4_2):
1485 LABEL(Byte0_sse4_2):
1492 .size STRCMP_SSE42, .-STRCMP_SSE42
1494 /* Put all SSE 4.2 functions together. */
1495 .section .rodata.sse4.2,"a",@progbits
1497 LABEL(unaligned_table_sse4_2):
1498 .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
1499 .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
1500 .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
1501 .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
1502 .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
1503 .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
1504 .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
1505 .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
1506 .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
1507 .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
1508 .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
1509 .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
1510 .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
1511 .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
1512 .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
1513 .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
1517 # define ENTRY(name) \
1518 .type STRCMP_SSE2, @function; \
1520 STRCMP_SSE2: cfi_startproc; \
1523 # define END(name) \
1524 cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
1525 # undef libc_hidden_builtin_def
1526 /* It doesn't make sense to send libc-internal strcmp calls through a PLT.
1527 The speedup we get from using SSE4.2 instruction is likely eaten away
1528 by the indirect call in the PLT. */
1529 # define libc_hidden_builtin_def(name) \
1530 .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
1533 #include "../strcmp.S"