1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
23 # if defined USE_AS_STRCASECMP_L
24 # include "locale-defines.h"
28 # define STRCMP __strcmp_avx2
31 # define PAGE_SIZE 4096
33 /* VEC_SIZE = Number of bytes in a ymm register. */
36 # define VMOVU vmovdqu
37 # define VMOVA vmovdqa
40 /* Compare packed dwords. */
41 # define VPCMPEQ vpcmpeqd
42 /* Compare packed dwords and store minimum. */
43 # define VPMINU vpminud
44 /* 1 dword char == 4 bytes. */
45 # define SIZE_OF_CHAR 4
47 /* Compare packed bytes. */
48 # define VPCMPEQ vpcmpeqb
49 /* Compare packed bytes and store minimum. */
50 # define VPMINU vpminub
51 /* 1 byte char == 1 byte. */
52 # define SIZE_OF_CHAR 1
55 # ifdef USE_AS_STRNCMP
57 # define LOOP_REG64 r9
59 # define OFFSET_REG8 r9b
60 # define OFFSET_REG r9d
61 # define OFFSET_REG64 r9
64 # define LOOP_REG64 rdx
66 # define OFFSET_REG8 dl
67 # define OFFSET_REG edx
68 # define OFFSET_REG64 rdx
72 # define VZEROUPPER vzeroupper
75 # if defined USE_AS_STRNCMP
78 # define VEC_OFFSET (-VEC_SIZE)
81 # ifdef USE_AS_STRCASECMP_L
82 # define BYTE_LOOP_REG OFFSET_REG
84 # define BYTE_LOOP_REG ecx
87 # ifdef USE_AS_STRCASECMP_L
88 # ifdef USE_AS_STRNCMP
89 # define STRCASECMP __strncasecmp_avx2
90 # define LOCALE_REG rcx
91 # define LOCALE_REG_LP RCX_LP
92 # define STRCASECMP_NONASCII __strncasecmp_l_nonascii
94 # define STRCASECMP __strcasecmp_avx2
95 # define LOCALE_REG rdx
96 # define LOCALE_REG_LP RDX_LP
97 # define STRCASECMP_NONASCII __strcasecmp_l_nonascii
101 # define xmmZERO xmm15
102 # define ymmZERO ymm15
104 # define LCASE_MIN_ymm %ymm10
105 # define LCASE_MAX_ymm %ymm11
106 # define CASE_ADD_ymm %ymm12
108 # define LCASE_MIN_xmm %xmm10
109 # define LCASE_MAX_xmm %xmm11
110 # define CASE_ADD_xmm %xmm12
112 /* r11 is never use elsewhere so this is safe to maintain. */
113 # define TOLOWER_BASE %r11
116 # define SECTION(p) p##.avx
119 # ifdef USE_AS_STRCASECMP_L
120 # define REG(x, y) x ## y
121 # define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
122 vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
123 vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
124 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
125 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
126 vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
127 vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
128 vpaddb REG(%ext, 8), reg1_in, reg1_out; \
129 vpaddb REG(%ext, 9), reg2_in, reg2_out
131 # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
132 # define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
133 # define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
135 # define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
136 TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
137 VPCMPEQ scratch_reg, s2_reg, reg_out
139 # define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
140 VMOVU s2_mem, reg_out; \
141 CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
143 # define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
144 # define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
146 # define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
147 # define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
150 # define TOLOWER_gpr(...)
151 # define TOLOWER_ymm(...)
152 # define TOLOWER_xmm(...)
154 # define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
155 VPCMPEQ s2_reg, s1_reg, reg_out
157 # define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
159 # define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
160 # define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
164 wcscmp/wcsncmp have to use SIGNED comparison for elements.
165 strcmp/strncmp have to use UNSIGNED comparison for elements.
168 /* The main idea of the string comparison (byte or dword) using AVX2
169 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
170 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
171 to check the null char, algorithm keeps the matched bytes/dwords,
172 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
173 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
174 one VPMINU instructions, together with movdqu and testl instructions.
175 Main loop (away from from page boundary) compares 4 vectors are a time,
176 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
178 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
179 is the same as strcmp, except that an a maximum offset is tracked. If
180 the maximum offset is reached before a difference is found, zero is
183 .section SECTION(.text), "ax", @progbits
185 .type STRCMP, @function
189 # define GLABEL(...) __VA_ARGS__
192 # ifdef USE_AS_STRCASECMP_L
193 ENTRY (GLABEL(STRCASECMP))
194 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
195 mov %fs:(%rax), %LOCALE_REG_LP
197 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
199 END (GLABEL(STRCASECMP))
200 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
209 # if defined USE_AS_STRCASECMP_L
210 /* We have to fall back on the C implementation for locales with
211 encodings not matching ASCII for single bytes. */
212 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
213 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
215 mov (%LOCALE_REG), %RAX_LP
217 testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
218 jne STRCASECMP_NONASCII
219 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
222 # ifdef USE_AS_STRNCMP
223 /* Don't overwrite LOCALE_REG (rcx) until we have pass
224 L(one_or_less). Otherwise we might use the wrong locale in
225 the OVERFLOW_STRCMP (strcasecmp_l). */
227 /* Clear the upper 32 bits. */
231 /* Signed comparison intentional. We use this branch to also
232 test cases where length >= 2^63. These very large sizes can be
233 handled with strcmp as there is no way for that length to
234 actually bound the buffer. */
236 # ifdef USE_AS_WCSCMP
239 /* Multiplying length by sizeof(wchar_t) can result in overflow.
240 Check if that is possible. All cases where overflow are possible
241 are cases where length is large enough that it can never be a
242 bound on valid memory so just use wcscmp. */
246 leaq (, %rdx, 4), %rdx
249 vpxor %xmmZERO, %xmmZERO, %xmmZERO
250 # if defined USE_AS_STRCASECMP_L
251 .section .rodata.cst32, "aM", @progbits, 32
254 .quad 0x3f3f3f3f3f3f3f3f
255 .quad 0x3f3f3f3f3f3f3f3f
256 .quad 0x3f3f3f3f3f3f3f3f
257 .quad 0x3f3f3f3f3f3f3f3f
259 .quad 0x9999999999999999
260 .quad 0x9999999999999999
261 .quad 0x9999999999999999
262 .quad 0x9999999999999999
264 .quad 0x2020202020202020
265 .quad 0x2020202020202020
266 .quad 0x2020202020202020
267 .quad 0x2020202020202020
270 vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
271 vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
272 vmovdqa L(case_add)(%rip), CASE_ADD_ymm
277 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
278 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
282 /* Safe to compare 4x vectors. */
284 /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
285 Otherwise converts ymm0 and load from rsi to lower. ymm2 is
286 scratch and ymm1 is the return. */
287 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
288 /* 1s at null CHAR. */
289 VPCMPEQ %ymm0, %ymmZERO, %ymm2
290 /* 1s where s1 and s2 equal AND not null CHAR. */
291 vpandn %ymm1, %ymm2, %ymm1
293 /* All 1s -> keep going, any 0s -> return. */
294 vpmovmskb %ymm1, %ecx
295 # ifdef USE_AS_STRNCMP
297 jbe L(vec_0_test_len)
300 /* All 1s represents all equals. incl will overflow to zero in
301 all equals case. Otherwise 1s will carry until position of first
309 # ifdef USE_AS_WCSCMP
310 movl (%rdi, %rcx), %edx
312 cmpl (%rsi, %rcx), %edx
318 movzbl (%rdi, %rcx), %eax
319 movzbl (%rsi, %rcx), %ecx
320 TOLOWER_gpr (%rax, %eax)
321 TOLOWER_gpr (%rcx, %ecx)
325 L(return_vzeroupper):
326 ZERO_UPPER_VEC_REGISTERS_RETURN
328 # ifdef USE_AS_STRNCMP
332 bzhil %edx, %ecx, %eax
334 /* Align if will cross fetch block. */
342 # ifdef USE_AS_STRCASECMP_L
343 /* Set locale argument for strcasecmp. */
344 movq %LOCALE_REG, %rdx
347 /* 'nbe' covers the case where length is negative (large
350 # ifdef USE_AS_WCSCMP
361 TOLOWER_gpr (%rax, %eax)
362 TOLOWER_gpr (%rcx, %ecx)
372 # ifdef USE_AS_STRNCMP
373 /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
375 addq $-VEC_SIZE, %rdx
379 # ifdef USE_AS_WCSCMP
380 movl VEC_SIZE(%rdi, %rcx), %edx
382 cmpl VEC_SIZE(%rsi, %rcx), %edx
388 movzbl VEC_SIZE(%rdi, %rcx), %eax
389 movzbl VEC_SIZE(%rsi, %rcx), %ecx
390 TOLOWER_gpr (%rax, %eax)
391 TOLOWER_gpr (%rcx, %ecx)
398 # ifdef USE_AS_STRNCMP
404 # ifndef USE_AS_STRNCMP
412 # ifdef USE_AS_WCSCMP
413 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
415 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
421 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
422 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
423 TOLOWER_gpr (%rax, %eax)
424 TOLOWER_gpr (%rcx, %ecx)
430 # ifndef USE_AS_STRNCMP
434 # ifdef USE_AS_WCSCMP
435 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
437 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
443 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
444 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
445 TOLOWER_gpr (%rax, %eax)
446 TOLOWER_gpr (%rcx, %ecx)
455 /* Safe to compare 4x vectors. */
456 VMOVU VEC_SIZE(%rdi), %ymm0
457 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
458 VPCMPEQ %ymm0, %ymmZERO, %ymm2
459 vpandn %ymm1, %ymm2, %ymm1
460 vpmovmskb %ymm1, %ecx
464 # ifdef USE_AS_STRNCMP
465 subq $(VEC_SIZE * 2), %rdx
469 VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
470 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
471 VPCMPEQ %ymm0, %ymmZERO, %ymm2
472 vpandn %ymm1, %ymm2, %ymm1
473 vpmovmskb %ymm1, %ecx
477 VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
478 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
479 VPCMPEQ %ymm0, %ymmZERO, %ymm2
480 vpandn %ymm1, %ymm2, %ymm1
481 vpmovmskb %ymm1, %ecx
485 # ifdef USE_AS_STRNCMP
486 cmpq $(VEC_SIZE * 2), %rdx
490 # ifdef USE_AS_WCSCMP
491 /* any non-zero positive value that doesn't inference with 0x1.
499 /* The prepare labels are various entry points from the page
503 # ifdef USE_AS_STRNCMP
504 /* Store N + (VEC_SIZE * 4) and place check at the begining of
506 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
508 L(prepare_loop_no_len):
510 /* Align s1 and adjust s2 accordingly. */
512 andq $-(VEC_SIZE * 4), %rdi
515 # ifdef USE_AS_STRNCMP
519 L(prepare_loop_aligned):
520 /* eax stores distance from rsi to next page cross. These cases
521 need to be handled specially as the 4x loop could potentially
522 read memory past the length of s1 or s2 and across a page
524 movl $-(VEC_SIZE * 4), %eax
526 andl $(PAGE_SIZE - 1), %eax
528 /* Loop 4x comparisons at a time. */
532 /* End condition for strncmp. */
533 # ifdef USE_AS_STRNCMP
534 subq $(VEC_SIZE * 4), %rdx
538 subq $-(VEC_SIZE * 4), %rdi
539 subq $-(VEC_SIZE * 4), %rsi
541 /* Check if rsi loads will cross a page boundary. */
542 addl $-(VEC_SIZE * 4), %eax
543 jnb L(page_cross_during_loop)
545 /* Loop entry after handling page cross during loop. */
546 L(loop_skip_page_cross_check):
547 VMOVA (VEC_SIZE * 0)(%rdi), %ymm0
548 VMOVA (VEC_SIZE * 1)(%rdi), %ymm2
549 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
550 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
552 /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
553 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
554 CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
555 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
556 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
558 /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
560 vpand %ymm0, %ymm1, %ymm1
563 vpand %ymm2, %ymm3, %ymm3
564 vpand %ymm4, %ymm5, %ymm5
565 vpand %ymm6, %ymm7, %ymm7
567 VPMINU %ymm1, %ymm3, %ymm3
568 VPMINU %ymm5, %ymm7, %ymm7
570 /* Reduce all 0 CHARs for the 4x VEC into ymm7. */
571 VPMINU %ymm3, %ymm7, %ymm7
573 /* If any 0 CHAR then done. */
574 VPCMPEQ %ymm7, %ymmZERO, %ymm7
575 vpmovmskb %ymm7, %LOOP_REG
576 testl %LOOP_REG, %LOOP_REG
579 /* Find which VEC has the mismatch of end of string. */
580 VPCMPEQ %ymm1, %ymmZERO, %ymm1
581 vpmovmskb %ymm1, %ecx
583 jnz L(return_vec_0_end)
586 VPCMPEQ %ymm3, %ymmZERO, %ymm3
587 vpmovmskb %ymm3, %ecx
589 jnz L(return_vec_1_end)
591 L(return_vec_2_3_end):
592 # ifdef USE_AS_STRNCMP
593 subq $(VEC_SIZE * 2), %rdx
597 VPCMPEQ %ymm5, %ymmZERO, %ymm5
598 vpmovmskb %ymm5, %ecx
600 jnz L(return_vec_2_end)
602 /* LOOP_REG contains matches for null/mismatch from the loop. If
603 VEC 0,1,and 2 all have no null and no mismatches then mismatch
604 must entirely be from VEC 3 which is fully represented by
606 tzcntl %LOOP_REG, %LOOP_REG
608 # ifdef USE_AS_STRNCMP
609 subl $-(VEC_SIZE), %LOOP_REG
610 cmpq %LOOP_REG64, %rdx
614 # ifdef USE_AS_WCSCMP
615 movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
617 cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
623 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
624 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
625 TOLOWER_gpr (%rax, %eax)
626 TOLOWER_gpr (%rcx, %ecx)
634 # ifdef USE_AS_STRNCMP
642 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
643 they use the value of `r8` to negate the return value. This is
644 because the page cross logic can swap `rdi` and `rsi`. */
646 # ifdef USE_AS_STRNCMP
651 # ifndef USE_AS_STRNCMP
659 # ifdef USE_AS_WCSCMP
660 movl (%rdi, %rcx), %edx
662 cmpl (%rsi, %rcx), %edx
668 movzbl (%rdi, %rcx), %eax
669 movzbl (%rsi, %rcx), %ecx
670 TOLOWER_gpr (%rax, %eax)
671 TOLOWER_gpr (%rcx, %ecx)
679 # ifndef USE_AS_STRNCMP
683 # ifdef USE_AS_WCSCMP
684 movl VEC_SIZE(%rdi, %rcx), %edx
686 cmpl VEC_SIZE(%rsi, %rcx), %edx
692 movzbl VEC_SIZE(%rdi, %rcx), %eax
693 movzbl VEC_SIZE(%rsi, %rcx), %ecx
694 TOLOWER_gpr (%rax, %eax)
695 TOLOWER_gpr (%rcx, %ecx)
707 # ifdef USE_AS_STRNCMP
709 jbe L(ret_zero_page_cross)
711 # ifdef USE_AS_WCSCMP
712 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
714 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
720 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
721 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
722 TOLOWER_gpr (%rax, %eax)
723 TOLOWER_gpr (%rcx, %ecx)
732 /* Page cross in rsi in next 4x VEC. */
734 /* TODO: Improve logic here. */
736 L(page_cross_during_loop):
737 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
739 /* Optimistically rsi and rdi and both aligned inwhich case we
740 don't need any logic here. */
741 cmpl $-(VEC_SIZE * 4), %eax
742 /* Don't adjust eax before jumping back to loop and we will
743 never hit page cross case again. */
744 je L(loop_skip_page_cross_check)
746 /* Check if we can safely load a VEC. */
747 cmpl $-(VEC_SIZE * 3), %eax
748 jle L(less_1x_vec_till_page_cross)
751 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
752 VPCMPEQ %ymm0, %ymmZERO, %ymm2
753 vpandn %ymm1, %ymm2, %ymm1
754 vpmovmskb %ymm1, %ecx
756 jnz L(return_vec_0_end)
758 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
759 cmpl $-(VEC_SIZE * 2), %eax
760 jg L(more_2x_vec_till_page_cross)
763 L(less_1x_vec_till_page_cross):
764 subl $-(VEC_SIZE * 4), %eax
765 /* Guranteed safe to read from rdi - VEC_SIZE here. The only
766 concerning case is first iteration if incoming s1 was near start
767 of a page and s2 near end. If s1 was near the start of the page
768 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
769 to read back -VEC_SIZE. If rdi is truly at the start of a page
770 here, it means the previous page (rdi - VEC_SIZE) has already
771 been loaded earlier so must be valid. */
772 VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
773 CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
774 VPCMPEQ %ymm0, %ymmZERO, %ymm2
775 vpandn %ymm1, %ymm2, %ymm1
776 vpmovmskb %ymm1, %ecx
778 /* Mask of potentially valid bits. The lower bits can be out of
779 range comparisons (but safe regarding page crosses). */
781 shlxl %esi, %r10d, %r10d
784 # ifdef USE_AS_STRNCMP
786 jbe L(return_page_cross_end_check)
788 movl %eax, %OFFSET_REG
789 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
792 jz L(loop_skip_page_cross_check)
795 L(return_page_cross_end):
798 # ifdef USE_AS_STRNCMP
799 leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
800 L(return_page_cross_cmp_mem):
802 addl %OFFSET_REG, %ecx
804 # ifdef USE_AS_WCSCMP
805 movl VEC_OFFSET(%rdi, %rcx), %edx
807 cmpl VEC_OFFSET(%rsi, %rcx), %edx
813 movzbl VEC_OFFSET(%rdi, %rcx), %eax
814 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
815 TOLOWER_gpr (%rax, %eax)
816 TOLOWER_gpr (%rcx, %ecx)
824 # ifdef USE_AS_STRNCMP
826 L(return_page_cross_end_check):
829 leal -VEC_SIZE(%rax, %rcx), %ecx
831 ja L(return_page_cross_cmp_mem)
838 L(more_2x_vec_till_page_cross):
839 /* If more 2x vec till cross we will complete a full loop
842 VMOVU VEC_SIZE(%rdi), %ymm0
843 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
844 VPCMPEQ %ymm0, %ymmZERO, %ymm2
845 vpandn %ymm1, %ymm2, %ymm1
846 vpmovmskb %ymm1, %ecx
848 jnz L(return_vec_1_end)
850 # ifdef USE_AS_STRNCMP
851 cmpq $(VEC_SIZE * 2), %rdx
852 jbe L(ret_zero_in_loop_page_cross)
855 subl $-(VEC_SIZE * 4), %eax
857 /* Safe to include comparisons from lower bytes. */
858 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
859 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
860 VPCMPEQ %ymm0, %ymmZERO, %ymm2
861 vpandn %ymm1, %ymm2, %ymm1
862 vpmovmskb %ymm1, %ecx
864 jnz L(return_vec_page_cross_0)
866 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
867 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
868 VPCMPEQ %ymm0, %ymmZERO, %ymm2
869 vpandn %ymm1, %ymm2, %ymm1
870 vpmovmskb %ymm1, %ecx
872 jnz L(return_vec_page_cross_1)
874 # ifdef USE_AS_STRNCMP
875 /* Must check length here as length might proclude reading next
878 jbe L(ret_zero_in_loop_page_cross)
881 /* Finish the loop. */
882 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
883 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
885 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
886 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
887 vpand %ymm4, %ymm5, %ymm5
888 vpand %ymm6, %ymm7, %ymm7
889 VPMINU %ymm5, %ymm7, %ymm7
890 VPCMPEQ %ymm7, %ymmZERO, %ymm7
891 vpmovmskb %ymm7, %LOOP_REG
892 testl %LOOP_REG, %LOOP_REG
893 jnz L(return_vec_2_3_end)
895 /* Best for code size to include ucond-jmp here. Would be faster
896 if this case is hot to duplicate the L(return_vec_2_3_end) code
897 as fall-through and have jump back to loop on mismatch
899 subq $-(VEC_SIZE * 4), %rdi
900 subq $-(VEC_SIZE * 4), %rsi
901 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
902 # ifdef USE_AS_STRNCMP
903 subq $(VEC_SIZE * 4), %rdx
904 ja L(loop_skip_page_cross_check)
905 L(ret_zero_in_loop_page_cross):
909 jmp L(loop_skip_page_cross_check)
914 L(return_vec_page_cross_0):
915 addl $-VEC_SIZE, %eax
916 L(return_vec_page_cross_1):
918 # ifdef USE_AS_STRNCMP
919 leal -VEC_SIZE(%rax, %rcx), %ecx
921 jbe L(ret_zero_in_loop_page_cross)
926 # ifdef USE_AS_WCSCMP
927 movl VEC_OFFSET(%rdi, %rcx), %edx
929 cmpl VEC_OFFSET(%rsi, %rcx), %edx
935 movzbl VEC_OFFSET(%rdi, %rcx), %eax
936 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
937 TOLOWER_gpr (%rax, %eax)
938 TOLOWER_gpr (%rcx, %ecx)
949 # ifndef USE_AS_STRNCMP
950 /* If both are VEC aligned we don't need any special logic here.
951 Only valid for strcmp where stop condition is guranteed to be
952 reachable by just reading memory. */
953 testl $((VEC_SIZE - 1) << 20), %eax
959 andl $(PAGE_SIZE - 1), %eax
960 andl $(PAGE_SIZE - 1), %ecx
962 xorl %OFFSET_REG, %OFFSET_REG
964 /* Check which is closer to page cross, s1 or s2. */
968 /* The previous page cross check has false positives. Check for
969 true positive as page cross logic is very expensive. */
970 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
973 /* Set r8 to not interfere with normal return value (rdi and rsi
975 # ifdef USE_AS_WCSCMP
976 /* any non-zero positive value that doesn't inference with 0x1.
983 /* Check if less than 1x VEC till page cross. */
984 subl $(VEC_SIZE * 3), %eax
985 jg L(less_1x_vec_till_page)
987 /* If more than 1x VEC till page cross, loop throuh safely
988 loadable memory until within 1x VEC of page cross. */
993 VMOVU (%rdi, %OFFSET_REG64), %ymm0
994 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
995 VPCMPEQ %ymm0, %ymmZERO, %ymm2
996 vpandn %ymm1, %ymm2, %ymm1
997 vpmovmskb %ymm1, %ecx
1000 jnz L(check_ret_vec_page_cross)
1001 addl $VEC_SIZE, %OFFSET_REG
1002 # ifdef USE_AS_STRNCMP
1003 cmpq %OFFSET_REG64, %rdx
1004 jbe L(ret_zero_page_cross)
1006 addl $VEC_SIZE, %eax
1007 jl L(page_cross_loop)
1009 subl %eax, %OFFSET_REG
1010 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1011 to not cross page so is safe to load. Since we have already
1012 loaded at least 1 VEC from rsi it is also guranteed to be
1015 VMOVU (%rdi, %OFFSET_REG64), %ymm0
1016 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1017 VPCMPEQ %ymm0, %ymmZERO, %ymm2
1018 vpandn %ymm1, %ymm2, %ymm1
1019 vpmovmskb %ymm1, %ecx
1021 # ifdef USE_AS_STRNCMP
1022 leal VEC_SIZE(%OFFSET_REG64), %eax
1024 jbe L(check_ret_vec_page_cross2)
1028 jz L(prepare_loop_no_len)
1031 L(ret_vec_page_cross):
1032 # ifndef USE_AS_STRNCMP
1033 L(check_ret_vec_page_cross):
1036 addl %OFFSET_REG, %ecx
1037 L(ret_vec_page_cross_cont):
1038 # ifdef USE_AS_WCSCMP
1039 movl (%rdi, %rcx), %edx
1041 cmpl (%rsi, %rcx), %edx
1047 movzbl (%rdi, %rcx), %eax
1048 movzbl (%rsi, %rcx), %ecx
1049 TOLOWER_gpr (%rax, %eax)
1050 TOLOWER_gpr (%rcx, %ecx)
1058 # ifdef USE_AS_STRNCMP
1060 L(check_ret_vec_page_cross2):
1062 L(check_ret_vec_page_cross):
1064 addl %OFFSET_REG, %ecx
1066 ja L(ret_vec_page_cross_cont)
1068 L(ret_zero_page_cross):
1075 /* Ensure this is a true page cross. */
1076 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1077 jbe L(no_page_cross)
1085 /* set r8 to negate return value as rdi and rsi swapped. */
1086 # ifdef USE_AS_WCSCMP
1091 xorl %OFFSET_REG, %OFFSET_REG
1093 /* Check if more than 1x VEC till page cross. */
1094 subl $(VEC_SIZE * 3), %eax
1095 jle L(page_cross_loop)
1098 L(less_1x_vec_till_page):
1099 /* Find largest load size we can use. */
1101 ja L(less_16_till_page)
1104 CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1105 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1106 vpandn %xmm1, %xmm2, %xmm1
1107 vpmovmskb %ymm1, %ecx
1109 jnz L(check_ret_vec_page_cross)
1110 movl $16, %OFFSET_REG
1111 # ifdef USE_AS_STRNCMP
1112 cmpq %OFFSET_REG64, %rdx
1113 jbe L(ret_zero_page_cross_slow_case0)
1114 subl %eax, %OFFSET_REG
1116 /* Explicit check for 16 byte alignment. */
1117 subl %eax, %OFFSET_REG
1121 VMOVU (%rdi, %OFFSET_REG64), %xmm0
1122 CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1123 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1124 vpandn %xmm1, %xmm2, %xmm1
1125 vpmovmskb %ymm1, %ecx
1127 jnz L(check_ret_vec_page_cross)
1129 # ifdef USE_AS_STRNCMP
1130 addl $16, %OFFSET_REG
1131 subq %OFFSET_REG64, %rdx
1132 jbe L(ret_zero_page_cross_slow_case0)
1133 subq $-(VEC_SIZE * 4), %rdx
1135 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1136 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1138 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1139 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1141 jmp L(prepare_loop_aligned)
1143 # ifdef USE_AS_STRNCMP
1145 L(ret_zero_page_cross_slow_case0):
1152 L(less_16_till_page):
1153 /* Find largest load size we can use. */
1155 ja L(less_8_till_page)
1159 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1160 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1161 vpandn %xmm1, %xmm2, %xmm1
1162 vpmovmskb %ymm1, %ecx
1164 jnz L(check_ret_vec_page_cross)
1167 # ifdef USE_AS_STRNCMP
1169 jbe L(ret_zero_page_cross_slow_case0)
1171 movl $24, %OFFSET_REG
1172 /* Explicit check for 16 byte alignment. */
1173 subl %eax, %OFFSET_REG
1177 vmovq (%rdi, %OFFSET_REG64), %xmm0
1178 vmovq (%rsi, %OFFSET_REG64), %xmm1
1179 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1180 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1181 vpandn %xmm1, %xmm2, %xmm1
1182 vpmovmskb %ymm1, %ecx
1184 jnz L(check_ret_vec_page_cross)
1186 # ifdef USE_AS_STRNCMP
1187 addl $8, %OFFSET_REG
1188 subq %OFFSET_REG64, %rdx
1189 jbe L(ret_zero_page_cross_slow_case0)
1190 subq $-(VEC_SIZE * 4), %rdx
1192 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1193 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1195 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1196 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1198 jmp L(prepare_loop_aligned)
1202 L(less_8_till_page):
1203 # ifdef USE_AS_WCSCMP
1204 /* If using wchar then this is the only check before we reach
1205 the page boundary. */
1209 jnz L(ret_less_8_wcs)
1210 # ifdef USE_AS_STRNCMP
1212 /* We already checked for len <= 1 so cannot hit that case here.
1216 jnz L(prepare_loop_no_len)
1223 movl %OFFSET_REG, %eax
1229 /* Find largest load size we can use. */
1231 ja L(less_4_till_page)
1235 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1236 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1237 vpandn %xmm1, %xmm2, %xmm1
1238 vpmovmskb %ymm1, %ecx
1240 jnz L(check_ret_vec_page_cross)
1242 # ifdef USE_AS_STRNCMP
1244 jbe L(ret_zero_page_cross_slow_case1)
1246 movl $28, %OFFSET_REG
1247 /* Explicit check for 16 byte alignment. */
1248 subl %eax, %OFFSET_REG
1252 vmovd (%rdi, %OFFSET_REG64), %xmm0
1253 vmovd (%rsi, %OFFSET_REG64), %xmm1
1254 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1255 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1256 vpandn %xmm1, %xmm2, %xmm1
1257 vpmovmskb %ymm1, %ecx
1259 jnz L(check_ret_vec_page_cross)
1261 # ifdef USE_AS_STRNCMP
1262 addl $4, %OFFSET_REG
1263 subq %OFFSET_REG64, %rdx
1264 jbe L(ret_zero_page_cross_slow_case1)
1265 subq $-(VEC_SIZE * 4), %rdx
1267 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1268 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1270 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1271 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1273 jmp L(prepare_loop_aligned)
1275 # ifdef USE_AS_STRNCMP
1277 L(ret_zero_page_cross_slow_case1):
1283 L(less_4_till_page):
1285 /* Extremely slow byte comparison loop. */
1288 movzbl (%rsi, %rdi), %ecx
1289 TOLOWER_gpr (%rax, %eax)
1290 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1291 subl %BYTE_LOOP_REG, %eax
1292 jnz L(ret_less_4_loop)
1294 jz L(ret_zero_4_loop)
1295 # ifdef USE_AS_STRNCMP
1297 jz L(ret_zero_4_loop)
1300 /* end condition is reach page boundary (rdi is aligned). */
1303 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1304 addq $-(VEC_SIZE * 4), %rdi
1305 # ifdef USE_AS_STRNCMP
1306 subq $-(VEC_SIZE * 4), %rdx
1308 jmp L(prepare_loop_aligned)
1319 .size STRCMP, .-STRCMP