1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
24 # include "x86-evex256-vecs.h"
27 # define STRCMP_ISA _evex
28 # include "strcmp-naming.h"
31 # if defined USE_AS_STRCASECMP_L
32 # include "locale-defines.h"
36 # define STRCMP __strcmp_evex
39 # define PAGE_SIZE 4096
41 /* VEC_SIZE = Number of bytes in a ymm register. */
42 # define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
45 /* Compare packed dwords. */
47 # define VPCMPEQ vpcmpeqd
48 # define VPMINU vpminud
49 # define VPTESTM vptestmd
50 # define VPTESTNM vptestnmd
51 /* 1 dword char == 4 bytes. */
52 # define SIZE_OF_CHAR 4
54 # define TESTEQ sub $((1 << CHAR_PER_VEC) - 1),
56 # define USE_WIDE_CHAR
58 /* Compare packed bytes. */
60 # define VPCMPEQ vpcmpeqb
61 # define VPMINU vpminub
62 # define VPTESTM vptestmb
63 # define VPTESTNM vptestnmb
64 /* 1 byte char == 1 byte. */
65 # define SIZE_OF_CHAR 1
70 # include "reg-macros.h"
73 # define RODATA_SECTION rodata.cst64
75 # define RODATA_SECTION rodata.cst32
78 # if CHAR_PER_VEC == 64
79 # define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 3)
81 # define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 2)
84 # ifdef USE_AS_STRNCMP
86 # define LOOP_REG64 r9
88 # define OFFSET_REG8 r9b
89 # define OFFSET_REG r9d
90 # define OFFSET_REG64 r9
92 # define LOOP_REG VRDX
93 # define LOOP_REG64 rdx
95 # define OFFSET_REG8 dl
96 # define OFFSET_REG edx
97 # define OFFSET_REG64 rdx
100 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
101 # define VEC_OFFSET 0
103 # define VEC_OFFSET (-VEC_SIZE)
106 # ifdef USE_AS_STRCASECMP_L
107 # define BYTE_LOOP_REG OFFSET_REG
109 # define BYTE_LOOP_REG ecx
112 # ifdef USE_AS_STRCASECMP_L
113 # ifdef USE_AS_STRNCMP
114 # define LOCALE_REG rcx
115 # define LOCALE_REG_LP RCX_LP
117 # define LOCALE_REG rdx
118 # define LOCALE_REG_LP RDX_LP
122 # define LCASE_MIN_V VMM(12)
123 # define LCASE_MAX_V VMM(13)
124 # define CASE_ADD_V VMM(14)
127 # define LCASE_MIN_YMM VMM_256(12)
128 # define LCASE_MAX_YMM VMM_256(13)
129 # define CASE_ADD_YMM VMM_256(14)
132 # define LCASE_MIN_XMM VMM_128(12)
133 # define LCASE_MAX_XMM VMM_128(13)
134 # define CASE_ADD_XMM VMM_128(14)
136 /* NB: wcsncmp uses r11 but strcasecmp is never used in
137 conjunction with wcscmp. */
138 # define TOLOWER_BASE %r11
140 # ifdef USE_AS_STRCASECMP_L
141 # define _REG(x, y) x ## y
142 # define REG(x, y) _REG(x, y)
143 # define TOLOWER(reg1, reg2, ext, vec_macro) \
144 vpsubb %REG(LCASE_MIN_, ext), reg1, %vec_macro(10); \
145 vpsubb %REG(LCASE_MIN_, ext), reg2, %vec_macro(11); \
146 vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
147 vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
148 vpaddb reg1, %REG(CASE_ADD_, ext), reg1{%k5}; \
149 vpaddb reg2, %REG(CASE_ADD_, ext), reg2{%k6}
151 # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152 # define TOLOWER_VMM(...) TOLOWER(__VA_ARGS__, V, VMM)
153 # define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM, VMM_256)
154 # define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM, VMM_128)
156 # define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro) \
157 TOLOWER (s1_reg, s2_reg, ext, vec_macro); \
158 VPCMPEQ s1_reg, s2_reg, reg_out
160 # define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro) \
161 VMOVU s2_mem, s2_reg; \
162 CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
164 # define CMP_R1_R2_VMM(...) CMP_R1_R2(__VA_ARGS__, V, VMM)
165 # define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
166 # define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
168 # define CMP_R1_S2_VMM(...) CMP_R1_S2(__VA_ARGS__, V, VMM)
169 # define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
170 # define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
173 # define TOLOWER_gpr(...)
174 # define TOLOWER_VMM(...)
175 # define TOLOWER_YMM(...)
176 # define TOLOWER_XMM(...)
178 # define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out) \
179 VPCMPEQ s2_reg, s1_reg, reg_out
181 # define CMP_R1_R2_YMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
182 # define CMP_R1_R2_XMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
184 # define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out) \
185 VPCMPEQ s2_mem, s1_reg, reg_out
186 # define CMP_R1_S2_YMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
187 # define CMP_R1_S2_XMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
191 wcscmp/wcsncmp have to use SIGNED comparison for elements.
192 strcmp/strncmp have to use UNSIGNED comparison for elements.
195 /* The main idea of the string comparison (byte or dword) using 256-bit
196 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
197 latter can be on either packed bytes or dwords depending on
198 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
199 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
200 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
201 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
202 instructions. Main loop (away from from page boundary) compares 4
203 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
206 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
207 is the same as strcmp, except that an a maximum offset is tracked. If
208 the maximum offset is reached before a difference is found, zero is
211 .section SECTION(.text), "ax", @progbits
213 .type STRCMP, @function
215 # ifdef USE_AS_STRCASECMP_L
217 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
218 mov %fs:(%rax), %LOCALE_REG_LP
220 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
223 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
232 # if defined USE_AS_STRCASECMP_L
233 /* We have to fall back on the C implementation for locales with
234 encodings not matching ASCII for single bytes. */
235 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
236 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
238 mov (%LOCALE_REG), %RAX_LP
240 testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
241 jne STRCASECMP_L_NONASCII
242 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
245 # ifdef USE_AS_STRNCMP
246 /* Don't overwrite LOCALE_REG (rcx) until we have pass
247 L(one_or_less). Otherwise we might use the wrong locale in
248 the OVERFLOW_STRCMP (strcasecmp_l). */
250 /* Clear the upper 32 bits. */
254 /* Signed comparison intentional. We use this branch to also
255 test cases where length >= 2^63. These very large sizes can be
256 handled with strcmp as there is no way for that length to
257 actually bound the buffer. */
261 # if defined USE_AS_STRCASECMP_L
262 .section RODATA_SECTION, "aM", @progbits, VEC_SIZE
265 .quad 0x4141414141414141
266 .quad 0x4141414141414141
267 .quad 0x4141414141414141
268 .quad 0x4141414141414141
270 .quad 0x4141414141414141
271 .quad 0x4141414141414141
272 .quad 0x4141414141414141
273 .quad 0x4141414141414141
276 .quad 0x1a1a1a1a1a1a1a1a
277 .quad 0x1a1a1a1a1a1a1a1a
278 .quad 0x1a1a1a1a1a1a1a1a
279 .quad 0x1a1a1a1a1a1a1a1a
281 .quad 0x1a1a1a1a1a1a1a1a
282 .quad 0x1a1a1a1a1a1a1a1a
283 .quad 0x1a1a1a1a1a1a1a1a
284 .quad 0x1a1a1a1a1a1a1a1a
287 .quad 0x2020202020202020
288 .quad 0x2020202020202020
289 .quad 0x2020202020202020
290 .quad 0x2020202020202020
292 .quad 0x2020202020202020
293 .quad 0x2020202020202020
294 .quad 0x2020202020202020
295 .quad 0x2020202020202020
299 VMOVA L(lcase_min)(%rip), %LCASE_MIN_V
300 VMOVA L(lcase_max)(%rip), %LCASE_MAX_V
301 VMOVA L(case_add)(%rip), %CASE_ADD_V
306 /* Shift out the bits irrelivant to page boundary ([63:12]). */
308 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
309 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
313 /* Safe to compare 4x vectors. */
314 VMOVU (%rdi), %VMM(0)
315 VPTESTM %VMM(0), %VMM(0), %k2
316 /* Each bit cleared in K1 represents a mismatch or a null CHAR
317 in YMM0 and 32 bytes at (%rsi). */
318 CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
320 # ifdef USE_AS_STRNCMP
321 cmpq $CHAR_PER_VEC, %rdx
322 jbe L(vec_0_test_len)
325 /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
328 /* All 1s represents all equals. TESTEQ will overflow to zero in
329 all equals case. Otherwise 1s will carry until position of
337 # ifdef USE_AS_WCSCMP
338 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
340 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
346 movzbl (%rdi, %rcx), %eax
347 /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
348 and keep logic for len <= VEC_SIZE (common) in just the
349 first cache line. NB: No evex512 processor has partial-
350 register stalls. If that changes this ifdef can be disabled
351 without affecting correctness. */
352 # if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
353 movb (%rsi, %rcx), %cl
355 movzbl (%rsi, %rcx), %ecx
357 TOLOWER_gpr (%rax, %eax)
358 TOLOWER_gpr (%rcx, %ecx)
364 # ifdef USE_AS_STRNCMP
368 bzhi %VRDX, %VRCX, %VRAX
370 /* Align if will cross fetch block. */
378 # ifdef USE_AS_STRCASECMP_L
379 /* Set locale argument for strcasecmp. */
380 movq %LOCALE_REG, %rdx
383 /* 'nbe' covers the case where length is negative (large
386 # ifdef USE_AS_WCSCMP
397 TOLOWER_gpr (%rax, %eax)
398 TOLOWER_gpr (%rcx, %ecx)
408 # ifdef USE_AS_STRNCMP
409 /* rdx must be > CHAR_PER_VEC so its safe to subtract without
410 worrying about underflow. */
411 addq $-CHAR_PER_VEC, %rdx
415 # ifdef USE_AS_WCSCMP
416 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
418 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
424 movzbl VEC_SIZE(%rdi, %rcx), %eax
425 movzbl VEC_SIZE(%rsi, %rcx), %ecx
426 TOLOWER_gpr (%rax, %eax)
427 TOLOWER_gpr (%rcx, %ecx)
434 # ifdef USE_AS_STRNCMP
436 # if CHAR_PER_VEC <= 32
437 /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
438 additional branches by adjusting the bit positions from
439 VEC3. We can't do this for CHAR_PER_VEC == 64. */
440 # if CHAR_PER_VEC <= 16
441 sall $CHAR_PER_VEC, %ecx
443 salq $CHAR_PER_VEC, %rcx
446 /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
449 addl $(CHAR_PER_VEC), %ecx
451 ja L(ret_vec_3_finish)
457 /* If CHAR_PER_VEC == 64 we can't combine matches from the last
458 2x VEC so need seperate return label. */
460 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
465 # ifdef USE_AS_STRNCMP
471 # ifdef USE_AS_WCSCMP
472 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
474 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
480 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
481 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
482 TOLOWER_gpr (%rax, %eax)
483 TOLOWER_gpr (%rcx, %ecx)
489 # ifndef USE_AS_STRNCMP
493 # ifdef USE_AS_WCSCMP
494 movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
496 cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
502 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
503 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
504 TOLOWER_gpr (%rax, %eax)
505 TOLOWER_gpr (%rcx, %ecx)
512 /* 32 byte align here ensures the main loop is ideally aligned
516 /* Safe to compare 4x vectors. */
517 VMOVU (VEC_SIZE)(%rdi), %VMM(0)
518 VPTESTM %VMM(0), %VMM(0), %k2
519 CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
524 # ifdef USE_AS_STRNCMP
525 subq $(CHAR_PER_VEC * 2), %rdx
529 VMOVU (VEC_SIZE * 2)(%rdi), %VMM(0)
530 VPTESTM %VMM(0), %VMM(0), %k2
531 CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
536 VMOVU (VEC_SIZE * 3)(%rdi), %VMM(0)
537 VPTESTM %VMM(0), %VMM(0), %k2
538 CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
543 # ifdef USE_AS_STRNCMP
544 cmpq $(CHAR_PER_VEC * 2), %rdx
549 # ifdef USE_AS_WCSCMP
550 /* any non-zero positive value that doesn't inference with 0x1.
558 /* The prepare labels are various entry points from the page
562 # ifdef USE_AS_STRNCMP
563 # ifdef USE_AS_WCSCMP
564 L(prepare_loop_no_len):
566 andl $(VEC_SIZE * 4 - 1), %ecx
568 leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
570 /* Store N + (VEC_SIZE * 4) and place check at the begining of
572 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
573 L(prepare_loop_no_len):
576 L(prepare_loop_no_len):
579 /* Align s1 and adjust s2 accordingly. */
581 andq $-(VEC_SIZE * 4), %rdi
582 L(prepare_loop_readj):
584 # if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
588 L(prepare_loop_aligned):
589 /* eax stores distance from rsi to next page cross. These cases
590 need to be handled specially as the 4x loop could potentially
591 read memory past the length of s1 or s2 and across a page
593 movl $-(VEC_SIZE * 4), %eax
595 andl $(PAGE_SIZE - 1), %eax
598 /* Loop 4x comparisons at a time. */
602 /* End condition for strncmp. */
603 # ifdef USE_AS_STRNCMP
604 subq $(CHAR_PER_VEC * 4), %rdx
608 subq $-(VEC_SIZE * 4), %rdi
609 subq $-(VEC_SIZE * 4), %rsi
611 /* Check if rsi loads will cross a page boundary. */
612 addl $-(VEC_SIZE * 4), %eax
613 jnb L(page_cross_during_loop)
615 /* Loop entry after handling page cross during loop. */
616 L(loop_skip_page_cross_check):
617 VMOVA (VEC_SIZE * 0)(%rdi), %VMM(0)
618 VMOVA (VEC_SIZE * 1)(%rdi), %VMM(2)
619 VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4)
620 VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6)
622 VPMINU %VMM(0), %VMM(2), %VMM(8)
623 VPMINU %VMM(4), %VMM(6), %VMM(9)
625 /* A zero CHAR in YMM9 means that there is a null CHAR. */
626 VPMINU %VMM(8), %VMM(9), %VMM(9)
628 /* Each bit set in K1 represents a non-null CHAR in YMM9. */
629 VPTESTM %VMM(9), %VMM(9), %k1
630 # ifndef USE_AS_STRCASECMP_L
631 vpxorq (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
632 vpxorq (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
633 vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
634 /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
635 oring with YMM1. Result is stored in YMM6. */
636 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
638 VMOVU (VEC_SIZE * 0)(%rsi), %VMM(1)
639 TOLOWER_VMM (%VMM(0), %VMM(1))
640 VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3)
641 TOLOWER_VMM (%VMM(2), %VMM(3))
642 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5)
643 TOLOWER_VMM (%VMM(4), %VMM(5))
644 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
645 TOLOWER_VMM (%VMM(6), %VMM(7))
646 vpxorq %VMM(0), %VMM(1), %VMM(1)
647 vpxorq %VMM(2), %VMM(3), %VMM(3)
648 vpxorq %VMM(4), %VMM(5), %VMM(5)
649 vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
651 /* Or together YMM3, YMM5, and YMM6. */
652 vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
655 /* A non-zero CHAR in YMM6 represents a mismatch. */
656 VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
663 /* Find which VEC has the mismatch of end of string. */
664 VPTESTM %VMM(0), %VMM(0), %k1
665 VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
668 jnz L(return_vec_0_end)
670 VPTESTM %VMM(2), %VMM(2), %k1
671 VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
674 jnz L(return_vec_1_end)
677 /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
679 L(return_vec_2_3_end):
680 # ifdef USE_AS_STRNCMP
681 subq $(CHAR_PER_VEC * 2), %rdx
685 VPTESTM %VMM(4), %VMM(4), %k1
686 VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
689 # if CHAR_PER_VEC <= 16
690 sall $CHAR_PER_VEC, %LOOP_REG
692 # elif CHAR_PER_VEC <= 32
693 salq $CHAR_PER_VEC, %LOOP_REG64
694 orq %rcx, %LOOP_REG64
696 /* We aren't combining last 2x VEC so branch on second the last.
698 jnz L(return_vec_2_end)
701 /* LOOP_REG contains matches for null/mismatch from the loop. If
702 VEC 0,1,and 2 all have no null and no mismatches then
703 mismatch must entirely be from VEC 3 which is fully
704 represented by LOOP_REG. */
705 # if CHAR_PER_VEC <= 16
706 bsf %LOOP_REG, %LOOP_REG
708 bsfq %LOOP_REG64, %LOOP_REG64
710 # ifdef USE_AS_STRNCMP
712 /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
713 adj length before last comparison. */
714 # if CHAR_PER_VEC == 64
715 subq $CHAR_PER_VEC, %rdx
719 cmpq %LOOP_REG64, %rdx
723 # ifdef USE_AS_WCSCMP
724 movl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
726 cmpl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
732 movzbl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
733 movzbl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
734 TOLOWER_gpr (%rax, %eax)
735 TOLOWER_gpr (%rcx, %ecx)
743 # ifdef USE_AS_STRNCMP
752 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
753 they use the value of `r8` to negate the return value. This
754 is because the page cross logic can swap `rdi` and `rsi`.
757 # ifdef USE_AS_STRNCMP
759 # if CHAR_PER_VEC <= 32
760 /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
761 without additional branches by adjusting the bit positions
762 from VEC1. We can't do this for CHAR_PER_VEC == 64. */
763 # if CHAR_PER_VEC <= 16
764 sall $CHAR_PER_VEC, %ecx
766 salq $CHAR_PER_VEC, %rcx
769 /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
772 addl $(CHAR_PER_VEC), %ecx
774 ja L(ret_vec_0_end_finish)
780 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
786 # ifdef USE_AS_STRNCMP
791 L(ret_vec_0_end_finish):
792 # ifdef USE_AS_WCSCMP
793 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
795 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
799 /* This is the non-zero case for `eax` so just xorl with `r8d`
800 flip is `rdi` and `rsi` where swapped. */
803 movzbl (%rdi, %rcx), %eax
804 movzbl (%rsi, %rcx), %ecx
805 TOLOWER_gpr (%rax, %eax)
806 TOLOWER_gpr (%rcx, %ecx)
808 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
809 logic. Subtract `r8d` after xor for zero case. */
816 # ifndef USE_AS_STRNCMP
820 # ifdef USE_AS_WCSCMP
821 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
823 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
829 movzbl VEC_SIZE(%rdi, %rcx), %eax
830 movzbl VEC_SIZE(%rsi, %rcx), %ecx
831 TOLOWER_gpr (%rax, %eax)
832 TOLOWER_gpr (%rcx, %ecx)
842 /* If CHAR_PER_VEC == 64 we can't combine matches from the last
843 2x VEC so need seperate return label. */
844 # if CHAR_PER_VEC == 64
847 # ifdef USE_AS_STRNCMP
851 # ifdef USE_AS_WCSCMP
852 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
854 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
858 /* This is the non-zero case for `eax` so just xorl with `r8d`
859 flip is `rdi` and `rsi` where swapped. */
862 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
863 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
864 TOLOWER_gpr (%rax, %eax)
865 TOLOWER_gpr (%rcx, %ecx)
867 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
868 logic. Subtract `r8d` after xor for zero case. */
877 /* Page cross in rsi in next 4x VEC. */
879 /* TODO: Improve logic here. */
881 L(page_cross_during_loop):
882 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
884 /* Optimistically rsi and rdi and both aligned in which case we
885 don't need any logic here. */
886 cmpl $-(VEC_SIZE * 4), %eax
887 /* Don't adjust eax before jumping back to loop and we will
888 never hit page cross case again. */
889 je L(loop_skip_page_cross_check)
891 /* Check if we can safely load a VEC. */
892 cmpl $-(VEC_SIZE * 3), %eax
893 jle L(less_1x_vec_till_page_cross)
895 VMOVA (%rdi), %VMM(0)
896 VPTESTM %VMM(0), %VMM(0), %k2
897 CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
900 jnz L(return_vec_0_end)
902 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
903 cmpl $-(VEC_SIZE * 2), %eax
904 jg L(more_2x_vec_till_page_cross)
907 L(less_1x_vec_till_page_cross):
908 subl $-(VEC_SIZE * 4), %eax
909 /* Guranteed safe to read from rdi - VEC_SIZE here. The only
910 concerning case is first iteration if incoming s1 was near start
911 of a page and s2 near end. If s1 was near the start of the page
912 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
913 to read back -VEC_SIZE. If rdi is truly at the start of a page
914 here, it means the previous page (rdi - VEC_SIZE) has already
915 been loaded earlier so must be valid. */
916 VMOVU -VEC_SIZE(%rdi, %rax), %VMM(0)
917 VPTESTM %VMM(0), %VMM(0), %k2
918 CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
919 /* Mask of potentially valid bits. The lower bits can be out of
920 range comparisons (but safe regarding page crosses). */
922 # ifdef USE_AS_WCSCMP
925 andl $(VEC_SIZE - 1), %ecx
927 shlxl %ecx, %r10d, %ecx
928 /* Depending on CHAR_PER_VEC extract mask for possible in-bound
930 # if CHAR_PER_VEC == 16
932 # elif CHAR_PER_VEC == 8
935 # error "Invalid CHAR_SIZE or VEC_SIZE"
939 shlx %VRSI, %VRCX, %VR10
946 # ifdef USE_AS_STRNCMP
947 # ifdef USE_AS_WCSCMP
948 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
956 jbe L(return_page_cross_end_check)
958 movl %eax, %OFFSET_REG
960 /* Readjust eax before potentially returning to the loop. */
961 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
964 jz L(loop_skip_page_cross_check)
968 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
969 leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
970 L(return_page_cross_cmp_mem):
972 addl %OFFSET_REG, %ecx
974 # ifdef USE_AS_WCSCMP
975 movl VEC_OFFSET(%rdi, %rcx), %edx
977 cmpl VEC_OFFSET(%rsi, %rcx), %edx
983 movzbl VEC_OFFSET(%rdi, %rcx), %eax
984 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
985 TOLOWER_gpr (%rax, %eax)
986 TOLOWER_gpr (%rcx, %ecx)
994 # ifdef USE_AS_STRNCMP
996 L(return_page_cross_end_check):
998 /* Need to use tzcnt here as VRCX may be zero. If VRCX is zero
999 tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
1000 guranteed to be <= CHAR_PER_VEC so we will only use the return
1001 idx if VRCX was non-zero. */
1003 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1004 # ifdef USE_AS_WCSCMP
1008 ja L(return_page_cross_cmp_mem)
1015 L(more_2x_vec_till_page_cross):
1016 /* If more 2x vec till cross we will complete a full loop
1019 VMOVA VEC_SIZE(%rdi), %VMM(0)
1020 VPTESTM %VMM(0), %VMM(0), %k2
1021 CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
1024 jnz L(return_vec_1_end)
1026 # ifdef USE_AS_STRNCMP
1027 cmpq $(CHAR_PER_VEC * 2), %rdx
1028 jbe L(ret_zero_in_loop_page_cross)
1031 subl $-(VEC_SIZE * 4), %eax
1033 /* Safe to include comparisons from lower bytes. */
1034 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
1035 VPTESTM %VMM(0), %VMM(0), %k2
1036 CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
1039 jnz L(return_vec_page_cross_0)
1041 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
1042 VPTESTM %VMM(0), %VMM(0), %k2
1043 CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
1046 jnz L(return_vec_page_cross_1)
1048 # ifdef USE_AS_STRNCMP
1049 /* Must check length here as length might proclude reading next
1051 # ifdef USE_AS_WCSCMP
1052 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
1060 jbe L(ret_zero_in_loop_page_cross)
1063 /* Finish the loop. */
1064 VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4)
1065 VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6)
1066 VPMINU %VMM(4), %VMM(6), %VMM(9)
1067 VPTESTM %VMM(9), %VMM(9), %k1
1068 # ifndef USE_AS_STRCASECMP_L
1069 vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
1070 /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
1071 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
1073 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5)
1074 TOLOWER_VMM (%VMM(4), %VMM(5))
1075 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
1076 TOLOWER_VMM (%VMM(6), %VMM(7))
1077 vpxorq %VMM(4), %VMM(5), %VMM(5)
1078 vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
1080 VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
1083 jnz L(return_vec_2_3_end)
1085 /* Best for code size to include ucond-jmp here. Would be faster
1086 if this case is hot to duplicate the L(return_vec_2_3_end)
1087 code as fall-through and have jump back to loop on mismatch
1089 subq $-(VEC_SIZE * 4), %rdi
1090 subq $-(VEC_SIZE * 4), %rsi
1091 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
1092 # ifdef USE_AS_STRNCMP
1093 subq $(CHAR_PER_VEC * 4), %rdx
1094 ja L(loop_skip_page_cross_check)
1095 L(ret_zero_in_loop_page_cross):
1099 jmp L(loop_skip_page_cross_check)
1104 L(return_vec_page_cross_0):
1105 addl $-VEC_SIZE, %eax
1106 L(return_vec_page_cross_1):
1108 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
1109 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1110 # ifdef USE_AS_STRNCMP
1111 # ifdef USE_AS_WCSCMP
1112 /* Must divide ecx instead of multiply rdx due to overflow. */
1119 jbe L(ret_zero_in_loop_page_cross)
1125 # ifdef USE_AS_WCSCMP
1126 movl VEC_OFFSET(%rdi, %rcx), %edx
1128 cmpl VEC_OFFSET(%rsi, %rcx), %edx
1134 movzbl VEC_OFFSET(%rdi, %rcx), %eax
1135 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
1136 TOLOWER_gpr (%rax, %eax)
1137 TOLOWER_gpr (%rcx, %ecx)
1148 # ifndef USE_AS_STRNCMP
1149 /* If both are VEC aligned we don't need any special logic here.
1150 Only valid for strcmp where stop condition is guranteed to
1151 be reachable by just reading memory. */
1152 testl $((VEC_SIZE - 1) << 20), %eax
1158 andl $(PAGE_SIZE - 1), %eax
1159 andl $(PAGE_SIZE - 1), %ecx
1161 xorl %OFFSET_REG, %OFFSET_REG
1163 /* Check which is closer to page cross, s1 or s2. */
1167 /* The previous page cross check has false positives. Check for
1168 true positive as page cross logic is very expensive. */
1169 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
1170 jbe L(no_page_cross)
1173 /* Set r8 to not interfere with normal return value (rdi and rsi
1175 # ifdef USE_AS_WCSCMP
1176 /* any non-zero positive value that doesn't inference with 0x1.
1183 /* Check if less than 1x VEC till page cross. */
1184 subl $(VEC_SIZE * 3), %eax
1185 jg L(less_1x_vec_till_page)
1188 /* If more than 1x VEC till page cross, loop throuh safely
1189 loadable memory until within 1x VEC of page cross. */
1192 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1193 VPTESTM %VMM(0), %VMM(0), %k2
1194 CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1197 jnz L(check_ret_vec_page_cross)
1198 addl $CHAR_PER_VEC, %OFFSET_REG
1199 # ifdef USE_AS_STRNCMP
1200 cmpq %OFFSET_REG64, %rdx
1201 jbe L(ret_zero_page_cross)
1203 addl $VEC_SIZE, %eax
1204 jl L(page_cross_loop)
1206 # ifdef USE_AS_WCSCMP
1211 subl %eax, %OFFSET_REG
1212 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1213 to not cross page so is safe to load. Since we have already
1214 loaded at least 1 VEC from rsi it is also guranteed to be
1216 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1217 VPTESTM %VMM(0), %VMM(0), %k2
1218 CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1221 # ifdef USE_AS_STRNCMP
1222 leal CHAR_PER_VEC(%OFFSET_REG64), %eax
1224 jbe L(check_ret_vec_page_cross2)
1225 # ifdef USE_AS_WCSCMP
1226 addq $-(CHAR_PER_VEC * 2), %rdx
1232 jz L(prepare_loop_no_len)
1235 L(ret_vec_page_cross):
1236 # ifndef USE_AS_STRNCMP
1237 L(check_ret_vec_page_cross):
1240 addl %OFFSET_REG, %ecx
1241 L(ret_vec_page_cross_cont):
1242 # ifdef USE_AS_WCSCMP
1243 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1245 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
1251 movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
1252 movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
1253 TOLOWER_gpr (%rax, %eax)
1254 TOLOWER_gpr (%rcx, %ecx)
1263 # ifdef USE_AS_STRNCMP
1265 L(check_ret_vec_page_cross2):
1267 L(check_ret_vec_page_cross):
1269 addl %OFFSET_REG, %ecx
1271 ja L(ret_vec_page_cross_cont)
1273 L(ret_zero_page_cross):
1280 /* Ensure this is a true page cross. */
1281 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1282 jbe L(no_page_cross)
1290 /* set r8 to negate return value as rdi and rsi swapped. */
1291 # ifdef USE_AS_WCSCMP
1296 xorl %OFFSET_REG, %OFFSET_REG
1298 /* Check if more than 1x VEC till page cross. */
1299 subl $(VEC_SIZE * 3), %eax
1300 jle L(page_cross_loop)
1303 L(less_1x_vec_till_page):
1304 # ifdef USE_AS_WCSCMP
1308 /* Find largest load size we can use. VEC_SIZE == 64 only check
1309 if we can do a full ymm load. */
1312 cmpl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
1313 ja L(less_32_till_page)
1316 /* Use 16 byte comparison. */
1317 VMOVU (%rdi), %VMM_256(0)
1318 VPTESTM %VMM_256(0), %VMM_256(0), %k2
1319 CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
1321 # ifdef USE_AS_WCSCMP
1326 jnz L(check_ret_vec_page_cross)
1327 movl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
1328 # ifdef USE_AS_STRNCMP
1329 cmpq %OFFSET_REG64, %rdx
1330 jbe L(ret_zero_page_cross_slow_case64)
1331 subl %eax, %OFFSET_REG
1333 /* Explicit check for 32 byte alignment. */
1334 subl %eax, %OFFSET_REG
1337 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
1338 VPTESTM %VMM_256(0), %VMM_256(0), %k2
1339 CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
1341 # ifdef USE_AS_WCSCMP
1346 jnz L(check_ret_vec_page_cross)
1347 # ifdef USE_AS_STRNCMP
1348 addl $(32 / SIZE_OF_CHAR), %OFFSET_REG
1349 subq %OFFSET_REG64, %rdx
1350 jbe L(ret_zero_page_cross_slow_case64)
1351 subq $-(CHAR_PER_VEC * 4), %rdx
1353 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1354 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1356 leaq (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1357 leaq (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1359 jmp L(prepare_loop_aligned)
1361 # ifdef USE_AS_STRNCMP
1363 L(ret_zero_page_cross_slow_case64):
1367 L(less_32_till_page):
1370 /* Find largest load size we can use. */
1371 cmpl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
1372 ja L(less_16_till_page)
1374 /* Use 16 byte comparison. */
1375 vmovdqu (%rdi), %xmm0
1376 VPTESTM %xmm0, %xmm0, %k2
1377 CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1379 # ifdef USE_AS_WCSCMP
1384 jnz L(check_ret_vec_page_cross)
1386 movl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
1387 # ifdef USE_AS_STRNCMP
1389 cmpq %OFFSET_REG64, %rdx
1391 cmpq $(16 / SIZE_OF_CHAR), %rdx
1393 jbe L(ret_zero_page_cross_slow_case0)
1394 subl %eax, %OFFSET_REG
1396 /* Explicit check for 16 byte alignment. */
1397 subl %eax, %OFFSET_REG
1400 vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1401 VPTESTM %xmm0, %xmm0, %k2
1402 CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1404 # ifdef USE_AS_WCSCMP
1409 jnz L(check_ret_vec_page_cross)
1410 # ifdef USE_AS_STRNCMP
1411 addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1412 subq %OFFSET_REG64, %rdx
1413 jbe L(ret_zero_page_cross_slow_case0)
1414 subq $-(CHAR_PER_VEC * 4), %rdx
1416 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1417 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1419 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1420 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1422 jmp L(prepare_loop_aligned)
1424 # ifdef USE_AS_STRNCMP
1426 L(ret_zero_page_cross_slow_case0):
1433 L(less_16_till_page):
1434 cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
1435 ja L(less_8_till_page)
1437 /* Use 8 byte comparison. */
1440 VPTESTM %xmm0, %xmm0, %k2
1441 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1443 # ifdef USE_AS_WCSCMP
1448 jnz L(check_ret_vec_page_cross)
1451 # ifdef USE_AS_STRNCMP
1452 cmpq $(8 / SIZE_OF_CHAR), %rdx
1453 jbe L(ret_zero_page_cross_slow_case0)
1455 movl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
1456 subl %eax, %OFFSET_REG
1458 vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1459 vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1460 VPTESTM %xmm0, %xmm0, %k2
1461 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1463 # ifdef USE_AS_WCSCMP
1468 jnz L(check_ret_vec_page_cross)
1471 # ifdef USE_AS_STRNCMP
1472 addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
1473 subq %OFFSET_REG64, %rdx
1474 jbe L(ret_zero_page_cross_slow_case0)
1475 subq $-(CHAR_PER_VEC * 4), %rdx
1477 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1478 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1480 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1481 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1483 jmp L(prepare_loop_aligned)
1489 L(less_8_till_page):
1490 # ifdef USE_AS_WCSCMP
1491 /* If using wchar then this is the only check before we reach
1492 the page boundary. */
1496 jnz L(ret_less_8_wcs)
1497 # ifdef USE_AS_STRNCMP
1498 addq $-(CHAR_PER_VEC * 2), %rdx
1499 /* We already checked for len <= 1 so cannot hit that case here.
1510 movl %OFFSET_REG, %eax
1515 cmpl $(VEC_SIZE - 4), %eax
1516 ja L(less_4_till_page)
1520 VPTESTM %xmm0, %xmm0, %k2
1521 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1524 jnz L(check_ret_vec_page_cross)
1526 # ifdef USE_AS_STRNCMP
1528 jbe L(ret_zero_page_cross_slow_case1)
1530 movl $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
1531 subl %eax, %OFFSET_REG
1533 vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1534 vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1535 VPTESTM %xmm0, %xmm0, %k2
1536 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1539 jnz L(check_ret_vec_page_cross)
1540 # ifdef USE_AS_STRNCMP
1541 addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
1542 subq %OFFSET_REG64, %rdx
1543 jbe L(ret_zero_page_cross_slow_case1)
1544 subq $-(CHAR_PER_VEC * 4), %rdx
1546 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1547 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1549 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1550 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1552 jmp L(prepare_loop_aligned)
1555 # ifdef USE_AS_STRNCMP
1557 L(ret_zero_page_cross_slow_case1):
1563 L(less_4_till_page):
1565 /* Extremely slow byte comparison loop. */
1568 movzbl (%rsi, %rdi), %ecx
1569 TOLOWER_gpr (%rax, %eax)
1570 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1571 subl %BYTE_LOOP_REG, %eax
1572 jnz L(ret_less_4_loop)
1574 jz L(ret_zero_4_loop)
1575 # ifdef USE_AS_STRNCMP
1577 jz L(ret_zero_4_loop)
1580 /* end condition is reach page boundary (rdi is aligned). */
1581 testb $(VEC_SIZE - 1), %dil
1583 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1584 addq $-(VEC_SIZE * 4), %rdi
1585 # ifdef USE_AS_STRNCMP
1586 subq $-(CHAR_PER_VEC * 4), %rdx
1588 jmp L(prepare_loop_aligned)
1599 .size STRCMP, .-STRCMP