sysdeps/x86_64/multiarch/strcmp-evex.S

   1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23 # ifndef VEC_SIZE
  24 #  include "x86-evex256-vecs.h"
  25 # endif
  26
  27 # define STRCMP_ISA     _evex
  28 # include "strcmp-naming.h"
  29
  30 # include <sysdep.h>
  31 # if defined USE_AS_STRCASECMP_L
  32 #  include "locale-defines.h"
  33 # endif
  34
  35 # ifndef STRCMP
  36 #  define STRCMP        __strcmp_evex
  37 # endif
  38
  39 # define PAGE_SIZE      4096
  40
  41         /* VEC_SIZE = Number of bytes in a ymm register.  */
  42 # define CHAR_PER_VEC   (VEC_SIZE       /       SIZE_OF_CHAR)
  43
  44 # ifdef USE_AS_WCSCMP
  45         /* Compare packed dwords.  */
  46 #  define VPCMP vpcmpd
  47 #  define VPCMPEQ       vpcmpeqd
  48 #  define VPMINU        vpminud
  49 #  define VPTESTM       vptestmd
  50 #  define VPTESTNM      vptestnmd
  51         /* 1 dword char == 4 bytes.  */
  52 #  define SIZE_OF_CHAR  4
  53
  54 #  define TESTEQ        sub $((1 << CHAR_PER_VEC) - 1),
  55
  56 #  define USE_WIDE_CHAR
  57 # else
  58         /* Compare packed bytes.  */
  59 #  define VPCMP vpcmpb
  60 #  define VPCMPEQ       vpcmpeqb
  61 #  define VPMINU        vpminub
  62 #  define VPTESTM       vptestmb
  63 #  define VPTESTNM      vptestnmb
  64         /* 1 byte char == 1 byte.  */
  65 #  define SIZE_OF_CHAR  1
  66
  67 #  define TESTEQ        inc
  68 # endif
  69
  70 # include "reg-macros.h"
  71
  72 # if VEC_SIZE == 64
  73 #  define RODATA_SECTION        rodata.cst64
  74 # else
  75 #  define RODATA_SECTION        rodata.cst32
  76 # endif
  77
  78 # if CHAR_PER_VEC == 64
  79 #  define FALLTHROUGH_RETURN_OFFSET     (VEC_SIZE * 3)
  80 # else
  81 #  define FALLTHROUGH_RETURN_OFFSET     (VEC_SIZE * 2)
  82 # endif
  83
  84 # ifdef USE_AS_STRNCMP
  85 #  define LOOP_REG      VR9
  86 #  define LOOP_REG64    r9
  87
  88 #  define OFFSET_REG8   r9b
  89 #  define OFFSET_REG    r9d
  90 #  define OFFSET_REG64  r9
  91 # else
  92 #  define LOOP_REG      VRDX
  93 #  define LOOP_REG64    rdx
  94
  95 #  define OFFSET_REG8   dl
  96 #  define OFFSET_REG    edx
  97 #  define OFFSET_REG64  rdx
  98 # endif
  99
 100 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
 101 #  define VEC_OFFSET    0
 102 # else
 103 #  define VEC_OFFSET    (-VEC_SIZE)
 104 # endif
 105
 106 # ifdef USE_AS_STRCASECMP_L
 107 #  define BYTE_LOOP_REG OFFSET_REG
 108 # else
 109 #  define BYTE_LOOP_REG ecx
 110 # endif
 111
 112 # ifdef USE_AS_STRCASECMP_L
 113 #  ifdef USE_AS_STRNCMP
 114 #   define LOCALE_REG   rcx
 115 #   define LOCALE_REG_LP        RCX_LP
 116 #  else
 117 #   define LOCALE_REG   rdx
 118 #   define LOCALE_REG_LP        RDX_LP
 119 #  endif
 120 # endif
 121
 122 # define LCASE_MIN_V    VMM(12)
 123 # define LCASE_MAX_V    VMM(13)
 124 # define CASE_ADD_V     VMM(14)
 125
 126 # if VEC_SIZE == 64
 127 #  define LCASE_MIN_YMM VMM_256(12)
 128 #  define LCASE_MAX_YMM VMM_256(13)
 129 #  define CASE_ADD_YMM  VMM_256(14)
 130 # endif
 131
 132 # define LCASE_MIN_XMM  VMM_128(12)
 133 # define LCASE_MAX_XMM  VMM_128(13)
 134 # define CASE_ADD_XMM   VMM_128(14)
 135
 136         /* NB: wcsncmp uses r11 but strcasecmp is never used in
 137            conjunction with wcscmp.  */
 138 # define TOLOWER_BASE   %r11
 139
 140 # ifdef USE_AS_STRCASECMP_L
 141 #  define _REG(x, y)    x ## y
 142 #  define REG(x, y)     _REG(x, y)
 143 #  define TOLOWER(reg1, reg2, ext, vec_macro)   \
 144         vpsubb  %REG(LCASE_MIN_, ext), reg1, %vec_macro(10);    \
 145         vpsubb  %REG(LCASE_MIN_, ext), reg2, %vec_macro(11);    \
 146         vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
 147         vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
 148         vpaddb  reg1, %REG(CASE_ADD_, ext), reg1{%k5};  \
 149         vpaddb  reg2, %REG(CASE_ADD_, ext), reg2{%k6}
 150
 151 #  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
 152 #  define TOLOWER_VMM(...)      TOLOWER(__VA_ARGS__, V, VMM)
 153 #  define TOLOWER_YMM(...)      TOLOWER(__VA_ARGS__, YMM, VMM_256)
 154 #  define TOLOWER_XMM(...)      TOLOWER(__VA_ARGS__, XMM, VMM_128)
 155
 156 #  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)    \
 157         TOLOWER (s1_reg, s2_reg, ext, vec_macro);       \
 158         VPCMPEQ s1_reg, s2_reg, reg_out
 159
 160 #  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)    \
 161         VMOVU   s2_mem, s2_reg; \
 162         CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
 163
 164 #  define CMP_R1_R2_VMM(...)    CMP_R1_R2(__VA_ARGS__, V, VMM)
 165 #  define CMP_R1_R2_YMM(...)    CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
 166 #  define CMP_R1_R2_XMM(...)    CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
 167
 168 #  define CMP_R1_S2_VMM(...)    CMP_R1_S2(__VA_ARGS__, V, VMM)
 169 #  define CMP_R1_S2_YMM(...)    CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
 170 #  define CMP_R1_S2_XMM(...)    CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
 171
 172 # else
 173 #  define TOLOWER_gpr(...)
 174 #  define TOLOWER_VMM(...)
 175 #  define TOLOWER_YMM(...)
 176 #  define TOLOWER_XMM(...)
 177
 178 #  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)        \
 179         VPCMPEQ s2_reg, s1_reg, reg_out
 180
 181 #  define CMP_R1_R2_YMM(...)    CMP_R1_R2_VMM(__VA_ARGS__)
 182 #  define CMP_R1_R2_XMM(...)    CMP_R1_R2_VMM(__VA_ARGS__)
 183
 184 #  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)        \
 185         VPCMPEQ s2_mem, s1_reg, reg_out
 186 #  define CMP_R1_S2_YMM(...)    CMP_R1_S2_VMM(__VA_ARGS__)
 187 #  define CMP_R1_S2_XMM(...)    CMP_R1_S2_VMM(__VA_ARGS__)
 188 # endif
 189
 190 /* Warning!
 191            wcscmp/wcsncmp have to use SIGNED comparison for elements.
 192            strcmp/strncmp have to use UNSIGNED comparison for elements.
 193 */
 194
 195 /* The main idea of the string comparison (byte or dword) using 256-bit
 196    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
 197    latter can be on either packed bytes or dwords depending on
 198    USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
 199    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
 200    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
 201    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
 202    instructions.  Main loop (away from from page boundary) compares 4
 203    vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
 204    bytes) on each loop.
 205
 206    The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
 207    is the same as strcmp, except that an a maximum offset is tracked.  If
 208    the maximum offset is reached before a difference is found, zero is
 209    returned.  */
 210
 211         .section SECTION(.text), "ax", @progbits
 212         .align  16
 213         .type   STRCMP, @function
 214         .globl  STRCMP
 215 # ifdef USE_AS_STRCASECMP_L
 216 ENTRY (STRCASECMP)
 217         movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
 218         mov     %fs:(%rax), %LOCALE_REG_LP
 219
 220         /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
 221         .p2align 4
 222 END (STRCASECMP)
 223         /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
 224 # endif
 225
 226         .p2align 4
 227 STRCMP:
 228         cfi_startproc
 229         _CET_ENDBR
 230         CALL_MCOUNT
 231
 232 # if defined USE_AS_STRCASECMP_L
 233         /* We have to fall back on the C implementation for locales with
 234            encodings not matching ASCII for single bytes.  */
 235 #  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 236         mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
 237 #  else
 238         mov     (%LOCALE_REG), %RAX_LP
 239 #  endif
 240         testb   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
 241         jne     STRCASECMP_L_NONASCII
 242         leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
 243 # endif
 244
 245 # ifdef USE_AS_STRNCMP
 246         /* Don't overwrite LOCALE_REG (rcx) until we have pass
 247            L(one_or_less). Otherwise we might use the wrong locale in
 248            the OVERFLOW_STRCMP (strcasecmp_l).  */
 249 #  ifdef __ILP32__
 250         /* Clear the upper 32 bits.  */
 251         movl    %edx, %edx
 252 #  endif
 253         cmp     $1, %RDX_LP
 254         /* Signed comparison intentional. We use this branch to also
 255            test cases where length >= 2^63. These very large sizes can be
 256            handled with strcmp as there is no way for that length to
 257            actually bound the buffer.  */
 258         jle     L(one_or_less)
 259 # endif
 260
 261 # if defined USE_AS_STRCASECMP_L
 262         .section RODATA_SECTION, "aM", @progbits, VEC_SIZE
 263         .align  VEC_SIZE
 264 L(lcase_min):
 265         .quad   0x4141414141414141
 266         .quad   0x4141414141414141
 267         .quad   0x4141414141414141
 268         .quad   0x4141414141414141
 269 #  if VEC_SIZE == 64
 270         .quad   0x4141414141414141
 271         .quad   0x4141414141414141
 272         .quad   0x4141414141414141
 273         .quad   0x4141414141414141
 274 #  endif
 275 L(lcase_max):
 276         .quad   0x1a1a1a1a1a1a1a1a
 277         .quad   0x1a1a1a1a1a1a1a1a
 278         .quad   0x1a1a1a1a1a1a1a1a
 279         .quad   0x1a1a1a1a1a1a1a1a
 280 #  if VEC_SIZE == 64
 281         .quad   0x1a1a1a1a1a1a1a1a
 282         .quad   0x1a1a1a1a1a1a1a1a
 283         .quad   0x1a1a1a1a1a1a1a1a
 284         .quad   0x1a1a1a1a1a1a1a1a
 285 #  endif
 286 L(case_add):
 287         .quad   0x2020202020202020
 288         .quad   0x2020202020202020
 289         .quad   0x2020202020202020
 290         .quad   0x2020202020202020
 291 #  if VEC_SIZE == 64
 292         .quad   0x2020202020202020
 293         .quad   0x2020202020202020
 294         .quad   0x2020202020202020
 295         .quad   0x2020202020202020
 296 #  endif
 297         .previous
 298
 299         VMOVA   L(lcase_min)(%rip), %LCASE_MIN_V
 300         VMOVA   L(lcase_max)(%rip), %LCASE_MAX_V
 301         VMOVA   L(case_add)(%rip), %CASE_ADD_V
 302 # endif
 303
 304         movl    %edi, %eax
 305         orl     %esi, %eax
 306         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
 307         sall    $20, %eax
 308         /* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
 309         cmpl    $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
 310         ja      L(page_cross)
 311
 312 L(no_page_cross):
 313         /* Safe to compare 4x vectors.  */
 314         VMOVU   (%rdi), %VMM(0)
 315         VPTESTM %VMM(0), %VMM(0), %k2
 316         /* Each bit cleared in K1 represents a mismatch or a null CHAR
 317            in YMM0 and 32 bytes at (%rsi).  */
 318         CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
 319         KMOV    %k1, %VRCX
 320 # ifdef USE_AS_STRNCMP
 321         cmpq    $CHAR_PER_VEC, %rdx
 322         jbe     L(vec_0_test_len)
 323 # endif
 324
 325         /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
 326            wcscmp/wcsncmp.  */
 327
 328         /* All 1s represents all equals. TESTEQ will overflow to zero in
 329            all equals case. Otherwise 1s will carry until position of
 330            first mismatch.  */
 331         TESTEQ  %VRCX
 332         jz      L(more_3x_vec)
 333
 334         .p2align 4,, 4
 335 L(return_vec_0):
 336         bsf     %VRCX, %VRCX
 337 # ifdef USE_AS_WCSCMP
 338         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
 339         xorl    %eax, %eax
 340         cmpl    (%rsi, %rcx, SIZE_OF_CHAR), %edx
 341         je      L(ret0)
 342         setl    %al
 343         negl    %eax
 344         orl     $1, %eax
 345 # else
 346         movzbl  (%rdi, %rcx), %eax
 347         /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
 348            and keep logic for len <= VEC_SIZE (common) in just the
 349            first cache line.  NB: No evex512 processor has partial-
 350            register stalls. If that changes this ifdef can be disabled
 351            without affecting correctness.  */
 352 #  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
 353         movb    (%rsi, %rcx), %cl
 354 #  else
 355         movzbl  (%rsi, %rcx), %ecx
 356 #  endif
 357         TOLOWER_gpr (%rax, %eax)
 358         TOLOWER_gpr (%rcx, %ecx)
 359         subl    %ecx, %eax
 360 # endif
 361 L(ret0):
 362         ret
 363
 364 # ifdef USE_AS_STRNCMP
 365         .p2align 4,, 4
 366 L(vec_0_test_len):
 367         not     %VRCX
 368         bzhi    %VRDX, %VRCX, %VRAX
 369         jnz     L(return_vec_0)
 370         /* Align if will cross fetch block.  */
 371         .p2align 4,, 2
 372 L(ret_zero):
 373         xorl    %eax, %eax
 374         ret
 375
 376         .p2align 4,, 5
 377 L(one_or_less):
 378 #  ifdef USE_AS_STRCASECMP_L
 379         /* Set locale argument for strcasecmp.  */
 380         movq    %LOCALE_REG, %rdx
 381 #  endif
 382         jb      L(ret_zero)
 383         /* 'nbe' covers the case where length is negative (large
 384            unsigned).  */
 385         jnbe    OVERFLOW_STRCMP
 386 #  ifdef USE_AS_WCSCMP
 387         movl    (%rdi), %edx
 388         xorl    %eax, %eax
 389         cmpl    (%rsi), %edx
 390         je      L(ret1)
 391         setl    %al
 392         negl    %eax
 393         orl     $1, %eax
 394 #  else
 395         movzbl  (%rdi), %eax
 396         movzbl  (%rsi), %ecx
 397         TOLOWER_gpr (%rax, %eax)
 398         TOLOWER_gpr (%rcx, %ecx)
 399         subl    %ecx, %eax
 400 #  endif
 401 L(ret1):
 402         ret
 403 # endif
 404
 405         .p2align 4,, 10
 406 L(return_vec_1):
 407         bsf     %VRCX, %VRCX
 408 # ifdef USE_AS_STRNCMP
 409         /* rdx must be > CHAR_PER_VEC so its safe to subtract without
 410            worrying about underflow.  */
 411         addq    $-CHAR_PER_VEC, %rdx
 412         cmpq    %rcx, %rdx
 413         jbe     L(ret_zero)
 414 # endif
 415 # ifdef USE_AS_WCSCMP
 416         movl    VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 417         xorl    %eax, %eax
 418         cmpl    VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
 419         je      L(ret2)
 420         setl    %al
 421         negl    %eax
 422         orl     $1, %eax
 423 # else
 424         movzbl  VEC_SIZE(%rdi, %rcx), %eax
 425         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
 426         TOLOWER_gpr (%rax, %eax)
 427         TOLOWER_gpr (%rcx, %ecx)
 428         subl    %ecx, %eax
 429 # endif
 430 L(ret2):
 431         ret
 432
 433         .p2align 4,, 10
 434 # ifdef USE_AS_STRNCMP
 435 L(return_vec_3):
 436 #  if CHAR_PER_VEC <= 32
 437         /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
 438            additional branches by adjusting the bit positions from
 439            VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
 440 #   if CHAR_PER_VEC <= 16
 441         sall    $CHAR_PER_VEC, %ecx
 442 #   else
 443         salq    $CHAR_PER_VEC, %rcx
 444 #   endif
 445 #  else
 446         /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
 447            check it.  */
 448         bsf     %VRCX, %VRCX
 449         addl    $(CHAR_PER_VEC), %ecx
 450         cmpq    %rcx, %rdx
 451         ja      L(ret_vec_3_finish)
 452         xorl    %eax, %eax
 453         ret
 454 #  endif
 455 # endif
 456
 457         /* If CHAR_PER_VEC == 64 we can't combine matches from the last
 458            2x VEC so need seperate return label.  */
 459 L(return_vec_2):
 460 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
 461         bsf     %VRCX, %VRCX
 462 # else
 463         bsfq    %rcx, %rcx
 464 # endif
 465 # ifdef USE_AS_STRNCMP
 466         cmpq    %rcx, %rdx
 467         jbe     L(ret_zero)
 468 # endif
 469
 470 L(ret_vec_3_finish):
 471 # ifdef USE_AS_WCSCMP
 472         movl    (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 473         xorl    %eax, %eax
 474         cmpl    (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
 475         je      L(ret3)
 476         setl    %al
 477         negl    %eax
 478         orl     $1, %eax
 479 # else
 480         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
 481         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
 482         TOLOWER_gpr (%rax, %eax)
 483         TOLOWER_gpr (%rcx, %ecx)
 484         subl    %ecx, %eax
 485 # endif
 486 L(ret3):
 487         ret
 488
 489 # ifndef USE_AS_STRNCMP
 490         .p2align 4,, 10
 491 L(return_vec_3):
 492         bsf     %VRCX, %VRCX
 493 #  ifdef USE_AS_WCSCMP
 494         movl    (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 495         xorl    %eax, %eax
 496         cmpl    (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
 497         je      L(ret4)
 498         setl    %al
 499         negl    %eax
 500         orl     $1, %eax
 501 #  else
 502         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
 503         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
 504         TOLOWER_gpr (%rax, %eax)
 505         TOLOWER_gpr (%rcx, %ecx)
 506         subl    %ecx, %eax
 507 #  endif
 508 L(ret4):
 509         ret
 510 # endif
 511
 512         /* 32 byte align here ensures the main loop is ideally aligned
 513            for DSB.  */
 514         .p2align 5
 515 L(more_3x_vec):
 516         /* Safe to compare 4x vectors.  */
 517         VMOVU   (VEC_SIZE)(%rdi), %VMM(0)
 518         VPTESTM %VMM(0), %VMM(0), %k2
 519         CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
 520         KMOV    %k1, %VRCX
 521         TESTEQ  %VRCX
 522         jnz     L(return_vec_1)
 523
 524 # ifdef USE_AS_STRNCMP
 525         subq    $(CHAR_PER_VEC * 2), %rdx
 526         jbe     L(ret_zero)
 527 # endif
 528
 529         VMOVU   (VEC_SIZE * 2)(%rdi), %VMM(0)
 530         VPTESTM %VMM(0), %VMM(0), %k2
 531         CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
 532         KMOV    %k1, %VRCX
 533         TESTEQ  %VRCX
 534         jnz     L(return_vec_2)
 535
 536         VMOVU   (VEC_SIZE * 3)(%rdi), %VMM(0)
 537         VPTESTM %VMM(0), %VMM(0), %k2
 538         CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
 539         KMOV    %k1, %VRCX
 540         TESTEQ  %VRCX
 541         jnz     L(return_vec_3)
 542
 543 # ifdef USE_AS_STRNCMP
 544         cmpq    $(CHAR_PER_VEC * 2), %rdx
 545         jbe     L(ret_zero)
 546 # endif
 547
 548
 549 # ifdef USE_AS_WCSCMP
 550         /* any non-zero positive value that doesn't inference with 0x1.
 551          */
 552         movl    $2, %r8d
 553
 554 # else
 555         xorl    %r8d, %r8d
 556 # endif
 557
 558         /* The prepare labels are various entry points from the page
 559            cross logic.  */
 560 L(prepare_loop):
 561
 562 # ifdef USE_AS_STRNCMP
 563 #  ifdef USE_AS_WCSCMP
 564 L(prepare_loop_no_len):
 565         movl    %edi, %ecx
 566         andl    $(VEC_SIZE * 4 - 1), %ecx
 567         shrl    $2, %ecx
 568         leaq    (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
 569 #  else
 570         /* Store N + (VEC_SIZE * 4) and place check at the begining of
 571            the loop.  */
 572         leaq    (VEC_SIZE * 2)(%rdi, %rdx), %rdx
 573 L(prepare_loop_no_len):
 574 #  endif
 575 # else
 576 L(prepare_loop_no_len):
 577 # endif
 578
 579         /* Align s1 and adjust s2 accordingly.  */
 580         subq    %rdi, %rsi
 581         andq    $-(VEC_SIZE * 4), %rdi
 582 L(prepare_loop_readj):
 583         addq    %rdi, %rsi
 584 # if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
 585         subq    %rdi, %rdx
 586 # endif
 587
 588 L(prepare_loop_aligned):
 589         /* eax stores distance from rsi to next page cross. These cases
 590            need to be handled specially as the 4x loop could potentially
 591            read memory past the length of s1 or s2 and across a page
 592            boundary.  */
 593         movl    $-(VEC_SIZE * 4), %eax
 594         subl    %esi, %eax
 595         andl    $(PAGE_SIZE - 1), %eax
 596
 597
 598         /* Loop 4x comparisons at a time.  */
 599         .p2align 4
 600 L(loop):
 601
 602         /* End condition for strncmp.  */
 603 # ifdef USE_AS_STRNCMP
 604         subq    $(CHAR_PER_VEC * 4), %rdx
 605         jbe     L(ret_zero)
 606 # endif
 607
 608         subq    $-(VEC_SIZE * 4), %rdi
 609         subq    $-(VEC_SIZE * 4), %rsi
 610
 611         /* Check if rsi loads will cross a page boundary.  */
 612         addl    $-(VEC_SIZE * 4), %eax
 613         jnb     L(page_cross_during_loop)
 614
 615         /* Loop entry after handling page cross during loop.  */
 616 L(loop_skip_page_cross_check):
 617         VMOVA   (VEC_SIZE * 0)(%rdi), %VMM(0)
 618         VMOVA   (VEC_SIZE * 1)(%rdi), %VMM(2)
 619         VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(4)
 620         VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(6)
 621
 622         VPMINU  %VMM(0), %VMM(2), %VMM(8)
 623         VPMINU  %VMM(4), %VMM(6), %VMM(9)
 624
 625         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
 626         VPMINU  %VMM(8), %VMM(9), %VMM(9)
 627
 628         /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
 629         VPTESTM %VMM(9), %VMM(9), %k1
 630 # ifndef USE_AS_STRCASECMP_L
 631         vpxorq  (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
 632         vpxorq  (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
 633         vpxorq  (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 634         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 635            oring with YMM1. Result is stored in YMM6.  */
 636         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
 637 # else
 638         VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(1)
 639         TOLOWER_VMM (%VMM(0), %VMM(1))
 640         VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
 641         TOLOWER_VMM (%VMM(2), %VMM(3))
 642         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(5)
 643         TOLOWER_VMM (%VMM(4), %VMM(5))
 644         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
 645         TOLOWER_VMM (%VMM(6), %VMM(7))
 646         vpxorq  %VMM(0), %VMM(1), %VMM(1)
 647         vpxorq  %VMM(2), %VMM(3), %VMM(3)
 648         vpxorq  %VMM(4), %VMM(5), %VMM(5)
 649         vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
 650 # endif
 651         /* Or together YMM3, YMM5, and YMM6.  */
 652         vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
 653
 654
 655         /* A non-zero CHAR in YMM6 represents a mismatch.  */
 656         VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
 657         KMOV    %k0, %LOOP_REG
 658
 659         TESTEQ  %LOOP_REG
 660         jz      L(loop)
 661
 662
 663         /* Find which VEC has the mismatch of end of string.  */
 664         VPTESTM %VMM(0), %VMM(0), %k1
 665         VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
 666         KMOV    %k0, %VRCX
 667         TESTEQ  %VRCX
 668         jnz     L(return_vec_0_end)
 669
 670         VPTESTM %VMM(2), %VMM(2), %k1
 671         VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
 672         KMOV    %k0, %VRCX
 673         TESTEQ  %VRCX
 674         jnz     L(return_vec_1_end)
 675
 676
 677         /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
 678          */
 679 L(return_vec_2_3_end):
 680 # ifdef USE_AS_STRNCMP
 681         subq    $(CHAR_PER_VEC * 2), %rdx
 682         jbe     L(ret_zero_end)
 683 # endif
 684
 685         VPTESTM %VMM(4), %VMM(4), %k1
 686         VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
 687         KMOV    %k0, %VRCX
 688         TESTEQ  %VRCX
 689 # if CHAR_PER_VEC <= 16
 690         sall    $CHAR_PER_VEC, %LOOP_REG
 691         orl     %ecx, %LOOP_REG
 692 # elif CHAR_PER_VEC <= 32
 693         salq    $CHAR_PER_VEC, %LOOP_REG64
 694         orq     %rcx, %LOOP_REG64
 695 # else
 696         /* We aren't combining last 2x VEC so branch on second the last.
 697          */
 698         jnz     L(return_vec_2_end)
 699 # endif
 700
 701         /* LOOP_REG contains matches for null/mismatch from the loop. If
 702            VEC 0,1,and 2 all have no null and no mismatches then
 703            mismatch must entirely be from VEC 3 which is fully
 704            represented by LOOP_REG.  */
 705 # if CHAR_PER_VEC <= 16
 706         bsf     %LOOP_REG, %LOOP_REG
 707 # else
 708         bsfq    %LOOP_REG64, %LOOP_REG64
 709 # endif
 710 # ifdef USE_AS_STRNCMP
 711
 712         /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
 713            adj length before last comparison.  */
 714 #  if CHAR_PER_VEC == 64
 715         subq    $CHAR_PER_VEC, %rdx
 716         jbe     L(ret_zero_end)
 717 #  endif
 718
 719         cmpq    %LOOP_REG64, %rdx
 720         jbe     L(ret_zero_end)
 721 # endif
 722
 723 # ifdef USE_AS_WCSCMP
 724         movl    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 725         xorl    %eax, %eax
 726         cmpl    (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 727         je      L(ret5)
 728         setl    %al
 729         negl    %eax
 730         xorl    %r8d, %eax
 731 # else
 732         movzbl  (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
 733         movzbl  (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
 734         TOLOWER_gpr (%rax, %eax)
 735         TOLOWER_gpr (%rcx, %ecx)
 736         subl    %ecx, %eax
 737         xorl    %r8d, %eax
 738         subl    %r8d, %eax
 739 # endif
 740 L(ret5):
 741         ret
 742
 743 # ifdef USE_AS_STRNCMP
 744         .p2align 4,, 2
 745 L(ret_zero_end):
 746         xorl    %eax, %eax
 747         ret
 748 # endif
 749
 750
 751
 752         /* The L(return_vec_N_end) differ from L(return_vec_N) in that
 753            they use the value of `r8` to negate the return value. This
 754            is because the page cross logic can swap `rdi` and `rsi`.
 755          */
 756         .p2align 4,, 10
 757 # ifdef USE_AS_STRNCMP
 758 L(return_vec_1_end):
 759 #  if CHAR_PER_VEC <= 32
 760         /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
 761            without additional branches by adjusting the bit positions
 762            from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
 763 #   if CHAR_PER_VEC <= 16
 764         sall    $CHAR_PER_VEC, %ecx
 765 #   else
 766         salq    $CHAR_PER_VEC, %rcx
 767 #   endif
 768 #  else
 769         /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
 770            check it.  */
 771         bsf     %VRCX, %VRCX
 772         addl    $(CHAR_PER_VEC), %ecx
 773         cmpq    %rcx, %rdx
 774         ja      L(ret_vec_0_end_finish)
 775         xorl    %eax, %eax
 776         ret
 777 #  endif
 778 # endif
 779 L(return_vec_0_end):
 780 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
 781         bsf     %VRCX, %VRCX
 782 # else
 783         bsfq    %rcx, %rcx
 784 # endif
 785
 786 # ifdef USE_AS_STRNCMP
 787         cmpq    %rcx, %rdx
 788         jbe     L(ret_zero_end)
 789 # endif
 790
 791 L(ret_vec_0_end_finish):
 792 # ifdef USE_AS_WCSCMP
 793         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
 794         xorl    %eax, %eax
 795         cmpl    (%rsi, %rcx, SIZE_OF_CHAR), %edx
 796         je      L(ret6)
 797         setl    %al
 798         negl    %eax
 799         /* This is the non-zero case for `eax` so just xorl with `r8d`
 800            flip is `rdi` and `rsi` where swapped.  */
 801         xorl    %r8d, %eax
 802 # else
 803         movzbl  (%rdi, %rcx), %eax
 804         movzbl  (%rsi, %rcx), %ecx
 805         TOLOWER_gpr (%rax, %eax)
 806         TOLOWER_gpr (%rcx, %ecx)
 807         subl    %ecx, %eax
 808         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 809            logic. Subtract `r8d` after xor for zero case.  */
 810         xorl    %r8d, %eax
 811         subl    %r8d, %eax
 812 # endif
 813 L(ret6):
 814         ret
 815
 816 # ifndef USE_AS_STRNCMP
 817         .p2align 4,, 10
 818 L(return_vec_1_end):
 819         bsf     %VRCX, %VRCX
 820 #  ifdef USE_AS_WCSCMP
 821         movl    VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 822         xorl    %eax, %eax
 823         cmpl    VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
 824         je      L(ret7)
 825         setl    %al
 826         negl    %eax
 827         xorl    %r8d, %eax
 828 #  else
 829         movzbl  VEC_SIZE(%rdi, %rcx), %eax
 830         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
 831         TOLOWER_gpr (%rax, %eax)
 832         TOLOWER_gpr (%rcx, %ecx)
 833         subl    %ecx, %eax
 834         xorl    %r8d, %eax
 835         subl    %r8d, %eax
 836 #  endif
 837 L(ret7):
 838         ret
 839 # endif
 840
 841
 842         /* If CHAR_PER_VEC == 64 we can't combine matches from the last
 843            2x VEC so need seperate return label.  */
 844 # if CHAR_PER_VEC == 64
 845 L(return_vec_2_end):
 846         bsf     %VRCX, %VRCX
 847 #  ifdef USE_AS_STRNCMP
 848         cmpq    %rcx, %rdx
 849         jbe     L(ret_zero_end)
 850 #  endif
 851 #  ifdef USE_AS_WCSCMP
 852         movl    (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 853         xorl    %eax, %eax
 854         cmpl    (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
 855         je      L(ret31)
 856         setl    %al
 857         negl    %eax
 858         /* This is the non-zero case for `eax` so just xorl with `r8d`
 859            flip is `rdi` and `rsi` where swapped.  */
 860         xorl    %r8d, %eax
 861 #  else
 862         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
 863         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
 864         TOLOWER_gpr (%rax, %eax)
 865         TOLOWER_gpr (%rcx, %ecx)
 866         subl    %ecx, %eax
 867         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 868            logic. Subtract `r8d` after xor for zero case.  */
 869         xorl    %r8d, %eax
 870         subl    %r8d, %eax
 871 #  endif
 872 L(ret13):
 873         ret
 874 # endif
 875
 876
 877         /* Page cross in rsi in next 4x VEC.  */
 878
 879         /* TODO: Improve logic here.  */
 880         .p2align 4,, 10
 881 L(page_cross_during_loop):
 882         /* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
 883
 884         /* Optimistically rsi and rdi and both aligned in which case we
 885            don't need any logic here.  */
 886         cmpl    $-(VEC_SIZE * 4), %eax
 887         /* Don't adjust eax before jumping back to loop and we will
 888            never hit page cross case again.  */
 889         je      L(loop_skip_page_cross_check)
 890
 891         /* Check if we can safely load a VEC.  */
 892         cmpl    $-(VEC_SIZE * 3), %eax
 893         jle     L(less_1x_vec_till_page_cross)
 894
 895         VMOVA   (%rdi), %VMM(0)
 896         VPTESTM %VMM(0), %VMM(0), %k2
 897         CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
 898         KMOV    %k1, %VRCX
 899         TESTEQ  %VRCX
 900         jnz     L(return_vec_0_end)
 901
 902         /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
 903         cmpl    $-(VEC_SIZE * 2), %eax
 904         jg      L(more_2x_vec_till_page_cross)
 905
 906         .p2align 4,, 4
 907 L(less_1x_vec_till_page_cross):
 908         subl    $-(VEC_SIZE * 4), %eax
 909         /* Guranteed safe to read from rdi - VEC_SIZE here. The only
 910            concerning case is first iteration if incoming s1 was near start
 911            of a page and s2 near end. If s1 was near the start of the page
 912            we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
 913            to read back -VEC_SIZE. If rdi is truly at the start of a page
 914            here, it means the previous page (rdi - VEC_SIZE) has already
 915            been loaded earlier so must be valid.  */
 916         VMOVU   -VEC_SIZE(%rdi, %rax), %VMM(0)
 917         VPTESTM %VMM(0), %VMM(0), %k2
 918         CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
 919         /* Mask of potentially valid bits. The lower bits can be out of
 920            range comparisons (but safe regarding page crosses).  */
 921
 922 # ifdef USE_AS_WCSCMP
 923         movl    $-1, %r10d
 924         movl    %esi, %ecx
 925         andl    $(VEC_SIZE - 1), %ecx
 926         shrl    $2, %ecx
 927         shlxl   %ecx, %r10d, %ecx
 928         /* Depending on CHAR_PER_VEC extract mask for possible in-bound
 929            matches.  */
 930 #  if CHAR_PER_VEC == 16
 931         movzwl  %cx, %r10d
 932 #  elif CHAR_PER_VEC == 8
 933         movzbl  %cl, %r10d
 934 #  else
 935 #   error "Invalid CHAR_SIZE or VEC_SIZE"
 936 #  endif
 937 # else
 938         mov     $-1, %VRCX
 939         shlx    %VRSI, %VRCX, %VR10
 940 # endif
 941
 942         KMOV    %k1, %VRCX
 943         not     %VRCX
 944
 945
 946 # ifdef USE_AS_STRNCMP
 947 #  ifdef USE_AS_WCSCMP
 948         /* NB: strcasecmp not used with WCSCMP so this access to r11 is
 949            safe.  */
 950         movl    %eax, %r11d
 951         shrl    $2, %r11d
 952         cmpq    %r11, %rdx
 953 #  else
 954         cmpq    %rax, %rdx
 955 #  endif
 956         jbe     L(return_page_cross_end_check)
 957 # endif
 958         movl    %eax, %OFFSET_REG
 959
 960         /* Readjust eax before potentially returning to the loop.  */
 961         addl    $(PAGE_SIZE - VEC_SIZE * 4), %eax
 962
 963         and     %VR10, %VRCX
 964         jz      L(loop_skip_page_cross_check)
 965
 966         bsf     %VRCX, %VRCX
 967
 968 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
 969         leal    -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
 970 L(return_page_cross_cmp_mem):
 971 # else
 972         addl    %OFFSET_REG, %ecx
 973 # endif
 974 # ifdef USE_AS_WCSCMP
 975         movl    VEC_OFFSET(%rdi, %rcx), %edx
 976         xorl    %eax, %eax
 977         cmpl    VEC_OFFSET(%rsi, %rcx), %edx
 978         je      L(ret8)
 979         setl    %al
 980         negl    %eax
 981         xorl    %r8d, %eax
 982 # else
 983         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
 984         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
 985         TOLOWER_gpr (%rax, %eax)
 986         TOLOWER_gpr (%rcx, %ecx)
 987         subl    %ecx, %eax
 988         xorl    %r8d, %eax
 989         subl    %r8d, %eax
 990 # endif
 991 L(ret8):
 992         ret
 993
 994 # ifdef USE_AS_STRNCMP
 995         .p2align 4,, 10
 996 L(return_page_cross_end_check):
 997         and     %VR10, %VRCX
 998         /* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
 999            tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
1000            guranteed to be <= CHAR_PER_VEC so we will only use the return
1001            idx if VRCX was non-zero.  */
1002         tzcnt   %VRCX, %VRCX
1003         leal    -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1004 #  ifdef USE_AS_WCSCMP
1005         sall    $2, %edx
1006 #  endif
1007         cmpl    %ecx, %edx
1008         ja      L(return_page_cross_cmp_mem)
1009         xorl    %eax, %eax
1010         ret
1011 # endif
1012
1013
1014         .p2align 4,, 10
1015 L(more_2x_vec_till_page_cross):
1016         /* If more 2x vec till cross we will complete a full loop
1017            iteration here.  */
1018
1019         VMOVA   VEC_SIZE(%rdi), %VMM(0)
1020         VPTESTM %VMM(0), %VMM(0), %k2
1021         CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
1022         KMOV    %k1, %VRCX
1023         TESTEQ  %VRCX
1024         jnz     L(return_vec_1_end)
1025
1026 # ifdef USE_AS_STRNCMP
1027         cmpq    $(CHAR_PER_VEC * 2), %rdx
1028         jbe     L(ret_zero_in_loop_page_cross)
1029 # endif
1030
1031         subl    $-(VEC_SIZE * 4), %eax
1032
1033         /* Safe to include comparisons from lower bytes.  */
1034         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
1035         VPTESTM %VMM(0), %VMM(0), %k2
1036         CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
1037         KMOV    %k1, %VRCX
1038         TESTEQ  %VRCX
1039         jnz     L(return_vec_page_cross_0)
1040
1041         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
1042         VPTESTM %VMM(0), %VMM(0), %k2
1043         CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
1044         KMOV    %k1, %VRCX
1045         TESTEQ  %VRCX
1046         jnz     L(return_vec_page_cross_1)
1047
1048 # ifdef USE_AS_STRNCMP
1049         /* Must check length here as length might proclude reading next
1050            page.  */
1051 #  ifdef USE_AS_WCSCMP
1052         /* NB: strcasecmp not used with WCSCMP so this access to r11 is
1053            safe.  */
1054         movl    %eax, %r11d
1055         shrl    $2, %r11d
1056         cmpq    %r11, %rdx
1057 #  else
1058         cmpq    %rax, %rdx
1059 #  endif
1060         jbe     L(ret_zero_in_loop_page_cross)
1061 # endif
1062
1063         /* Finish the loop.  */
1064         VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(4)
1065         VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(6)
1066         VPMINU  %VMM(4), %VMM(6), %VMM(9)
1067         VPTESTM %VMM(9), %VMM(9), %k1
1068 # ifndef USE_AS_STRCASECMP_L
1069         vpxorq  (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
1070         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
1071         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
1072 # else
1073         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(5)
1074         TOLOWER_VMM (%VMM(4), %VMM(5))
1075         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
1076         TOLOWER_VMM (%VMM(6), %VMM(7))
1077         vpxorq  %VMM(4), %VMM(5), %VMM(5)
1078         vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
1079 # endif
1080         VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
1081         KMOV    %k0, %LOOP_REG
1082         TESTEQ  %LOOP_REG
1083         jnz     L(return_vec_2_3_end)
1084
1085         /* Best for code size to include ucond-jmp here. Would be faster
1086            if this case is hot to duplicate the L(return_vec_2_3_end)
1087            code as fall-through and have jump back to loop on mismatch
1088            comparison.  */
1089         subq    $-(VEC_SIZE * 4), %rdi
1090         subq    $-(VEC_SIZE * 4), %rsi
1091         addl    $(PAGE_SIZE - VEC_SIZE * 8), %eax
1092 # ifdef USE_AS_STRNCMP
1093         subq    $(CHAR_PER_VEC * 4), %rdx
1094         ja      L(loop_skip_page_cross_check)
1095 L(ret_zero_in_loop_page_cross):
1096         xorl    %eax, %eax
1097         ret
1098 # else
1099         jmp     L(loop_skip_page_cross_check)
1100 # endif
1101
1102
1103         .p2align 4,, 10
1104 L(return_vec_page_cross_0):
1105         addl    $-VEC_SIZE, %eax
1106 L(return_vec_page_cross_1):
1107         bsf     %VRCX, %VRCX
1108 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
1109         leal    -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1110 #  ifdef USE_AS_STRNCMP
1111 #   ifdef USE_AS_WCSCMP
1112         /* Must divide ecx instead of multiply rdx due to overflow.  */
1113         movl    %ecx, %eax
1114         shrl    $2, %eax
1115         cmpq    %rax, %rdx
1116 #   else
1117         cmpq    %rcx, %rdx
1118 #   endif
1119         jbe     L(ret_zero_in_loop_page_cross)
1120 #  endif
1121 # else
1122         addl    %eax, %ecx
1123 # endif
1124
1125 # ifdef USE_AS_WCSCMP
1126         movl    VEC_OFFSET(%rdi, %rcx), %edx
1127         xorl    %eax, %eax
1128         cmpl    VEC_OFFSET(%rsi, %rcx), %edx
1129         je      L(ret9)
1130         setl    %al
1131         negl    %eax
1132         xorl    %r8d, %eax
1133 # else
1134         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
1135         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
1136         TOLOWER_gpr (%rax, %eax)
1137         TOLOWER_gpr (%rcx, %ecx)
1138         subl    %ecx, %eax
1139         xorl    %r8d, %eax
1140         subl    %r8d, %eax
1141 # endif
1142 L(ret9):
1143         ret
1144
1145
1146         .p2align 4,, 10
1147 L(page_cross):
1148 # ifndef USE_AS_STRNCMP
1149         /* If both are VEC aligned we don't need any special logic here.
1150            Only valid for strcmp where stop condition is guranteed to
1151            be reachable by just reading memory.  */
1152         testl   $((VEC_SIZE - 1) << 20), %eax
1153         jz      L(no_page_cross)
1154 # endif
1155
1156         movl    %edi, %eax
1157         movl    %esi, %ecx
1158         andl    $(PAGE_SIZE - 1), %eax
1159         andl    $(PAGE_SIZE - 1), %ecx
1160
1161         xorl    %OFFSET_REG, %OFFSET_REG
1162
1163         /* Check which is closer to page cross, s1 or s2.  */
1164         cmpl    %eax, %ecx
1165         jg      L(page_cross_s2)
1166
1167         /* The previous page cross check has false positives. Check for
1168            true positive as page cross logic is very expensive.  */
1169         subl    $(PAGE_SIZE - VEC_SIZE * 4), %eax
1170         jbe     L(no_page_cross)
1171
1172
1173         /* Set r8 to not interfere with normal return value (rdi and rsi
1174            did not swap).  */
1175 # ifdef USE_AS_WCSCMP
1176         /* any non-zero positive value that doesn't inference with 0x1.
1177          */
1178         movl    $2, %r8d
1179 # else
1180         xorl    %r8d, %r8d
1181 # endif
1182
1183         /* Check if less than 1x VEC till page cross.  */
1184         subl    $(VEC_SIZE * 3), %eax
1185         jg      L(less_1x_vec_till_page)
1186
1187
1188         /* If more than 1x VEC till page cross, loop throuh safely
1189            loadable memory until within 1x VEC of page cross.  */
1190         .p2align 4,, 8
1191 L(page_cross_loop):
1192         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1193         VPTESTM %VMM(0), %VMM(0), %k2
1194         CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1195         KMOV    %k1, %VRCX
1196         TESTEQ  %VRCX
1197         jnz     L(check_ret_vec_page_cross)
1198         addl    $CHAR_PER_VEC, %OFFSET_REG
1199 # ifdef USE_AS_STRNCMP
1200         cmpq    %OFFSET_REG64, %rdx
1201         jbe     L(ret_zero_page_cross)
1202 # endif
1203         addl    $VEC_SIZE, %eax
1204         jl      L(page_cross_loop)
1205
1206 # ifdef USE_AS_WCSCMP
1207         shrl    $2, %eax
1208 # endif
1209
1210
1211         subl    %eax, %OFFSET_REG
1212         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1213            to not cross page so is safe to load. Since we have already
1214            loaded at least 1 VEC from rsi it is also guranteed to be
1215            safe.  */
1216         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1217         VPTESTM %VMM(0), %VMM(0), %k2
1218         CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1219
1220         KMOV    %k1, %VRCX
1221 # ifdef USE_AS_STRNCMP
1222         leal    CHAR_PER_VEC(%OFFSET_REG64), %eax
1223         cmpq    %rax, %rdx
1224         jbe     L(check_ret_vec_page_cross2)
1225 #  ifdef USE_AS_WCSCMP
1226         addq    $-(CHAR_PER_VEC * 2), %rdx
1227 #  else
1228         addq    %rdi, %rdx
1229 #  endif
1230 # endif
1231         TESTEQ  %VRCX
1232         jz      L(prepare_loop_no_len)
1233
1234         .p2align 4,, 4
1235 L(ret_vec_page_cross):
1236 # ifndef USE_AS_STRNCMP
1237 L(check_ret_vec_page_cross):
1238 # endif
1239         tzcnt   %VRCX, %VRCX
1240         addl    %OFFSET_REG, %ecx
1241 L(ret_vec_page_cross_cont):
1242 # ifdef USE_AS_WCSCMP
1243         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
1244         xorl    %eax, %eax
1245         cmpl    (%rsi, %rcx, SIZE_OF_CHAR), %edx
1246         je      L(ret12)
1247         setl    %al
1248         negl    %eax
1249         xorl    %r8d, %eax
1250 # else
1251         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
1252         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
1253         TOLOWER_gpr (%rax, %eax)
1254         TOLOWER_gpr (%rcx, %ecx)
1255         subl    %ecx, %eax
1256         xorl    %r8d, %eax
1257         subl    %r8d, %eax
1258 # endif
1259 L(ret12):
1260         ret
1261
1262
1263 # ifdef USE_AS_STRNCMP
1264         .p2align 4,, 10
1265 L(check_ret_vec_page_cross2):
1266         TESTEQ  %VRCX
1267 L(check_ret_vec_page_cross):
1268         tzcnt   %VRCX, %VRCX
1269         addl    %OFFSET_REG, %ecx
1270         cmpq    %rcx, %rdx
1271         ja      L(ret_vec_page_cross_cont)
1272         .p2align 4,, 2
1273 L(ret_zero_page_cross):
1274         xorl    %eax, %eax
1275         ret
1276 # endif
1277
1278         .p2align 4,, 4
1279 L(page_cross_s2):
1280         /* Ensure this is a true page cross.  */
1281         subl    $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1282         jbe     L(no_page_cross)
1283
1284
1285         movl    %ecx, %eax
1286         movq    %rdi, %rcx
1287         movq    %rsi, %rdi
1288         movq    %rcx, %rsi
1289
1290         /* set r8 to negate return value as rdi and rsi swapped.  */
1291 # ifdef USE_AS_WCSCMP
1292         movl    $-4, %r8d
1293 # else
1294         movl    $-1, %r8d
1295 # endif
1296         xorl    %OFFSET_REG, %OFFSET_REG
1297
1298         /* Check if more than 1x VEC till page cross.  */
1299         subl    $(VEC_SIZE * 3), %eax
1300         jle     L(page_cross_loop)
1301
1302         .p2align 4,, 6
1303 L(less_1x_vec_till_page):
1304 # ifdef USE_AS_WCSCMP
1305         shrl    $2, %eax
1306 # endif
1307
1308         /* Find largest load size we can use. VEC_SIZE == 64 only check
1309            if we can do a full ymm load.  */
1310 # if VEC_SIZE == 64
1311
1312         cmpl    $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
1313         ja      L(less_32_till_page)
1314
1315
1316         /* Use 16 byte comparison.  */
1317         VMOVU   (%rdi), %VMM_256(0)
1318         VPTESTM %VMM_256(0), %VMM_256(0), %k2
1319         CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
1320         kmovd   %k1, %ecx
1321 #  ifdef USE_AS_WCSCMP
1322         subl    $0xff, %ecx
1323 #  else
1324         incl    %ecx
1325 #  endif
1326         jnz     L(check_ret_vec_page_cross)
1327         movl    $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
1328 #  ifdef USE_AS_STRNCMP
1329         cmpq    %OFFSET_REG64, %rdx
1330         jbe     L(ret_zero_page_cross_slow_case64)
1331         subl    %eax, %OFFSET_REG
1332 #  else
1333         /* Explicit check for 32 byte alignment.  */
1334         subl    %eax, %OFFSET_REG
1335         jz      L(prepare_loop)
1336 #  endif
1337         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
1338         VPTESTM %VMM_256(0), %VMM_256(0), %k2
1339         CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
1340         kmovd   %k1, %ecx
1341 #  ifdef USE_AS_WCSCMP
1342         subl    $0xff, %ecx
1343 #  else
1344         incl    %ecx
1345 #  endif
1346         jnz     L(check_ret_vec_page_cross)
1347 #  ifdef USE_AS_STRNCMP
1348         addl    $(32 / SIZE_OF_CHAR), %OFFSET_REG
1349         subq    %OFFSET_REG64, %rdx
1350         jbe     L(ret_zero_page_cross_slow_case64)
1351         subq    $-(CHAR_PER_VEC * 4), %rdx
1352
1353         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1354         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1355 #  else
1356         leaq    (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1357         leaq    (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1358 #  endif
1359         jmp     L(prepare_loop_aligned)
1360
1361 #  ifdef USE_AS_STRNCMP
1362         .p2align 4,, 2
1363 L(ret_zero_page_cross_slow_case64):
1364         xorl    %eax, %eax
1365         ret
1366 #  endif
1367 L(less_32_till_page):
1368 # endif
1369
1370         /* Find largest load size we can use.  */
1371         cmpl    $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
1372         ja      L(less_16_till_page)
1373
1374         /* Use 16 byte comparison.  */
1375         vmovdqu (%rdi), %xmm0
1376         VPTESTM %xmm0, %xmm0, %k2
1377         CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1378         kmovd   %k1, %ecx
1379 # ifdef USE_AS_WCSCMP
1380         subl    $0xf, %ecx
1381 # else
1382         incw    %cx
1383 # endif
1384         jnz     L(check_ret_vec_page_cross)
1385
1386         movl    $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
1387 # ifdef USE_AS_STRNCMP
1388 #  if VEC_SIZE == 32
1389         cmpq    %OFFSET_REG64, %rdx
1390 #  else
1391         cmpq    $(16 / SIZE_OF_CHAR), %rdx
1392 #  endif
1393         jbe     L(ret_zero_page_cross_slow_case0)
1394         subl    %eax, %OFFSET_REG
1395 # else
1396         /* Explicit check for 16 byte alignment.  */
1397         subl    %eax, %OFFSET_REG
1398         jz      L(prepare_loop)
1399 # endif
1400         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1401         VPTESTM %xmm0, %xmm0, %k2
1402         CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1403         kmovd   %k1, %ecx
1404 # ifdef USE_AS_WCSCMP
1405         subl    $0xf, %ecx
1406 # else
1407         incw    %cx
1408 # endif
1409         jnz     L(check_ret_vec_page_cross)
1410 # ifdef USE_AS_STRNCMP
1411         addl    $(16 / SIZE_OF_CHAR), %OFFSET_REG
1412         subq    %OFFSET_REG64, %rdx
1413         jbe     L(ret_zero_page_cross_slow_case0)
1414         subq    $-(CHAR_PER_VEC * 4), %rdx
1415
1416         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1417         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1418 # else
1419         leaq    (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1420         leaq    (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1421 # endif
1422         jmp     L(prepare_loop_aligned)
1423
1424 # ifdef USE_AS_STRNCMP
1425         .p2align 4,, 2
1426 L(ret_zero_page_cross_slow_case0):
1427         xorl    %eax, %eax
1428         ret
1429 # endif
1430
1431
1432         .p2align 4,, 10
1433 L(less_16_till_page):
1434         cmpl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
1435         ja      L(less_8_till_page)
1436
1437         /* Use 8 byte comparison.  */
1438         vmovq   (%rdi), %xmm0
1439         vmovq   (%rsi), %xmm1
1440         VPTESTM %xmm0, %xmm0, %k2
1441         CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1442         kmovd   %k1, %ecx
1443 # ifdef USE_AS_WCSCMP
1444         subl    $0x3, %ecx
1445 # else
1446         incb    %cl
1447 # endif
1448         jnz     L(check_ret_vec_page_cross)
1449
1450
1451 # ifdef USE_AS_STRNCMP
1452         cmpq    $(8 / SIZE_OF_CHAR), %rdx
1453         jbe     L(ret_zero_page_cross_slow_case0)
1454 # endif
1455         movl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
1456         subl    %eax, %OFFSET_REG
1457
1458         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1459         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1460         VPTESTM %xmm0, %xmm0, %k2
1461         CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1462         kmovd   %k1, %ecx
1463 # ifdef USE_AS_WCSCMP
1464         subl    $0x3, %ecx
1465 # else
1466         incb    %cl
1467 # endif
1468         jnz     L(check_ret_vec_page_cross)
1469
1470
1471 # ifdef USE_AS_STRNCMP
1472         addl    $(8 / SIZE_OF_CHAR), %OFFSET_REG
1473         subq    %OFFSET_REG64, %rdx
1474         jbe     L(ret_zero_page_cross_slow_case0)
1475         subq    $-(CHAR_PER_VEC * 4), %rdx
1476
1477         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1478         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1479 # else
1480         leaq    (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1481         leaq    (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1482 # endif
1483         jmp     L(prepare_loop_aligned)
1484
1485
1486
1487
1488         .p2align 4,, 10
1489 L(less_8_till_page):
1490 # ifdef USE_AS_WCSCMP
1491         /* If using wchar then this is the only check before we reach
1492            the page boundary.  */
1493         movl    (%rdi), %eax
1494         movl    (%rsi), %ecx
1495         cmpl    %ecx, %eax
1496         jnz     L(ret_less_8_wcs)
1497 #  ifdef USE_AS_STRNCMP
1498         addq    $-(CHAR_PER_VEC * 2), %rdx
1499         /* We already checked for len <= 1 so cannot hit that case here.
1500          */
1501 #  endif
1502         testl   %eax, %eax
1503         jnz     L(prepare_loop)
1504         ret
1505
1506         .p2align 4,, 8
1507 L(ret_less_8_wcs):
1508         setl    %OFFSET_REG8
1509         negl    %OFFSET_REG
1510         movl    %OFFSET_REG, %eax
1511         xorl    %r8d, %eax
1512         ret
1513
1514 # else
1515         cmpl    $(VEC_SIZE - 4), %eax
1516         ja      L(less_4_till_page)
1517
1518         vmovd   (%rdi), %xmm0
1519         vmovd   (%rsi), %xmm1
1520         VPTESTM %xmm0, %xmm0, %k2
1521         CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1522         kmovd   %k1, %ecx
1523         subl    $0xf, %ecx
1524         jnz     L(check_ret_vec_page_cross)
1525
1526 #  ifdef USE_AS_STRNCMP
1527         cmpq    $4, %rdx
1528         jbe     L(ret_zero_page_cross_slow_case1)
1529 #  endif
1530         movl    $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
1531         subl    %eax, %OFFSET_REG
1532
1533         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1534         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1535         VPTESTM %xmm0, %xmm0, %k2
1536         CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1537         kmovd   %k1, %ecx
1538         subl    $0xf, %ecx
1539         jnz     L(check_ret_vec_page_cross)
1540 #  ifdef USE_AS_STRNCMP
1541         addl    $(4 / SIZE_OF_CHAR), %OFFSET_REG
1542         subq    %OFFSET_REG64, %rdx
1543         jbe     L(ret_zero_page_cross_slow_case1)
1544         subq    $-(CHAR_PER_VEC * 4), %rdx
1545
1546         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1547         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1548 #  else
1549         leaq    (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1550         leaq    (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1551 #  endif
1552         jmp     L(prepare_loop_aligned)
1553
1554
1555 #  ifdef USE_AS_STRNCMP
1556         .p2align 4,, 2
1557 L(ret_zero_page_cross_slow_case1):
1558         xorl    %eax, %eax
1559         ret
1560 #  endif
1561
1562         .p2align 4,, 10
1563 L(less_4_till_page):
1564         subq    %rdi, %rsi
1565         /* Extremely slow byte comparison loop.  */
1566 L(less_4_loop):
1567         movzbl  (%rdi), %eax
1568         movzbl  (%rsi, %rdi), %ecx
1569         TOLOWER_gpr (%rax, %eax)
1570         TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1571         subl    %BYTE_LOOP_REG, %eax
1572         jnz     L(ret_less_4_loop)
1573         testl   %ecx, %ecx
1574         jz      L(ret_zero_4_loop)
1575 #  ifdef USE_AS_STRNCMP
1576         decq    %rdx
1577         jz      L(ret_zero_4_loop)
1578 #  endif
1579         incq    %rdi
1580         /* end condition is reach page boundary (rdi is aligned).  */
1581         testb   $(VEC_SIZE - 1), %dil
1582         jnz     L(less_4_loop)
1583         leaq    -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1584         addq    $-(VEC_SIZE * 4), %rdi
1585 #  ifdef USE_AS_STRNCMP
1586         subq    $-(CHAR_PER_VEC * 4), %rdx
1587 #  endif
1588         jmp     L(prepare_loop_aligned)
1589
1590 L(ret_zero_4_loop):
1591         xorl    %eax, %eax
1592         ret
1593 L(ret_less_4_loop):
1594         xorl    %r8d, %eax
1595         subl    %r8d, %eax
1596         ret
1597 # endif
1598         cfi_endproc
1599         .size   STRCMP, .-STRCMP
1600 #endif