sysdeps/x86_64/multiarch/strcmp-avx2.S

   1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
   2    Copyright (C) 2018-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (3)
  22
  23 # ifndef STRCMP_ISA
  24 #  define STRCMP_ISA    _avx2
  25 # endif
  26
  27 # include "strcmp-naming.h"
  28
  29 # include <sysdep.h>
  30
  31 # if defined USE_AS_STRCASECMP_L
  32 #  include "locale-defines.h"
  33 # endif
  34
  35 # ifndef STRCMP
  36 #  define STRCMP        __strcmp_avx2
  37 # endif
  38
  39 # define PAGE_SIZE      4096
  40
  41         /* VEC_SIZE = Number of bytes in a ymm register.  */
  42 # define VEC_SIZE       32
  43
  44 # define VMOVU  vmovdqu
  45 # define VMOVA  vmovdqa
  46
  47 # ifdef USE_AS_WCSCMP
  48         /* Compare packed dwords.  */
  49 #  define VPCMPEQ       vpcmpeqd
  50         /* Compare packed dwords and store minimum.  */
  51 #  define VPMINU        vpminud
  52         /* 1 dword char == 4 bytes.  */
  53 #  define SIZE_OF_CHAR  4
  54 # else
  55         /* Compare packed bytes.  */
  56 #  define VPCMPEQ       vpcmpeqb
  57         /* Compare packed bytes and store minimum.  */
  58 #  define VPMINU        vpminub
  59         /* 1 byte char == 1 byte.  */
  60 #  define SIZE_OF_CHAR  1
  61 # endif
  62
  63 # ifdef USE_AS_STRNCMP
  64 #  define LOOP_REG      r9d
  65 #  define LOOP_REG64    r9
  66
  67 #  define OFFSET_REG8   r9b
  68 #  define OFFSET_REG    r9d
  69 #  define OFFSET_REG64  r9
  70 # else
  71 #  define LOOP_REG      edx
  72 #  define LOOP_REG64    rdx
  73
  74 #  define OFFSET_REG8   dl
  75 #  define OFFSET_REG    edx
  76 #  define OFFSET_REG64  rdx
  77 # endif
  78
  79 # ifndef VZEROUPPER
  80 #  define VZEROUPPER    vzeroupper
  81 # endif
  82
  83 # if defined USE_AS_STRNCMP
  84 #  define VEC_OFFSET    0
  85 # else
  86 #  define VEC_OFFSET    (-VEC_SIZE)
  87 # endif
  88
  89 # ifdef USE_AS_STRCASECMP_L
  90 #  define BYTE_LOOP_REG OFFSET_REG
  91 # else
  92 #  define BYTE_LOOP_REG ecx
  93 # endif
  94
  95 # ifdef USE_AS_STRCASECMP_L
  96 #  ifdef USE_AS_STRNCMP
  97 #   define LOCALE_REG   rcx
  98 #   define LOCALE_REG_LP        RCX_LP
  99 #  else
 100 #   define LOCALE_REG   rdx
 101 #   define LOCALE_REG_LP        RDX_LP
 102 #  endif
 103 # endif
 104
 105 # define xmmZERO        xmm15
 106 # define ymmZERO        ymm15
 107
 108 # define LCASE_MIN_ymm  %ymm10
 109 # define LCASE_MAX_ymm  %ymm11
 110 # define CASE_ADD_ymm   %ymm12
 111
 112 # define LCASE_MIN_xmm  %xmm10
 113 # define LCASE_MAX_xmm  %xmm11
 114 # define CASE_ADD_xmm   %xmm12
 115
 116         /* r11 is never use elsewhere so this is safe to maintain.  */
 117 # define TOLOWER_BASE   %r11
 118
 119 # ifndef SECTION
 120 #  define SECTION(p)    p##.avx
 121 # endif
 122
 123 # ifdef USE_AS_STRCASECMP_L
 124 #  define REG(x, y) x ## y
 125 #  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)                    \
 126         vpaddb  REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);                            \
 127         vpaddb  REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);                            \
 128         vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);                      \
 129         vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);                      \
 130         vpandn  REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);                        \
 131         vpandn  REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);                        \
 132         vpaddb  REG(%ext, 8), reg1_in, reg1_out;                                                        \
 133         vpaddb  REG(%ext, 9), reg2_in, reg2_out
 134
 135 #  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
 136 #  define TOLOWER_ymm(...)      TOLOWER(__VA_ARGS__, ymm)
 137 #  define TOLOWER_xmm(...)      TOLOWER(__VA_ARGS__, xmm)
 138
 139 #  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)                  \
 140         TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext);                                     \
 141         VPCMPEQ scratch_reg, s2_reg, reg_out
 142
 143 #  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)                  \
 144         VMOVU   s2_mem, reg_out;                                                                                        \
 145         CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
 146
 147 #  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
 148 #  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
 149
 150 #  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
 151 #  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
 152
 153 # else
 154 #  define TOLOWER_gpr(...)
 155 #  define TOLOWER_ymm(...)
 156 #  define TOLOWER_xmm(...)
 157
 158 #  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)                   \
 159         VPCMPEQ s2_reg, s1_reg, reg_out
 160
 161 #  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
 162
 163 #  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
 164 #  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
 165 # endif
 166
 167 /* Warning!
 168            wcscmp/wcsncmp have to use SIGNED comparison for elements.
 169            strcmp/strncmp have to use UNSIGNED comparison for elements.
 170 */
 171
 172 /* The main idea of the string comparison (byte or dword) using AVX2
 173    consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
 174    either packed bytes or dwords depending on USE_AS_WCSCMP. In order
 175    to check the null char, algorithm keeps the matched bytes/dwords,
 176    requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
 177    the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
 178    one VPMINU instructions, together with movdqu and testl instructions.
 179    Main loop (away from from page boundary) compares 4 vectors are a time,
 180    effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
 181
 182    The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
 183    is the same as strcmp, except that an a maximum offset is tracked.  If
 184    the maximum offset is reached before a difference is found, zero is
 185    returned.  */
 186
 187         .section SECTION(.text), "ax", @progbits
 188         .align  16
 189         .type   STRCMP, @function
 190         .globl  STRCMP
 191
 192 # ifdef USE_AS_STRCASECMP_L
 193 ENTRY (STRCASECMP)
 194         movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
 195         mov     %fs:(%rax), %LOCALE_REG_LP
 196
 197         /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
 198         .p2align 4
 199 END (STRCASECMP)
 200         /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
 201 # endif
 202
 203         .p2align 4
 204 STRCMP:
 205         cfi_startproc
 206         _CET_ENDBR
 207         CALL_MCOUNT
 208
 209 # if defined USE_AS_STRCASECMP_L
 210         /* We have to fall back on the C implementation for locales with
 211            encodings not matching ASCII for single bytes.  */
 212 #  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 213         mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
 214 #  else
 215         mov     (%LOCALE_REG), %RAX_LP
 216 #  endif
 217         testb   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
 218         jne     STRCASECMP_L_NONASCII
 219         leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
 220 # endif
 221
 222 # ifdef USE_AS_STRNCMP
 223         /* Don't overwrite LOCALE_REG (rcx) until we have pass
 224            L(one_or_less). Otherwise we might use the wrong locale in
 225            the OVERFLOW_STRCMP (strcasecmp_l).  */
 226 #  ifdef __ILP32__
 227         /* Clear the upper 32 bits.  */
 228         movl    %edx, %edx
 229 #  endif
 230         cmp     $1, %RDX_LP
 231         /* Signed comparison intentional. We use this branch to also
 232            test cases where length >= 2^63. These very large sizes can be
 233            handled with strcmp as there is no way for that length to
 234            actually bound the buffer.  */
 235         jle     L(one_or_less)
 236 #  ifdef USE_AS_WCSCMP
 237         movq    %rdx, %rcx
 238
 239         /* Multiplying length by sizeof(wchar_t) can result in overflow.
 240            Check if that is possible. All cases where overflow are possible
 241            are cases where length is large enough that it can never be a
 242            bound on valid memory so just use wcscmp.  */
 243         shrq    $56, %rcx
 244         jnz     OVERFLOW_STRCMP
 245
 246         leaq    (, %rdx, 4), %rdx
 247 #  endif
 248 # endif
 249         vpxor   %xmmZERO, %xmmZERO, %xmmZERO
 250 # if defined USE_AS_STRCASECMP_L
 251         .section .rodata.cst32, "aM", @progbits, 32
 252         .align  32
 253 L(lcase_min):
 254         .quad   0x3f3f3f3f3f3f3f3f
 255         .quad   0x3f3f3f3f3f3f3f3f
 256         .quad   0x3f3f3f3f3f3f3f3f
 257         .quad   0x3f3f3f3f3f3f3f3f
 258 L(lcase_max):
 259         .quad   0x9999999999999999
 260         .quad   0x9999999999999999
 261         .quad   0x9999999999999999
 262         .quad   0x9999999999999999
 263 L(case_add):
 264         .quad   0x2020202020202020
 265         .quad   0x2020202020202020
 266         .quad   0x2020202020202020
 267         .quad   0x2020202020202020
 268         .previous
 269
 270         vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
 271         vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
 272         vmovdqa L(case_add)(%rip), CASE_ADD_ymm
 273 # endif
 274         movl    %edi, %eax
 275         orl     %esi, %eax
 276         sall    $20, %eax
 277         /* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
 278         cmpl    $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
 279         ja      L(page_cross)
 280
 281 L(no_page_cross):
 282         /* Safe to compare 4x vectors.  */
 283         VMOVU   (%rdi), %ymm0
 284         /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
 285            Otherwise converts ymm0 and load from rsi to lower. ymm2 is
 286            scratch and ymm1 is the return.  */
 287         CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 288         /* 1s at null CHAR.  */
 289         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 290         /* 1s where s1 and s2 equal AND not null CHAR.  */
 291         vpandn  %ymm1, %ymm2, %ymm1
 292
 293         /* All 1s -> keep going, any 0s -> return.  */
 294         vpmovmskb %ymm1, %ecx
 295 # ifdef USE_AS_STRNCMP
 296         cmpq    $VEC_SIZE, %rdx
 297         jbe     L(vec_0_test_len)
 298 # endif
 299
 300         /* All 1s represents all equals. incl will overflow to zero in
 301            all equals case. Otherwise 1s will carry until position of first
 302            mismatch.  */
 303         incl    %ecx
 304         jz      L(more_3x_vec)
 305
 306         .p2align 4,, 4
 307 L(return_vec_0):
 308         tzcntl  %ecx, %ecx
 309 # ifdef USE_AS_WCSCMP
 310         movl    (%rdi, %rcx), %edx
 311         xorl    %eax, %eax
 312         cmpl    (%rsi, %rcx), %edx
 313         je      L(ret0)
 314         setl    %al
 315         negl    %eax
 316         orl     $1, %eax
 317 # else
 318         movzbl  (%rdi, %rcx), %eax
 319         movzbl  (%rsi, %rcx), %ecx
 320         TOLOWER_gpr (%rax, %eax)
 321         TOLOWER_gpr (%rcx, %ecx)
 322         subl    %ecx, %eax
 323 # endif
 324 L(ret0):
 325 L(return_vzeroupper):
 326         ZERO_UPPER_VEC_REGISTERS_RETURN
 327
 328 # ifdef USE_AS_STRNCMP
 329         .p2align 4,, 8
 330 L(vec_0_test_len):
 331         notl    %ecx
 332         bzhil   %edx, %ecx, %eax
 333         jnz     L(return_vec_0)
 334         /* Align if will cross fetch block.  */
 335         .p2align 4,, 2
 336 L(ret_zero):
 337         xorl    %eax, %eax
 338         VZEROUPPER_RETURN
 339
 340         .p2align 4,, 5
 341 L(one_or_less):
 342 #  ifdef USE_AS_STRCASECMP_L
 343         /* Set locale argument for strcasecmp.  */
 344         movq    %LOCALE_REG, %rdx
 345 #  endif
 346         jb      L(ret_zero)
 347         /* 'nbe' covers the case where length is negative (large
 348            unsigned).  */
 349         jnbe    OVERFLOW_STRCMP
 350 #  ifdef USE_AS_WCSCMP
 351         movl    (%rdi), %edx
 352         xorl    %eax, %eax
 353         cmpl    (%rsi), %edx
 354         je      L(ret1)
 355         setl    %al
 356         negl    %eax
 357         orl     $1, %eax
 358 #  else
 359         movzbl  (%rdi), %eax
 360         movzbl  (%rsi), %ecx
 361         TOLOWER_gpr (%rax, %eax)
 362         TOLOWER_gpr (%rcx, %ecx)
 363         subl    %ecx, %eax
 364 #  endif
 365 L(ret1):
 366         ret
 367 # endif
 368
 369         .p2align 4,, 10
 370 L(return_vec_1):
 371         tzcntl  %ecx, %ecx
 372 # ifdef USE_AS_STRNCMP
 373         /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
 374            overflow.  */
 375         addq    $-VEC_SIZE, %rdx
 376         cmpq    %rcx, %rdx
 377         jbe     L(ret_zero)
 378 # endif
 379 # ifdef USE_AS_WCSCMP
 380         movl    VEC_SIZE(%rdi, %rcx), %edx
 381         xorl    %eax, %eax
 382         cmpl    VEC_SIZE(%rsi, %rcx), %edx
 383         je      L(ret2)
 384         setl    %al
 385         negl    %eax
 386         orl     $1, %eax
 387 # else
 388         movzbl  VEC_SIZE(%rdi, %rcx), %eax
 389         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
 390         TOLOWER_gpr (%rax, %eax)
 391         TOLOWER_gpr (%rcx, %ecx)
 392         subl    %ecx, %eax
 393 # endif
 394 L(ret2):
 395         VZEROUPPER_RETURN
 396
 397         .p2align 4,, 10
 398 # ifdef USE_AS_STRNCMP
 399 L(return_vec_3):
 400         salq    $32, %rcx
 401 # endif
 402
 403 L(return_vec_2):
 404 # ifndef USE_AS_STRNCMP
 405         tzcntl  %ecx, %ecx
 406 # else
 407         tzcntq  %rcx, %rcx
 408         cmpq    %rcx, %rdx
 409         jbe     L(ret_zero)
 410 # endif
 411
 412 # ifdef USE_AS_WCSCMP
 413         movl    (VEC_SIZE * 2)(%rdi, %rcx), %edx
 414         xorl    %eax, %eax
 415         cmpl    (VEC_SIZE * 2)(%rsi, %rcx), %edx
 416         je      L(ret3)
 417         setl    %al
 418         negl    %eax
 419         orl     $1, %eax
 420 # else
 421         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
 422         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
 423         TOLOWER_gpr (%rax, %eax)
 424         TOLOWER_gpr (%rcx, %ecx)
 425         subl    %ecx, %eax
 426 # endif
 427 L(ret3):
 428         VZEROUPPER_RETURN
 429
 430 # ifndef USE_AS_STRNCMP
 431         .p2align 4,, 10
 432 L(return_vec_3):
 433         tzcntl  %ecx, %ecx
 434 #  ifdef USE_AS_WCSCMP
 435         movl    (VEC_SIZE * 3)(%rdi, %rcx), %edx
 436         xorl    %eax, %eax
 437         cmpl    (VEC_SIZE * 3)(%rsi, %rcx), %edx
 438         je      L(ret4)
 439         setl    %al
 440         negl    %eax
 441         orl     $1, %eax
 442 #  else
 443         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
 444         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
 445         TOLOWER_gpr (%rax, %eax)
 446         TOLOWER_gpr (%rcx, %ecx)
 447         subl    %ecx, %eax
 448 #  endif
 449 L(ret4):
 450         VZEROUPPER_RETURN
 451 # endif
 452
 453         .p2align 4,, 10
 454 L(more_3x_vec):
 455         /* Safe to compare 4x vectors.  */
 456         VMOVU   VEC_SIZE(%rdi), %ymm0
 457         CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 458         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 459         vpandn  %ymm1, %ymm2, %ymm1
 460         vpmovmskb %ymm1, %ecx
 461         incl    %ecx
 462         jnz     L(return_vec_1)
 463
 464 # ifdef USE_AS_STRNCMP
 465         subq    $(VEC_SIZE * 2), %rdx
 466         jbe     L(ret_zero)
 467 # endif
 468
 469         VMOVU   (VEC_SIZE * 2)(%rdi), %ymm0
 470         CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 471         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 472         vpandn  %ymm1, %ymm2, %ymm1
 473         vpmovmskb %ymm1, %ecx
 474         incl    %ecx
 475         jnz     L(return_vec_2)
 476
 477         VMOVU   (VEC_SIZE * 3)(%rdi), %ymm0
 478         CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 479         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 480         vpandn  %ymm1, %ymm2, %ymm1
 481         vpmovmskb %ymm1, %ecx
 482         incl    %ecx
 483         jnz     L(return_vec_3)
 484
 485 # ifdef USE_AS_STRNCMP
 486         cmpq    $(VEC_SIZE * 2), %rdx
 487         jbe     L(ret_zero)
 488 # endif
 489
 490 # ifdef USE_AS_WCSCMP
 491         /* any non-zero positive value that doesn't inference with 0x1.
 492          */
 493         movl    $2, %r8d
 494
 495 # else
 496         xorl    %r8d, %r8d
 497 # endif
 498
 499         /* The prepare labels are various entry points from the page
 500            cross logic.  */
 501 L(prepare_loop):
 502
 503 # ifdef USE_AS_STRNCMP
 504         /* Store N + (VEC_SIZE * 4) and place check at the begining of
 505            the loop.  */
 506         leaq    (VEC_SIZE * 2)(%rdi, %rdx), %rdx
 507 # endif
 508 L(prepare_loop_no_len):
 509
 510         /* Align s1 and adjust s2 accordingly.  */
 511         subq    %rdi, %rsi
 512         andq    $-(VEC_SIZE * 4), %rdi
 513         addq    %rdi, %rsi
 514
 515 # ifdef USE_AS_STRNCMP
 516         subq    %rdi, %rdx
 517 # endif
 518
 519 L(prepare_loop_aligned):
 520         /* eax stores distance from rsi to next page cross. These cases
 521            need to be handled specially as the 4x loop could potentially
 522            read memory past the length of s1 or s2 and across a page
 523            boundary.  */
 524         movl    $-(VEC_SIZE * 4), %eax
 525         subl    %esi, %eax
 526         andl    $(PAGE_SIZE - 1), %eax
 527
 528         /* Loop 4x comparisons at a time.  */
 529         .p2align 4
 530 L(loop):
 531
 532         /* End condition for strncmp.  */
 533 # ifdef USE_AS_STRNCMP
 534         subq    $(VEC_SIZE * 4), %rdx
 535         jbe     L(ret_zero)
 536 # endif
 537
 538         subq    $-(VEC_SIZE * 4), %rdi
 539         subq    $-(VEC_SIZE * 4), %rsi
 540
 541         /* Check if rsi loads will cross a page boundary.  */
 542         addl    $-(VEC_SIZE * 4), %eax
 543         jnb     L(page_cross_during_loop)
 544
 545         /* Loop entry after handling page cross during loop.  */
 546 L(loop_skip_page_cross_check):
 547         VMOVA   (VEC_SIZE * 0)(%rdi), %ymm0
 548         VMOVA   (VEC_SIZE * 1)(%rdi), %ymm2
 549         VMOVA   (VEC_SIZE * 2)(%rdi), %ymm4
 550         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
 551
 552         /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
 553         CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
 554         CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
 555         CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
 556         CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 557
 558         /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 559            zero.  */
 560         vpand   %ymm0, %ymm1, %ymm1
 561
 562
 563         vpand   %ymm2, %ymm3, %ymm3
 564         vpand   %ymm4, %ymm5, %ymm5
 565         vpand   %ymm6, %ymm7, %ymm7
 566
 567         VPMINU  %ymm1, %ymm3, %ymm3
 568         VPMINU  %ymm5, %ymm7, %ymm7
 569
 570         /* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
 571         VPMINU  %ymm3, %ymm7, %ymm7
 572
 573         /* If any 0 CHAR then done.  */
 574         VPCMPEQ %ymm7, %ymmZERO, %ymm7
 575         vpmovmskb %ymm7, %LOOP_REG
 576         testl   %LOOP_REG, %LOOP_REG
 577         jz      L(loop)
 578
 579         /* Find which VEC has the mismatch of end of string.  */
 580         VPCMPEQ %ymm1, %ymmZERO, %ymm1
 581         vpmovmskb %ymm1, %ecx
 582         testl   %ecx, %ecx
 583         jnz     L(return_vec_0_end)
 584
 585
 586         VPCMPEQ %ymm3, %ymmZERO, %ymm3
 587         vpmovmskb %ymm3, %ecx
 588         testl   %ecx, %ecx
 589         jnz     L(return_vec_1_end)
 590
 591 L(return_vec_2_3_end):
 592 # ifdef USE_AS_STRNCMP
 593         subq    $(VEC_SIZE * 2), %rdx
 594         jbe     L(ret_zero_end)
 595 # endif
 596
 597         VPCMPEQ %ymm5, %ymmZERO, %ymm5
 598         vpmovmskb %ymm5, %ecx
 599         testl   %ecx, %ecx
 600         jnz     L(return_vec_2_end)
 601
 602         /* LOOP_REG contains matches for null/mismatch from the loop. If
 603            VEC 0,1,and 2 all have no null and no mismatches then mismatch
 604            must entirely be from VEC 3 which is fully represented by
 605            LOOP_REG.  */
 606         tzcntl  %LOOP_REG, %LOOP_REG
 607
 608 # ifdef USE_AS_STRNCMP
 609         subl    $-(VEC_SIZE), %LOOP_REG
 610         cmpq    %LOOP_REG64, %rdx
 611         jbe     L(ret_zero_end)
 612 # endif
 613
 614 # ifdef USE_AS_WCSCMP
 615         movl    (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
 616         xorl    %eax, %eax
 617         cmpl    (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
 618         je      L(ret5)
 619         setl    %al
 620         negl    %eax
 621         xorl    %r8d, %eax
 622 # else
 623         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 624         movzbl  (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
 625         TOLOWER_gpr (%rax, %eax)
 626         TOLOWER_gpr (%rcx, %ecx)
 627         subl    %ecx, %eax
 628         xorl    %r8d, %eax
 629         subl    %r8d, %eax
 630 # endif
 631 L(ret5):
 632         VZEROUPPER_RETURN
 633
 634 # ifdef USE_AS_STRNCMP
 635         .p2align 4,, 2
 636 L(ret_zero_end):
 637         xorl    %eax, %eax
 638         VZEROUPPER_RETURN
 639 # endif
 640
 641
 642         /* The L(return_vec_N_end) differ from L(return_vec_N) in that
 643            they use the value of `r8` to negate the return value. This is
 644            because the page cross logic can swap `rdi` and `rsi`.  */
 645         .p2align 4,, 10
 646 # ifdef USE_AS_STRNCMP
 647 L(return_vec_1_end):
 648         salq    $32, %rcx
 649 # endif
 650 L(return_vec_0_end):
 651 # ifndef USE_AS_STRNCMP
 652         tzcntl  %ecx, %ecx
 653 # else
 654         tzcntq  %rcx, %rcx
 655         cmpq    %rcx, %rdx
 656         jbe     L(ret_zero_end)
 657 # endif
 658
 659 # ifdef USE_AS_WCSCMP
 660         movl    (%rdi, %rcx), %edx
 661         xorl    %eax, %eax
 662         cmpl    (%rsi, %rcx), %edx
 663         je      L(ret6)
 664         setl    %al
 665         negl    %eax
 666         xorl    %r8d, %eax
 667 # else
 668         movzbl  (%rdi, %rcx), %eax
 669         movzbl  (%rsi, %rcx), %ecx
 670         TOLOWER_gpr (%rax, %eax)
 671         TOLOWER_gpr (%rcx, %ecx)
 672         subl    %ecx, %eax
 673         xorl    %r8d, %eax
 674         subl    %r8d, %eax
 675 # endif
 676 L(ret6):
 677         VZEROUPPER_RETURN
 678
 679 # ifndef USE_AS_STRNCMP
 680         .p2align 4,, 10
 681 L(return_vec_1_end):
 682         tzcntl  %ecx, %ecx
 683 #  ifdef USE_AS_WCSCMP
 684         movl    VEC_SIZE(%rdi, %rcx), %edx
 685         xorl    %eax, %eax
 686         cmpl    VEC_SIZE(%rsi, %rcx), %edx
 687         je      L(ret7)
 688         setl    %al
 689         negl    %eax
 690         xorl    %r8d, %eax
 691 #  else
 692         movzbl  VEC_SIZE(%rdi, %rcx), %eax
 693         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
 694         TOLOWER_gpr (%rax, %eax)
 695         TOLOWER_gpr (%rcx, %ecx)
 696         subl    %ecx, %eax
 697         xorl    %r8d, %eax
 698         subl    %r8d, %eax
 699 #  endif
 700 L(ret7):
 701         VZEROUPPER_RETURN
 702 # endif
 703
 704         .p2align 4,, 10
 705 L(return_vec_2_end):
 706         tzcntl  %ecx, %ecx
 707 # ifdef USE_AS_STRNCMP
 708         cmpq    %rcx, %rdx
 709         jbe     L(ret_zero_page_cross)
 710 # endif
 711 # ifdef USE_AS_WCSCMP
 712         movl    (VEC_SIZE * 2)(%rdi, %rcx), %edx
 713         xorl    %eax, %eax
 714         cmpl    (VEC_SIZE * 2)(%rsi, %rcx), %edx
 715         je      L(ret11)
 716         setl    %al
 717         negl    %eax
 718         xorl    %r8d, %eax
 719 # else
 720         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
 721         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
 722         TOLOWER_gpr (%rax, %eax)
 723         TOLOWER_gpr (%rcx, %ecx)
 724         subl    %ecx, %eax
 725         xorl    %r8d, %eax
 726         subl    %r8d, %eax
 727 # endif
 728 L(ret11):
 729         VZEROUPPER_RETURN
 730
 731
 732         /* Page cross in rsi in next 4x VEC.  */
 733
 734         /* TODO: Improve logic here.  */
 735         .p2align 4,, 10
 736 L(page_cross_during_loop):
 737         /* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
 738
 739         /* Optimistically rsi and rdi and both aligned inwhich case we
 740            don't need any logic here.  */
 741         cmpl    $-(VEC_SIZE * 4), %eax
 742         /* Don't adjust eax before jumping back to loop and we will
 743            never hit page cross case again.  */
 744         je      L(loop_skip_page_cross_check)
 745
 746         /* Check if we can safely load a VEC.  */
 747         cmpl    $-(VEC_SIZE * 3), %eax
 748         jle     L(less_1x_vec_till_page_cross)
 749
 750         VMOVA   (%rdi), %ymm0
 751         CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 752         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 753         vpandn  %ymm1, %ymm2, %ymm1
 754         vpmovmskb %ymm1, %ecx
 755         incl    %ecx
 756         jnz     L(return_vec_0_end)
 757
 758         /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
 759         cmpl    $-(VEC_SIZE * 2), %eax
 760         jg      L(more_2x_vec_till_page_cross)
 761
 762         .p2align 4,, 4
 763 L(less_1x_vec_till_page_cross):
 764         subl    $-(VEC_SIZE * 4), %eax
 765         /* Guranteed safe to read from rdi - VEC_SIZE here. The only
 766            concerning case is first iteration if incoming s1 was near start
 767            of a page and s2 near end. If s1 was near the start of the page
 768            we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
 769            to read back -VEC_SIZE. If rdi is truly at the start of a page
 770            here, it means the previous page (rdi - VEC_SIZE) has already
 771            been loaded earlier so must be valid.  */
 772         VMOVU   -VEC_SIZE(%rdi, %rax), %ymm0
 773         CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 774         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 775         vpandn  %ymm1, %ymm2, %ymm1
 776         vpmovmskb %ymm1, %ecx
 777
 778         /* Mask of potentially valid bits. The lower bits can be out of
 779            range comparisons (but safe regarding page crosses).  */
 780         movl    $-1, %r10d
 781         shlxl   %esi, %r10d, %r10d
 782         notl    %ecx
 783
 784 # ifdef USE_AS_STRNCMP
 785         cmpq    %rax, %rdx
 786         jbe     L(return_page_cross_end_check)
 787 # endif
 788         movl    %eax, %OFFSET_REG
 789         addl    $(PAGE_SIZE - VEC_SIZE * 4), %eax
 790
 791         andl    %r10d, %ecx
 792         jz      L(loop_skip_page_cross_check)
 793
 794         .p2align 4,, 3
 795 L(return_page_cross_end):
 796         tzcntl  %ecx, %ecx
 797
 798 # ifdef USE_AS_STRNCMP
 799         leal    -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
 800 L(return_page_cross_cmp_mem):
 801 # else
 802         addl    %OFFSET_REG, %ecx
 803 # endif
 804 # ifdef USE_AS_WCSCMP
 805         movl    VEC_OFFSET(%rdi, %rcx), %edx
 806         xorl    %eax, %eax
 807         cmpl    VEC_OFFSET(%rsi, %rcx), %edx
 808         je      L(ret8)
 809         setl    %al
 810         negl    %eax
 811         xorl    %r8d, %eax
 812 # else
 813         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
 814         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
 815         TOLOWER_gpr (%rax, %eax)
 816         TOLOWER_gpr (%rcx, %ecx)
 817         subl    %ecx, %eax
 818         xorl    %r8d, %eax
 819         subl    %r8d, %eax
 820 # endif
 821 L(ret8):
 822         VZEROUPPER_RETURN
 823
 824 # ifdef USE_AS_STRNCMP
 825         .p2align 4,, 10
 826 L(return_page_cross_end_check):
 827         andl    %r10d, %ecx
 828         tzcntl  %ecx, %ecx
 829         leal    -VEC_SIZE(%rax, %rcx), %ecx
 830         cmpl    %ecx, %edx
 831         ja      L(return_page_cross_cmp_mem)
 832         xorl    %eax, %eax
 833         VZEROUPPER_RETURN
 834 # endif
 835
 836
 837         .p2align 4,, 10
 838 L(more_2x_vec_till_page_cross):
 839         /* If more 2x vec till cross we will complete a full loop
 840            iteration here.  */
 841
 842         VMOVU   VEC_SIZE(%rdi), %ymm0
 843         CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 844         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 845         vpandn  %ymm1, %ymm2, %ymm1
 846         vpmovmskb %ymm1, %ecx
 847         incl    %ecx
 848         jnz     L(return_vec_1_end)
 849
 850 # ifdef USE_AS_STRNCMP
 851         cmpq    $(VEC_SIZE * 2), %rdx
 852         jbe     L(ret_zero_in_loop_page_cross)
 853 # endif
 854
 855         subl    $-(VEC_SIZE * 4), %eax
 856
 857         /* Safe to include comparisons from lower bytes.  */
 858         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
 859         CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 860         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 861         vpandn  %ymm1, %ymm2, %ymm1
 862         vpmovmskb %ymm1, %ecx
 863         incl    %ecx
 864         jnz     L(return_vec_page_cross_0)
 865
 866         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
 867         CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 868         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 869         vpandn  %ymm1, %ymm2, %ymm1
 870         vpmovmskb %ymm1, %ecx
 871         incl    %ecx
 872         jnz     L(return_vec_page_cross_1)
 873
 874 # ifdef USE_AS_STRNCMP
 875         /* Must check length here as length might proclude reading next
 876            page.  */
 877         cmpq    %rax, %rdx
 878         jbe     L(ret_zero_in_loop_page_cross)
 879 # endif
 880
 881         /* Finish the loop.  */
 882         VMOVA   (VEC_SIZE * 2)(%rdi), %ymm4
 883         VMOVA   (VEC_SIZE * 3)(%rdi), %ymm6
 884
 885         CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
 886         CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 887         vpand   %ymm4, %ymm5, %ymm5
 888         vpand   %ymm6, %ymm7, %ymm7
 889         VPMINU  %ymm5, %ymm7, %ymm7
 890         VPCMPEQ %ymm7, %ymmZERO, %ymm7
 891         vpmovmskb %ymm7, %LOOP_REG
 892         testl   %LOOP_REG, %LOOP_REG
 893         jnz     L(return_vec_2_3_end)
 894
 895         /* Best for code size to include ucond-jmp here. Would be faster
 896            if this case is hot to duplicate the L(return_vec_2_3_end) code
 897            as fall-through and have jump back to loop on mismatch
 898            comparison.  */
 899         subq    $-(VEC_SIZE * 4), %rdi
 900         subq    $-(VEC_SIZE * 4), %rsi
 901         addl    $(PAGE_SIZE - VEC_SIZE * 8), %eax
 902 # ifdef USE_AS_STRNCMP
 903         subq    $(VEC_SIZE * 4), %rdx
 904         ja      L(loop_skip_page_cross_check)
 905 L(ret_zero_in_loop_page_cross):
 906         xorl    %eax, %eax
 907         VZEROUPPER_RETURN
 908 # else
 909         jmp     L(loop_skip_page_cross_check)
 910 # endif
 911
 912
 913         .p2align 4,, 10
 914 L(return_vec_page_cross_0):
 915         addl    $-VEC_SIZE, %eax
 916 L(return_vec_page_cross_1):
 917         tzcntl  %ecx, %ecx
 918 # ifdef USE_AS_STRNCMP
 919         leal    -VEC_SIZE(%rax, %rcx), %ecx
 920         cmpq    %rcx, %rdx
 921         jbe     L(ret_zero_in_loop_page_cross)
 922 # else
 923         addl    %eax, %ecx
 924 # endif
 925
 926 # ifdef USE_AS_WCSCMP
 927         movl    VEC_OFFSET(%rdi, %rcx), %edx
 928         xorl    %eax, %eax
 929         cmpl    VEC_OFFSET(%rsi, %rcx), %edx
 930         je      L(ret9)
 931         setl    %al
 932         negl    %eax
 933         xorl    %r8d, %eax
 934 # else
 935         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
 936         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
 937         TOLOWER_gpr (%rax, %eax)
 938         TOLOWER_gpr (%rcx, %ecx)
 939         subl    %ecx, %eax
 940         xorl    %r8d, %eax
 941         subl    %r8d, %eax
 942 # endif
 943 L(ret9):
 944         VZEROUPPER_RETURN
 945
 946
 947         .p2align 4,, 10
 948 L(page_cross):
 949 # ifndef USE_AS_STRNCMP
 950         /* If both are VEC aligned we don't need any special logic here.
 951            Only valid for strcmp where stop condition is guranteed to be
 952            reachable by just reading memory.  */
 953         testl   $((VEC_SIZE - 1) << 20), %eax
 954         jz      L(no_page_cross)
 955 # endif
 956
 957         movl    %edi, %eax
 958         movl    %esi, %ecx
 959         andl    $(PAGE_SIZE - 1), %eax
 960         andl    $(PAGE_SIZE - 1), %ecx
 961
 962         xorl    %OFFSET_REG, %OFFSET_REG
 963
 964         /* Check which is closer to page cross, s1 or s2.  */
 965         cmpl    %eax, %ecx
 966         jg      L(page_cross_s2)
 967
 968         /* The previous page cross check has false positives. Check for
 969            true positive as page cross logic is very expensive.  */
 970         subl    $(PAGE_SIZE - VEC_SIZE * 4), %eax
 971         jbe     L(no_page_cross)
 972
 973         /* Set r8 to not interfere with normal return value (rdi and rsi
 974            did not swap).  */
 975 # ifdef USE_AS_WCSCMP
 976         /* any non-zero positive value that doesn't inference with 0x1.
 977          */
 978         movl    $2, %r8d
 979 # else
 980         xorl    %r8d, %r8d
 981 # endif
 982
 983         /* Check if less than 1x VEC till page cross.  */
 984         subl    $(VEC_SIZE * 3), %eax
 985         jg      L(less_1x_vec_till_page)
 986
 987         /* If more than 1x VEC till page cross, loop throuh safely
 988            loadable memory until within 1x VEC of page cross.  */
 989
 990         .p2align 4,, 10
 991 L(page_cross_loop):
 992
 993         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
 994         CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 995         VPCMPEQ %ymm0, %ymmZERO, %ymm2
 996         vpandn  %ymm1, %ymm2, %ymm1
 997         vpmovmskb %ymm1, %ecx
 998         incl    %ecx
 999
1000         jnz     L(check_ret_vec_page_cross)
1001         addl    $VEC_SIZE, %OFFSET_REG
1002 # ifdef USE_AS_STRNCMP
1003         cmpq    %OFFSET_REG64, %rdx
1004         jbe     L(ret_zero_page_cross)
1005 # endif
1006         addl    $VEC_SIZE, %eax
1007         jl      L(page_cross_loop)
1008
1009         subl    %eax, %OFFSET_REG
1010         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1011            to not cross page so is safe to load. Since we have already
1012            loaded at least 1 VEC from rsi it is also guranteed to be
1013            safe.  */
1014
1015         VMOVU   (%rdi, %OFFSET_REG64), %ymm0
1016         CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1017         VPCMPEQ %ymm0, %ymmZERO, %ymm2
1018         vpandn  %ymm1, %ymm2, %ymm1
1019         vpmovmskb %ymm1, %ecx
1020
1021 # ifdef USE_AS_STRNCMP
1022         leal    VEC_SIZE(%OFFSET_REG64), %eax
1023         cmpq    %rax, %rdx
1024         jbe     L(check_ret_vec_page_cross2)
1025         addq    %rdi, %rdx
1026 # endif
1027         incl    %ecx
1028         jz      L(prepare_loop_no_len)
1029
1030         .p2align 4,, 4
1031 L(ret_vec_page_cross):
1032 # ifndef USE_AS_STRNCMP
1033 L(check_ret_vec_page_cross):
1034 # endif
1035         tzcntl  %ecx, %ecx
1036         addl    %OFFSET_REG, %ecx
1037 L(ret_vec_page_cross_cont):
1038 # ifdef USE_AS_WCSCMP
1039         movl    (%rdi, %rcx), %edx
1040         xorl    %eax, %eax
1041         cmpl    (%rsi, %rcx), %edx
1042         je      L(ret12)
1043         setl    %al
1044         negl    %eax
1045         xorl    %r8d, %eax
1046 # else
1047         movzbl  (%rdi, %rcx), %eax
1048         movzbl  (%rsi, %rcx), %ecx
1049         TOLOWER_gpr (%rax, %eax)
1050         TOLOWER_gpr (%rcx, %ecx)
1051         subl    %ecx, %eax
1052         xorl    %r8d, %eax
1053         subl    %r8d, %eax
1054 # endif
1055 L(ret12):
1056         VZEROUPPER_RETURN
1057
1058 # ifdef USE_AS_STRNCMP
1059         .p2align 4,, 10
1060 L(check_ret_vec_page_cross2):
1061         incl    %ecx
1062 L(check_ret_vec_page_cross):
1063         tzcntl  %ecx, %ecx
1064         addl    %OFFSET_REG, %ecx
1065         cmpq    %rcx, %rdx
1066         ja      L(ret_vec_page_cross_cont)
1067         .p2align 4,, 2
1068 L(ret_zero_page_cross):
1069         xorl    %eax, %eax
1070         VZEROUPPER_RETURN
1071 # endif
1072
1073         .p2align 4,, 4
1074 L(page_cross_s2):
1075         /* Ensure this is a true page cross.  */
1076         subl    $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1077         jbe     L(no_page_cross)
1078
1079
1080         movl    %ecx, %eax
1081         movq    %rdi, %rcx
1082         movq    %rsi, %rdi
1083         movq    %rcx, %rsi
1084
1085         /* set r8 to negate return value as rdi and rsi swapped.  */
1086 # ifdef USE_AS_WCSCMP
1087         movl    $-4, %r8d
1088 # else
1089         movl    $-1, %r8d
1090 # endif
1091         xorl    %OFFSET_REG, %OFFSET_REG
1092
1093         /* Check if more than 1x VEC till page cross.  */
1094         subl    $(VEC_SIZE * 3), %eax
1095         jle     L(page_cross_loop)
1096
1097         .p2align 4,, 6
1098 L(less_1x_vec_till_page):
1099         /* Find largest load size we can use.  */
1100         cmpl    $16, %eax
1101         ja      L(less_16_till_page)
1102
1103         VMOVU   (%rdi), %xmm0
1104         CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1105         VPCMPEQ %xmm0, %xmmZERO, %xmm2
1106         vpandn  %xmm1, %xmm2, %xmm1
1107         vpmovmskb %ymm1, %ecx
1108         incw    %cx
1109         jnz     L(check_ret_vec_page_cross)
1110         movl    $16, %OFFSET_REG
1111 # ifdef USE_AS_STRNCMP
1112         cmpq    %OFFSET_REG64, %rdx
1113         jbe     L(ret_zero_page_cross_slow_case0)
1114         subl    %eax, %OFFSET_REG
1115 # else
1116         /* Explicit check for 16 byte alignment.  */
1117         subl    %eax, %OFFSET_REG
1118         jz      L(prepare_loop)
1119 # endif
1120
1121         VMOVU   (%rdi, %OFFSET_REG64), %xmm0
1122         CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1123         VPCMPEQ %xmm0, %xmmZERO, %xmm2
1124         vpandn  %xmm1, %xmm2, %xmm1
1125         vpmovmskb %ymm1, %ecx
1126         incw    %cx
1127         jnz     L(check_ret_vec_page_cross)
1128
1129 # ifdef USE_AS_STRNCMP
1130         addl    $16, %OFFSET_REG
1131         subq    %OFFSET_REG64, %rdx
1132         jbe     L(ret_zero_page_cross_slow_case0)
1133         subq    $-(VEC_SIZE * 4), %rdx
1134
1135         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1136         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1137 # else
1138         leaq    (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1139         leaq    (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1140 # endif
1141         jmp     L(prepare_loop_aligned)
1142
1143 # ifdef USE_AS_STRNCMP
1144         .p2align 4,, 2
1145 L(ret_zero_page_cross_slow_case0):
1146         xorl    %eax, %eax
1147         ret
1148 # endif
1149
1150
1151         .p2align 4,, 10
1152 L(less_16_till_page):
1153         /* Find largest load size we can use.  */
1154         cmpl    $24, %eax
1155         ja      L(less_8_till_page)
1156
1157         vmovq   (%rdi), %xmm0
1158         vmovq   (%rsi), %xmm1
1159         VPCMPEQ %xmm0, %xmmZERO, %xmm2
1160         CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1161         vpandn  %xmm1, %xmm2, %xmm1
1162         vpmovmskb %ymm1, %ecx
1163         incb    %cl
1164         jnz     L(check_ret_vec_page_cross)
1165
1166
1167 # ifdef USE_AS_STRNCMP
1168         cmpq    $8, %rdx
1169         jbe     L(ret_zero_page_cross_slow_case0)
1170 # endif
1171         movl    $24, %OFFSET_REG
1172         /* Explicit check for 16 byte alignment.  */
1173         subl    %eax, %OFFSET_REG
1174
1175
1176
1177         vmovq   (%rdi, %OFFSET_REG64), %xmm0
1178         vmovq   (%rsi, %OFFSET_REG64), %xmm1
1179         VPCMPEQ %xmm0, %xmmZERO, %xmm2
1180         CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1181         vpandn  %xmm1, %xmm2, %xmm1
1182         vpmovmskb %ymm1, %ecx
1183         incb    %cl
1184         jnz     L(check_ret_vec_page_cross)
1185
1186 # ifdef USE_AS_STRNCMP
1187         addl    $8, %OFFSET_REG
1188         subq    %OFFSET_REG64, %rdx
1189         jbe     L(ret_zero_page_cross_slow_case0)
1190         subq    $-(VEC_SIZE * 4), %rdx
1191
1192         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1193         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1194 # else
1195         leaq    (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1196         leaq    (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1197 # endif
1198         jmp     L(prepare_loop_aligned)
1199
1200
1201         .p2align 4,, 10
1202 L(less_8_till_page):
1203 # ifdef USE_AS_WCSCMP
1204         /* If using wchar then this is the only check before we reach
1205            the page boundary.  */
1206         movl    (%rdi), %eax
1207         movl    (%rsi), %ecx
1208         cmpl    %ecx, %eax
1209         jnz     L(ret_less_8_wcs)
1210 #  ifdef USE_AS_STRNCMP
1211         addq    %rdi, %rdx
1212         /* We already checked for len <= 1 so cannot hit that case here.
1213          */
1214 #  endif
1215         testl   %eax, %eax
1216         jnz     L(prepare_loop_no_len)
1217         ret
1218
1219         .p2align 4,, 8
1220 L(ret_less_8_wcs):
1221         setl    %OFFSET_REG8
1222         negl    %OFFSET_REG
1223         movl    %OFFSET_REG, %eax
1224         xorl    %r8d, %eax
1225         ret
1226
1227 # else
1228
1229         /* Find largest load size we can use.  */
1230         cmpl    $28, %eax
1231         ja      L(less_4_till_page)
1232
1233         vmovd   (%rdi), %xmm0
1234         vmovd   (%rsi), %xmm1
1235         VPCMPEQ %xmm0, %xmmZERO, %xmm2
1236         CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1237         vpandn  %xmm1, %xmm2, %xmm1
1238         vpmovmskb %ymm1, %ecx
1239         subl    $0xf, %ecx
1240         jnz     L(check_ret_vec_page_cross)
1241
1242 #  ifdef USE_AS_STRNCMP
1243         cmpq    $4, %rdx
1244         jbe     L(ret_zero_page_cross_slow_case1)
1245 #  endif
1246         movl    $28, %OFFSET_REG
1247         /* Explicit check for 16 byte alignment.  */
1248         subl    %eax, %OFFSET_REG
1249
1250
1251
1252         vmovd   (%rdi, %OFFSET_REG64), %xmm0
1253         vmovd   (%rsi, %OFFSET_REG64), %xmm1
1254         VPCMPEQ %xmm0, %xmmZERO, %xmm2
1255         CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1256         vpandn  %xmm1, %xmm2, %xmm1
1257         vpmovmskb %ymm1, %ecx
1258         subl    $0xf, %ecx
1259         jnz     L(check_ret_vec_page_cross)
1260
1261 #  ifdef USE_AS_STRNCMP
1262         addl    $4, %OFFSET_REG
1263         subq    %OFFSET_REG64, %rdx
1264         jbe     L(ret_zero_page_cross_slow_case1)
1265         subq    $-(VEC_SIZE * 4), %rdx
1266
1267         leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1268         leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1269 #  else
1270         leaq    (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1271         leaq    (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1272 #  endif
1273         jmp     L(prepare_loop_aligned)
1274
1275 #  ifdef USE_AS_STRNCMP
1276         .p2align 4,, 2
1277 L(ret_zero_page_cross_slow_case1):
1278         xorl    %eax, %eax
1279         ret
1280 #  endif
1281
1282         .p2align 4,, 10
1283 L(less_4_till_page):
1284         subq    %rdi, %rsi
1285         /* Extremely slow byte comparison loop.  */
1286 L(less_4_loop):
1287         movzbl  (%rdi), %eax
1288         movzbl  (%rsi, %rdi), %ecx
1289         TOLOWER_gpr (%rax, %eax)
1290         TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1291         subl    %BYTE_LOOP_REG, %eax
1292         jnz     L(ret_less_4_loop)
1293         testl   %ecx, %ecx
1294         jz      L(ret_zero_4_loop)
1295 #  ifdef USE_AS_STRNCMP
1296         decq    %rdx
1297         jz      L(ret_zero_4_loop)
1298 #  endif
1299         incq    %rdi
1300         /* end condition is reach page boundary (rdi is aligned).  */
1301         testl   $31, %edi
1302         jnz     L(less_4_loop)
1303         leaq    -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1304         addq    $-(VEC_SIZE * 4), %rdi
1305 #  ifdef USE_AS_STRNCMP
1306         subq    $-(VEC_SIZE * 4), %rdx
1307 #  endif
1308         jmp     L(prepare_loop_aligned)
1309
1310 L(ret_zero_4_loop):
1311         xorl    %eax, %eax
1312         ret
1313 L(ret_less_4_loop):
1314         xorl    %r8d, %eax
1315         subl    %r8d, %eax
1316         ret
1317 # endif
1318         cfi_endproc
1319         .size   STRCMP, .-STRCMP
1320 #endif