sysdeps/x86_64/multiarch/memchr-evex.S

   1 /* memchr/wmemchr optimized with 256-bit EVEX instructions.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20 #include <sysdep.h>
  21
  22 #if ISA_SHOULD_BUILD (4)
  23
  24 # ifndef VEC_SIZE
  25 #  include "x86-evex256-vecs.h"
  26 # endif
  27
  28 # ifndef MEMCHR
  29 #  define MEMCHR        __memchr_evex
  30 # endif
  31
  32 # ifdef USE_AS_WMEMCHR
  33 #  define PC_SHIFT_GPR  rcx
  34 #  define VPTESTN       vptestnmd
  35 #  define VPBROADCAST   vpbroadcastd
  36 #  define VPMINU        vpminud
  37 #  define VPCMP vpcmpd
  38 #  define VPCMPEQ       vpcmpeqd
  39 #  define CHAR_SIZE     4
  40
  41 #  define USE_WIDE_CHAR
  42 # else
  43 #  define PC_SHIFT_GPR  rdi
  44 #  define VPTESTN       vptestnmb
  45 #  define VPBROADCAST   vpbroadcastb
  46 #  define VPMINU        vpminub
  47 #  define VPCMP vpcmpb
  48 #  define VPCMPEQ       vpcmpeqb
  49 #  define CHAR_SIZE     1
  50 # endif
  51
  52 # include "reg-macros.h"
  53
  54
  55 /* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
  56    doesn't have VEX encoding), use VEX encoding in loop so we
  57    can use vpcmpeqb + vptern which is more efficient than the
  58    EVEX alternative.  */
  59 # if defined USE_IN_RTM || VEC_SIZE == 64
  60 #  undef COND_VZEROUPPER
  61 #  undef VZEROUPPER_RETURN
  62 #  undef VZEROUPPER
  63
  64 #  define COND_VZEROUPPER
  65 #  define VZEROUPPER_RETURN     ret
  66 #  define VZEROUPPER
  67
  68 #  define USE_TERN_IN_LOOP      0
  69 # else
  70 #  define USE_TERN_IN_LOOP      1
  71 #  undef VZEROUPPER
  72 #  define VZEROUPPER    vzeroupper
  73 # endif
  74
  75 # if USE_TERN_IN_LOOP
  76         /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
  77            so we don't want to multiply resulting index.  */
  78 #  define TERN_CHAR_MULT        1
  79
  80 #  ifdef USE_AS_WMEMCHR
  81 #   define TEST_END()   inc %VRCX
  82 #  else
  83 #   define TEST_END()   add %rdx, %rcx
  84 #  endif
  85 # else
  86 #  define TERN_CHAR_MULT        CHAR_SIZE
  87 #  define TEST_END()    KORTEST %k2, %k3
  88 # endif
  89
  90 # if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
  91 #  ifndef USE_AS_WMEMCHR
  92 #   define GPR_X0_IS_RET        1
  93 #  else
  94 #   define GPR_X0_IS_RET        0
  95 #  endif
  96 #  define GPR_X0        rax
  97 # else
  98 #  define GPR_X0_IS_RET 0
  99 #  define GPR_X0        rdx
 100 # endif
 101
 102 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
 103
 104 # if CHAR_PER_VEC == 64
 105 #  define LAST_VEC_OFFSET       (VEC_SIZE * 3)
 106 # else
 107 #  define LAST_VEC_OFFSET       (VEC_SIZE * 2)
 108 # endif
 109 # if CHAR_PER_VEC >= 32
 110 #  define MASK_GPR(...) VGPR(__VA_ARGS__)
 111 # elif CHAR_PER_VEC == 16
 112 #  define MASK_GPR(reg) VGPR_SZ(reg, 16)
 113 # else
 114 #  define MASK_GPR(reg) VGPR_SZ(reg, 8)
 115 # endif
 116
 117 # define VMATCH VMM(0)
 118 # define VMATCH_LO      VMM_lo(0)
 119
 120 # define PAGE_SIZE      4096
 121
 122
 123         .section SECTION(.text), "ax", @progbits
 124 ENTRY_P2ALIGN (MEMCHR, 6)
 125         /* Check for zero length.  */
 126         test    %RDX_LP, %RDX_LP
 127         jz      L(zero_0)
 128
 129 # ifdef __ILP32__
 130         /* Clear the upper 32 bits.  */
 131         movl    %edx, %edx
 132 # endif
 133         VPBROADCAST %esi, %VMATCH
 134         /* Check if we may cross page boundary with one vector load.  */
 135         movl    %edi, %eax
 136         andl    $(PAGE_SIZE - 1), %eax
 137         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
 138         ja      L(page_cross)
 139
 140         VPCMPEQ (%rdi), %VMATCH, %k0
 141         KMOV    %k0, %VRAX
 142 # ifndef USE_AS_WMEMCHR
 143         /* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
 144            already a dependency between rcx and rsi so no worries about
 145            false-dep here.  */
 146         tzcnt   %VRAX, %VRSI
 147         /* If rdx <= rsi then either 1) rcx was non-zero (there was a
 148            match) but it was out of bounds or 2) rcx was zero and rdx
 149            was <= VEC_SIZE so we are done scanning.  */
 150         cmpq    %rsi, %rdx
 151         /* NB: Use branch to return zero/non-zero.  Common usage will
 152            branch on result of function (if return is null/non-null).
 153            This branch can be used to predict the ensuing one so there
 154            is no reason to extend the data-dependency with cmovcc.  */
 155         jbe     L(zero_0)
 156
 157         /* If rcx is zero then len must be > RDX, otherwise since we
 158            already tested len vs lzcnt(rcx) (in rsi) we are good to
 159            return this match.  */
 160         test    %VRAX, %VRAX
 161         jz      L(more_1x_vec)
 162         leaq    (%rdi, %rsi), %rax
 163 # else
 164
 165         /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
 166            > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
 167         cmpq    $CHAR_PER_VEC, %rdx
 168         ja      L(more_1x_vec)
 169         tzcnt   %VRAX, %VRAX
 170         cmpl    %eax, %edx
 171         jbe     L(zero_0)
 172 L(first_vec_x0_ret):
 173         leaq    (%rdi, %rax, CHAR_SIZE), %rax
 174 # endif
 175         ret
 176
 177         /* Only fits in first cache line for VEC_SIZE == 32.  */
 178 # if VEC_SIZE == 32
 179         .p2align 4,, 2
 180 L(zero_0):
 181         xorl    %eax, %eax
 182         ret
 183 # endif
 184
 185         .p2align 4,, 9
 186 L(more_1x_vec):
 187 # ifdef USE_AS_WMEMCHR
 188         /* If wmemchr still need to test if there was a match in first
 189            VEC.  Use bsf to test here so we can reuse
 190            L(first_vec_x0_ret).  */
 191         bsf     %VRAX, %VRAX
 192         jnz     L(first_vec_x0_ret)
 193 # endif
 194
 195 L(page_cross_continue):
 196 # ifdef USE_AS_WMEMCHR
 197         /* We can't use end of the buffer to re-calculate length for
 198            wmemchr as len * CHAR_SIZE may overflow.  */
 199         leaq    -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
 200         andq    $(VEC_SIZE * -1), %rdi
 201         subq    %rdi, %rax
 202         sarq    $2, %rax
 203         addq    %rdx, %rax
 204 # else
 205         leaq    -(VEC_SIZE + 1)(%rdx, %rdi), %rax
 206         andq    $(VEC_SIZE * -1), %rdi
 207         subq    %rdi, %rax
 208 # endif
 209
 210         /* rax contains remaining length - 1.  -1 so we can get imm8
 211            encoding in a few additional places saving code size.  */
 212
 213         /* Needed regardless of remaining length.  */
 214         VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
 215         KMOV    %k0, %VRDX
 216
 217         /* We cannot fold the above `sub %rdi, %rax` with the `cmp
 218            $(CHAR_PER_VEC * 2), %rax` because its possible for a very
 219            large length to overflow and cause the subtract to carry
 220            despite length being above CHAR_PER_VEC * 2.  */
 221         cmpq    $(CHAR_PER_VEC * 2 - 1), %rax
 222         ja      L(more_2x_vec)
 223 L(last_2x_vec):
 224
 225         test    %VRDX, %VRDX
 226         jnz     L(first_vec_x1_check)
 227
 228         /* Check the end of data.  NB: use 8-bit operations to save code
 229            size.  We no longer need the full-width of eax and will
 230            perform a write-only operation over eax so there will be no
 231            partial-register stalls.  */
 232         subb    $(CHAR_PER_VEC * 1 - 1), %al
 233         jle     L(zero_0)
 234
 235         VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
 236         KMOV    %k0, %VRCX
 237 # ifdef USE_AS_WMEMCHR
 238         /* For wmemchr against we can't take advantage of tzcnt(0) ==
 239            VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
 240         test    %VRCX, %VRCX
 241         jz      L(zero_0)
 242 # endif
 243         tzcnt   %VRCX, %VRCX
 244         cmp     %cl, %al
 245
 246         /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
 247            fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
 248            not enough space before the next cache line to fit the `lea`
 249            for return.  */
 250 # if VEC_SIZE == 64
 251         ja      L(first_vec_x2_ret)
 252 L(zero_0):
 253         xorl    %eax, %eax
 254         ret
 255 # else
 256         jbe     L(zero_0)
 257         leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 258         ret
 259 # endif
 260
 261         .p2align 4,, 5
 262 L(first_vec_x1_check):
 263         bsf     %VRDX, %VRDX
 264         cmpb    %dl, %al
 265         jb      L(zero_4)
 266         leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 267         ret
 268
 269         /* Fits at the end of the cache line here for VEC_SIZE == 32.
 270          */
 271 # if VEC_SIZE == 32
 272 L(zero_4):
 273         xorl    %eax, %eax
 274         ret
 275 # endif
 276
 277
 278         .p2align 4,, 4
 279 L(first_vec_x2):
 280         bsf     %VRCX, %VRCX
 281 L(first_vec_x2_ret):
 282         leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 283         ret
 284
 285         /* Fits at the end of the cache line here for VEC_SIZE == 64.
 286          */
 287 # if VEC_SIZE == 64
 288 L(zero_4):
 289         xorl    %eax, %eax
 290         ret
 291 # endif
 292
 293         .p2align 4,, 4
 294 L(first_vec_x1):
 295         bsf     %VRDX, %VRDX
 296         leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 297         ret
 298
 299
 300         .p2align 4,, 5
 301 L(more_2x_vec):
 302         /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
 303            length.  */
 304
 305
 306         /* Already computed matches for first VEC in rdx.  */
 307         test    %VRDX, %VRDX
 308         jnz     L(first_vec_x1)
 309
 310
 311         VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
 312         KMOV    %k0, %VRCX
 313         test    %VRCX, %VRCX
 314         jnz     L(first_vec_x2)
 315
 316         /* Needed regardless of next length check.  */
 317         VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
 318         KMOV    %k0, %VRCX
 319
 320         /* Check if we are near the end.  */
 321         cmpq    $(CHAR_PER_VEC * 4 - 1), %rax
 322         ja      L(more_4x_vec)
 323
 324         test    %VRCX, %VRCX
 325         jnz     L(first_vec_x3_check)
 326
 327         /* Use 8-bit instructions to save code size.  We won't use full-
 328            width eax again and will perform a write-only operation to
 329            eax so no worries about partial-register stalls.  */
 330         subb    $(CHAR_PER_VEC * 3), %al
 331         jb      L(zero_2)
 332 L(last_vec_check):
 333         VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
 334         KMOV    %k0, %VRCX
 335 # ifdef USE_AS_WMEMCHR
 336         /* For wmemchr against we can't take advantage of tzcnt(0) ==
 337            VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
 338         test    %VRCX, %VRCX
 339         jz      L(zero_2)
 340 # endif
 341         tzcnt   %VRCX, %VRCX
 342         cmp     %cl, %al
 343         jae     L(first_vec_x4_ret)
 344 L(zero_2):
 345         xorl    %eax, %eax
 346         ret
 347
 348         /* Fits at the end of the cache line here for VEC_SIZE == 64.
 349            For VEC_SIZE == 32 we put the return label at the end of
 350            L(first_vec_x4).  */
 351 # if VEC_SIZE == 64
 352 L(first_vec_x4_ret):
 353         leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 354         ret
 355 # endif
 356
 357         .p2align 4,, 6
 358 L(first_vec_x4):
 359         bsf     %VRCX, %VRCX
 360 # if VEC_SIZE == 32
 361         /* Place L(first_vec_x4_ret) here as we can't fit it in the same
 362            cache line as where it is called from so we might as well
 363            save code size by reusing return of L(first_vec_x4).  */
 364 L(first_vec_x4_ret):
 365 # endif
 366         leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 367         ret
 368
 369         .p2align 4,, 6
 370 L(first_vec_x3_check):
 371         /* Need to adjust remaining length before checking.  */
 372         addb    $-(CHAR_PER_VEC * 2), %al
 373         bsf     %VRCX, %VRCX
 374         cmpb    %cl, %al
 375         jb      L(zero_2)
 376         leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
 377         ret
 378
 379         .p2align 4,, 6
 380 L(first_vec_x3):
 381         bsf     %VRCX, %VRCX
 382         leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
 383         ret
 384
 385         .p2align 4,, 3
 386 # if !USE_TERN_IN_LOOP
 387         .p2align 4,, 10
 388 # endif
 389 L(more_4x_vec):
 390         test    %VRCX, %VRCX
 391         jnz     L(first_vec_x3)
 392
 393         VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
 394         KMOV    %k0, %VRCX
 395         test    %VRCX, %VRCX
 396         jnz     L(first_vec_x4)
 397
 398         subq    $-(VEC_SIZE * 5), %rdi
 399         subq    $(CHAR_PER_VEC * 8), %rax
 400         jb      L(last_4x_vec)
 401
 402 # ifdef USE_AS_WMEMCHR
 403         movl    %edi, %ecx
 404 # else
 405         addq    %rdi, %rax
 406 # endif
 407
 408
 409 # if VEC_SIZE == 64
 410         /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
 411            processor has partial register stalls (all have merging
 412            uop). If that changes this can be removed.  */
 413         xorb    %dil, %dil
 414 # else
 415         andq    $-(VEC_SIZE * 4), %rdi
 416 # endif
 417
 418 # ifdef USE_AS_WMEMCHR
 419         subl    %edi, %ecx
 420         sarl    $2, %ecx
 421         addq    %rcx, %rax
 422 # else
 423         subq    %rdi, %rax
 424 # endif
 425
 426
 427
 428 # if USE_TERN_IN_LOOP
 429         /* copy VMATCH to low ymm so we can use vpcmpeq which is not
 430            encodable with EVEX registers.  NB: this is VEC_SIZE == 32
 431            only as there is no way to encode vpcmpeq with zmm0-15.  */
 432         vmovdqa64 %VMATCH, %VMATCH_LO
 433 # endif
 434
 435         .p2align 4,, 11
 436 L(loop_4x_vec):
 437         /* Two versions of the loop.  One that does not require
 438            vzeroupper by not using ymmm0-15 and another does that
 439            require vzeroupper because it uses ymmm0-15.  The reason why
 440            ymm0-15 is used at all is because there is no EVEX encoding
 441            vpcmpeq and with vpcmpeq this loop can be performed more
 442            efficiently.  The non-vzeroupper version is safe for RTM
 443            while the vzeroupper version should be prefered if RTM are
 444            not supported.   Which loop version we use is determined by
 445            USE_TERN_IN_LOOP.  */
 446
 447 # if USE_TERN_IN_LOOP
 448         /* Since vptern can only take 3x vectors fastest to do 1 vec
 449            seperately with EVEX vpcmp.  */
 450 #  ifdef USE_AS_WMEMCHR
 451         /* vptern can only accept masks for epi32/epi64 so can only save
 452            instruction using not equals mask on vptern with wmemchr.
 453          */
 454         VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 455 #  else
 456         VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 457 #  endif
 458         /* Compare 3x with vpcmpeq and or them all together with vptern.
 459          */
 460         VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
 461         VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
 462         VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 463 #  ifdef USE_AS_WMEMCHR
 464         /* This takes the not of or between VEC_lo(2), VEC_lo(3),
 465            VEC_lo(4) as well as combines result from VEC(0) with zero
 466            mask.  */
 467         vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
 468         vpmovmskb %VMM_lo(4), %VRCX
 469 #  else
 470         /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
 471            VEC_lo(4).  */
 472         vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
 473         vpmovmskb %VMM_lo(4), %VRCX
 474         KMOV    %k1, %edx
 475 #  endif
 476
 477 # else
 478         /* Loop version that uses EVEX encoding.  */
 479         VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 480         vpxorq  (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
 481         vpxorq  (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
 482         VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
 483         VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
 484         VPTESTN %VMM(3), %VMM(3), %k2
 485 # endif
 486
 487
 488         TEST_END ()
 489         jnz     L(loop_vec_ret)
 490
 491         subq    $-(VEC_SIZE * 4), %rdi
 492
 493         subq    $(CHAR_PER_VEC * 4), %rax
 494         jae     L(loop_4x_vec)
 495
 496         /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
 497          */
 498         COND_VZEROUPPER
 499
 500         .p2align 4,, 10
 501 L(last_4x_vec):
 502         /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
 503            instructions on eax from here on out.  */
 504 # if CHAR_PER_VEC != 64
 505         andl    $(CHAR_PER_VEC * 4 - 1), %eax
 506 # endif
 507         VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
 508         subq    $(VEC_SIZE * 1), %rdi
 509         KMOV    %k0, %VRDX
 510         cmpb    $(CHAR_PER_VEC * 2 - 1), %al
 511         jbe     L(last_2x_vec)
 512         test    %VRDX, %VRDX
 513         jnz     L(last_vec_x1_novzero)
 514
 515         VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
 516         KMOV    %k0, %VRDX
 517         test    %VRDX, %VRDX
 518         jnz     L(last_vec_x2_novzero)
 519
 520         VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
 521         KMOV    %k0, %VRCX
 522         test    %VRCX, %VRCX
 523         jnz     L(first_vec_x3_check)
 524
 525         subb    $(CHAR_PER_VEC * 3), %al
 526         jae     L(last_vec_check)
 527
 528         xorl    %eax, %eax
 529         ret
 530
 531 # if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
 532 L(last_vec_x2_novzero):
 533         addq    $VEC_SIZE, %rdi
 534 L(last_vec_x1_novzero):
 535         bsf     %VRDX, %VRDX
 536         leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 537         ret
 538 # endif
 539
 540 # if CHAR_PER_VEC == 64
 541         /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
 542            64 it needs a seperate return label.  */
 543         .p2align 4,, 4
 544 L(last_vec_x2):
 545 L(last_vec_x2_novzero):
 546         bsf     %VRDX, %VRDX
 547         leaq    (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 548         ret
 549 # endif
 550
 551         .p2align 4,, 4
 552 L(loop_vec_ret):
 553 # if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
 554         KMOV    %k1, %VRAX
 555         inc     %MASK_GPR(rax)
 556 # else
 557         test    %VRDX, %VRDX
 558 # endif
 559         jnz     L(last_vec_x0)
 560
 561
 562 # if USE_TERN_IN_LOOP
 563         vpmovmskb %VMM_lo(2), %VRDX
 564 # else
 565         VPTESTN %VMM(2), %VMM(2), %k1
 566         KMOV    %k1, %VRDX
 567 # endif
 568         test    %VRDX, %VRDX
 569         jnz     L(last_vec_x1)
 570
 571
 572 # if USE_TERN_IN_LOOP
 573         vpmovmskb %VMM_lo(3), %VRDX
 574 # else
 575         KMOV    %k2, %VRDX
 576 # endif
 577
 578         /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
 579            (only if used VEX encoded loop).  */
 580         COND_VZEROUPPER
 581
 582         /* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
 583            CHAR_PER_VEC we test the last 2x VEC seperately, for
 584            CHAR_PER_VEC <= 32 we can combine the results from the 2x
 585            VEC in a single GPR.  */
 586 # if CHAR_PER_VEC == 64
 587 #  if USE_TERN_IN_LOOP
 588 #   error "Unsupported"
 589 #  endif
 590
 591
 592         /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
 593         test    %VRDX, %VRDX
 594         jnz     L(last_vec_x2)
 595         KMOV    %k3, %VRDX
 596 # else
 597         /* CHAR_PER_VEC <= 32 so we can combine the results from the
 598            last 2x VEC.  */
 599
 600 #  if !USE_TERN_IN_LOOP
 601         KMOV    %k3, %VRCX
 602 #  endif
 603         salq    $(VEC_SIZE / TERN_CHAR_MULT), %rcx
 604         addq    %rcx, %rdx
 605 #  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
 606 L(last_vec_x2_novzero):
 607 #  endif
 608 # endif
 609         bsf     %rdx, %rdx
 610         leaq    (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 611         ret
 612
 613         .p2align 4,, 8
 614 L(last_vec_x1):
 615         COND_VZEROUPPER
 616 # if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
 617 L(last_vec_x1_novzero):
 618 # endif
 619         bsf     %VRDX, %VRDX
 620         leaq    (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 621         ret
 622
 623
 624         .p2align 4,, 4
 625 L(last_vec_x0):
 626         COND_VZEROUPPER
 627         bsf     %VGPR(GPR_X0), %VGPR(GPR_X0)
 628 # if GPR_X0_IS_RET
 629         addq    %rdi, %rax
 630 # else
 631         leaq    (%rdi, %GPR_X0, CHAR_SIZE), %rax
 632 # endif
 633         ret
 634
 635         .p2align 4,, 6
 636 L(page_cross):
 637         /* Need to preserve eax to compute inbound bytes we are
 638            checking.  */
 639 # ifdef USE_AS_WMEMCHR
 640         movl    %eax, %ecx
 641 # else
 642         xorl    %ecx, %ecx
 643         subl    %eax, %ecx
 644 # endif
 645
 646         xorq    %rdi, %rax
 647         VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
 648         KMOV    %k0, %VRAX
 649
 650 # ifdef USE_AS_WMEMCHR
 651         /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
 652         shrl    $2, %ecx
 653         andl    $(CHAR_PER_VEC - 1), %ecx
 654 # endif
 655
 656
 657         shrx    %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
 658
 659 # ifdef USE_AS_WMEMCHR
 660         negl    %ecx
 661 # endif
 662
 663         /* mask lower bits from ecx (negative eax) to get bytes till
 664            next VEC.  */
 665         andl    $(CHAR_PER_VEC - 1), %ecx
 666
 667         /* Check if VEC is entirely contained in the remainder of the
 668            page.  */
 669         cmpq    %rcx, %rdx
 670         jbe     L(page_cross_ret)
 671
 672         /* Length crosses the page so if rax is zero (no matches)
 673            continue.  */
 674         test    %VRAX, %VRAX
 675         jz      L(page_cross_continue)
 676
 677         /* if rdx > rcx then any match here must be in [buf:buf + len].
 678          */
 679         tzcnt   %VRAX, %VRAX
 680 # ifdef USE_AS_WMEMCHR
 681         leaq    (%rdi, %rax, CHAR_SIZE), %rax
 682 # else
 683         addq    %rdi, %rax
 684 # endif
 685         ret
 686
 687         .p2align 4,, 2
 688 L(page_cross_zero):
 689         xorl    %eax, %eax
 690         ret
 691
 692         .p2align 4,, 4
 693 L(page_cross_ret):
 694         /* Search is entirely contained in page cross case.  */
 695 # ifdef USE_AS_WMEMCHR
 696         test    %VRAX, %VRAX
 697         jz      L(page_cross_zero)
 698 # endif
 699         tzcnt   %VRAX, %VRAX
 700         cmpl    %eax, %edx
 701         jbe     L(page_cross_zero)
 702 # ifdef USE_AS_WMEMCHR
 703         leaq    (%rdi, %rax, CHAR_SIZE), %rax
 704 # else
 705         addq    %rdi, %rax
 706 # endif
 707         ret
 708 END (MEMCHR)
 709 #endif