sysdeps/x86_64/multiarch/rawmemchr-evex.S

   1 /* rawmemchr optimized with 256-bit EVEX instructions.
   2    Copyright (C) 2022-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20 #include <sysdep.h>
  21
  22 #if ISA_SHOULD_BUILD (4)
  23
  24 # ifndef VEC_SIZE
  25 #  include "x86-evex256-vecs.h"
  26 # endif
  27
  28 # ifndef RAWMEMCHR
  29 #  define RAWMEMCHR     __rawmemchr_evex
  30 # endif
  31
  32
  33 # define PC_SHIFT_GPR   rdi
  34 # define REG_WIDTH      VEC_SIZE
  35 # define VPTESTN        vptestnmb
  36 # define VPBROADCAST    vpbroadcastb
  37 # define VPMINU vpminub
  38 # define VPCMP  vpcmpb
  39 # define VPCMPEQ        vpcmpeqb
  40 # define CHAR_SIZE      1
  41
  42 # include "reg-macros.h"
  43
  44 /* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
  45    doesn't have VEX encoding), use VEX encoding in loop so we
  46    can use vpcmpeqb + vptern which is more efficient than the
  47    EVEX alternative.  */
  48 # if defined USE_IN_RTM || VEC_SIZE == 64
  49 #  undef COND_VZEROUPPER
  50 #  undef VZEROUPPER_RETURN
  51 #  undef VZEROUPPER
  52
  53
  54 #  define COND_VZEROUPPER
  55 #  define VZEROUPPER_RETURN     ret
  56 #  define VZEROUPPER
  57
  58 #  define USE_TERN_IN_LOOP      0
  59 # else
  60 #  define USE_TERN_IN_LOOP      1
  61 #  undef VZEROUPPER
  62 #  define VZEROUPPER    vzeroupper
  63 # endif
  64
  65 # define CHAR_PER_VEC   VEC_SIZE
  66
  67 # if CHAR_PER_VEC == 64
  68
  69 #  define TAIL_RETURN_LBL       first_vec_x2
  70 #  define TAIL_RETURN_OFFSET    (CHAR_PER_VEC * 2)
  71
  72 #  define FALLTHROUGH_RETURN_LBL        first_vec_x3
  73 #  define FALLTHROUGH_RETURN_OFFSET     (CHAR_PER_VEC * 3)
  74
  75 # else  /* !(CHAR_PER_VEC == 64) */
  76
  77 #  define TAIL_RETURN_LBL       first_vec_x3
  78 #  define TAIL_RETURN_OFFSET    (CHAR_PER_VEC * 3)
  79
  80 #  define FALLTHROUGH_RETURN_LBL        first_vec_x2
  81 #  define FALLTHROUGH_RETURN_OFFSET     (CHAR_PER_VEC * 2)
  82 # endif /* !(CHAR_PER_VEC == 64) */
  83
  84
  85 # define VMATCH VMM(0)
  86 # define VMATCH_LO      VMM_lo(0)
  87
  88 # define PAGE_SIZE      4096
  89
  90         .section SECTION(.text), "ax", @progbits
  91 ENTRY_P2ALIGN (RAWMEMCHR, 6)
  92         VPBROADCAST %esi, %VMATCH
  93         /* Check if we may cross page boundary with one vector load.  */
  94         movl    %edi, %eax
  95         andl    $(PAGE_SIZE - 1), %eax
  96         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
  97         ja      L(page_cross)
  98
  99         VPCMPEQ (%rdi), %VMATCH, %k0
 100         KMOV    %k0, %VRAX
 101
 102         test    %VRAX, %VRAX
 103         jz      L(aligned_more)
 104 L(first_vec_x0):
 105         bsf     %VRAX, %VRAX
 106         addq    %rdi, %rax
 107         ret
 108
 109         .p2align 4,, 4
 110 L(first_vec_x4):
 111         bsf     %VRAX, %VRAX
 112         leaq    (VEC_SIZE * 4)(%rdi, %rax), %rax
 113         ret
 114
 115         /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
 116            as well place it more locally.  For VEC_SIZE == 64 we reuse
 117            return code at the end of loop's return.  */
 118 # if VEC_SIZE == 32
 119         .p2align 4,, 4
 120 L(FALLTHROUGH_RETURN_LBL):
 121         bsf     %VRAX, %VRAX
 122         leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 123         ret
 124 # endif
 125
 126         .p2align 4,, 6
 127 L(page_cross):
 128         /* eax has lower page-offset bits of rdi so xor will zero them
 129            out.  */
 130         xorq    %rdi, %rax
 131         VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
 132         KMOV    %k0, %VRAX
 133
 134         /* Shift out out-of-bounds matches.  */
 135         shrx    %VRDI, %VRAX, %VRAX
 136         test    %VRAX, %VRAX
 137         jnz     L(first_vec_x0)
 138
 139         .p2align 4,, 10
 140 L(aligned_more):
 141 L(page_cross_continue):
 142         /* Align pointer.  */
 143         andq    $(VEC_SIZE * -1), %rdi
 144
 145         VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
 146         KMOV    %k0, %VRAX
 147         test    %VRAX, %VRAX
 148         jnz     L(first_vec_x1)
 149
 150         VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
 151         KMOV    %k0, %VRAX
 152         test    %VRAX, %VRAX
 153         jnz     L(first_vec_x2)
 154
 155         VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
 156         KMOV    %k0, %VRAX
 157         test    %VRAX, %VRAX
 158         jnz     L(first_vec_x3)
 159
 160         VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
 161         KMOV    %k0, %VRAX
 162         test    %VRAX, %VRAX
 163         jnz     L(first_vec_x4)
 164
 165         subq    $-(VEC_SIZE * 1), %rdi
 166 # if VEC_SIZE == 64
 167         /* Saves code size.  No evex512 processor has partial register
 168            stalls.  If that change this can be replaced with `andq
 169            $-(VEC_SIZE * 4), %rdi`.  */
 170         xorb    %dil, %dil
 171 # else
 172         andq    $-(VEC_SIZE * 4), %rdi
 173 # endif
 174
 175 # if USE_TERN_IN_LOOP
 176         /* copy VMATCH to low ymm so we can use vpcmpeq which is not
 177            encodable with EVEX registers.  NB: this is VEC_SIZE == 32
 178            only as there is no way to encode vpcmpeq with zmm0-15.  */
 179         vmovdqa64 %VMATCH, %VMATCH_LO
 180 # endif
 181
 182         .p2align 4
 183 L(loop_4x_vec):
 184         /* Two versions of the loop.  One that does not require
 185            vzeroupper by not using ymm0-15 and another does that
 186            require vzeroupper because it uses ymm0-15.  The reason why
 187            ymm0-15 is used at all is because there is no EVEX encoding
 188            vpcmpeq and with vpcmpeq this loop can be performed more
 189            efficiently.  The non-vzeroupper version is safe for RTM
 190            while the vzeroupper version should be preferred if RTM are
 191            not supported.   Which loop version we use is determined by
 192            USE_TERN_IN_LOOP.  */
 193
 194 # if USE_TERN_IN_LOOP
 195         /* Since vptern can only take 3x vectors fastest to do 1 vec
 196            separately with EVEX vpcmp.  */
 197         VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
 198         /* Compare 3x with vpcmpeq and or them all together with vptern.
 199          */
 200
 201         VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
 202         subq    $(VEC_SIZE * -4), %rdi
 203         VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
 204         VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 205
 206         /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
 207            VEC_lo(4).  */
 208         vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
 209         vpmovmskb %VMM_lo(4), %VRCX
 210
 211         KMOV    %k1, %eax
 212
 213         /* NB:  rax has match from first VEC and rcx has matches from
 214            VEC 2-4.  If rax is non-zero we will return that match.  If
 215            rax is zero adding won't disturb the bits in rcx.  */
 216         add     %rax, %rcx
 217 # else
 218         /* Loop version that uses EVEX encoding.  */
 219         VPCMP   $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
 220         vpxorq  (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
 221         vpxorq  (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
 222         VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
 223         VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
 224         VPTESTN %VMM(3), %VMM(3), %k2
 225         subq    $(VEC_SIZE * -4), %rdi
 226         KORTEST %k2, %k3
 227 # endif
 228         jz      L(loop_4x_vec)
 229
 230 # if USE_TERN_IN_LOOP
 231         test    %VRAX, %VRAX
 232 # else
 233         KMOV    %k1, %VRAX
 234         inc     %VRAX
 235 # endif
 236         jnz     L(last_vec_x0)
 237
 238
 239 # if USE_TERN_IN_LOOP
 240         vpmovmskb %VMM_lo(2), %VRAX
 241 # else
 242         VPTESTN %VMM(2), %VMM(2), %k1
 243         KMOV    %k1, %VRAX
 244 # endif
 245         test    %VRAX, %VRAX
 246         jnz     L(last_vec_x1)
 247
 248
 249 # if USE_TERN_IN_LOOP
 250         vpmovmskb %VMM_lo(3), %VRAX
 251 # else
 252         KMOV    %k2, %VRAX
 253 # endif
 254
 255         /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
 256            (only if used VEX encoded loop).  */
 257         COND_VZEROUPPER
 258
 259         /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
 260            returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
 261            individually, for VEC_SIZE == 32 we combine them in a single
 262            64-bit GPR.  */
 263 # if CHAR_PER_VEC == 64
 264 #  if USE_TERN_IN_LOOP
 265 #   error "Unsupported"
 266 #  endif
 267
 268
 269         /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
 270         test    %VRAX, %VRAX
 271         jnz     L(first_vec_x2)
 272         KMOV    %k3, %VRAX
 273 L(FALLTHROUGH_RETURN_LBL):
 274 # else
 275         /* CHAR_PER_VEC <= 32 so we can combine the results from the
 276            last 2x VEC.  */
 277 #  if !USE_TERN_IN_LOOP
 278         KMOV    %k3, %VRCX
 279 #  endif
 280         salq    $CHAR_PER_VEC, %rcx
 281         addq    %rcx, %rax
 282 # endif
 283         bsf     %rax, %rax
 284         leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 285         ret
 286
 287         .p2align 4,, 8
 288 L(TAIL_RETURN_LBL):
 289         bsf     %rax, %rax
 290         leaq    (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
 291         ret
 292
 293         .p2align 4,, 8
 294 L(last_vec_x1):
 295         COND_VZEROUPPER
 296 L(first_vec_x1):
 297         bsf     %VRAX, %VRAX
 298         leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
 299         ret
 300
 301         .p2align 4,, 8
 302 L(last_vec_x0):
 303         COND_VZEROUPPER
 304         bsf     %VRAX, %VRAX
 305         addq    %rdi, %rax
 306         ret
 307 END (RAWMEMCHR)
 308 #endif