sysdeps/x86_64/multiarch/memrchr-avx2.S

   1 /* memrchr optimized with AVX2.
   2    Copyright (C) 2017-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (3)
  22
  23 # include <sysdep.h>
  24
  25 # ifndef MEMRCHR
  26 #  define MEMRCHR                               __memrchr_avx2
  27 # endif
  28
  29 # ifndef VZEROUPPER
  30 #  define VZEROUPPER                    vzeroupper
  31 # endif
  32
  33 # ifndef SECTION
  34 #  define SECTION(p)    p##.avx
  35 # endif
  36
  37 # define VEC_SIZE                       32
  38 # define PAGE_SIZE                      4096
  39         .section SECTION(.text), "ax", @progbits
  40 ENTRY_P2ALIGN(MEMRCHR, 6)
  41 # ifdef __ILP32__
  42         /* Clear upper bits.  */
  43         and     %RDX_LP, %RDX_LP
  44 # else
  45         test    %RDX_LP, %RDX_LP
  46 # endif
  47         jz      L(zero_0)
  48
  49         vmovd   %esi, %xmm0
  50         /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
  51            correct page cross check and 2) it correctly sets up end ptr to be
  52            subtract by lzcnt aligned.  */
  53         leaq    -1(%rdx, %rdi), %rax
  54
  55         vpbroadcastb %xmm0, %ymm0
  56
  57         /* Check if we can load 1x VEC without cross a page.  */
  58         testl   $(PAGE_SIZE - VEC_SIZE), %eax
  59         jz      L(page_cross)
  60
  61         vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
  62         vpmovmskb %ymm1, %ecx
  63         cmpq    $VEC_SIZE, %rdx
  64         ja      L(more_1x_vec)
  65
  66 L(ret_vec_x0_test):
  67         /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
  68            will guarantee edx (len) is less than it.  */
  69         lzcntl  %ecx, %ecx
  70
  71         /* Hoist vzeroupper (not great for RTM) to save code size. This allows
  72            all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
  73         COND_VZEROUPPER
  74         cmpl    %ecx, %edx
  75         jle     L(zero_0)
  76         subq    %rcx, %rax
  77         ret
  78
  79         /* Fits in aligning bytes of first cache line.  */
  80 L(zero_0):
  81         xorl    %eax, %eax
  82         ret
  83
  84         .p2align 4,, 9
  85 L(ret_vec_x0):
  86         lzcntl  %ecx, %ecx
  87         subq    %rcx, %rax
  88 L(return_vzeroupper):
  89         ZERO_UPPER_VEC_REGISTERS_RETURN
  90
  91         .p2align 4,, 10
  92 L(more_1x_vec):
  93         testl   %ecx, %ecx
  94         jnz     L(ret_vec_x0)
  95
  96         /* Align rax (string pointer).  */
  97         andq    $-VEC_SIZE, %rax
  98
  99         /* Recompute remaining length after aligning.  */
 100         movq    %rax, %rdx
 101         /* Need this comparison next no matter what.  */
 102         vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
 103         subq    %rdi, %rdx
 104         decq    %rax
 105         vpmovmskb %ymm1, %ecx
 106         /* Fall through for short (hotter than length).  */
 107         cmpq    $(VEC_SIZE * 2), %rdx
 108         ja      L(more_2x_vec)
 109 L(last_2x_vec):
 110         cmpl    $VEC_SIZE, %edx
 111         jbe     L(ret_vec_x0_test)
 112
 113         testl   %ecx, %ecx
 114         jnz     L(ret_vec_x0)
 115
 116         vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
 117         vpmovmskb %ymm1, %ecx
 118         /* 64-bit lzcnt. This will naturally add 32 to position.  */
 119         lzcntq  %rcx, %rcx
 120         COND_VZEROUPPER
 121         cmpl    %ecx, %edx
 122         jle     L(zero_0)
 123         subq    %rcx, %rax
 124         ret
 125
 126
 127         /* Inexpensive place to put this regarding code size / target alignments
 128            / ICache NLP. Necessary for 2-byte encoding of jump to page cross
 129            case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
 130            in first cache line.  */
 131 L(page_cross):
 132         movq    %rax, %rsi
 133         andq    $-VEC_SIZE, %rsi
 134         vpcmpeqb (%rsi), %ymm0, %ymm1
 135         vpmovmskb %ymm1, %ecx
 136         /* Shift out negative alignment (because we are starting from endptr and
 137            working backwards).  */
 138         movl    %eax, %r8d
 139         /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
 140         notl    %r8d
 141         shlxl   %r8d, %ecx, %ecx
 142         cmpq    %rdi, %rsi
 143         ja      L(more_1x_vec)
 144         lzcntl  %ecx, %ecx
 145         COND_VZEROUPPER
 146         cmpl    %ecx, %edx
 147         jle     L(zero_0)
 148         subq    %rcx, %rax
 149         ret
 150         .p2align 4,, 11
 151 L(ret_vec_x1):
 152         /* This will naturally add 32 to position.  */
 153         lzcntq  %rcx, %rcx
 154         subq    %rcx, %rax
 155         VZEROUPPER_RETURN
 156         .p2align 4,, 10
 157 L(more_2x_vec):
 158         testl   %ecx, %ecx
 159         jnz     L(ret_vec_x0)
 160
 161         vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
 162         vpmovmskb %ymm1, %ecx
 163         testl   %ecx, %ecx
 164         jnz     L(ret_vec_x1)
 165
 166
 167         /* Needed no matter what.  */
 168         vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
 169         vpmovmskb %ymm1, %ecx
 170
 171         subq    $(VEC_SIZE * 4), %rdx
 172         ja      L(more_4x_vec)
 173
 174         cmpl    $(VEC_SIZE * -1), %edx
 175         jle     L(ret_vec_x2_test)
 176
 177 L(last_vec):
 178         testl   %ecx, %ecx
 179         jnz     L(ret_vec_x2)
 180
 181         /* Needed no matter what.  */
 182         vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
 183         vpmovmskb %ymm1, %ecx
 184         lzcntl  %ecx, %ecx
 185         subq    $(VEC_SIZE * 3), %rax
 186         COND_VZEROUPPER
 187         subq    %rcx, %rax
 188         cmpq    %rax, %rdi
 189         ja      L(zero_2)
 190         ret
 191
 192         /* First in aligning bytes.  */
 193 L(zero_2):
 194         xorl    %eax, %eax
 195         ret
 196
 197         .p2align 4,, 4
 198 L(ret_vec_x2_test):
 199         lzcntl  %ecx, %ecx
 200         subq    $(VEC_SIZE * 2), %rax
 201         COND_VZEROUPPER
 202         subq    %rcx, %rax
 203         cmpq    %rax, %rdi
 204         ja      L(zero_2)
 205         ret
 206
 207
 208         .p2align 4,, 11
 209 L(ret_vec_x2):
 210         /* ecx must be non-zero.  */
 211         bsrl    %ecx, %ecx
 212         leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
 213         VZEROUPPER_RETURN
 214
 215         .p2align 4,, 14
 216 L(ret_vec_x3):
 217         /* ecx must be non-zero.  */
 218         bsrl    %ecx, %ecx
 219         leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
 220         VZEROUPPER_RETURN
 221
 222
 223
 224         .p2align 4
 225 L(more_4x_vec):
 226         testl   %ecx, %ecx
 227         jnz     L(ret_vec_x2)
 228
 229         vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
 230         vpmovmskb %ymm1, %ecx
 231
 232         testl   %ecx, %ecx
 233         jnz     L(ret_vec_x3)
 234
 235         /* Check if near end before re-aligning (otherwise might do an
 236            unnecessary loop iteration).  */
 237         addq    $-(VEC_SIZE * 4), %rax
 238         cmpq    $(VEC_SIZE * 4), %rdx
 239         jbe     L(last_4x_vec)
 240
 241         /* Align rax to (VEC_SIZE - 1).  */
 242         orq     $(VEC_SIZE * 4 - 1), %rax
 243         movq    %rdi, %rdx
 244         /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
 245            lengths that overflow can be valid and break the comparison.  */
 246         orq     $(VEC_SIZE * 4 - 1), %rdx
 247
 248         .p2align 4
 249 L(loop_4x_vec):
 250         /* Need this comparison next no matter what.  */
 251         vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
 252         vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
 253         vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
 254         vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 255
 256         vpor    %ymm1, %ymm2, %ymm2
 257         vpor    %ymm3, %ymm4, %ymm4
 258         vpor    %ymm2, %ymm4, %ymm4
 259         vpmovmskb %ymm4, %esi
 260
 261         testl   %esi, %esi
 262         jnz     L(loop_end)
 263
 264         addq    $(VEC_SIZE * -4), %rax
 265         cmpq    %rdx, %rax
 266         jne     L(loop_4x_vec)
 267
 268         subl    %edi, %edx
 269         incl    %edx
 270
 271 L(last_4x_vec):
 272         /* Used no matter what.  */
 273         vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
 274         vpmovmskb %ymm1, %ecx
 275
 276         cmpl    $(VEC_SIZE * 2), %edx
 277         jbe     L(last_2x_vec)
 278
 279         testl   %ecx, %ecx
 280         jnz     L(ret_vec_x0_end)
 281
 282         vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
 283         vpmovmskb %ymm1, %ecx
 284         testl   %ecx, %ecx
 285         jnz     L(ret_vec_x1_end)
 286
 287         /* Used no matter what.  */
 288         vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
 289         vpmovmskb %ymm1, %ecx
 290
 291         cmpl    $(VEC_SIZE * 3), %edx
 292         ja      L(last_vec)
 293
 294         lzcntl  %ecx, %ecx
 295         subq    $(VEC_SIZE * 2), %rax
 296         COND_VZEROUPPER
 297         subq    %rcx, %rax
 298         cmpq    %rax, %rdi
 299         jbe     L(ret0)
 300         xorl    %eax, %eax
 301 L(ret0):
 302         ret
 303
 304
 305         .p2align 4
 306 L(loop_end):
 307         vpmovmskb %ymm1, %ecx
 308         testl   %ecx, %ecx
 309         jnz     L(ret_vec_x0_end)
 310
 311         vpmovmskb %ymm2, %ecx
 312         testl   %ecx, %ecx
 313         jnz     L(ret_vec_x1_end)
 314
 315         vpmovmskb %ymm3, %ecx
 316         /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
 317            then it won't affect the result in esi (VEC4). If ecx is non-zero
 318            then CHAR in VEC3 and bsrq will use that position.  */
 319         salq    $32, %rcx
 320         orq     %rsi, %rcx
 321         bsrq    %rcx, %rcx
 322         leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
 323         VZEROUPPER_RETURN
 324
 325         .p2align 4,, 4
 326 L(ret_vec_x1_end):
 327         /* 64-bit version will automatically add 32 (VEC_SIZE).  */
 328         lzcntq  %rcx, %rcx
 329         subq    %rcx, %rax
 330         VZEROUPPER_RETURN
 331
 332         .p2align 4,, 4
 333 L(ret_vec_x0_end):
 334         lzcntl  %ecx, %ecx
 335         subq    %rcx, %rax
 336         VZEROUPPER_RETURN
 337
 338         /* 2 bytes until next cache line.  */
 339 END(MEMRCHR)
 340 #endif