sysdeps/x86_64/multiarch/strnlen-evex512.S

   1 /* Placeholder function, not used by any processor at the moment.
   2    Copyright (C) 2022-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #ifndef STRNLEN
  20 #define STRNLEN __strnlen_evex512
  21 #endif
  22
  23 #include "x86-evex512-vecs.h"
  24 #include "reg-macros.h"
  25
  26 #include <isa-level.h>
  27
  28 #if ISA_SHOULD_BUILD (4)
  29
  30 # include <sysdep.h>
  31
  32 # ifdef USE_AS_WCSLEN
  33 #  define VPCMPEQ       vpcmpeqd
  34 #  define VPTESTN       vptestnmd
  35 #  define VPMINU        vpminud
  36 #  define CHAR_SIZE     4
  37 # else
  38 #  define VPCMPEQ       vpcmpeqb
  39 #  define VPTESTN       vptestnmb
  40 #  define VPMINU        vpminub
  41 #  define CHAR_SIZE     1
  42 # endif
  43
  44 # define PAGE_SIZE      4096
  45 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  46
  47         .section SECTION(.text),"ax",@progbits
  48 /* Aligning entry point to 64 byte, provides better performance for
  49    one vector length string.  */
  50 ENTRY_P2ALIGN (STRNLEN, 6)
  51         /* Check zero length.  */
  52         test    %RSI_LP, %RSI_LP
  53         jz      L(ret_max)
  54 #  ifdef __ILP32__
  55         /* Clear the upper 32 bits.  */
  56         movl    %esi, %esi
  57 #  endif
  58
  59         movl    %edi, %eax
  60         vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
  61         sall    $20, %eax
  62         cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
  63         ja      L(page_cross)
  64
  65         /* Compare [w]char for null, mask bit will be set for match.  */
  66         VPCMPEQ (%rdi), %VMM(0), %k0
  67         KMOV    %k0, %VRCX
  68         /* Store max length in rax.  */
  69         mov     %rsi, %rax
  70         /* If rcx is 0, rax will have max length.  We can not use VRCX
  71            and VRAX here for evex256 because, upper 32 bits may be
  72            undefined for ecx and eax.  */
  73         bsfq    %rcx, %rax
  74         cmp     $CHAR_PER_VEC, %rax
  75         ja      L(align_more)
  76         cmpq    %rax, %rsi
  77         cmovb   %esi, %eax
  78         ret
  79
  80         /* At this point vector max length reached.  */
  81         .p2align 4,,3
  82 L(ret_max):
  83         movq    %rsi, %rax
  84         ret
  85
  86 L(align_more):
  87         mov     %rdi, %rax
  88         /* Align rax to VEC_SIZE.  */
  89         andq    $-VEC_SIZE, %rax
  90         movq    %rdi, %rdx
  91         subq    %rax, %rdx
  92 #  ifdef USE_AS_WCSLEN
  93         shr     $2, %VRDX
  94 #  endif
  95         /* At this point rdx contains [w]chars already compared.  */
  96         leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
  97         /* At this point rdx contains number of w[char] needs to go.
  98            Now onwards rdx will keep decrementing with each compare.  */
  99
 100         /* Loop unroll 4 times for 4 vector loop.  */
 101         VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
 102         subq    $-VEC_SIZE, %rax
 103         KMOV    %k0, %VRCX
 104         test    %VRCX, %VRCX
 105         jnz     L(ret_vec_x1)
 106
 107         subq    $CHAR_PER_VEC, %rdx
 108         jbe     L(ret_max)
 109
 110         VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
 111         KMOV    %k0, %VRCX
 112         test    %VRCX, %VRCX
 113         jnz     L(ret_vec_x2)
 114
 115         subq    $CHAR_PER_VEC, %rdx
 116         jbe     L(ret_max)
 117
 118         VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
 119         KMOV    %k0, %VRCX
 120         test    %VRCX, %VRCX
 121         jnz     L(ret_vec_x3)
 122
 123         subq    $CHAR_PER_VEC, %rdx
 124         jbe     L(ret_max)
 125
 126         VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
 127         KMOV    %k0, %VRCX
 128         test    %VRCX, %VRCX
 129         jnz     L(ret_vec_x4)
 130
 131         subq    $CHAR_PER_VEC, %rdx
 132         jbe     L(ret_max)
 133         /* Save pointer before 4 x VEC_SIZE alignment.  */
 134         movq    %rax, %rcx
 135
 136         /* Align address to VEC_SIZE * 4 for loop.  */
 137         andq    $-(VEC_SIZE * 4), %rax
 138
 139         subq    %rax, %rcx
 140 #  ifdef USE_AS_WCSLEN
 141         shr     $2, %VRCX
 142 #  endif
 143         /* rcx contains number of [w]char will be recompared due to
 144            alignment fixes.  rdx must be incremented by rcx to offset
 145            alignment adjustment.  */
 146         addq    %rcx, %rdx
 147         /* Need jump as we don't want to add/subtract rdx for first
 148            iteration of 4 x VEC_SIZE aligned loop.  */
 149
 150         .p2align 4,,11
 151 L(loop):
 152         /* VPMINU and VPCMP combination provide better performance as
 153            compared to alternative combinations.  */
 154         VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
 155         VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
 156         VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
 157         VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 158
 159         VPTESTN %VMM(2), %VMM(2), %k0
 160         VPTESTN %VMM(4), %VMM(4), %k1
 161
 162         subq    $-(VEC_SIZE * 4), %rax
 163         KORTEST %k0, %k1
 164
 165         jnz     L(loopend)
 166         subq    $(CHAR_PER_VEC * 4), %rdx
 167         ja      L(loop)
 168         mov     %rsi, %rax
 169         ret
 170
 171 L(loopend):
 172
 173         VPTESTN %VMM(1), %VMM(1), %k2
 174         KMOV    %k2, %VRCX
 175         test    %VRCX, %VRCX
 176         jnz     L(ret_vec_x1)
 177
 178         KMOV    %k0, %VRCX
 179         /* At this point, if k0 is non zero, null char must be in the
 180            second vector.  */
 181         test    %VRCX, %VRCX
 182         jnz     L(ret_vec_x2)
 183
 184         VPTESTN %VMM(3), %VMM(3), %k3
 185         KMOV    %k3, %VRCX
 186         test    %VRCX, %VRCX
 187         jnz     L(ret_vec_x3)
 188         /* At this point null [w]char must be in the fourth vector so no
 189            need to check.  */
 190         KMOV    %k1, %VRCX
 191
 192         /* Fourth, third, second vector terminating are pretty much
 193            same, implemented this way to avoid branching and reuse code
 194            from pre loop exit condition.  */
 195 L(ret_vec_x4):
 196         bsf     %VRCX, %VRCX
 197         subq    %rdi, %rax
 198 # ifdef USE_AS_WCSLEN
 199         subq    $-(VEC_SIZE * 3), %rax
 200         shrq    $2, %rax
 201         addq    %rcx, %rax
 202 # else
 203         leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
 204 # endif
 205
 206         cmpq    %rsi, %rax
 207         cmovnb  %rsi, %rax
 208         ret
 209
 210 L(ret_vec_x3):
 211         bsf     %VRCX, %VRCX
 212         subq    %rdi, %rax
 213 # ifdef USE_AS_WCSLEN
 214         subq    $-(VEC_SIZE * 2), %rax
 215         shrq    $2, %rax
 216         addq    %rcx, %rax
 217 # else
 218         leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
 219 # endif
 220         cmpq    %rsi, %rax
 221         cmovnb  %rsi, %rax
 222         ret
 223
 224 L(ret_vec_x2):
 225         subq    $-VEC_SIZE, %rax
 226 L(ret_vec_x1):
 227         bsf     %VRCX, %VRCX
 228         subq    %rdi, %rax
 229 # ifdef USE_AS_WCSLEN
 230         shrq    $2, %rax
 231 # endif
 232         addq    %rcx, %rax
 233         cmpq    %rsi, %rax
 234         cmovnb  %rsi, %rax
 235         ret
 236
 237 L(page_cross):
 238         mov     %rdi, %rax
 239         movl    %edi, %ecx
 240         andl    $(VEC_SIZE - 1), %ecx
 241 # ifdef USE_AS_WCSLEN
 242         sarl    $2, %ecx
 243 # endif
 244         /* ecx contains number of w[char] to be skipped as a result
 245            of address alignment.  */
 246         andq    $-VEC_SIZE, %rax
 247         VPCMPEQ (%rax), %VMM(0), %k0
 248         KMOV    %k0, %VRDX
 249         /* Ignore number of character for alignment adjustment.  */
 250         shr     %cl, %VRDX
 251         jnz     L(page_cross_end)
 252         movl    $CHAR_PER_VEC, %eax
 253         sub     %ecx, %eax
 254         cmp     %rax, %rsi
 255         ja      L(align_more)
 256
 257 L(page_cross_end):
 258         bsf     %VRDX, %VRAX
 259         cmpq    %rsi, %rax
 260         cmovnb  %esi, %eax
 261         ret
 262
 263 END (STRNLEN)
 264 #endif