sysdeps/x86_64/multiarch/strlen-evex-base.S

   1 /* Placeholder function, not used by any processor at the moment.
   2    Copyright (C) 2022 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* UNUSED. Exists purely as reference implementation.  */
  20
  21 #include <isa-level.h>
  22
  23 #if ISA_SHOULD_BUILD (4)
  24
  25 # include <sysdep.h>
  26
  27 # ifdef USE_AS_WCSLEN
  28 #  define VPCMPEQ       vpcmpeqd
  29 #  define VPTESTN       vptestnmd
  30 #  define VPMINU        vpminud
  31 #  define CHAR_SIZE     4
  32 # else
  33 #  define VPCMPEQ       vpcmpeqb
  34 #  define VPTESTN       vptestnmb
  35 #  define VPMINU        vpminub
  36 #  define CHAR_SIZE     1
  37 # endif
  38
  39 # define PAGE_SIZE      4096
  40 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  41
  42         .section SECTION(.text),"ax",@progbits
  43 /* Aligning entry point to 64 byte, provides better performance for
  44    one vector length string.  */
  45 ENTRY_P2ALIGN (STRLEN, 6)
  46 # ifdef USE_AS_STRNLEN
  47         /* Check zero length.  */
  48         test    %RSI_LP, %RSI_LP
  49         jz      L(ret_max)
  50 #  ifdef __ILP32__
  51         /* Clear the upper 32 bits.  */
  52         movl    %esi, %esi
  53 #  endif
  54 # endif
  55
  56         movl    %edi, %eax
  57         vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
  58         sall    $20, %eax
  59         cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
  60         ja      L(page_cross)
  61
  62         /* Compare [w]char for null, mask bit will be set for match.  */
  63         VPCMPEQ (%rdi), %VMM(0), %k0
  64 # ifdef USE_AS_STRNLEN
  65         KMOV    %k0, %VRCX
  66         /* Store max length in rax.  */
  67         mov     %rsi, %rax
  68         /* If rcx is 0, rax will have max length.  We can not use VRCX
  69            and VRAX here for evex256 because, upper 32 bits may be
  70            undefined for ecx and eax.  */
  71         bsfq    %rcx, %rax
  72         cmp     $CHAR_PER_VEC, %rax
  73         ja      L(align_more)
  74         cmpq    %rax, %rsi
  75         cmovb   %esi, %eax
  76 # else
  77         KMOV    %k0, %VRAX
  78         test    %VRAX, %VRAX
  79         jz      L(align_more)
  80         bsf     %VRAX, %VRAX
  81 # endif
  82         ret
  83
  84         /* At this point vector max length reached.  */
  85 # ifdef USE_AS_STRNLEN
  86         .p2align 4,,3
  87 L(ret_max):
  88         movq    %rsi, %rax
  89         ret
  90 # endif
  91
  92 L(align_more):
  93         mov     %rdi, %rax
  94         /* Align rax to VEC_SIZE.  */
  95         andq    $-VEC_SIZE, %rax
  96 # ifdef USE_AS_STRNLEN
  97         movq    %rdi, %rdx
  98         subq    %rax, %rdx
  99 #  ifdef USE_AS_WCSLEN
 100         shr     $2, %VRDX
 101 #  endif
 102         /* At this point rdx contains [w]chars already compared.  */
 103         leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
 104         /* At this point rdx contains number of w[char] needs to go.
 105            Now onwards rdx will keep decrementing with each compare.  */
 106 # endif
 107
 108         /* Loop unroll 4 times for 4 vector loop.  */
 109         VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
 110         subq    $-VEC_SIZE, %rax
 111         KMOV    %k0, %VRCX
 112         test    %VRCX, %VRCX
 113         jnz     L(ret_vec_x1)
 114
 115 # ifdef USE_AS_STRNLEN
 116         subq    $CHAR_PER_VEC, %rdx
 117         jbe     L(ret_max)
 118 # endif
 119
 120         VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
 121         KMOV    %k0, %VRCX
 122         test    %VRCX, %VRCX
 123         jnz     L(ret_vec_x2)
 124
 125 # ifdef USE_AS_STRNLEN
 126         subq    $CHAR_PER_VEC, %rdx
 127         jbe     L(ret_max)
 128 # endif
 129
 130         VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
 131         KMOV    %k0, %VRCX
 132         test    %VRCX, %VRCX
 133         jnz     L(ret_vec_x3)
 134
 135 # ifdef USE_AS_STRNLEN
 136         subq    $CHAR_PER_VEC, %rdx
 137         jbe     L(ret_max)
 138 # endif
 139
 140         VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
 141         KMOV    %k0, %VRCX
 142         test    %VRCX, %VRCX
 143         jnz     L(ret_vec_x4)
 144
 145 # ifdef USE_AS_STRNLEN
 146         subq    $CHAR_PER_VEC, %rdx
 147         jbe     L(ret_max)
 148         /* Save pointer before 4 x VEC_SIZE alignment.  */
 149         movq    %rax, %rcx
 150 # endif
 151
 152         /* Align address to VEC_SIZE * 4 for loop.  */
 153         andq    $-(VEC_SIZE * 4), %rax
 154
 155 # ifdef USE_AS_STRNLEN
 156         subq    %rax, %rcx
 157 #  ifdef USE_AS_WCSLEN
 158         shr     $2, %VRCX
 159 #  endif
 160         /* rcx contains number of [w]char will be recompared due to
 161            alignment fixes.  rdx must be incremented by rcx to offset
 162            alignment adjustment.  */
 163         addq    %rcx, %rdx
 164         /* Need jump as we don't want to add/subtract rdx for first
 165            iteration of 4 x VEC_SIZE aligned loop.  */
 166 # endif
 167
 168         .p2align 4,,11
 169 L(loop):
 170         /* VPMINU and VPCMP combination provide better performance as
 171            compared to alternative combinations.  */
 172         VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
 173         VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
 174         VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
 175         VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 176
 177         VPTESTN %VMM(2), %VMM(2), %k0
 178         VPTESTN %VMM(4), %VMM(4), %k1
 179
 180         subq    $-(VEC_SIZE * 4), %rax
 181         KORTEST %k0, %k1
 182
 183 # ifndef USE_AS_STRNLEN
 184         jz      L(loop)
 185 # else
 186         jnz     L(loopend)
 187         subq    $(CHAR_PER_VEC * 4), %rdx
 188         ja      L(loop)
 189         mov     %rsi, %rax
 190         ret
 191 # endif
 192
 193 L(loopend):
 194
 195         VPTESTN %VMM(1), %VMM(1), %k2
 196         KMOV    %k2, %VRCX
 197         test    %VRCX, %VRCX
 198         jnz     L(ret_vec_x1)
 199
 200         KMOV    %k0, %VRCX
 201         /* At this point, if k0 is non zero, null char must be in the
 202            second vector.  */
 203         test    %VRCX, %VRCX
 204         jnz     L(ret_vec_x2)
 205
 206         VPTESTN %VMM(3), %VMM(3), %k3
 207         KMOV    %k3, %VRCX
 208         test    %VRCX, %VRCX
 209         jnz     L(ret_vec_x3)
 210         /* At this point null [w]char must be in the fourth vector so no
 211            need to check.  */
 212         KMOV    %k1, %VRCX
 213
 214         /* Fourth, third, second vector terminating are pretty much
 215            same, implemented this way to avoid branching and reuse code
 216            from pre loop exit condition.  */
 217 L(ret_vec_x4):
 218         bsf     %VRCX, %VRCX
 219         subq    %rdi, %rax
 220 # ifdef USE_AS_WCSLEN
 221         subq    $-(VEC_SIZE * 3), %rax
 222         shrq    $2, %rax
 223         addq    %rcx, %rax
 224 # else
 225         leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
 226 # endif
 227 # ifdef USE_AS_STRNLEN
 228         cmpq    %rsi, %rax
 229         cmovnb  %rsi, %rax
 230 # endif
 231         ret
 232
 233 L(ret_vec_x3):
 234         bsf     %VRCX, %VRCX
 235         subq    %rdi, %rax
 236 # ifdef USE_AS_WCSLEN
 237         subq    $-(VEC_SIZE * 2), %rax
 238         shrq    $2, %rax
 239         addq    %rcx, %rax
 240 # else
 241         leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
 242 # endif
 243 # ifdef USE_AS_STRNLEN
 244         cmpq    %rsi, %rax
 245         cmovnb  %rsi, %rax
 246 # endif
 247         ret
 248
 249 L(ret_vec_x2):
 250         subq    $-VEC_SIZE, %rax
 251 L(ret_vec_x1):
 252         bsf     %VRCX, %VRCX
 253         subq    %rdi, %rax
 254 # ifdef USE_AS_WCSLEN
 255         shrq    $2, %rax
 256 # endif
 257         addq    %rcx, %rax
 258 # ifdef USE_AS_STRNLEN
 259         cmpq    %rsi, %rax
 260         cmovnb  %rsi, %rax
 261 # endif
 262         ret
 263
 264 L(page_cross):
 265         mov     %rdi, %rax
 266         movl    %edi, %ecx
 267         andl    $(VEC_SIZE - 1), %ecx
 268 # ifdef USE_AS_WCSLEN
 269         sarl    $2, %ecx
 270 # endif
 271         /* ecx contains number of w[char] to be skipped as a result
 272            of address alignment.  */
 273         andq    $-VEC_SIZE, %rax
 274         VPCMPEQ (%rax), %VMM(0), %k0
 275         KMOV    %k0, %VRDX
 276         /* Ignore number of character for alignment adjustment.  */
 277         shr     %cl, %VRDX
 278 # ifdef USE_AS_STRNLEN
 279         jnz     L(page_cross_end)
 280         movl    $CHAR_PER_VEC, %eax
 281         sub     %ecx, %eax
 282         cmp     %rax, %rsi
 283         ja      L(align_more)
 284 # else
 285         jz      L(align_more)
 286 # endif
 287
 288 L(page_cross_end):
 289         bsf     %VRDX, %VRAX
 290 # ifdef USE_AS_STRNLEN
 291         cmpq    %rsi, %rax
 292         cmovnb  %esi, %eax
 293 # endif
 294         ret
 295
 296 END (STRLEN)
 297 #endif