sysdeps/powerpc/powerpc64/power8/strlen.S

   1 /* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
   2    loop.
   3    Copyright (C) 2016-2023 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* int [r3] strlen (char *s [r3])  */
  23
  24 #ifndef STRLEN
  25 # define STRLEN strlen
  26 #endif
  27         .machine  power8
  28 ENTRY_TOCLESS (STRLEN, 4)
  29         CALL_MCOUNT 1
  30         dcbt    0,r3
  31         clrrdi  r4,r3,3       /* Align the address to doubleword boundary.  */
  32         rlwinm  r6,r3,3,26,28 /* Calculate padding.  */
  33         li      r0,0          /* Doubleword with null chars to use
  34                                  with cmpb.  */
  35         li      r5,-1         /* MASK = 0xffffffffffffffff.  */
  36         ld      r12,0(r4)     /* Load doubleword from memory.  */
  37 #ifdef __LITTLE_ENDIAN__
  38         sld     r5,r5,r6
  39 #else
  40         srd     r5,r5,r6      /* MASK = MASK >> padding.  */
  41 #endif
  42         orc     r9,r12,r5     /* Mask bits that are not part of the string.  */
  43         cmpb    r10,r9,r0     /* Check for null bytes in DWORD1.  */
  44         cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
  45         bne     cr7,L(done)
  46
  47         /* For shorter strings (< 64 bytes), we will not use vector registers,
  48            as the overhead isn't worth it.  So, let's use GPRs instead.  This
  49            will be done the same way as we do in the POWER7 implementation.
  50            Let's see if we are aligned to a quadword boundary.  If so, we can
  51            jump to the first (non-vectorized) loop.  Otherwise, we have to
  52            handle the next DWORD first.  */
  53         mtcrf   0x01,r4
  54         mr      r9,r4
  55         addi    r9,r9,8
  56         bt      28,L(align64)
  57
  58         /* Handle the next 8 bytes so we are aligned to a quadword
  59            boundary.  */
  60         ldu     r5,8(r4)
  61         cmpb    r10,r5,r0
  62         cmpdi   cr7,r10,0
  63         addi    r9,r9,8
  64         bne     cr7,L(done)
  65
  66 L(align64):
  67         /* Proceed to the old (POWER7) implementation, checking two doublewords
  68            per iteraction.  For the first 56 bytes, we will just check for null
  69            characters.  After that, we will also check if we are 64-byte aligned
  70            so we can jump to the vectorized implementation.  We will unroll
  71            these loops to avoid excessive branching.  */
  72         ld      r6,8(r4)
  73         ldu     r5,16(r4)
  74         cmpb    r10,r6,r0
  75         cmpb    r11,r5,r0
  76         or      r5,r10,r11
  77         cmpdi   cr7,r5,0
  78         addi    r9,r9,16
  79         bne     cr7,L(dword_zero)
  80
  81         ld      r6,8(r4)
  82         ldu     r5,16(r4)
  83         cmpb    r10,r6,r0
  84         cmpb    r11,r5,r0
  85         or      r5,r10,r11
  86         cmpdi   cr7,r5,0
  87         addi    r9,r9,16
  88         bne     cr7,L(dword_zero)
  89
  90         ld      r6,8(r4)
  91         ldu     r5,16(r4)
  92         cmpb    r10,r6,r0
  93         cmpb    r11,r5,r0
  94         or      r5,r10,r11
  95         cmpdi   cr7,r5,0
  96         addi    r9,r9,16
  97         bne     cr7,L(dword_zero)
  98
  99         /* Are we 64-byte aligned? If so, jump to the vectorized loop.
 100            Note: aligning to 64-byte will necessarily slow down performance for
 101            strings around 64 bytes in length due to the extra comparisons
 102            required to check alignment for the vectorized loop.  This is a
 103            necessary tradeoff we are willing to take in order to speed up the
 104            calculation for larger strings.  */
 105         andi.   r10,r9,63
 106         beq     cr0,L(preloop)
 107         ld      r6,8(r4)
 108         ldu     r5,16(r4)
 109         cmpb    r10,r6,r0
 110         cmpb    r11,r5,r0
 111         or      r5,r10,r11
 112         cmpdi   cr7,r5,0
 113         addi    r9,r9,16
 114         bne     cr7,L(dword_zero)
 115
 116         andi.   r10,r9,63
 117         beq     cr0,L(preloop)
 118         ld      r6,8(r4)
 119         ldu     r5,16(r4)
 120         cmpb    r10,r6,r0
 121         cmpb    r11,r5,r0
 122         or      r5,r10,r11
 123         cmpdi   cr7,r5,0
 124         addi    r9,r9,16
 125         bne     cr7,L(dword_zero)
 126
 127         andi.   r10,r9,63
 128         beq     cr0,L(preloop)
 129         ld      r6,8(r4)
 130         ldu     r5,16(r4)
 131         cmpb    r10,r6,r0
 132         cmpb    r11,r5,r0
 133         or      r5,r10,r11
 134         cmpdi   cr7,r5,0
 135         addi    r9,r9,16
 136
 137         /* At this point, we are necessarily 64-byte aligned.  If no zeroes were
 138            found, jump to the vectorized loop.  */
 139         beq     cr7,L(preloop)
 140
 141 L(dword_zero):
 142         /* OK, one (or both) of the doublewords contains a null byte.  Check
 143            the first doubleword and decrement the address in case the first
 144            doubleword really contains a null byte.  */
 145
 146         cmpdi   cr6,r10,0
 147         addi    r4,r4,-8
 148         bne     cr6,L(done)
 149
 150         /* The null byte must be in the second doubleword.  Adjust the address
 151            again and move the result of cmpb to r10 so we can calculate the
 152            length.  */
 153
 154         mr      r10,r11
 155         addi    r4,r4,8
 156
 157         /* If the null byte was found in the non-vectorized code, compute the
 158            final length.  r10 has the output of the cmpb instruction, that is,
 159            it contains 0xff in the same position as the null byte in the
 160            original doubleword from the string.  Use that to calculate the
 161            length.  */
 162 L(done):
 163 #ifdef __LITTLE_ENDIAN__
 164         addi    r9, r10,-1    /* Form a mask from trailing zeros.  */
 165         andc    r9, r9,r10
 166         popcntd r0, r9        /* Count the bits in the mask.  */
 167 #else
 168         cntlzd  r0,r10        /* Count leading zeros before the match.  */
 169 #endif
 170         subf    r5,r3,r4
 171         srdi    r0,r0,3       /* Convert leading/trailing zeros to bytes.  */
 172         add     r3,r5,r0      /* Compute final length.  */
 173         blr
 174
 175         /* Vectorized implementation starts here.  */
 176         .p2align  4
 177 L(preloop):
 178         /* Set up for the loop.  */
 179         mr      r4,r9
 180         li      r7, 16        /* Load required offsets.  */
 181         li      r8, 32
 182         li      r9, 48
 183         li      r12, 8
 184         vxor    v0,v0,v0      /* VR with null chars to use with
 185                                  vcmpequb.  */
 186
 187         /* Main loop to look for the end of the string.  We will read in
 188            64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
 189            leverage the icache performance.  */
 190         .p2align  5
 191 L(loop):
 192         lvx       v1,r4,r0  /* Load 4 quadwords.  */
 193         lvx       v2,r4,r7
 194         lvx       v3,r4,r8
 195         lvx       v4,r4,r9
 196         vminub    v5,v1,v2  /* Compare and merge into one VR for speed.  */
 197         vminub    v6,v3,v4
 198         vminub    v7,v5,v6
 199         vcmpequb. v7,v7,v0  /* Check for NULLs.  */
 200         addi      r4,r4,64  /* Adjust address for the next iteration.  */
 201         bne       cr6,L(vmx_zero)
 202
 203         lvx       v1,r4,r0  /* Load 4 quadwords.  */
 204         lvx       v2,r4,r7
 205         lvx       v3,r4,r8
 206         lvx       v4,r4,r9
 207         vminub    v5,v1,v2  /* Compare and merge into one VR for speed.  */
 208         vminub    v6,v3,v4
 209         vminub    v7,v5,v6
 210         vcmpequb. v7,v7,v0  /* Check for NULLs.  */
 211         addi      r4,r4,64  /* Adjust address for the next iteration.  */
 212         bne       cr6,L(vmx_zero)
 213
 214         lvx       v1,r4,r0  /* Load 4 quadwords.  */
 215         lvx       v2,r4,r7
 216         lvx       v3,r4,r8
 217         lvx       v4,r4,r9
 218         vminub    v5,v1,v2  /* Compare and merge into one VR for speed.  */
 219         vminub    v6,v3,v4
 220         vminub    v7,v5,v6
 221         vcmpequb. v7,v7,v0  /* Check for NULLs.  */
 222         addi      r4,r4,64  /* Adjust address for the next iteration.  */
 223         beq       cr6,L(loop)
 224
 225 L(vmx_zero):
 226         /* OK, we found a null byte.  Let's look for it in the current 64-byte
 227            block and mark it in its corresponding VR.  */
 228         vcmpequb  v1,v1,v0
 229         vcmpequb  v2,v2,v0
 230         vcmpequb  v3,v3,v0
 231         vcmpequb  v4,v4,v0
 232
 233         /* We will now 'compress' the result into a single doubleword, so it
 234            can be moved to a GPR for the final calculation.  First, we
 235            generate an appropriate mask for vbpermq, so we can permute bits into
 236            the first halfword.  */
 237         vspltisb  v10,3
 238         lvsl      v11,r0,r0
 239         vslb      v10,v11,v10
 240
 241         /* Permute the first bit of each byte into bits 48-63.  */
 242         vbpermq v1,v1,v10
 243         vbpermq v2,v2,v10
 244         vbpermq v3,v3,v10
 245         vbpermq v4,v4,v10
 246
 247         /* Shift each component into its correct position for merging.  */
 248 #ifdef __LITTLE_ENDIAN__
 249         vsldoi  v2,v2,v2,2
 250         vsldoi  v3,v3,v3,4
 251         vsldoi  v4,v4,v4,6
 252 #else
 253         vsldoi  v1,v1,v1,6
 254         vsldoi  v2,v2,v2,4
 255         vsldoi  v3,v3,v3,2
 256 #endif
 257
 258         /* Merge the results and move to a GPR.  */
 259         vor     v1,v2,v1
 260         vor     v2,v3,v4
 261         vor     v4,v1,v2
 262         mfvrd   r10,v4
 263
 264          /* Adjust address to the begninning of the current 64-byte block.  */
 265         addi    r4,r4,-64
 266
 267 #ifdef __LITTLE_ENDIAN__
 268         addi    r9, r10,-1    /* Form a mask from trailing zeros.  */
 269         andc    r9, r9,r10
 270         popcntd r0, r9        /* Count the bits in the mask.  */
 271 #else
 272         cntlzd  r0,r10        /* Count leading zeros before the match.  */
 273 #endif
 274         subf    r5,r3,r4
 275         add     r3,r5,r0      /* Compute final length.  */
 276         blr
 277
 278 END (STRLEN)
 279 libc_hidden_builtin_def (strlen)