1 /* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
3 Copyright (C) 2016-2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
22 /* int [r3] strlen (char *s [r3]) */
25 # define STRLEN strlen
28 ENTRY_TOCLESS (STRLEN, 4)
31 clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
32 rlwinm r6,r3,3,26,28 /* Calculate padding. */
33 li r0,0 /* Doubleword with null chars to use
35 li r5,-1 /* MASK = 0xffffffffffffffff. */
36 ld r12,0(r4) /* Load doubleword from memory. */
37 #ifdef __LITTLE_ENDIAN__
40 srd r5,r5,r6 /* MASK = MASK >> padding. */
42 orc r9,r12,r5 /* Mask bits that are not part of the string. */
43 cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
44 cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
47 /* For shorter strings (< 64 bytes), we will not use vector registers,
48 as the overhead isn't worth it. So, let's use GPRs instead. This
49 will be done the same way as we do in the POWER7 implementation.
50 Let's see if we are aligned to a quadword boundary. If so, we can
51 jump to the first (non-vectorized) loop. Otherwise, we have to
52 handle the next DWORD first. */
58 /* Handle the next 8 bytes so we are aligned to a quadword
67 /* Proceed to the old (POWER7) implementation, checking two doublewords
68 per iteraction. For the first 56 bytes, we will just check for null
69 characters. After that, we will also check if we are 64-byte aligned
70 so we can jump to the vectorized implementation. We will unroll
71 these loops to avoid excessive branching. */
99 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
100 Note: aligning to 64-byte will necessarily slow down performance for
101 strings around 64 bytes in length due to the extra comparisons
102 required to check alignment for the vectorized loop. This is a
103 necessary tradeoff we are willing to take in order to speed up the
104 calculation for larger strings. */
114 bne cr7,L(dword_zero)
125 bne cr7,L(dword_zero)
137 /* At this point, we are necessarily 64-byte aligned. If no zeroes were
138 found, jump to the vectorized loop. */
142 /* OK, one (or both) of the doublewords contains a null byte. Check
143 the first doubleword and decrement the address in case the first
144 doubleword really contains a null byte. */
150 /* The null byte must be in the second doubleword. Adjust the address
151 again and move the result of cmpb to r10 so we can calculate the
157 /* If the null byte was found in the non-vectorized code, compute the
158 final length. r10 has the output of the cmpb instruction, that is,
159 it contains 0xff in the same position as the null byte in the
160 original doubleword from the string. Use that to calculate the
163 #ifdef __LITTLE_ENDIAN__
164 addi r9, r10,-1 /* Form a mask from trailing zeros. */
166 popcntd r0, r9 /* Count the bits in the mask. */
168 cntlzd r0,r10 /* Count leading zeros before the match. */
171 srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
172 add r3,r5,r0 /* Compute final length. */
175 /* Vectorized implementation starts here. */
178 /* Set up for the loop. */
180 li r7, 16 /* Load required offsets. */
184 vxor v0,v0,v0 /* VR with null chars to use with
187 /* Main loop to look for the end of the string. We will read in
188 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
189 leverage the icache performance. */
192 lvx v1,r4,r0 /* Load 4 quadwords. */
196 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
199 vcmpequb. v7,v7,v0 /* Check for NULLs. */
200 addi r4,r4,64 /* Adjust address for the next iteration. */
203 lvx v1,r4,r0 /* Load 4 quadwords. */
207 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
210 vcmpequb. v7,v7,v0 /* Check for NULLs. */
211 addi r4,r4,64 /* Adjust address for the next iteration. */
214 lvx v1,r4,r0 /* Load 4 quadwords. */
218 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
221 vcmpequb. v7,v7,v0 /* Check for NULLs. */
222 addi r4,r4,64 /* Adjust address for the next iteration. */
226 /* OK, we found a null byte. Let's look for it in the current 64-byte
227 block and mark it in its corresponding VR. */
233 /* We will now 'compress' the result into a single doubleword, so it
234 can be moved to a GPR for the final calculation. First, we
235 generate an appropriate mask for vbpermq, so we can permute bits into
236 the first halfword. */
241 /* Permute the first bit of each byte into bits 48-63. */
247 /* Shift each component into its correct position for merging. */
248 #ifdef __LITTLE_ENDIAN__
258 /* Merge the results and move to a GPR. */
264 /* Adjust address to the begninning of the current 64-byte block. */
267 #ifdef __LITTLE_ENDIAN__
268 addi r9, r10,-1 /* Form a mask from trailing zeros. */
270 popcntd r0, r9 /* Count the bits in the mask. */
272 cntlzd r0,r10 /* Count leading zeros before the match. */
275 add r3,r5,r0 /* Compute final length. */
279 libc_hidden_builtin_def (strlen)