1 /* Optimized strnlen implementation for POWER8 using a vmx loop.
3 Copyright (C) 2017 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with the GNU C Library; if not, see
15 <http://www.gnu.org/licenses/>. */
17 /* It is implemented the following heuristic:
18 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
19 reading doublewords. Uses the POWER7 algorithm.
20 2. Case maxlen > 32: check for null bytes in the first 16 bytes using
21 unaligned accesses. Return length if found. Otherwise:
22 2.1 Case maxlen < 64: deduct the bytes previously read, align
23 the pointer to 16 bytes and loop through reading quadwords
24 until find null bytes or reach maxlen.
25 2.2 Case maxlen > 64: deduct the bytes previously read, align
26 the pointer to 64 bytes and set up a counter to loop through
27 reading in strides of 64 bytes. In case it finished the loop
28 with null bytes not found, process the remainder bytes by
29 switching to the loop to heuristic in 2.1. */
33 /* Define default page size to 4KB. */
34 #define PAGE_SIZE 4096
36 /* The following macros implement Power ISA v2.07 opcodes
37 that could not be used directly into this code to the keep
38 compatibility with older binutils versions. */
40 /* Move from vector register doubleword. */
41 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
43 /* Move to vector register doubleword. */
44 #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
46 /* Vector Bit Permute Quadword. */
47 #define VBPERMQ(t,a,b) .long (0x1000054c \
52 /* Vector Population Count Halfword. */
53 #define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
55 /* Vector Count Leading Zeros Halfword. */
56 #define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
59 /* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
60 /* TODO: change to power8 when minimum required binutils allows it. */
66 cmpldi r4,32 /* Check if maxlen <= 32. */
67 ble L(small_range) /* If maxlen <= 32. */
69 /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
70 otherwise the processor throws an memory access error.
71 Use following code to check there is room for such as accesses:
72 (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
73 If it is disallowed then switch to the code that handles
74 the string when maxlen <= 32. */
76 cmpldi cr7,r10,PAGE_SIZE-16
77 bgt cr7,L(small_range) /* If less than 16B of page end. */
79 /* Compute our permute constant r8. */
81 /* Compute a bpermd constant to move bit 0 of each word into
82 a halfword value, and count trailing zeros. */
83 #ifdef __LITTLE_ENDIAN__
97 /* maxlen > 32. Optimistically check for null bytes in the first
98 16 bytes of the string using unaligned accesses. */
101 cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */
102 cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */
104 bne cr0, L(early_find) /* If found null bytes. */
106 /* At this point maxlen > 32 and null bytes were not found at first
107 16 bytes. Prepare for loop using VMX. */
109 /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */
111 addi r5,r3,16 /* Align up, or just add the 16B we
114 and r7,r5,r0 /* Find offset into 16B alignment. */
115 andc r5,r5,r0 /* Quadword align up s to the next quadword. */
118 subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */
121 /* Compute offsets for vmx loads, and precompute the vbpermq
122 constants for both the 64B and 16B loops. */
129 cmpldi r4,64 /* Check maxlen < 64. */
130 blt L(smaller) /* If maxlen < 64 */
132 /* In order to begin the 64B loop, it needs to be 64
133 bytes aligned. So read quadwords until it is aligned or found null
134 bytes. At worst case it will be aligned after the fourth iteration,
135 so unroll the loop to avoid counter checking. */
136 andi. r7,r5,63 /* Check if is 64 bytes aligned. */
137 beq cr0,L(preloop_64B) /* If it is already 64B aligned. */
141 addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */
142 bne cr6,L(found_aligning64B) /* If found null bytes. */
144 /* Unroll 3x above code block until aligned or find null bytes. */
146 beq cr0,L(preloop_64B)
151 bne cr6,L(found_aligning64B)
154 beq cr0,L(preloop_64B)
159 bne cr6,L(found_aligning64B)
162 beq cr0,L(preloop_64B)
167 bne cr6,L(found_aligning64B)
169 /* At this point it should be 16 bytes aligned.
170 Prepare for the 64B loop. */
173 /* Check if maxlen became is less than 64, therefore disallowing the
174 64B loop. If it happened switch to the 16B loop code. */
175 cmpldi r4,64 /* Check if maxlen < 64. */
176 blt L(smaller) /* If maxlen < 64. */
177 /* Set some constant values. */
182 /* Compute the number of 64 bytes iterations needed. */
183 srdi r11,r4,6 /* Compute loop count (maxlen / 64). */
184 andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */
185 mtctr r11 /* Move loop count to counter register. */
187 /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */
190 lvx v1,r5,r6 /* r5 is the pointer to s. */
194 /* Compare the four 16B vectors to obtain the least 16 values.
195 Null bytes should emerge into v7, then check for null bytes. */
199 vcmpequb. v7,v7,v0 /* Check for null bytes. */
200 addi r5,r5,64 /* Add pointer to next iteraction. */
201 bne cr6,L(found_64B) /* If found null bytes. */
202 bdnz L(loop_64B) /* Continue the loop if count > 0. */
204 /* Hit loop end without null match. So branch to handle the remainder. */
206 /* Prepare a 16B loop to handle two cases:
207 1. If 32 > maxlen < 64.
208 2. If maxlen >= 64, and reached end of the 64B loop with null
209 bytes not found. Thus handle the remainder bytes here. */
212 cmpldi r4,0 /* Check maxlen is zero. */
213 beq L(done) /* If maxlen is zero. */
215 /* Place rounded up number of qw's to check into a vmx
216 register, and use some vector tricks to minimize
218 MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */
224 #ifdef __LITTLE_ENDIAN__
225 vspltish v5,1 /* Compute 16 in each byte. */
228 /* Loop in 16B aligned incremements now. */
231 lvx v1,r5,r6 /* Load quadword into vector register. */
232 addi r5,r5,16 /* Increment address to next 16B block. */
233 vor v7,v2,v2 /* Save loop count (v2) into v7. */
234 vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */
236 vcmpequb. v4,v4,v0 /* Checking for null bytes. */
237 beq cr6,L(loop_16B) /* If null bytes not found. */
241 #ifdef __LITTLE_ENDIAN__
242 vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
244 VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */
246 VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */
248 /* Truncate to maximum allowable offset. */
249 vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
251 vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
254 addi r5,r5,-16 /* Undo speculative bump. */
255 extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
256 add r5,r5,r0 /* Add the offset of whatever was found. */
258 subf r3,r3,r5 /* Length is equal to the offset of null byte
259 matched minus the pointer to s. */
262 /* Handle case of maxlen > 64 and found null bytes in last block
266 /* A zero was found. Reduce the result. */
272 /* Permute the first bit of each byte into bits 48-63. */
278 /* Shift each component into its correct position for merging. */
279 #ifdef __LITTLE_ENDIAN__
289 /* Merge the results and move to a GPR. */
294 /* Adjust address to the start of the current 64B block. */
298 #ifdef __LITTLE_ENDIAN__
299 addi r9,r10,-1 /* Form a mask from trailing zeros. */
301 popcntd r0,r9 /* Count the bits in the mask. */
303 cntlzd r0,r10 /* Count leading zeros before the match. */
306 add r3,r5,r0 /* Compute final length. */
309 /* Handle case where null bytes were found while aligning
310 as a preparation for the 64B loop. */
312 L(found_aligning64B):
314 #ifdef __LITTLE_ENDIAN__
316 addi r9,r10,-1 /* Form a mask from trailing zeros. */
318 popcntd r0,r9 /* Count the bits in the mask. */
322 cntlzd r0,r10 /* Count leading zeros before the match. */
324 addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
326 /* Calculate length as subtracted the pointer to s of last 16 bytes
327 offset, added with the bytes before the match. */
332 /* Handle case of maxlen > 32 and found a null bytes within the first
336 bpermd r5,r8,r10 /* r8 contains the bit permute constants. */
339 or r5,r5,r6 /* r5 should hold a 16B mask of
341 cntlzd r5,r5 /* Count leading zeros. */
342 addi r3,r5,-48 /* Deduct the 48 leading zeros always
346 /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */
349 clrrdi r8,r3,3 /* Align the pointer to 8B. */
351 /* Register's content at this point:
352 r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
353 r7 == last acceptable address. */
354 cmpldi r4,0 /* Check if maxlen is zero. */
355 beq L(end_max) /* If maxlen is zero. */
357 /* Calculate the last acceptable address and check for possible
358 addition overflow by using satured math:
368 clrrdi r7,r7,3 /* Align to 8B address of last
369 acceptable address. */
371 rlwinm r6,r3,3,26,28 /* Calculate padding. */
372 ld r12,0(r8) /* Load aligned doubleword. */
373 cmpb r10,r12,r0 /* Check for null bytes. */
374 #ifdef __LITTLE_ENDIAN__
380 #endif /* __LITTLE_ENDIAN__ */
382 bne cr7,L(done_small) /* If found null byte. */
384 cmpld r8,r7 /* Check if reached maxlen. */
385 beq L(end_max) /* If reached maxlen. */
387 /* Still handling case of maxlen <= 32. Read doubleword aligned until
388 find null bytes or reach maxlen. */
391 ldu r12,8(r8) /* Load next doubleword and update r8. */
392 cmpb r10,r12,r0 /* Check for null bytes. */
394 bne cr6,L(done_small) /* If found null bytes. */
395 cmpld r8,r7 /* Check if reached maxlen. */
396 bne L(loop_small) /* If it has more bytes to read. */
397 mr r3,r4 /* Reached maxlen with null bytes not found.
398 Length is equal to maxlen. */
401 /* Still handling case of maxlen <= 32. Found null bytes.
402 Registers: r10 == match bits within doubleword, r8 == address of
403 last doubleword read, r3 == pointer to s, r4 == maxlen. */
406 #ifdef __LITTLE_ENDIAN__
407 /* Count trailing zeros. */
412 cntlzd r0,r10 /* Count leading zeros before the match. */
414 sub r3,r8,r3 /* Calculate total of bytes before the match. */
415 srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
416 add r3,r3,r0 /* Length until the match. */
417 cmpld r3,r4 /* Check length is greater than maxlen. */
419 mr r3,r4 /* If length is greater than maxlen, return
423 /* Handle case of reached maxlen with null bytes not found. */
426 mr r3,r4 /* Length is equal to maxlen. */
431 libc_hidden_def (__strnlen)
432 weak_alias (__strnlen, strnlen)
433 libc_hidden_def (strnlen)