1 /* Optimized strlen implementation for PowerPC64.
2 Copyright (C) 1997, 1999, 2000, 2002, 2003, 2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 /* The algorithm here uses the following techniques:
25 1) Given a word 'x', we can test to see if it contains any 0 bytes
26 by subtracting 0x01010101, and seeing if any of the high bits of each
27 byte changed from 0 to 1. This works because the least significant
28 0 byte must have had no incoming carry (otherwise it's not the least
29 significant), so it is 0x00 - 0x01 == 0xff. For all other
30 byte values, either they have the high bit set initially, or when
31 1 is subtracted you get a value in the range 0x00-0x7f, none of which
32 have their high bit set. The expression here is
33 (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
34 there were no 0x00 bytes in the word.
36 2) Given a word 'x', we can test to see _which_ byte was zero by
37 calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
38 This produces 0x80 in each byte that was zero, and 0x00 in all
39 the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
40 byte, and the '| x' part ensures that bytes with the high bit set
41 produce 0x00. The addition will carry into the high bit of each byte
42 iff that byte had one of its low 7 bits set. We can then just see
43 which was the most significant bit set and divide by 8 to find how
44 many to add to the index.
45 This is from the book 'The PowerPC Compiler Writer's Guide',
46 by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
48 We deal with strings not aligned to a word boundary by taking the
49 first word and ensuring that bytes not part of the string
50 are treated as nonzero. To allow for memory latency, we unroll the
51 loop a few times, being careful to ensure that we do not read ahead
52 across cache line boundaries.
55 1) How long are strings passed to strlen? If they're often really long,
56 we should probably use cache management instructions and/or unroll the
57 loop more. If they're often quite short, it might be better to use
58 fact (2) in the inner loop than have to recalculate it.
59 2) How popular are bytes with the high bit set? If they are very rare,
60 on some processors it might be useful to use the simpler expression
61 ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
62 ALU), but this fails when any character has its high bit set.
65 1) Added a Data Cache Block Touch early to prefetch the first 128
66 byte cache line. Adding dcbt instructions to the loop would not be
67 effective since most strings will be shorter than the cache line.*/
69 /* Some notes on register usage: Under the SVR4 ABI, we can use registers
70 0 and 3 through 12 (so long as we don't call any procedures) without
71 saving them. We can also use registers 14 through 31 if we save them.
72 We can't use r1 (it's the stack pointer), r2 nor r13 because the user
73 program may expect them to hold their usual value if we get sent
74 a signal. Integer parameters are passed in r3 through r10.
75 We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
76 them, the others we must save. */
78 /* int [r3] strlen (char *s [r3]) */
80 ENTRY (BP_SYM (strlen))
84 #define rRTN r3 /* incoming STR arg, outgoing result */
85 #define rSTR r4 /* current string position */
86 #define rPADN r5 /* number of padding bits we prepend to the
87 string to make it start at a word boundary */
88 #define rFEFE r6 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
89 #define r7F7F r7 /* constant 0x7f7f7f7f7f7f7f7f */
90 #define rWORD1 r8 /* current string doubleword */
91 #define rWORD2 r9 /* next string doubleword */
92 #define rMASK r9 /* mask for first string doubleword */
97 /* Note: The Bounded pointer support in this code is broken. This code
98 was inherited from PPC32 and that support was never completed.
99 Current PPC gcc does not support -fbounds-check or -fbounded-pointers.
100 These artifacts are left in the code as a reminder in case we need
101 bounded pointer support in the future. */
102 CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2)
107 rlwinm rPADN, rRTN, 3, 26, 28
109 addi r7F7F, r7F7F, 0x7f7f
111 insrdi r7F7F, r7F7F, 32, 0
112 /* That's the setup done, now do the first pair of doublewords.
113 We make an exception and use method (2) on the first two doublewords,
114 to reduce overhead. */
115 srd rMASK, rMASK, rPADN
116 and rTMP1, r7F7F, rWORD1
117 or rTMP2, r7F7F, rWORD1
119 add rTMP1, rTMP1, r7F7F
120 addi rFEFE, rFEFE, -0x101
121 nor rTMP1, rTMP2, rTMP1
122 and. rWORD1, rTMP1, rMASK
125 sldi rTMP1, rFEFE, 32
126 add rFEFE, rFEFE, rTMP1
127 /* Are we now aligned to a doubleword boundary? */
130 /* Handle second doubleword of pair. */
132 and rTMP1, r7F7F, rWORD1
133 or rTMP2, r7F7F, rWORD1
134 add rTMP1, rTMP1, r7F7F
135 nor. rWORD1, rTMP2, rTMP1
143 add rTMP1, rFEFE, rWORD1
144 nor rTMP2, r7F7F, rWORD1
145 and. rTMP1, rTMP1, rTMP2
146 add rTMP3, rFEFE, rWORD2
147 nor rTMP4, r7F7F, rWORD2
149 and. rTMP1, rTMP3, rTMP4
152 and rTMP1, r7F7F, rWORD2
153 add rTMP1, rTMP1, r7F7F
154 andc rWORD1, rTMP4, rTMP1
158 and rTMP1, r7F7F, rWORD1
160 add rTMP1, rTMP1, r7F7F
161 andc rWORD1, rTMP2, rTMP1
163 /* When we get to here, rSTR points to the first doubleword in the string that
164 contains a zero byte, and the most significant set bit in rWORD1 is in that
168 subf rTMP1, rRTN, rSTR
170 add rRTN, rTMP1, rTMP3
171 /* GKM FIXME: check high bound. */
173 END (BP_SYM (strlen))
174 libc_hidden_builtin_def (strlen)