1 /* strcpy/stpcpy - copy a string returning pointer to start/end.
2 Copyright (C) 2013-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
21 To test the page crossing code path more thoroughly, compile with
22 -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
23 the slower entry path. This option is not intended for production use. */
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
32 /* Arguments and results. */
36 /* Locals and temporaries. */
57 #define STRCPY __stpcpy
62 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64 can be done in parallel across the entire word. */
66 #define REP8_01 0x0101010101010101
67 #define REP8_7f 0x7f7f7f7f7f7f7f7f
68 #define REP8_80 0x8080808080808080
70 /* AArch64 systems have a minimum page size of 4k. We can do a quick
71 page size check for crossing this boundary on entry and if we
72 do not, then we can short-circuit much of the entry code. We
73 expect early page-crossing strings to be rare (probability of
74 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
75 predictable, even with random strings.
77 We don't bother checking for larger page sizes, the cost of setting
78 up the correct page size is just not worth the extra gain from
79 a small reduction in the cases taking the slow path. Note that
80 we only care about whether the first fetch, which may be
81 misaligned, crosses a page boundary - after that we move to aligned
82 fetches for the remainder of the string. */
84 #ifdef STRCPY_TEST_PAGE_CROSS
85 /* Make everything that isn't Qword aligned look like a page cross. */
88 #define MIN_PAGE_P2 12
91 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
93 ENTRY_ALIGN (STRCPY, 6)
96 /* For moderately short strings, the fastest way to do the copy is to
97 calculate the length of the string in the same way as strlen, then
98 essentially do a memcpy of the result. This avoids the need for
99 multiple byte copies and further means that by the time we
100 reach the bulk copy loop we know we can always use DWord
101 accesses. We expect strcpy to rarely be called repeatedly
102 with the same source string, so branch prediction is likely to
103 always be difficult - we mitigate against this by preferring
104 conditional select operations over branches whenever this is
106 and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
107 mov zeroones, #REP8_01
108 and to_align, srcin, #15
109 cmp tmp2, #(MIN_PAGE_SIZE - 16)
111 /* The first fetch will straddle a (possible) page boundary iff
112 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
113 aligned string will never fail the page align check, so will
114 always take the fast path. */
118 ldp data1, data2, [srcin]
120 /* Because we expect the end to be found within 16 characters
121 (profiling shows this is the most common case), it's worth
122 swapping the bytes now to save having to recalculate the
123 termination syndrome later. We preserve data1 and data2
124 so that we can re-use the values later on. */
126 sub tmp1, tmp2, zeroones
127 orr tmp2, tmp2, #REP8_7f
128 bics has_nul1, tmp1, tmp2
131 sub tmp3, tmp4, zeroones
132 orr tmp4, tmp4, #REP8_7f
134 sub tmp1, data1, zeroones
135 orr tmp2, data1, #REP8_7f
136 bics has_nul1, tmp1, tmp2
138 sub tmp3, data2, zeroones
139 orr tmp4, data2, #REP8_7f
141 bics has_nul2, tmp3, tmp4
144 /* The string is short (<=16 bytes). We don't know exactly how
145 short though, yet. Work out the exact length so that we can
146 quickly select the optimal copy strategy. */
148 rev has_nul2, has_nul2
151 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
154 lsr data2, data2, pos
156 lsl data2, data2, pos
166 rev has_nul1, has_nul1
168 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
169 subs tmp2, pos, #24 /* Pos in bits. */
174 lsr data2, data1, pos
175 lsr data1, data1, #32
177 lsr data2, data1, tmp2
179 /* 4->7 bytes to copy. */
180 str data2w, [dst, #-3]
188 /* 2->3 bytes to copy. */
190 lsr data1, data1, #48
193 /* Fall-through, one byte (max) to go. */
195 /* Null-terminated string. Last character must be zero! */
203 /* Aligning here ensures that the entry code and main loop all lies
204 within one 64-byte cache line. */
206 sub to_align, to_align, #16
207 stp data1, data2, [dstin]
208 sub src, srcin, to_align
209 sub dst, dstin, to_align
210 b L(entry_no_page_cross)
212 /* The inner loop deals with two Dwords at a time. This has a
213 slightly higher start-up cost, but we should win quite quickly,
214 especially on cores with a high number of issue slots per
215 cycle, as we get much better parallelism out of the operations. */
217 stp data1, data2, [dst], #16
218 L(entry_no_page_cross):
219 ldp data1, data2, [src], #16
220 sub tmp1, data1, zeroones
221 orr tmp2, data1, #REP8_7f
222 sub tmp3, data2, zeroones
223 orr tmp4, data2, #REP8_7f
224 bic has_nul1, tmp1, tmp2
225 bics has_nul2, tmp3, tmp4
226 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
229 /* Since we know we are copying at least 16 bytes, the fastest way
230 to deal with the tail is to determine the location of the
231 trailing NUL, then (re)copy the 16 bytes leading up to that. */
234 /* For big-endian, carry propagation (if the final byte in the
235 string is 0x01) means we cannot use has_nul directly. The
236 easiest way to get the correct byte is to byte-swap the data
237 and calculate the syndrome a second time. */
238 csel data1, data1, data2, ne
240 sub tmp1, data1, zeroones
241 orr tmp2, data1, #REP8_7f
242 bic has_nul1, tmp1, tmp2
244 csel has_nul1, has_nul1, has_nul2, ne
246 rev has_nul1, has_nul1
250 csel pos, pos, tmp1, ne
251 add src, src, pos, lsr #3
252 add dst, dst, pos, lsr #3
253 ldp data1, data2, [src, #-32]
254 stp data1, data2, [dst, #-16]
262 /* Start by loading two words at [srcin & ~15], then forcing the
263 bytes that precede srcin to 0xff. This means they never look
264 like termination bytes. */
265 ldp data1, data2, [src]
266 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
270 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
272 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
274 orr data1, data1, tmp2
275 orr data2a, data2, tmp2
277 csinv data1, data1, xzr, lt
278 csel data2, data2, data2a, lt
279 sub tmp1, data1, zeroones
280 orr tmp2, data1, #REP8_7f
281 sub tmp3, data2, zeroones
282 orr tmp4, data2, #REP8_7f
283 bic has_nul1, tmp1, tmp2
284 bics has_nul2, tmp3, tmp4
285 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
286 b.eq L(page_cross_ok)
287 /* We now need to make data1 and data2 look like they've been
288 loaded directly from srcin. Do a rotate on the 128-bit value. */
289 lsl tmp1, to_align, #3 /* Bytes->bits. */
290 neg tmp2, to_align, lsl #3
292 lsl data1a, data1, tmp1
293 lsr tmp4, data2, tmp2
294 lsl data2, data2, tmp1
295 orr tmp4, tmp4, data1a
297 csel data1, tmp4, data2, lt
300 sub tmp1, tmp2, zeroones
301 orr tmp2, tmp2, #REP8_7f
302 sub tmp3, tmp4, zeroones
303 orr tmp4, tmp4, #REP8_7f
305 lsr data1a, data1, tmp1
306 lsl tmp4, data2, tmp2
307 lsr data2, data2, tmp1
308 orr tmp4, tmp4, data1a
310 csel data1, tmp4, data2, lt
311 sub tmp1, data1, zeroones
312 orr tmp2, data1, #REP8_7f
313 sub tmp3, data2, zeroones
314 orr tmp4, data2, #REP8_7f
316 bic has_nul1, tmp1, tmp2
317 cbnz has_nul1, L(fp_le8)
318 bic has_nul2, tmp3, tmp4
323 weak_alias (__stpcpy, stpcpy)
324 libc_hidden_def (__stpcpy)
325 libc_hidden_builtin_def (stpcpy)
327 libc_hidden_builtin_def (strcpy)