1 /* strcpy/stpcpy - copy a string returning pointer to start/end.
2 Copyright (C) 2013-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
21 To test the page crossing code path more thoroughly, compile with
22 -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
23 the slower entry path. This option is not intended for production use. */
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
32 /* Arguments and results. */
36 /* Locals and temporaries. */
57 #define STRCPY __stpcpy
62 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64 can be done in parallel across the entire word. */
66 #define REP8_01 0x0101010101010101
67 #define REP8_7f 0x7f7f7f7f7f7f7f7f
68 #define REP8_80 0x8080808080808080
70 /* AArch64 systems have a minimum page size of 4k. We can do a quick
71 page size check for crossing this boundary on entry and if we
72 do not, then we can short-circuit much of the entry code. We
73 expect early page-crossing strings to be rare (probability of
74 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
75 predictable, even with random strings.
77 We don't bother checking for larger page sizes, the cost of setting
78 up the correct page size is just not worth the extra gain from
79 a small reduction in the cases taking the slow path. Note that
80 we only care about whether the first fetch, which may be
81 misaligned, crosses a page boundary - after that we move to aligned
82 fetches for the remainder of the string. */
84 #ifdef STRCPY_TEST_PAGE_CROSS
85 /* Make everything that isn't Qword aligned look like a page cross. */
88 #define MIN_PAGE_P2 12
91 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
93 ENTRY_ALIGN (STRCPY, 6)
94 /* For moderately short strings, the fastest way to do the copy is to
95 calculate the length of the string in the same way as strlen, then
96 essentially do a memcpy of the result. This avoids the need for
97 multiple byte copies and further means that by the time we
98 reach the bulk copy loop we know we can always use DWord
99 accesses. We expect strcpy to rarely be called repeatedly
100 with the same source string, so branch prediction is likely to
101 always be difficult - we mitigate against this by preferring
102 conditional select operations over branches whenever this is
104 and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
105 mov zeroones, #REP8_01
106 and to_align, srcin, #15
107 cmp tmp2, #(MIN_PAGE_SIZE - 16)
109 /* The first fetch will straddle a (possible) page boundary iff
110 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
111 aligned string will never fail the page align check, so will
112 always take the fast path. */
116 ldp data1, data2, [srcin]
118 /* Because we expect the end to be found within 16 characters
119 (profiling shows this is the most common case), it's worth
120 swapping the bytes now to save having to recalculate the
121 termination syndrome later. We preserve data1 and data2
122 so that we can re-use the values later on. */
124 sub tmp1, tmp2, zeroones
125 orr tmp2, tmp2, #REP8_7f
126 bics has_nul1, tmp1, tmp2
129 sub tmp3, tmp4, zeroones
130 orr tmp4, tmp4, #REP8_7f
132 sub tmp1, data1, zeroones
133 orr tmp2, data1, #REP8_7f
134 bics has_nul1, tmp1, tmp2
136 sub tmp3, data2, zeroones
137 orr tmp4, data2, #REP8_7f
139 bics has_nul2, tmp3, tmp4
142 /* The string is short (<=16 bytes). We don't know exactly how
143 short though, yet. Work out the exact length so that we can
144 quickly select the optimal copy strategy. */
146 rev has_nul2, has_nul2
149 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
152 lsr data2, data2, pos
154 lsl data2, data2, pos
164 rev has_nul1, has_nul1
166 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
167 subs tmp2, pos, #24 /* Pos in bits. */
172 lsr data2, data1, pos
173 lsr data1, data1, #32
175 lsr data2, data1, tmp2
177 /* 4->7 bytes to copy. */
178 str data2w, [dst, #-3]
186 /* 2->3 bytes to copy. */
188 lsr data1, data1, #48
191 /* Fall-through, one byte (max) to go. */
193 /* Null-terminated string. Last character must be zero! */
201 /* Aligning here ensures that the entry code and main loop all lies
202 within one 64-byte cache line. */
204 sub to_align, to_align, #16
205 stp data1, data2, [dstin]
206 sub src, srcin, to_align
207 sub dst, dstin, to_align
208 b L(entry_no_page_cross)
210 /* The inner loop deals with two Dwords at a time. This has a
211 slightly higher start-up cost, but we should win quite quickly,
212 especially on cores with a high number of issue slots per
213 cycle, as we get much better parallelism out of the operations. */
215 stp data1, data2, [dst], #16
216 L(entry_no_page_cross):
217 ldp data1, data2, [src], #16
218 sub tmp1, data1, zeroones
219 orr tmp2, data1, #REP8_7f
220 sub tmp3, data2, zeroones
221 orr tmp4, data2, #REP8_7f
222 bic has_nul1, tmp1, tmp2
223 bics has_nul2, tmp3, tmp4
224 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
227 /* Since we know we are copying at least 16 bytes, the fastest way
228 to deal with the tail is to determine the location of the
229 trailing NUL, then (re)copy the 16 bytes leading up to that. */
232 /* For big-endian, carry propagation (if the final byte in the
233 string is 0x01) means we cannot use has_nul directly. The
234 easiest way to get the correct byte is to byte-swap the data
235 and calculate the syndrome a second time. */
236 csel data1, data1, data2, ne
238 sub tmp1, data1, zeroones
239 orr tmp2, data1, #REP8_7f
240 bic has_nul1, tmp1, tmp2
242 csel has_nul1, has_nul1, has_nul2, ne
244 rev has_nul1, has_nul1
248 csel pos, pos, tmp1, ne
249 add src, src, pos, lsr #3
250 add dst, dst, pos, lsr #3
251 ldp data1, data2, [src, #-32]
252 stp data1, data2, [dst, #-16]
260 /* Start by loading two words at [srcin & ~15], then forcing the
261 bytes that precede srcin to 0xff. This means they never look
262 like termination bytes. */
263 ldp data1, data2, [src]
264 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
268 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
270 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
272 orr data1, data1, tmp2
273 orr data2a, data2, tmp2
275 csinv data1, data1, xzr, lt
276 csel data2, data2, data2a, lt
277 sub tmp1, data1, zeroones
278 orr tmp2, data1, #REP8_7f
279 sub tmp3, data2, zeroones
280 orr tmp4, data2, #REP8_7f
281 bic has_nul1, tmp1, tmp2
282 bics has_nul2, tmp3, tmp4
283 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
284 b.eq L(page_cross_ok)
285 /* We now need to make data1 and data2 look like they've been
286 loaded directly from srcin. Do a rotate on the 128-bit value. */
287 lsl tmp1, to_align, #3 /* Bytes->bits. */
288 neg tmp2, to_align, lsl #3
290 lsl data1a, data1, tmp1
291 lsr tmp4, data2, tmp2
292 lsl data2, data2, tmp1
293 orr tmp4, tmp4, data1a
295 csel data1, tmp4, data2, lt
298 sub tmp1, tmp2, zeroones
299 orr tmp2, tmp2, #REP8_7f
300 sub tmp3, tmp4, zeroones
301 orr tmp4, tmp4, #REP8_7f
303 lsr data1a, data1, tmp1
304 lsl tmp4, data2, tmp2
305 lsr data2, data2, tmp1
306 orr tmp4, tmp4, data1a
308 csel data1, tmp4, data2, lt
309 sub tmp1, data1, zeroones
310 orr tmp2, data1, #REP8_7f
311 sub tmp3, data2, zeroones
312 orr tmp4, data2, #REP8_7f
314 bic has_nul1, tmp1, tmp2
315 cbnz has_nul1, L(fp_le8)
316 bic has_nul2, tmp3, tmp4
321 weak_alias (__stpcpy, stpcpy)
322 libc_hidden_def (__stpcpy)
323 libc_hidden_builtin_def (stpcpy)
325 libc_hidden_builtin_def (strcpy)