1 /* Optimized memcpy for Qualcomm Falkor processor.
2 Copyright (C) 2017-2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
24 ARMv8-a, AArch64, falkor, unaligned accesses. */
49 /* Copies are split into 3 main cases:
51 1. Small copies of up to 32 bytes
52 2. Medium copies of 33..128 bytes which are fully unrolled
53 3. Large copies of more than 128 bytes.
55 Large copies align the source to a quad word and use an unrolled loop
56 processing 64 bytes per iteration.
58 FALKOR-SPECIFIC DESIGN:
60 The smallest copies (32 bytes or less) focus on optimal pipeline usage,
61 which is why the redundant copies of 0-3 bytes have been replaced with
62 conditionals, since the former would unnecessarily break across multiple
63 issue groups. The medium copy group has been enlarged to 128 bytes since
64 bumping up the small copies up to 32 bytes allows us to do that without
65 cost and also allows us to reduce the size of the prep code before loop64.
67 The copy loop uses only one register q0. This is to ensure that all loads
68 hit a single hardware prefetcher which can get correctly trained to prefetch
71 The non-temporal stores help optimize cache utilization. */
74 ENTRY_ALIGN (__memcpy_falkor, 6)
81 add srcend, src, count
82 add dstend, dstin, count
87 /* Medium copies: 33..128 bytes. */
92 ldr C_q, [srcend, -32]
93 ldr D_q, [srcend, -16]
97 ldr G_q, [srcend, -64]
98 ldr H_q, [srcend, -48]
99 str G_q, [dstend, -64]
100 str H_q, [dstend, -48]
106 str C_q, [dstend, -32]
107 str D_q, [dstend, -16]
111 /* Small copies: 0..32 bytes. */
117 ldr B_q, [srcend, -16]
119 str B_q, [dstend, -16]
126 ldr B_x, [srcend, -8]
128 str B_x, [dstend, -8]
135 ldr B_w, [srcend, -4]
137 str B_w, [dstend, -4]
144 ldrh B_w, [srcend, -2]
146 strh B_w, [dstend, -2]
157 /* Align SRC to 16 bytes and copy; that way at least one of the
158 accesses is aligned throughout the copy sequence.
160 The count is off by 0 to 15 bytes, but this is OK because we trim
161 off the last 64 bytes to copy off from the end. Due to this the
162 loop never runs out of bounds. */
165 nop /* Align loop64 below. */
168 sub count, count, 64 + 16
173 add count, count, tmp1
179 subs count, count, 64
187 /* Write the last full set of 64 bytes. The remainder is at most 64
188 bytes, so it is safe to always copy 64 bytes from the end even if
189 there is just 1 byte left. */
190 ldr E_q, [srcend, -64]
191 str E_q, [dstend, -64]
192 ldr D_q, [srcend, -48]
193 str D_q, [dstend, -48]
194 ldr C_q, [srcend, -32]
195 str C_q, [dstend, -32]
196 ldr B_q, [srcend, -16]
197 str B_q, [dstend, -16]
200 END (__memcpy_falkor)
201 libc_hidden_builtin_def (__memcpy_falkor)
206 The move has 4 distinct parts:
207 * Small moves of 32 bytes and under.
208 * Medium sized moves of 33-128 bytes (fully unrolled).
209 * Large moves where the source address is higher than the destination
211 * Large moves where the destination address is higher than the source
212 (copy backward, or move).
214 We use only two registers q6 and q22 for the moves and move 32 bytes at a
215 time to correctly train the hardware prefetcher for better throughput.
217 For small and medium cases memcpy is used. */
219 ENTRY_ALIGN (__memmove_falkor, 6)
226 add srcend, src, count
227 add dstend, dstin, count
232 ccmp tmp1, count, 2, hi
235 /* CASE: Copy Forwards
237 Align src to 16 byte alignment so that we don't cross cache line
238 boundaries on both loads and stores. There are at least 128 bytes
239 to copy, so copy 16 bytes unaligned and then align. The loop
240 copies 32 bytes per iteration and prefetches one iteration ahead. */
246 add count, count, tmp1 /* Count is now 16 too large. */
250 sub count, count, 32 + 32 + 16 /* Test and readjust count. */
254 subs count, count, 32
261 /* Copy 32 bytes from the end before writing the data prefetched in the
262 last loop iteration. */
264 ldr B_q, [srcend, -32]
265 ldr C_q, [srcend, -16]
268 str B_q, [dstend, -32]
269 str C_q, [dstend, -16]
272 /* CASE: Copy Backwards
274 Align srcend to 16 byte alignment so that we don't cross cache line
275 boundaries on both loads and stores. There are at least 128 bytes
276 to copy, so copy 16 bytes unaligned and then align. The loop
277 copies 32 bytes per iteration and prefetches one iteration ahead. */
283 cbz tmp1, 3f /* Return early if src == dstin */
284 ldr S_q, [srcend, -16]
286 sub srcend, srcend, tmp1
287 ldr Q_q, [srcend, -16]!
288 str S_q, [dstend, -16]
289 sub count, count, tmp1
290 ldr S_q, [srcend, -16]!
291 sub dstend, dstend, tmp1
292 sub count, count, 32 + 32
295 subs count, count, 32
296 str Q_q, [dstend, -16]
297 ldr Q_q, [srcend, -16]!
298 str S_q, [dstend, -32]!
299 ldr S_q, [srcend, -16]!
302 /* Copy 32 bytes from the start before writing the data prefetched in the
303 last loop iteration. */
307 str Q_q, [dstend, -16]
308 str S_q, [dstend, -32]
313 END (__memmove_falkor)
314 libc_hidden_builtin_def (__memmove_falkor)