1 /* Copyright (C) 2012-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <https://www.gnu.org/licenses/>. */
23 * ARMv8-a, AArch64, unaligned accesses.
55 # define MEMMOVE memmove
58 # define MEMCPY memcpy
61 /* This implementation supports both memcpy and memmove and shares most code.
62 It uses unaligned accesses and branchless sequences to keep the code small,
63 simple and improve performance.
65 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
66 copies of up to 128 bytes, and large copies. The overhead of the overlap
67 check in memmove is negligible since it is only required for large copies.
69 Large copies use a software pipelined loop processing 64 bytes per
70 iteration. The destination pointer is 16-byte aligned to minimize
71 unaligned accesses. The loop tail is handled by always copying 64 bytes
75 ENTRY_ALIGN (MEMCPY, 6)
80 add srcend, src, count
81 add dstend, dstin, count
87 /* Small copies: 0..32 bytes. */
91 ldp D_l, D_h, [srcend, -16]
93 stp D_l, D_h, [dstend, -16]
96 /* Copy 8-15 bytes. */
98 tbz count, 3, L(copy8)
100 ldr A_h, [srcend, -8]
102 str A_h, [dstend, -8]
106 /* Copy 4-7 bytes. */
108 tbz count, 2, L(copy4)
110 ldr B_lw, [srcend, -4]
112 str B_lw, [dstend, -4]
115 /* Copy 0..3 bytes using a branchless sequence. */
120 ldrb C_lw, [srcend, -1]
121 ldrb B_lw, [src, tmp1]
123 strb B_lw, [dstin, tmp1]
124 strb C_lw, [dstend, -1]
129 /* Medium copies: 33..128 bytes. */
132 ldp B_l, B_h, [src, 16]
133 ldp C_l, C_h, [srcend, -32]
134 ldp D_l, D_h, [srcend, -16]
137 stp A_l, A_h, [dstin]
138 stp B_l, B_h, [dstin, 16]
139 stp C_l, C_h, [dstend, -32]
140 stp D_l, D_h, [dstend, -16]
144 /* Copy 65..128 bytes. */
146 ldp E_l, E_h, [src, 32]
147 ldp F_l, F_h, [src, 48]
150 ldp G_l, G_h, [srcend, -64]
151 ldp H_l, H_h, [srcend, -48]
152 stp G_l, G_h, [dstend, -64]
153 stp H_l, H_h, [dstend, -48]
155 stp A_l, A_h, [dstin]
156 stp B_l, B_h, [dstin, 16]
157 stp E_l, E_h, [dstin, 32]
158 stp F_l, F_h, [dstin, 48]
159 stp C_l, C_h, [dstend, -32]
160 stp D_l, D_h, [dstend, -16]
164 /* Copy more than 128 bytes. */
166 /* Copy 16 bytes and then align dst to 16-byte alignment. */
171 add count, count, tmp1 /* Count is now 16 too large. */
172 ldp A_l, A_h, [src, 16]
173 stp D_l, D_h, [dstin]
174 ldp B_l, B_h, [src, 32]
175 ldp C_l, C_h, [src, 48]
176 ldp D_l, D_h, [src, 64]!
177 subs count, count, 128 + 16 /* Test and readjust count. */
178 b.ls L(copy64_from_end)
181 stp A_l, A_h, [dst, 16]
182 ldp A_l, A_h, [src, 16]
183 stp B_l, B_h, [dst, 32]
184 ldp B_l, B_h, [src, 32]
185 stp C_l, C_h, [dst, 48]
186 ldp C_l, C_h, [src, 48]
187 stp D_l, D_h, [dst, 64]!
188 ldp D_l, D_h, [src, 64]!
189 subs count, count, 64
192 /* Write the last iteration and copy 64 bytes from the end. */
194 ldp E_l, E_h, [srcend, -64]
195 stp A_l, A_h, [dst, 16]
196 ldp A_l, A_h, [srcend, -48]
197 stp B_l, B_h, [dst, 32]
198 ldp B_l, B_h, [srcend, -32]
199 stp C_l, C_h, [dst, 48]
200 ldp C_l, C_h, [srcend, -16]
201 stp D_l, D_h, [dst, 64]
202 stp E_l, E_h, [dstend, -64]
203 stp A_l, A_h, [dstend, -48]
204 stp B_l, B_h, [dstend, -32]
205 stp C_l, C_h, [dstend, -16]
209 libc_hidden_builtin_def (MEMCPY)
211 ENTRY_ALIGN (MEMMOVE, 4)
216 add srcend, src, count
217 add dstend, dstin, count
223 /* Small copies: 0..32 bytes. */
227 ldp D_l, D_h, [srcend, -16]
228 stp A_l, A_h, [dstin]
229 stp D_l, D_h, [dstend, -16]
234 /* Only use backward copy if there is an overlap. */
240 /* Large backwards copy for overlapping copies.
241 Copy 16 bytes and then align dst to 16-byte alignment. */
242 ldp D_l, D_h, [srcend, -16]
244 sub srcend, srcend, tmp1
245 sub count, count, tmp1
246 ldp A_l, A_h, [srcend, -16]
247 stp D_l, D_h, [dstend, -16]
248 ldp B_l, B_h, [srcend, -32]
249 ldp C_l, C_h, [srcend, -48]
250 ldp D_l, D_h, [srcend, -64]!
251 sub dstend, dstend, tmp1
252 subs count, count, 128
253 b.ls L(copy64_from_start)
256 stp A_l, A_h, [dstend, -16]
257 ldp A_l, A_h, [srcend, -16]
258 stp B_l, B_h, [dstend, -32]
259 ldp B_l, B_h, [srcend, -32]
260 stp C_l, C_h, [dstend, -48]
261 ldp C_l, C_h, [srcend, -48]
262 stp D_l, D_h, [dstend, -64]!
263 ldp D_l, D_h, [srcend, -64]!
264 subs count, count, 64
265 b.hi L(loop64_backwards)
267 /* Write the last iteration and copy 64 bytes from the start. */
268 L(copy64_from_start):
269 ldp G_l, G_h, [src, 48]
270 stp A_l, A_h, [dstend, -16]
271 ldp A_l, A_h, [src, 32]
272 stp B_l, B_h, [dstend, -32]
273 ldp B_l, B_h, [src, 16]
274 stp C_l, C_h, [dstend, -48]
276 stp D_l, D_h, [dstend, -64]
277 stp G_l, G_h, [dstin, 48]
278 stp A_l, A_h, [dstin, 32]
279 stp B_l, B_h, [dstin, 16]
280 stp C_l, C_h, [dstin]
284 libc_hidden_builtin_def (MEMMOVE)