1 /* Copyright (C) 2012-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <https://www.gnu.org/licenses/>. */
23 * ARMv8-a, AArch64, unaligned accesses.
54 /* Copies are split into 3 main cases: small copies of up to 32 bytes,
55 medium copies of 33..128 bytes which are fully unrolled. Large copies
56 of more than 128 bytes align the destination and use an unrolled loop
57 processing 64 bytes per iteration.
58 In order to share code with memmove, small and medium copies read all
59 data before writing, allowing any kind of overlap. So small, medium
60 and large backwards memmoves are handled by falling through into memcpy.
61 Overlapping large forward memmoves use a loop that copies backwards.
65 # define MEMMOVE memmove
68 # define MEMCPY memcpy
71 ENTRY_ALIGN (MEMMOVE, 6)
79 ccmp tmp1, count, 2, hi
82 /* Common case falls through into memcpy. */
84 libc_hidden_builtin_def (MEMMOVE)
92 add srcend, src, count
93 add dstend, dstin, count
99 /* Medium copies: 33..128 bytes. */
101 ldp B_l, B_h, [src, 16]
102 ldp C_l, C_h, [srcend, -32]
103 ldp D_l, D_h, [srcend, -16]
106 stp A_l, A_h, [dstin]
107 stp B_l, B_h, [dstin, 16]
108 stp C_l, C_h, [dstend, -32]
109 stp D_l, D_h, [dstend, -16]
113 /* Small copies: 0..32 bytes. */
119 ldp B_l, B_h, [srcend, -16]
120 stp A_l, A_h, [dstin]
121 stp B_l, B_h, [dstend, -16]
128 ldr A_h, [srcend, -8]
130 str A_h, [dstend, -8]
137 ldr A_hw, [srcend, -4]
139 str A_hw, [dstend, -4]
142 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
143 byte 3 times if count==1, or the 2nd byte twice if count==2. */
148 ldrb A_hw, [srcend, -1]
149 ldrb B_lw, [src, tmp1]
151 strb B_lw, [dstin, tmp1]
152 strb A_hw, [dstend, -1]
156 /* Copy 65..128 bytes. Copy 64 bytes from the start and
157 64 bytes from the end. */
159 ldp E_l, E_h, [src, 32]
160 ldp F_l, F_h, [src, 48]
161 ldp G_l, G_h, [srcend, -64]
162 ldp H_l, H_h, [srcend, -48]
163 stp A_l, A_h, [dstin]
164 stp B_l, B_h, [dstin, 16]
165 stp E_l, E_h, [dstin, 32]
166 stp F_l, F_h, [dstin, 48]
167 stp G_l, G_h, [dstend, -64]
168 stp H_l, H_h, [dstend, -48]
169 stp C_l, C_h, [dstend, -32]
170 stp D_l, D_h, [dstend, -16]
173 /* Align DST to 16 byte alignment so that we don't cross cache line
174 boundaries on both loads and stores. There are at least 128 bytes
175 to copy, so copy 16 bytes unaligned and then align. The loop
176 copies 64 bytes per iteration and prefetches one iteration ahead. */
184 add count, count, tmp1 /* Count is now 16 too large. */
185 ldp A_l, A_h, [src, 16]
186 stp D_l, D_h, [dstin]
187 ldp B_l, B_h, [src, 32]
188 ldp C_l, C_h, [src, 48]
189 ldp D_l, D_h, [src, 64]!
190 subs count, count, 128 + 16 /* Test and readjust count. */
193 stp A_l, A_h, [dst, 16]
194 ldp A_l, A_h, [src, 16]
195 stp B_l, B_h, [dst, 32]
196 ldp B_l, B_h, [src, 32]
197 stp C_l, C_h, [dst, 48]
198 ldp C_l, C_h, [src, 48]
199 stp D_l, D_h, [dst, 64]!
200 ldp D_l, D_h, [src, 64]!
201 subs count, count, 64
204 /* Write the last full set of 64 bytes. The remainder is at most 64
205 bytes, so it is safe to always copy 64 bytes from the end even if
206 there is just 1 byte left. */
208 ldp E_l, E_h, [srcend, -64]
209 stp A_l, A_h, [dst, 16]
210 ldp A_l, A_h, [srcend, -48]
211 stp B_l, B_h, [dst, 32]
212 ldp B_l, B_h, [srcend, -32]
213 stp C_l, C_h, [dst, 48]
214 ldp C_l, C_h, [srcend, -16]
215 stp D_l, D_h, [dst, 64]
216 stp E_l, E_h, [dstend, -64]
217 stp A_l, A_h, [dstend, -48]
218 stp B_l, B_h, [dstend, -32]
219 stp C_l, C_h, [dstend, -16]
226 add srcend, src, count
227 add dstend, dstin, count
229 /* Align dstend to 16 byte alignment so that we don't cross cache line
230 boundaries on both loads and stores. There are at least 128 bytes
231 to copy, so copy 16 bytes unaligned and then align. The loop
232 copies 64 bytes per iteration and prefetches one iteration ahead. */
235 ldp D_l, D_h, [srcend, -16]
236 sub srcend, srcend, tmp1
237 sub count, count, tmp1
238 ldp A_l, A_h, [srcend, -16]
239 stp D_l, D_h, [dstend, -16]
240 ldp B_l, B_h, [srcend, -32]
241 ldp C_l, C_h, [srcend, -48]
242 ldp D_l, D_h, [srcend, -64]!
243 sub dstend, dstend, tmp1
244 subs count, count, 128
249 stp A_l, A_h, [dstend, -16]
250 ldp A_l, A_h, [srcend, -16]
251 stp B_l, B_h, [dstend, -32]
252 ldp B_l, B_h, [srcend, -32]
253 stp C_l, C_h, [dstend, -48]
254 ldp C_l, C_h, [srcend, -48]
255 stp D_l, D_h, [dstend, -64]!
256 ldp D_l, D_h, [srcend, -64]!
257 subs count, count, 64
260 /* Write the last full set of 64 bytes. The remainder is at most 64
261 bytes, so it is safe to always copy 64 bytes from the start even if
262 there is just 1 byte left. */
264 ldp G_l, G_h, [src, 48]
265 stp A_l, A_h, [dstend, -16]
266 ldp A_l, A_h, [src, 32]
267 stp B_l, B_h, [dstend, -32]
268 ldp B_l, B_h, [src, 16]
269 stp C_l, C_h, [dstend, -48]
271 stp D_l, D_h, [dstend, -64]
272 stp G_l, G_h, [dstin, 48]
273 stp A_l, A_h, [dstin, 32]
274 stp B_l, B_h, [dstin, 16]
275 stp C_l, C_h, [dstin]
279 libc_hidden_builtin_def (MEMCPY)