1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
23 * ARMv8-a, AArch64, unaligned accesses.
52 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
53 medium copies of 17..96 bytes which are fully unrolled. Large copies
54 of more than 96 bytes align the destination and use an unrolled loop
55 processing 64 bytes per iteration.
56 In order to share code with memmove, small and medium copies read all
57 data before writing, allowing any kind of overlap. So small, medium
58 and large backwards memmoves are handled by falling through into memcpy.
59 Overlapping large forward memmoves use a loop that copies backwards.
62 ENTRY_ALIGN (memmove, 6)
66 ccmp tmp1, count, 2, hi
69 /* Common case falls through into memcpy. */
71 libc_hidden_builtin_def (memmove)
75 add srcend, src, count
76 add dstend, dstin, count
82 /* Medium copies: 17..96 bytes. */
85 tbnz tmp1, 6, L(copy96)
86 ldp D_l, D_h, [srcend, -16]
88 ldp B_l, B_h, [src, 16]
89 ldp C_l, C_h, [srcend, -32]
90 stp B_l, B_h, [dstin, 16]
91 stp C_l, C_h, [dstend, -32]
94 stp D_l, D_h, [dstend, -16]
98 /* Small copies: 0..16 bytes. */
103 ldr A_h, [srcend, -8]
105 str A_h, [dstend, -8]
111 ldr A_hw, [srcend, -4]
113 str A_hw, [dstend, -4]
116 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
117 byte 3 times if count==1, or the 2nd byte twice if count==2. */
122 ldrb A_hw, [srcend, -1]
123 ldrb B_lw, [src, tmp1]
125 strb B_lw, [dstin, tmp1]
126 strb A_hw, [dstend, -1]
130 /* Copy 64..96 bytes. Copy 64 bytes from the start and
131 32 bytes from the end. */
133 ldp B_l, B_h, [src, 16]
134 ldp C_l, C_h, [src, 32]
135 ldp D_l, D_h, [src, 48]
136 ldp E_l, E_h, [srcend, -32]
137 ldp F_l, F_h, [srcend, -16]
138 stp A_l, A_h, [dstin]
139 stp B_l, B_h, [dstin, 16]
140 stp C_l, C_h, [dstin, 32]
141 stp D_l, D_h, [dstin, 48]
142 stp E_l, E_h, [dstend, -32]
143 stp F_l, F_h, [dstend, -16]
146 /* Align DST to 16 byte alignment so that we don't cross cache line
147 boundaries on both loads and stores. There are at least 96 bytes
148 to copy, so copy 16 bytes unaligned and then align. The loop
149 copies 64 bytes per iteration and prefetches one iteration ahead. */
157 add count, count, tmp1 /* Count is now 16 too large. */
158 ldp A_l, A_h, [src, 16]
159 stp D_l, D_h, [dstin]
160 ldp B_l, B_h, [src, 32]
161 ldp C_l, C_h, [src, 48]
162 ldp D_l, D_h, [src, 64]!
163 subs count, count, 128 + 16 /* Test and readjust count. */
166 stp A_l, A_h, [dst, 16]
167 ldp A_l, A_h, [src, 16]
168 stp B_l, B_h, [dst, 32]
169 ldp B_l, B_h, [src, 32]
170 stp C_l, C_h, [dst, 48]
171 ldp C_l, C_h, [src, 48]
172 stp D_l, D_h, [dst, 64]!
173 ldp D_l, D_h, [src, 64]!
174 subs count, count, 64
177 /* Write the last full set of 64 bytes. The remainder is at most 64
178 bytes, so it is safe to always copy 64 bytes from the end even if
179 there is just 1 byte left. */
181 ldp E_l, E_h, [srcend, -64]
182 stp A_l, A_h, [dst, 16]
183 ldp A_l, A_h, [srcend, -48]
184 stp B_l, B_h, [dst, 32]
185 ldp B_l, B_h, [srcend, -32]
186 stp C_l, C_h, [dst, 48]
187 ldp C_l, C_h, [srcend, -16]
188 stp D_l, D_h, [dst, 64]
189 stp E_l, E_h, [dstend, -64]
190 stp A_l, A_h, [dstend, -48]
191 stp B_l, B_h, [dstend, -32]
192 stp C_l, C_h, [dstend, -16]
199 add srcend, src, count
200 add dstend, dstin, count
202 /* Align dstend to 16 byte alignment so that we don't cross cache line
203 boundaries on both loads and stores. There are at least 96 bytes
204 to copy, so copy 16 bytes unaligned and then align. The loop
205 copies 64 bytes per iteration and prefetches one iteration ahead. */
208 ldp D_l, D_h, [srcend, -16]
209 sub srcend, srcend, tmp1
210 sub count, count, tmp1
211 ldp A_l, A_h, [srcend, -16]
212 stp D_l, D_h, [dstend, -16]
213 ldp B_l, B_h, [srcend, -32]
214 ldp C_l, C_h, [srcend, -48]
215 ldp D_l, D_h, [srcend, -64]!
216 sub dstend, dstend, tmp1
217 subs count, count, 128
222 stp A_l, A_h, [dstend, -16]
223 ldp A_l, A_h, [srcend, -16]
224 stp B_l, B_h, [dstend, -32]
225 ldp B_l, B_h, [srcend, -32]
226 stp C_l, C_h, [dstend, -48]
227 ldp C_l, C_h, [srcend, -48]
228 stp D_l, D_h, [dstend, -64]!
229 ldp D_l, D_h, [srcend, -64]!
230 subs count, count, 64
233 /* Write the last full set of 64 bytes. The remainder is at most 64
234 bytes, so it is safe to always copy 64 bytes from the start even if
235 there is just 1 byte left. */
237 ldp G_l, G_h, [src, 48]
238 stp A_l, A_h, [dstend, -16]
239 ldp A_l, A_h, [src, 32]
240 stp B_l, B_h, [dstend, -32]
241 ldp B_l, B_h, [src, 16]
242 stp C_l, C_h, [dstend, -48]
244 stp D_l, D_h, [dstend, -64]
245 stp G_l, G_h, [dstin, 48]
246 stp A_l, A_h, [dstin, 32]
247 stp B_l, B_h, [dstin, 16]
248 stp C_l, C_h, [dstin]
252 libc_hidden_builtin_def (memcpy)