1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
23 * ARMv8-a, AArch64, unaligned accesses.
52 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
53 medium copies of 17..96 bytes which are fully unrolled. Large copies
54 of more than 96 bytes align the destination and use an unrolled loop
55 processing 64 bytes per iteration.
56 In order to share code with memmove, small and medium copies read all
57 data before writing, allowing any kind of overlap. So small, medium
58 and large backwards memmoves are handled by falling through into memcpy.
59 Overlapping large forward memmoves use a loop that copies backwards.
62 ENTRY_ALIGN (memmove, 6)
70 ccmp tmp1, count, 2, hi
73 /* Common case falls through into memcpy. */
75 libc_hidden_builtin_def (memmove)
83 add srcend, src, count
84 add dstend, dstin, count
90 /* Medium copies: 17..96 bytes. */
93 tbnz tmp1, 6, L(copy96)
94 ldp D_l, D_h, [srcend, -16]
96 ldp B_l, B_h, [src, 16]
97 ldp C_l, C_h, [srcend, -32]
98 stp B_l, B_h, [dstin, 16]
99 stp C_l, C_h, [dstend, -32]
101 stp A_l, A_h, [dstin]
102 stp D_l, D_h, [dstend, -16]
106 /* Small copies: 0..16 bytes. */
111 ldr A_h, [srcend, -8]
113 str A_h, [dstend, -8]
119 ldr A_hw, [srcend, -4]
121 str A_hw, [dstend, -4]
124 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
125 byte 3 times if count==1, or the 2nd byte twice if count==2. */
130 ldrb A_hw, [srcend, -1]
131 ldrb B_lw, [src, tmp1]
133 strb B_lw, [dstin, tmp1]
134 strb A_hw, [dstend, -1]
138 /* Copy 64..96 bytes. Copy 64 bytes from the start and
139 32 bytes from the end. */
141 ldp B_l, B_h, [src, 16]
142 ldp C_l, C_h, [src, 32]
143 ldp D_l, D_h, [src, 48]
144 ldp E_l, E_h, [srcend, -32]
145 ldp F_l, F_h, [srcend, -16]
146 stp A_l, A_h, [dstin]
147 stp B_l, B_h, [dstin, 16]
148 stp C_l, C_h, [dstin, 32]
149 stp D_l, D_h, [dstin, 48]
150 stp E_l, E_h, [dstend, -32]
151 stp F_l, F_h, [dstend, -16]
154 /* Align DST to 16 byte alignment so that we don't cross cache line
155 boundaries on both loads and stores. There are at least 96 bytes
156 to copy, so copy 16 bytes unaligned and then align. The loop
157 copies 64 bytes per iteration and prefetches one iteration ahead. */
165 add count, count, tmp1 /* Count is now 16 too large. */
166 ldp A_l, A_h, [src, 16]
167 stp D_l, D_h, [dstin]
168 ldp B_l, B_h, [src, 32]
169 ldp C_l, C_h, [src, 48]
170 ldp D_l, D_h, [src, 64]!
171 subs count, count, 128 + 16 /* Test and readjust count. */
174 stp A_l, A_h, [dst, 16]
175 ldp A_l, A_h, [src, 16]
176 stp B_l, B_h, [dst, 32]
177 ldp B_l, B_h, [src, 32]
178 stp C_l, C_h, [dst, 48]
179 ldp C_l, C_h, [src, 48]
180 stp D_l, D_h, [dst, 64]!
181 ldp D_l, D_h, [src, 64]!
182 subs count, count, 64
185 /* Write the last full set of 64 bytes. The remainder is at most 64
186 bytes, so it is safe to always copy 64 bytes from the end even if
187 there is just 1 byte left. */
189 ldp E_l, E_h, [srcend, -64]
190 stp A_l, A_h, [dst, 16]
191 ldp A_l, A_h, [srcend, -48]
192 stp B_l, B_h, [dst, 32]
193 ldp B_l, B_h, [srcend, -32]
194 stp C_l, C_h, [dst, 48]
195 ldp C_l, C_h, [srcend, -16]
196 stp D_l, D_h, [dst, 64]
197 stp E_l, E_h, [dstend, -64]
198 stp A_l, A_h, [dstend, -48]
199 stp B_l, B_h, [dstend, -32]
200 stp C_l, C_h, [dstend, -16]
207 add srcend, src, count
208 add dstend, dstin, count
210 /* Align dstend to 16 byte alignment so that we don't cross cache line
211 boundaries on both loads and stores. There are at least 96 bytes
212 to copy, so copy 16 bytes unaligned and then align. The loop
213 copies 64 bytes per iteration and prefetches one iteration ahead. */
216 ldp D_l, D_h, [srcend, -16]
217 sub srcend, srcend, tmp1
218 sub count, count, tmp1
219 ldp A_l, A_h, [srcend, -16]
220 stp D_l, D_h, [dstend, -16]
221 ldp B_l, B_h, [srcend, -32]
222 ldp C_l, C_h, [srcend, -48]
223 ldp D_l, D_h, [srcend, -64]!
224 sub dstend, dstend, tmp1
225 subs count, count, 128
230 stp A_l, A_h, [dstend, -16]
231 ldp A_l, A_h, [srcend, -16]
232 stp B_l, B_h, [dstend, -32]
233 ldp B_l, B_h, [srcend, -32]
234 stp C_l, C_h, [dstend, -48]
235 ldp C_l, C_h, [srcend, -48]
236 stp D_l, D_h, [dstend, -64]!
237 ldp D_l, D_h, [srcend, -64]!
238 subs count, count, 64
241 /* Write the last full set of 64 bytes. The remainder is at most 64
242 bytes, so it is safe to always copy 64 bytes from the start even if
243 there is just 1 byte left. */
245 ldp G_l, G_h, [src, 48]
246 stp A_l, A_h, [dstend, -16]
247 ldp A_l, A_h, [src, 32]
248 stp B_l, B_h, [dstend, -32]
249 ldp B_l, B_h, [src, 16]
250 stp C_l, C_h, [dstend, -48]
252 stp D_l, D_h, [dstend, -64]
253 stp G_l, G_h, [dstin, 48]
254 stp A_l, A_h, [dstin, 32]
255 stp B_l, B_h, [dstin, 16]
256 stp C_l, C_h, [dstin]
260 libc_hidden_builtin_def (memcpy)