1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
23 * ARMv8-a, AArch64, unaligned accesses.
51 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
52 medium copies of 17..96 bytes which are fully unrolled. Large copies
53 of more than 96 bytes align the destination and use an unrolled loop
54 processing 64 bytes per iteration.
55 In order to share code with memmove, small and medium copies read all
56 data before writing, allowing any kind of overlap. So small, medium
57 and large backwards memmoves are handled by falling through into memcpy.
58 Overlapping large forward memmoves use a loop that copies backwards.
61 ENTRY_ALIGN (memmove, 6)
65 ccmp tmp1, count, 2, hi
68 /* Common case falls through into memcpy. */
70 libc_hidden_builtin_def (memmove)
73 add srcend, src, count
74 add dstend, dstin, count
80 /* Small copies: 0..16 bytes. */
91 ldr A_hw, [srcend, -4]
93 str A_hw, [dstend, -4]
100 ldrh A_hw, [srcend, -2]
101 strh A_hw, [dstend, -2]
102 1: strb A_lw, [dstin]
106 /* Medium copies: 17..96 bytes. */
109 tbnz count, 6, L(copy96)
110 ldp D_l, D_h, [srcend, -16]
112 ldp B_l, B_h, [src, 16]
113 ldp C_l, C_h, [srcend, -32]
114 stp B_l, B_h, [dstin, 16]
115 stp C_l, C_h, [dstend, -32]
117 stp A_l, A_h, [dstin]
118 stp D_l, D_h, [dstend, -16]
122 /* Copy 64..96 bytes. Copy 64 bytes from the start and
123 32 bytes from the end. */
125 ldp B_l, B_h, [src, 16]
126 ldp C_l, C_h, [src, 32]
127 ldp D_l, D_h, [src, 48]
128 ldp E_l, E_h, [srcend, -32]
129 ldp F_l, F_h, [srcend, -16]
130 stp A_l, A_h, [dstin]
131 stp B_l, B_h, [dstin, 16]
132 stp C_l, C_h, [dstin, 32]
133 stp D_l, D_h, [dstin, 48]
134 stp E_l, E_h, [dstend, -32]
135 stp F_l, F_h, [dstend, -16]
138 /* Align DST to 16 byte alignment so that we don't cross cache line
139 boundaries on both loads and stores. There are at least 96 bytes
140 to copy, so copy 16 bytes unaligned and then align. The loop
141 copies 64 bytes per iteration and prefetches one iteration ahead. */
149 add count, count, tmp1 /* Count is now 16 too large. */
150 ldp A_l, A_h, [src, 16]
151 stp D_l, D_h, [dstin]
152 ldp B_l, B_h, [src, 32]
153 ldp C_l, C_h, [src, 48]
154 ldp D_l, D_h, [src, 64]!
155 subs count, count, 128 + 16 /* Test and readjust count. */
158 stp A_l, A_h, [dst, 16]
159 ldp A_l, A_h, [src, 16]
160 stp B_l, B_h, [dst, 32]
161 ldp B_l, B_h, [src, 32]
162 stp C_l, C_h, [dst, 48]
163 ldp C_l, C_h, [src, 48]
164 stp D_l, D_h, [dst, 64]!
165 ldp D_l, D_h, [src, 64]!
166 subs count, count, 64
169 /* Write the last full set of 64 bytes. The remainder is at most 64
170 bytes, so it is safe to always copy 64 bytes from the end even if
171 there is just 1 byte left. */
173 ldp E_l, E_h, [srcend, -64]
174 stp A_l, A_h, [dst, 16]
175 ldp A_l, A_h, [srcend, -48]
176 stp B_l, B_h, [dst, 32]
177 ldp B_l, B_h, [srcend, -32]
178 stp C_l, C_h, [dst, 48]
179 ldp C_l, C_h, [srcend, -16]
180 stp D_l, D_h, [dst, 64]
181 stp E_l, E_h, [dstend, -64]
182 stp A_l, A_h, [dstend, -48]
183 stp B_l, B_h, [dstend, -32]
184 stp C_l, C_h, [dstend, -16]
191 add srcend, src, count
192 add dstend, dstin, count
194 /* Align dstend to 16 byte alignment so that we don't cross cache line
195 boundaries on both loads and stores. There are at least 96 bytes
196 to copy, so copy 16 bytes unaligned and then align. The loop
197 copies 64 bytes per iteration and prefetches one iteration ahead. */
200 ldp D_l, D_h, [srcend, -16]
201 sub srcend, srcend, tmp1
202 sub count, count, tmp1
203 ldp A_l, A_h, [srcend, -16]
204 stp D_l, D_h, [dstend, -16]
205 ldp B_l, B_h, [srcend, -32]
206 ldp C_l, C_h, [srcend, -48]
207 ldp D_l, D_h, [srcend, -64]!
208 sub dstend, dstend, tmp1
209 subs count, count, 128
214 stp A_l, A_h, [dstend, -16]
215 ldp A_l, A_h, [srcend, -16]
216 stp B_l, B_h, [dstend, -32]
217 ldp B_l, B_h, [srcend, -32]
218 stp C_l, C_h, [dstend, -48]
219 ldp C_l, C_h, [srcend, -48]
220 stp D_l, D_h, [dstend, -64]!
221 ldp D_l, D_h, [srcend, -64]!
222 subs count, count, 64
225 /* Write the last full set of 64 bytes. The remainder is at most 64
226 bytes, so it is safe to always copy 64 bytes from the start even if
227 there is just 1 byte left. */
229 ldp G_l, G_h, [src, 48]
230 stp A_l, A_h, [dstend, -16]
231 ldp A_l, A_h, [src, 32]
232 stp B_l, B_h, [dstend, -32]
233 ldp B_l, B_h, [src, 16]
234 stp C_l, C_h, [dstend, -48]
236 stp D_l, D_h, [dstend, -64]
237 stp G_l, G_h, [dstin, 48]
238 stp A_l, A_h, [dstin, 32]
239 stp B_l, B_h, [dstin, 16]
240 stp C_l, C_h, [dstin]
244 libc_hidden_builtin_def (memcpy)