1 /* A Thunderx Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 /* The actual code in this memcpy and memmove should be identical to the
21 generic version except for the code under '#ifdef THUNDERX'. This is
22 to make is easier to keep this version and the generic version in sync
23 for changes that are not specific to thunderx. */
29 * ARMv8-a, AArch64, unaligned accesses.
58 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
59 medium copies of 17..96 bytes which are fully unrolled. Large copies
60 of more than 96 bytes align the destination and use an unrolled loop
61 processing 64 bytes per iteration.
62 In order to share code with memmove, small and medium copies read all
63 data before writing, allowing any kind of overlap. So small, medium
64 and large backwards memmoves are handled by falling through into memcpy.
65 Overlapping large forward memmoves use a loop that copies backwards.
69 # define MEMMOVE memmove
72 # define MEMCPY memcpy
78 # define MEMCPY __memcpy_thunderx
80 # define MEMMOVE __memmove_thunderx
82 ENTRY_ALIGN (MEMMOVE, 6)
90 ccmp tmp1, count, 2, hi
93 /* Common case falls through into memcpy. */
95 libc_hidden_builtin_def (MEMMOVE)
102 prfm PLDL1KEEP, [src]
103 add srcend, src, count
104 add dstend, dstin, count
110 /* Medium copies: 17..96 bytes. */
113 tbnz tmp1, 6, L(copy96)
114 ldp D_l, D_h, [srcend, -16]
116 ldp B_l, B_h, [src, 16]
117 ldp C_l, C_h, [srcend, -32]
118 stp B_l, B_h, [dstin, 16]
119 stp C_l, C_h, [dstend, -32]
121 stp A_l, A_h, [dstin]
122 stp D_l, D_h, [dstend, -16]
126 /* Small copies: 0..16 bytes. */
131 ldr A_h, [srcend, -8]
133 str A_h, [dstend, -8]
139 ldr A_hw, [srcend, -4]
141 str A_hw, [dstend, -4]
144 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
145 byte 3 times if count==1, or the 2nd byte twice if count==2. */
150 ldrb A_hw, [srcend, -1]
151 ldrb B_lw, [src, tmp1]
153 strb B_lw, [dstin, tmp1]
154 strb A_hw, [dstend, -1]
158 /* Copy 64..96 bytes. Copy 64 bytes from the start and
159 32 bytes from the end. */
161 ldp B_l, B_h, [src, 16]
162 ldp C_l, C_h, [src, 32]
163 ldp D_l, D_h, [src, 48]
164 ldp E_l, E_h, [srcend, -32]
165 ldp F_l, F_h, [srcend, -16]
166 stp A_l, A_h, [dstin]
167 stp B_l, B_h, [dstin, 16]
168 stp C_l, C_h, [dstin, 32]
169 stp D_l, D_h, [dstin, 48]
170 stp E_l, E_h, [dstend, -32]
171 stp F_l, F_h, [dstend, -16]
174 /* Align DST to 16 byte alignment so that we don't cross cache line
175 boundaries on both loads and stores. There are at least 96 bytes
176 to copy, so copy 16 bytes unaligned and then align. The loop
177 copies 64 bytes per iteration and prefetches one iteration ahead. */
182 /* On thunderx, large memcpy's are helped by software prefetching.
183 This loop is identical to the one below it but with prefetching
184 instructions included. For loops that are less than 32768 bytes,
185 the prefetching does not help and slow the code down so we only
186 use the prefetching loop for the largest memcpys. */
189 b.lo L(copy_long_without_prefetch)
194 prfm pldl1strm, [src, 384]
195 add count, count, tmp1 /* Count is now 16 too large. */
196 ldp A_l, A_h, [src, 16]
197 stp D_l, D_h, [dstin]
198 ldp B_l, B_h, [src, 32]
199 ldp C_l, C_h, [src, 48]
200 ldp D_l, D_h, [src, 64]!
201 subs count, count, 128 + 16 /* Test and readjust count. */
205 prfm pldl1strm, [src, 512]
207 stp A_l, A_h, [dst, 16]
208 ldp A_l, A_h, [src, 16]
209 stp B_l, B_h, [dst, 32]
210 ldp B_l, B_h, [src, 32]
211 stp C_l, C_h, [dst, 48]
212 ldp C_l, C_h, [src, 48]
213 stp D_l, D_h, [dst, 64]!
214 ldp D_l, D_h, [src, 64]!
215 subs count, count, 64
216 b.hi L(prefetch_loop64)
219 L(copy_long_without_prefetch):
225 add count, count, tmp1 /* Count is now 16 too large. */
226 ldp A_l, A_h, [src, 16]
227 stp D_l, D_h, [dstin]
228 ldp B_l, B_h, [src, 32]
229 ldp C_l, C_h, [src, 48]
230 ldp D_l, D_h, [src, 64]!
231 subs count, count, 128 + 16 /* Test and readjust count. */
234 stp A_l, A_h, [dst, 16]
235 ldp A_l, A_h, [src, 16]
236 stp B_l, B_h, [dst, 32]
237 ldp B_l, B_h, [src, 32]
238 stp C_l, C_h, [dst, 48]
239 ldp C_l, C_h, [src, 48]
240 stp D_l, D_h, [dst, 64]!
241 ldp D_l, D_h, [src, 64]!
242 subs count, count, 64
245 /* Write the last full set of 64 bytes. The remainder is at most 64
246 bytes, so it is safe to always copy 64 bytes from the end even if
247 there is just 1 byte left. */
249 ldp E_l, E_h, [srcend, -64]
250 stp A_l, A_h, [dst, 16]
251 ldp A_l, A_h, [srcend, -48]
252 stp B_l, B_h, [dst, 32]
253 ldp B_l, B_h, [srcend, -32]
254 stp C_l, C_h, [dst, 48]
255 ldp C_l, C_h, [srcend, -16]
256 stp D_l, D_h, [dst, 64]
257 stp E_l, E_h, [dstend, -64]
258 stp A_l, A_h, [dstend, -48]
259 stp B_l, B_h, [dstend, -32]
260 stp C_l, C_h, [dstend, -16]
267 add srcend, src, count
268 add dstend, dstin, count
270 /* Align dstend to 16 byte alignment so that we don't cross cache line
271 boundaries on both loads and stores. There are at least 96 bytes
272 to copy, so copy 16 bytes unaligned and then align. The loop
273 copies 64 bytes per iteration and prefetches one iteration ahead. */
276 ldp D_l, D_h, [srcend, -16]
277 sub srcend, srcend, tmp1
278 sub count, count, tmp1
279 ldp A_l, A_h, [srcend, -16]
280 stp D_l, D_h, [dstend, -16]
281 ldp B_l, B_h, [srcend, -32]
282 ldp C_l, C_h, [srcend, -48]
283 ldp D_l, D_h, [srcend, -64]!
284 sub dstend, dstend, tmp1
285 subs count, count, 128
290 stp A_l, A_h, [dstend, -16]
291 ldp A_l, A_h, [srcend, -16]
292 stp B_l, B_h, [dstend, -32]
293 ldp B_l, B_h, [srcend, -32]
294 stp C_l, C_h, [dstend, -48]
295 ldp C_l, C_h, [srcend, -48]
296 stp D_l, D_h, [dstend, -64]!
297 ldp D_l, D_h, [srcend, -64]!
298 subs count, count, 64
301 /* Write the last full set of 64 bytes. The remainder is at most 64
302 bytes, so it is safe to always copy 64 bytes from the start even if
303 there is just 1 byte left. */
305 ldp G_l, G_h, [src, 48]
306 stp A_l, A_h, [dstend, -16]
307 ldp A_l, A_h, [src, 32]
308 stp B_l, B_h, [dstend, -32]
309 ldp B_l, B_h, [src, 16]
310 stp C_l, C_h, [dstend, -48]
312 stp D_l, D_h, [dstend, -64]
313 stp G_l, G_h, [dstin, 48]
314 stp A_l, A_h, [dstin, 32]
315 stp B_l, B_h, [dstin, 16]
316 stp C_l, C_h, [dstin]
320 libc_hidden_builtin_def (MEMCPY)