1 /* A Thunderx2 Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2018-2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
24 * ARMv8-a, AArch64, unaligned accesses.
79 # define MEMMOVE memmove
82 # define MEMCPY memcpy
88 #define MEMCPY __memcpy_thunderx2
90 #define MEMMOVE __memmove_thunderx2
93 /* Overlapping large forward memmoves use a loop that copies backwards.
94 Otherwise memcpy is used. Small moves branch to memcopy16 directly.
95 The longer memcpy cases fall through to the memcpy head.
98 ENTRY_ALIGN (MEMMOVE, 6)
104 add srcend, src, count
109 ccmp tmp1, count, 2, hi
113 libc_hidden_builtin_def (MEMMOVE)
116 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
117 medium copies of 17..96 bytes which are fully unrolled. Large copies
118 of more than 96 bytes align the destination and use load-and-merge
119 approach in the case src and dst addresses are unaligned not evenly,
120 so that, actual loads and stores are always aligned.
121 Large copies use the loops processing 64 bytes per iteration for
122 unaligned case and 128 bytes per iteration for aligned ones.
125 #define MEMCPY_PREFETCH_LDR 640
134 add srcend, src, count
138 add dstend, dstin, count
143 /* Medium copies: 17..96 bytes. */
144 ldr E_q, [srcend, -16]
146 b.gt L(memcpy_copy96)
148 b.le L(bytes_17_to_48)
151 str E_q, [dstend, -16]
152 stp A_q, B_q, [dstin]
159 b.gt L(bytes_32_to_48)
162 str E_q, [dstend, -16]
169 str E_q, [dstend, -16]
174 /* Small copies: 0..16 bytes. */
179 ldr A_h, [srcend, -8]
180 add dstend, dstin, count
182 str A_h, [dstend, -8]
187 tbz count, 2, L(bytes_0_to_3)
189 ldr A_hw, [srcend, -4]
190 add dstend, dstin, count
192 str A_hw, [dstend, -4]
195 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
196 byte 3 times if count==1, or the 2nd byte twice if count==2. */
201 ldrb A_hw, [srcend, -1]
202 add dstend, dstin, count
203 ldrb B_lw, [src, tmp1]
204 strb B_lw, [dstin, tmp1]
205 strb A_hw, [dstend, -1]
213 /* Copying 65..96 bytes. A_q (first 16 bytes) and
214 E_q(last 16 bytes) are already loaded. The size
215 is large enough to benefit from aligned loads */
218 /* Loaded 64 bytes, second 16-bytes chunk can be
219 overlapping with the first chunk by tmp1 bytes.
222 add count, count, tmp1
223 /* The range of count being [65..96] becomes [65..111]
224 after tmp [0..15] gets added to it,
225 count now is <bytes-left-to-load>+48 */
227 b.gt L(copy96_medium)
229 stp B_q, C_q, [dst, 16]
232 str E_q, [dstend, -16]
237 ldp D_q, G_q, [src, 32]
240 stp B_q, C_q, [dst, 16]
241 stp D_q, G_q, [dst, 48]
243 str E_q, [dstend, -16]
249 stp C_q, D_q, [dst, 32]
250 stp G_q, F_q, [dst, 64]
252 str E_q, [dstend, -16]
258 ldp B_q, C_q, [src], #32
260 add count, count, tmp1
263 ldp D_q, E_q, [src], #32
266 /* Already loaded 64+16 bytes. Check if at
267 least 64 more bytes left */
268 subs count, count, 64+64+16
269 b.lt L(loop128_exit0)
270 cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
272 cbnz tmp1, L(dst_unaligned)
273 sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
278 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
279 ldp F_q, G_q, [src], #32
280 stp B_q, C_q, [dst], #32
281 ldp H_q, I_q, [src], #32
282 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
283 ldp B_q, C_q, [src], #32
284 stp D_q, E_q, [dst], #32
285 ldp D_q, E_q, [src], #32
286 stp F_q, G_q, [dst], #32
287 stp H_q, I_q, [dst], #32
288 subs count, count, 128
289 b.ge L(loop128_prefetch)
291 add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
294 ldp F_q, G_q, [src], #32
295 ldp H_q, I_q, [src], #32
296 stp B_q, C_q, [dst], #32
297 stp D_q, E_q, [dst], #32
298 subs count, count, 64
299 b.lt L(loop128_exit1)
300 ldp B_q, C_q, [src], #32
301 ldp D_q, E_q, [src], #32
302 stp F_q, G_q, [dst], #32
303 stp H_q, I_q, [dst], #32
304 subs count, count, 64
307 ldp F_q, G_q, [srcend, -64]
308 ldp H_q, I_q, [srcend, -32]
309 stp B_q, C_q, [dst], #32
311 stp F_q, G_q, [dstend, -64]
312 stp H_q, I_q, [dstend, -32]
315 ldp B_q, C_q, [srcend, -64]
316 ldp D_q, E_q, [srcend, -32]
317 stp F_q, G_q, [dst], #32
319 stp B_q, C_q, [dstend, -64]
320 stp D_q, E_q, [dstend, -32]
323 L(dst_unaligned_tail):
324 ldp C_q, D_q, [srcend, -64]
325 ldp E_q, F_q, [srcend, -32]
326 stp A_q, B_q, [dst], #32
327 stp H_q, I_q, [dst], #16
329 stp C_q, D_q, [dstend, -64]
330 stp E_q, F_q, [dstend, -32]
334 /* For the unaligned store case the code loads two
335 aligned chunks and then merges them using ext
336 instruction. This can be up to 30% faster than
337 the the simple unaligned store access.
339 Current state: tmp1 = dst % 16; C_q, D_q, E_q
340 contains data yet to be stored. src and dst points
341 to next-to-be-processed data. A_q, B_q contains
342 data already stored before, count = bytes left to
343 be load decremented by 64.
345 The control is passed here if at least 64 bytes left
346 to be loaded. The code does two aligned loads and then
347 extracts (16-tmp1) bytes from the first register and
348 tmp1 bytes from the next register forming the value
349 for the aligned store.
351 As ext instruction can only have it's index encoded
352 as immediate. 15 code chunks process each possible
353 index value. Computed goto is used to reach the
356 /* Store the 16 bytes to dst and align dst for further
357 operations, several bytes will be stored at this
360 ldp F_q, G_q, [src], #32
361 stp B_q, C_q, [dst], #32
364 adrp tmp2, L(ext_table)
365 add tmp2, tmp2, :lo12:L(ext_table)
366 add tmp2, tmp2, tmp1, LSL #2
368 add tmp2, tmp2, tmp3w, SXTW
372 /* to make the loop in each chunk 16-bytes aligned */
374 #define EXT_CHUNK(shft) \
375 L(ext_size_ ## shft):;\
376 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
377 ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
378 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
380 stp A_q, B_q, [dst], #32;\
381 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
382 ldp C_q, D_q, [src], #32;\
383 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
384 stp H_q, I_q, [dst], #32;\
385 ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
386 ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
387 ldp F_q, G_q, [src], #32;\
388 ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
389 subs count, count, 64;\
392 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
393 b L(dst_unaligned_tail);
416 add srcend, src, count
417 add dstend, dstin, count
420 ldr D_q, [srcend, -16]
421 sub srcend, srcend, tmp1
422 sub count, count, tmp1
423 ldp A_q, B_q, [srcend, -32]
424 str D_q, [dstend, -16]
425 ldp C_q, D_q, [srcend, -64]!
426 sub dstend, dstend, tmp1
427 subs count, count, 128
432 subs count, count, 64
433 stp A_q, B_q, [dstend, -32]
434 ldp A_q, B_q, [srcend, -32]
435 stp C_q, D_q, [dstend, -64]!
436 ldp C_q, D_q, [srcend, -64]!
439 /* Write the last full set of 64 bytes. The remainder is at most 64
440 bytes, so it is safe to always copy 64 bytes from the start even if
441 there is just 1 byte left. */
443 ldp E_q, F_q, [src, 32]
445 stp A_q, B_q, [dstend, -32]
446 stp C_q, D_q, [dstend, -64]
447 stp E_q, F_q, [dstin, 32]
448 stp G_q, H_q, [dstin]
457 /* The first entry is for the alignment of 0 and is never
458 actually used (could be any value). */
460 .word L(ext_size_1) -.
461 .word L(ext_size_2) -.
462 .word L(ext_size_3) -.
463 .word L(ext_size_4) -.
464 .word L(ext_size_5) -.
465 .word L(ext_size_6) -.
466 .word L(ext_size_7) -.
467 .word L(ext_size_8) -.
468 .word L(ext_size_9) -.
469 .word L(ext_size_10) -.
470 .word L(ext_size_11) -.
471 .word L(ext_size_12) -.
472 .word L(ext_size_13) -.
473 .word L(ext_size_14) -.
474 .word L(ext_size_15) -.
476 libc_hidden_builtin_def (MEMCPY)