1 /* Optimized memcpy for Fujitsu A64FX processor.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
27 * ARMv8.2-a, AArch64, unaligned accesses, sve
41 #if HAVE_AARCH64_SVE_ASM
43 # define MEMCPY __memcpy_a64fx
44 # define MEMMOVE __memmove_a64fx
49 ld1b z0.b, p0/z, [src, 0, mul vl]
50 ld1b z1.b, p0/z, [src, 1, mul vl]
51 ld1b z2.b, p0/z, [src, 2, mul vl]
52 ld1b z3.b, p0/z, [src, 3, mul vl]
53 ld1b z4.b, p0/z, [src, 4, mul vl]
54 ld1b z5.b, p0/z, [src, 5, mul vl]
55 ld1b z6.b, p0/z, [src, 6, mul vl]
56 ld1b z7.b, p0/z, [src, 7, mul vl]
59 .macro stld1b_unroll4a
60 st1b z0.b, p0, [dst, 0, mul vl]
61 st1b z1.b, p0, [dst, 1, mul vl]
62 ld1b z0.b, p0/z, [src, 0, mul vl]
63 ld1b z1.b, p0/z, [src, 1, mul vl]
64 st1b z2.b, p0, [dst, 2, mul vl]
65 st1b z3.b, p0, [dst, 3, mul vl]
66 ld1b z2.b, p0/z, [src, 2, mul vl]
67 ld1b z3.b, p0/z, [src, 3, mul vl]
70 .macro stld1b_unroll4b
71 st1b z4.b, p0, [dst, 4, mul vl]
72 st1b z5.b, p0, [dst, 5, mul vl]
73 ld1b z4.b, p0/z, [src, 4, mul vl]
74 ld1b z5.b, p0/z, [src, 5, mul vl]
75 st1b z6.b, p0, [dst, 6, mul vl]
76 st1b z7.b, p0, [dst, 7, mul vl]
77 ld1b z6.b, p0/z, [src, 6, mul vl]
78 ld1b z7.b, p0/z, [src, 7, mul vl]
87 st1b z0.b, p0, [dst, 0, mul vl]
88 st1b z1.b, p0, [dst, 1, mul vl]
89 st1b z2.b, p0, [dst, 2, mul vl]
90 st1b z3.b, p0, [dst, 3, mul vl]
91 st1b z4.b, p0, [dst, 4, mul vl]
92 st1b z5.b, p0, [dst, 5, mul vl]
93 st1b z6.b, p0, [dst, 6, mul vl]
94 st1b z7.b, p0, [dst, 7, mul vl]
109 whilelo p1.b, vlen, n
111 ld1b z0.b, p0/z, [src, 0, mul vl]
112 ld1b z1.b, p1/z, [src, 1, mul vl]
113 st1b z0.b, p0, [dstin, 0, mul vl]
114 st1b z1.b, p1, [dstin, 1, mul vl]
127 /* Copy 2-4 vectors. */
129 ld1b z0.b, p0/z, [src, 0, mul vl]
130 ld1b z1.b, p0/z, [src, 1, mul vl]
131 ld1b z2.b, p0/z, [srcend, -2, mul vl]
132 ld1b z3.b, p0/z, [srcend, -1, mul vl]
133 st1b z0.b, p0, [dstin, 0, mul vl]
134 st1b z1.b, p0, [dstin, 1, mul vl]
135 st1b z2.b, p0, [dstend, -2, mul vl]
136 st1b z3.b, p0, [dstend, -1, mul vl]
140 /* Copy 4-8 vectors. */
142 ld1b z0.b, p0/z, [src, 0, mul vl]
143 ld1b z1.b, p0/z, [src, 1, mul vl]
144 ld1b z2.b, p0/z, [src, 2, mul vl]
145 ld1b z3.b, p0/z, [src, 3, mul vl]
146 ld1b z4.b, p0/z, [srcend, -4, mul vl]
147 ld1b z5.b, p0/z, [srcend, -3, mul vl]
148 ld1b z6.b, p0/z, [srcend, -2, mul vl]
149 ld1b z7.b, p0/z, [srcend, -1, mul vl]
150 st1b z0.b, p0, [dstin, 0, mul vl]
151 st1b z1.b, p0, [dstin, 1, mul vl]
152 st1b z2.b, p0, [dstin, 2, mul vl]
153 st1b z3.b, p0, [dstin, 3, mul vl]
154 st1b z4.b, p0, [dstend, -4, mul vl]
155 st1b z5.b, p0, [dstend, -3, mul vl]
156 st1b z6.b, p0, [dstend, -2, mul vl]
157 st1b z7.b, p0, [dstend, -1, mul vl]
161 /* At least 8 vectors - always align to vector length for
162 higher and consistent write performance. */
167 whilelo p1.b, xzr, tmp
168 ld1b z1.b, p1/z, [src]
169 st1b z1.b, p1, [dstin]
184 /* 8x unrolled and software pipelined loop. */
194 /* Move last 0-8 vectors. */
199 whilelo p1.b, vlen, n
200 ld1b z0.b, p0/z, [src, 0, mul vl]
201 ld1b z1.b, p1/z, [src, 1, mul vl]
202 st1b z0.b, p0, [dst, 0, mul vl]
203 st1b z1.b, p1, [dst, 1, mul vl]
208 1: add srcend, src, n
210 ld1b z0.b, p0/z, [src, 0, mul vl]
211 ld1b z1.b, p0/z, [src, 1, mul vl]
212 ld1b z2.b, p0/z, [srcend, -2, mul vl]
213 ld1b z3.b, p0/z, [srcend, -1, mul vl]
217 st1b z0.b, p0, [dst, 0, mul vl]
218 st1b z1.b, p0, [dst, 1, mul vl]
219 st1b z2.b, p0, [dstend, -2, mul vl]
220 st1b z3.b, p0, [dstend, -1, mul vl]
223 1: ld1b z4.b, p0/z, [src, 2, mul vl]
224 ld1b z5.b, p0/z, [src, 3, mul vl]
225 ld1b z6.b, p0/z, [srcend, -4, mul vl]
226 ld1b z7.b, p0/z, [srcend, -3, mul vl]
227 st1b z0.b, p0, [dst, 0, mul vl]
228 st1b z1.b, p0, [dst, 1, mul vl]
229 st1b z4.b, p0, [dst, 2, mul vl]
230 st1b z5.b, p0, [dst, 3, mul vl]
231 st1b z6.b, p0, [dstend, -4, mul vl]
232 st1b z7.b, p0, [dstend, -3, mul vl]
233 st1b z2.b, p0, [dstend, -2, mul vl]
234 st1b z3.b, p0, [dstend, -1, mul vl]
238 libc_hidden_builtin_def (MEMCPY)
241 ENTRY_ALIGN (MEMMOVE, 4)
247 /* Fast case for up to 2 vectors. */
252 whilelo p1.b, vlen, n
253 ld1b z0.b, p0/z, [src, 0, mul vl]
254 ld1b z1.b, p1/z, [src, 1, mul vl]
255 st1b z0.b, p0, [dstin, 0, mul vl]
256 st1b z1.b, p1, [dstin, 1, mul vl]
261 /* Check for overlapping moves. Return if there is a full overlap.
262 Small moves up to 8 vectors use the overlap-safe copy_small code.
263 Non-overlapping or overlapping moves with dst < src use memcpy.
264 Overlapping moves with dst > src use a backward copy loop. */
265 1: sub tmp, dstin, src
266 ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */
273 /* Align to vector length. */
277 csel tmp, tmp, vlen, ne
278 whilelo p1.b, xzr, tmp
280 ld1b z1.b, p1/z, [src, n]
281 st1b z1.b, p1, [dstin, n]
295 /* 8x unrolled and software pipelined backward copy loop. */
296 1: sub src, src, vlen8
301 2: sub dst, dst, vlen8
305 /* Adjust src/dst for last 0-8 vectors. */
311 libc_hidden_builtin_def (MEMMOVE)
312 # endif /* IS_IN (libc) */
313 #endif /* HAVE_AARCH64_SVE_ASM */