1 /* Optimized memset for Fujitsu A64FX processor.
2 Copyright (C) 2021 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
21 #include <sysdeps/aarch64/memset-reg.h>
25 * ARMv8.2-a, AArch64, unaligned accesses, sve
29 #define L1_SIZE (64*1024) // L1 64KB
30 #define L2_SIZE (8*1024*1024) // L2 8MB
31 #define CACHE_LINE_SIZE 256
32 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
34 #define vector_length x9
36 #if HAVE_AARCH64_SVE_ASM
38 # define MEMSET __memset_a64fx
42 .macro st1b_unroll first=0, last=7
43 st1b z0.b, p0, [dst, \first, mul vl]
45 st1b_unroll "(\first+1)", \last
59 whilelo p0.b, vector_length, count
61 whilelo p1.b, xzr, count
62 st1b z0.b, p1, [dstin, 0, mul vl]
63 st1b z0.b, p0, [dstin, 1, mul vl]
66 // count >= vector_length * 2
67 1: cmp count, vector_length, lsl 2
68 add dstend, dstin, count
70 st1b z0.b, p0, [dstin, 0, mul vl]
71 st1b z0.b, p0, [dstin, 1, mul vl]
72 st1b z0.b, p0, [dstend, -2, mul vl]
73 st1b z0.b, p0, [dstend, -1, mul vl]
76 // count > vector_length * 4
77 1: lsl tmp1, vector_length, 3
80 st1b z0.b, p0, [dstin, 0, mul vl]
81 st1b z0.b, p0, [dstin, 1, mul vl]
82 st1b z0.b, p0, [dstin, 2, mul vl]
83 st1b z0.b, p0, [dstin, 3, mul vl]
84 st1b z0.b, p0, [dstend, -4, mul vl]
85 st1b z0.b, p0, [dstend, -3, mul vl]
86 st1b z0.b, p0, [dstend, -2, mul vl]
87 st1b z0.b, p0, [dstend, -1, mul vl]
91 L(vl_agnostic): // VL Agnostic
94 add dstend, dstin, count
95 // if rest >= L2_SIZE && vector_length == 64 then L(L2)
98 ccmp vector_length, tmp1, 0, cs
100 // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
102 ccmp vector_length, tmp1, 0, cs
106 lsl tmp1, vector_length, 3 // vector_length * 8
107 lsl tmp2, vector_length, 5 // vector_length * 32
123 lsl tmp1, vector_length, 3
133 whilelo p0.b, xzr, rest
134 whilelo p1.b, vector_length, rest
136 st1b z0.b, p0, [dst, #0, mul vl]
137 st1b z0.b, p1, [dst, #1, mul vl]
139 1: lsl tmp1, vector_length, 1 // vector_length * 2
140 whilelo p2.b, tmp1, rest
142 whilelo p3.b, tmp1, rest
144 st1b z0.b, p0, [dst, #0, mul vl]
145 st1b z0.b, p1, [dst, #1, mul vl]
146 st1b z0.b, p2, [dst, #2, mul vl]
147 st1b z0.b, p3, [dst, #3, mul vl]
149 1: lsl tmp1, vector_length, 2 // vector_length * 4
150 whilelo p4.b, tmp1, rest
152 whilelo p5.b, tmp1, rest
154 whilelo p6.b, tmp1, rest
156 whilelo p7.b, tmp1, rest
157 st1b z0.b, p0, [dst, #0, mul vl]
158 st1b z0.b, p1, [dst, #1, mul vl]
159 st1b z0.b, p2, [dst, #2, mul vl]
160 st1b z0.b, p3, [dst, #3, mul vl]
161 st1b z0.b, p4, [dst, #4, mul vl]
162 st1b z0.b, p5, [dst, #5, mul vl]
163 st1b z0.b, p6, [dst, #6, mul vl]
164 st1b z0.b, p7, [dst, #7, mul vl]
167 L(L1_prefetch): // if rest >= L1_SIZE
170 prfm pstl1keep, [dst, PF_DIST_L1]
172 prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
173 add dst, dst, CACHE_LINE_SIZE * 2
174 sub rest, rest, CACHE_LINE_SIZE * 2
177 cbnz rest, L(unroll32)
185 // align dst to CACHE_LINE_SIZE byte boundary
186 and tmp2, dst, CACHE_LINE_SIZE - 1
187 st1b z0.b, p0, [dst, 0, mul vl]
188 st1b z0.b, p0, [dst, 1, mul vl]
189 st1b z0.b, p0, [dst, 2, mul vl]
190 st1b z0.b, p0, [dst, 3, mul vl]
192 add count, count, tmp2
194 // clear cachelines using DC ZVA
195 sub count, count, CACHE_LINE_SIZE * 2
197 1: add dst, dst, CACHE_LINE_SIZE
199 subs count, count, CACHE_LINE_SIZE
201 add count, count, CACHE_LINE_SIZE
202 add dst, dst, CACHE_LINE_SIZE
206 libc_hidden_builtin_def (MEMSET)
208 #endif /* IS_IN (libc) */
209 #endif /* HAVE_AARCH64_SVE_ASM */