1 /* Optimized memset for Fujitsu A64FX processor.
2 Copyright (C) 2021 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
21 #include <sysdeps/aarch64/memset-reg.h>
25 * ARMv8.2-a, AArch64, unaligned accesses, sve
29 #define L1_SIZE (64*1024) // L1 64KB
30 #define L2_SIZE (8*1024*1024) // L2 8MB
31 #define CACHE_LINE_SIZE 256
32 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
34 #define vector_length x9
36 #if HAVE_AARCH64_SVE_ASM
38 # define MEMSET __memset_a64fx
42 .macro st1b_unroll first=0, last=7
43 st1b z0.b, p0, [dst, \first, mul vl]
45 st1b_unroll "(\first+1)", \last
59 whilelo p0.b, vector_length, count
61 whilelo p1.b, xzr, count
62 st1b z0.b, p1, [dstin, 0, mul vl]
63 st1b z0.b, p0, [dstin, 1, mul vl]
66 // count >= vector_length * 2
67 1: cmp count, vector_length, lsl 2
68 add dstend, dstin, count
70 st1b z0.b, p0, [dstin, 0, mul vl]
71 st1b z0.b, p0, [dstin, 1, mul vl]
72 st1b z0.b, p0, [dstend, -2, mul vl]
73 st1b z0.b, p0, [dstend, -1, mul vl]
76 // count > vector_length * 4
77 1: lsl tmp1, vector_length, 3
80 st1b z0.b, p0, [dstin, 0, mul vl]
81 st1b z0.b, p0, [dstin, 1, mul vl]
82 st1b z0.b, p0, [dstin, 2, mul vl]
83 st1b z0.b, p0, [dstin, 3, mul vl]
84 st1b z0.b, p0, [dstend, -4, mul vl]
85 st1b z0.b, p0, [dstend, -3, mul vl]
86 st1b z0.b, p0, [dstend, -2, mul vl]
87 st1b z0.b, p0, [dstend, -1, mul vl]
91 L(vl_agnostic): // VL Agnostic
94 add dstend, dstin, count
95 // if rest >= L2_SIZE && vector_length == 64 then L(L2)
98 ccmp vector_length, tmp1, 0, cs
100 // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
102 ccmp vector_length, tmp1, 0, cs
107 lsl tmp1, vector_length, 3
117 cmp count, vector_length, lsl 1
119 add tmp2, vector_length, vector_length, lsl 2
122 st1b z0.b, p0, [dstend, -8, mul vl]
123 st1b z0.b, p0, [dstend, -7, mul vl]
124 st1b z0.b, p0, [dstend, -6, mul vl]
125 5: st1b z0.b, p0, [dstend, -5, mul vl]
126 st1b z0.b, p0, [dstend, -4, mul vl]
127 st1b z0.b, p0, [dstend, -3, mul vl]
128 2: st1b z0.b, p0, [dstend, -2, mul vl]
129 st1b z0.b, p0, [dstend, -1, mul vl]
132 L(L1_prefetch): // if rest >= L1_SIZE
135 prfm pstl1keep, [dst, PF_DIST_L1]
137 prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
138 add dst, dst, CACHE_LINE_SIZE * 2
139 sub rest, rest, CACHE_LINE_SIZE * 2
142 cbnz rest, L(unroll8)
150 // align dst to CACHE_LINE_SIZE byte boundary
151 and tmp2, dst, CACHE_LINE_SIZE - 1
152 st1b z0.b, p0, [dst, 0, mul vl]
153 st1b z0.b, p0, [dst, 1, mul vl]
154 st1b z0.b, p0, [dst, 2, mul vl]
155 st1b z0.b, p0, [dst, 3, mul vl]
157 add count, count, tmp2
159 // clear cachelines using DC ZVA
160 sub count, count, CACHE_LINE_SIZE * 2
162 1: add dst, dst, CACHE_LINE_SIZE
164 subs count, count, CACHE_LINE_SIZE
166 add count, count, CACHE_LINE_SIZE
170 libc_hidden_builtin_def (MEMSET)
172 #endif /* IS_IN (libc) */
173 #endif /* HAVE_AARCH64_SVE_ASM */