1 /* Optimized memset for Fujitsu A64FX processor.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
21 #include <sysdeps/aarch64/memset-reg.h>
25 * ARMv8.2-a, AArch64, unaligned accesses, sve
29 #define L1_SIZE (64*1024) // L1 64KB
30 #define L2_SIZE (8*1024*1024) // L2 8MB
31 #define CACHE_LINE_SIZE 256
32 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
33 #define vector_length x9
35 #if HAVE_AARCH64_SVE_ASM
37 # define MEMSET __memset_a64fx
41 .macro st1b_unroll first=0, last=7
42 st1b z0.b, p0, [dst, \first, mul vl]
44 st1b_unroll "(\first+1)", \last
58 whilelo p0.b, vector_length, count
60 whilelo p1.b, xzr, count
61 st1b z0.b, p1, [dstin, 0, mul vl]
62 st1b z0.b, p0, [dstin, 1, mul vl]
65 // count >= vector_length * 2
66 1: cmp count, vector_length, lsl 2
67 add dstend, dstin, count
69 st1b z0.b, p0, [dstin, 0, mul vl]
70 st1b z0.b, p0, [dstin, 1, mul vl]
71 st1b z0.b, p0, [dstend, -2, mul vl]
72 st1b z0.b, p0, [dstend, -1, mul vl]
75 // count > vector_length * 4
76 1: lsl tmp1, vector_length, 3
79 st1b z0.b, p0, [dstin, 0, mul vl]
80 st1b z0.b, p0, [dstin, 1, mul vl]
81 st1b z0.b, p0, [dstin, 2, mul vl]
82 st1b z0.b, p0, [dstin, 3, mul vl]
83 st1b z0.b, p0, [dstend, -4, mul vl]
84 st1b z0.b, p0, [dstend, -3, mul vl]
85 st1b z0.b, p0, [dstend, -2, mul vl]
86 st1b z0.b, p0, [dstend, -1, mul vl]
90 L(vl_agnostic): // VL Agnostic
95 // count >= 8 * vector_length
97 sub count, count, tmp1
99 // The 2 instructions at the beginning of the following loop,
100 // cmp and branch, are a workaround so as not to degrade at
101 // the peak performance 16KB.
102 // It is found heuristically and the branch condition, b.ne,
103 // is chosen intentionally never to jump.
108 subs count, count, tmp1
110 add count, count, tmp1
113 cmp count, vector_length, lsl 1
115 add tmp2, vector_length, vector_length, lsl 2
118 st1b z0.b, p0, [dstend, -8, mul vl]
119 st1b z0.b, p0, [dstend, -7, mul vl]
120 st1b z0.b, p0, [dstend, -6, mul vl]
121 5: st1b z0.b, p0, [dstend, -5, mul vl]
122 st1b z0.b, p0, [dstend, -4, mul vl]
123 st1b z0.b, p0, [dstend, -3, mul vl]
124 2: st1b z0.b, p0, [dstend, -2, mul vl]
125 st1b z0.b, p0, [dstend, -1, mul vl]
133 cmp vector_length, 64
136 prfm pstl1keep, [dst, PF_DIST_L1]
138 prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
139 add dst, dst, CACHE_LINE_SIZE * 2
140 sub count, count, CACHE_LINE_SIZE * 2
141 cmp count, PF_DIST_L1
150 // align dst to CACHE_LINE_SIZE byte boundary
151 and tmp2, dst, CACHE_LINE_SIZE - 1
152 st1b z0.b, p0, [dst, 0, mul vl]
153 st1b z0.b, p0, [dst, 1, mul vl]
154 st1b z0.b, p0, [dst, 2, mul vl]
155 st1b z0.b, p0, [dst, 3, mul vl]
157 add count, count, tmp2
159 // clear cachelines using DC ZVA
160 sub count, count, CACHE_LINE_SIZE * 2
162 1: add dst, dst, CACHE_LINE_SIZE
164 subs count, count, CACHE_LINE_SIZE
166 add count, count, CACHE_LINE_SIZE
170 libc_hidden_builtin_def (MEMSET)
172 #endif /* IS_IN (libc) */
173 #endif /* HAVE_AARCH64_SVE_ASM */