1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-4.
2 Copyright (C) 2012-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
25 /* On T4 it is very expensive to access ASRs like %fprs and
26 * %asi, avoiding a read or a write can save ~50 cycles.
30 andcc %o5, FPRS_FEF, %g0; \
32 wr %g0, FPRS_FEF, %fprs; \
35 #define VISEntryHalf FPU_ENTER
36 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
38 #define GLOBAL_SPARE %g5
40 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
43 #define EX_RETVAL(x) x
44 #define LOAD(type,addr,dest) type [addr], dest
45 #define STORE(type,src,addr) type src, [addr]
46 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
50 .register %g2,#scratch
51 .register %g3,#scratch
52 .register %g6,#scratch
56 ENTRY(__mempcpy_niagara4)
59 END(__mempcpy_niagara4)
62 ENTRY(__memcpy_niagara4)
63 100: /* %o0=dst, %o1=src, %o2=len */
79 .Llarge:/* len >= 0x80 */
80 /* First get dest 8 byte aligned. */
86 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
91 EX_ST(STORE(stb, %g2, %o0 - 0x01))
93 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
94 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
95 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
96 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
97 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
98 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
99 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
100 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
102 /* Check if we can use the straight fully aligned
103 * loop, or we require the alignaddr/faligndata variant.
106 bne,pn %icc, .Llarge_src_unaligned
109 /* Legitimize the use of initializing stores by getting dest
110 * to be 64-byte aligned.
113 brz,pt %g1, .Llarge_aligned
116 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
121 EX_ST(STORE(stx, %g2, %o0 - 0x08))
124 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
128 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
130 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
132 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
133 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
134 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
135 EX_ST(STORE_INIT(%g1, %o0))
137 EX_ST(STORE_INIT(%g2, %o0))
139 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
140 EX_ST(STORE_INIT(%g3, %o0))
142 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
143 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
145 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
146 EX_ST(STORE_INIT(%o5, %o0))
148 EX_ST(STORE_INIT(%g2, %o0))
150 EX_ST(STORE_INIT(%g3, %o0))
152 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
155 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
157 membar #StoreLoad | #StoreStore
161 ble,pn %icc, .Lsmall_unaligned
163 ba,a,pt %icc, .Lmedium_noprefetch
166 mov EX_RETVAL(%o3), %o0
168 .Llarge_src_unaligned:
172 alignaddr %o1, %g0, %g1
174 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
175 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
177 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
178 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
179 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
180 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
181 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
182 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
183 faligndata %f0, %f2, %f16
184 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
185 faligndata %f2, %f4, %f18
187 faligndata %f4, %f6, %f20
188 faligndata %f6, %f8, %f22
189 faligndata %f8, %f10, %f24
190 faligndata %f10, %f12, %f26
191 faligndata %f12, %f14, %f28
192 faligndata %f14, %f0, %f30
193 EX_ST(STORE(std, %f16, %o0 + 0x00))
194 EX_ST(STORE(std, %f18, %o0 + 0x08))
195 EX_ST(STORE(std, %f20, %o0 + 0x10))
196 EX_ST(STORE(std, %f22, %o0 + 0x18))
197 EX_ST(STORE(std, %f24, %o0 + 0x20))
198 EX_ST(STORE(std, %f26, %o0 + 0x28))
199 EX_ST(STORE(std, %f28, %o0 + 0x30))
200 EX_ST(STORE(std, %f30, %o0 + 0x38))
203 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
208 ble,pn %icc, .Lsmall_unaligned
210 ba,a,pt %icc, .Lmedium_unaligned
213 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
215 bne,pn %icc, .Lmedium_unaligned
218 andncc %o2, 0x20 - 1, %o5
221 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
222 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
223 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
224 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
227 EX_ST(STORE(stx, %g1, %o0 + 0x00))
228 EX_ST(STORE(stx, %g2, %o0 + 0x08))
229 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
230 EX_ST(STORE(stx, %o4, %o0 + 0x18))
233 2: andcc %o2, 0x18, %o5
236 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
241 EX_ST(STORE(stx, %g1, %o0 - 0x08))
242 3: brz,pt %o2, .Lexit
246 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
251 EX_ST(STORE(stw, %g1, %o0 - 0x04))
254 /* First get dest 8 byte aligned. */
260 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
265 EX_ST(STORE(stb, %g2, %o0 - 0x01))
268 brz,pn %g1, .Lmedium_noprefetch
273 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
275 andn %o2, 0x08 - 1, %o5
277 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
280 srlx %g3, %g2, GLOBAL_SPARE
281 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
282 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
290 ba,pt %icc, .Lsmall_unaligned
293 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
296 EX_ST(STORE(stb, %g1, %o0 + 0x00))
297 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
300 EX_ST(STORE(stb, %g1, %o0 + 0x01))
301 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
303 EX_ST(STORE(stb, %g1, %o0 + 0x02))
307 bne,pn %icc, .Lsmall_unaligned
308 andn %o2, 0x4 - 1, %o5
311 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
316 EX_ST(STORE(stw, %g1, %o0 - 0x04))
322 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
327 EX_ST(STORE(stb, %g1, %o0 - 0x01))
329 END(__memcpy_niagara4)