1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-4.
2 Copyright (C) 2012-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by David S. Miller (davem@davemloft.net)
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
26 /* On T4 it is very expensive to access ASRs like %fprs and
27 * %asi, avoiding a read or a write can save ~50 cycles.
31 andcc %o5, FPRS_FEF, %g0; \
33 wr %g0, FPRS_FEF, %fprs; \
36 #define VISEntryHalf FPU_ENTER
37 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
39 #define GLOBAL_SPARE %g5
41 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
44 #define EX_RETVAL(x) x
45 #define LOAD(type,addr,dest) type [addr], dest
46 #define STORE(type,src,addr) type src, [addr]
47 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
49 #if !defined NOT_IN_libc
51 .register %g2,#scratch
52 .register %g3,#scratch
53 .register %g6,#scratch
57 ENTRY(__mempcpy_niagara4)
60 END(__mempcpy_niagara4)
63 ENTRY(__memcpy_niagara4)
64 100: /* %o0=dst, %o1=src, %o2=len */
80 .Llarge:/* len >= 0x80 */
81 /* First get dest 8 byte aligned. */
87 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
92 EX_ST(STORE(stb, %g2, %o0 - 0x01))
94 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
95 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
96 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
97 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
98 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
99 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
100 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
101 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
103 /* Check if we can use the straight fully aligned
104 * loop, or we require the alignaddr/faligndata variant.
107 bne,pn %icc, .Llarge_src_unaligned
110 /* Legitimize the use of initializing stores by getting dest
111 * to be 64-byte aligned.
114 brz,pt %g1, .Llarge_aligned
117 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
122 EX_ST(STORE(stx, %g2, %o0 - 0x08))
125 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
129 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
131 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
133 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
134 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
135 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
136 EX_ST(STORE_INIT(%g1, %o0))
138 EX_ST(STORE_INIT(%g2, %o0))
140 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
141 EX_ST(STORE_INIT(%g3, %o0))
143 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
144 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
146 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
147 EX_ST(STORE_INIT(%o5, %o0))
149 EX_ST(STORE_INIT(%g2, %o0))
151 EX_ST(STORE_INIT(%g3, %o0))
153 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
156 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
158 membar #StoreLoad | #StoreStore
162 ble,pn %icc, .Lsmall_unaligned
164 ba,a,pt %icc, .Lmedium_noprefetch
167 mov EX_RETVAL(%o3), %o0
169 .Llarge_src_unaligned:
173 alignaddr %o1, %g0, %g1
175 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
176 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
178 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
179 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
180 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
181 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
182 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
183 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
184 faligndata %f0, %f2, %f16
185 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
186 faligndata %f2, %f4, %f18
188 faligndata %f4, %f6, %f20
189 faligndata %f6, %f8, %f22
190 faligndata %f8, %f10, %f24
191 faligndata %f10, %f12, %f26
192 faligndata %f12, %f14, %f28
193 faligndata %f14, %f0, %f30
194 EX_ST(STORE(std, %f16, %o0 + 0x00))
195 EX_ST(STORE(std, %f18, %o0 + 0x08))
196 EX_ST(STORE(std, %f20, %o0 + 0x10))
197 EX_ST(STORE(std, %f22, %o0 + 0x18))
198 EX_ST(STORE(std, %f24, %o0 + 0x20))
199 EX_ST(STORE(std, %f26, %o0 + 0x28))
200 EX_ST(STORE(std, %f28, %o0 + 0x30))
201 EX_ST(STORE(std, %f30, %o0 + 0x38))
204 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
209 ble,pn %icc, .Lsmall_unaligned
211 ba,a,pt %icc, .Lmedium_unaligned
214 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
216 bne,pn %icc, .Lmedium_unaligned
219 andncc %o2, 0x20 - 1, %o5
222 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
223 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
224 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
225 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
228 EX_ST(STORE(stx, %g1, %o0 + 0x00))
229 EX_ST(STORE(stx, %g2, %o0 + 0x08))
230 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
231 EX_ST(STORE(stx, %o4, %o0 + 0x18))
234 2: andcc %o2, 0x18, %o5
237 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
242 EX_ST(STORE(stx, %g1, %o0 - 0x08))
243 3: brz,pt %o2, .Lexit
247 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
252 EX_ST(STORE(stw, %g1, %o0 - 0x04))
255 /* First get dest 8 byte aligned. */
261 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
266 EX_ST(STORE(stb, %g2, %o0 - 0x01))
269 brz,pn %g1, .Lmedium_noprefetch
274 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
276 andn %o2, 0x08 - 1, %o5
278 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
281 srlx %g3, %g2, GLOBAL_SPARE
282 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
283 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
291 ba,pt %icc, .Lsmall_unaligned
294 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
297 EX_ST(STORE(stb, %g1, %o0 + 0x00))
298 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
301 EX_ST(STORE(stb, %g1, %o0 + 0x01))
302 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
304 EX_ST(STORE(stb, %g1, %o0 + 0x02))
308 bne,pn %icc, .Lsmall_unaligned
309 andn %o2, 0x4 - 1, %o5
312 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
317 EX_ST(STORE(stw, %g1, %o0 - 0x04))
323 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
328 EX_ST(STORE(stb, %g1, %o0 - 0x01))
330 END(__memcpy_niagara4)