1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-2.
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by David S. Miller (davem@davemloft.net)
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
22 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
23 #define ASI_BLK_P 0xf0
29 #define VISEntryHalf \
31 wr %g0, FPRS_FEF, %fprs
34 and %o5, FPRS_FEF, %o5; \
37 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39 #define LOAD(type,addr,dest) type [addr], dest
40 #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
41 #define STORE(type,src,addr) type src, [addr]
42 #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
43 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
50 #define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
51 faligndata %x0, %x1, %f0; \
52 faligndata %x1, %x2, %f2; \
53 faligndata %x2, %x3, %f4; \
54 faligndata %x3, %x4, %f6; \
55 faligndata %x4, %x5, %f8; \
56 faligndata %x5, %x6, %f10; \
57 faligndata %x6, %x7, %f12; \
58 faligndata %x7, %x8, %f14;
60 #define FREG_MOVE_1(x0) \
62 #define FREG_MOVE_2(x0, x1) \
65 #define FREG_MOVE_3(x0, x1, x2) \
69 #define FREG_MOVE_4(x0, x1, x2, x3) \
74 #define FREG_MOVE_5(x0, x1, x2, x3, x4) \
80 #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
87 #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
95 #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
104 #define FREG_LOAD_1(base, x0) \
105 LOAD(ldd, base + 0x00, %x0)
106 #define FREG_LOAD_2(base, x0, x1) \
107 LOAD(ldd, base + 0x00, %x0); \
108 LOAD(ldd, base + 0x08, %x1);
109 #define FREG_LOAD_3(base, x0, x1, x2) \
110 LOAD(ldd, base + 0x00, %x0); \
111 LOAD(ldd, base + 0x08, %x1); \
112 LOAD(ldd, base + 0x10, %x2);
113 #define FREG_LOAD_4(base, x0, x1, x2, x3) \
114 LOAD(ldd, base + 0x00, %x0); \
115 LOAD(ldd, base + 0x08, %x1); \
116 LOAD(ldd, base + 0x10, %x2); \
117 LOAD(ldd, base + 0x18, %x3);
118 #define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
119 LOAD(ldd, base + 0x00, %x0); \
120 LOAD(ldd, base + 0x08, %x1); \
121 LOAD(ldd, base + 0x10, %x2); \
122 LOAD(ldd, base + 0x18, %x3); \
123 LOAD(ldd, base + 0x20, %x4);
124 #define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
125 LOAD(ldd, base + 0x00, %x0); \
126 LOAD(ldd, base + 0x08, %x1); \
127 LOAD(ldd, base + 0x10, %x2); \
128 LOAD(ldd, base + 0x18, %x3); \
129 LOAD(ldd, base + 0x20, %x4); \
130 LOAD(ldd, base + 0x28, %x5);
131 #define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
132 LOAD(ldd, base + 0x00, %x0); \
133 LOAD(ldd, base + 0x08, %x1); \
134 LOAD(ldd, base + 0x10, %x2); \
135 LOAD(ldd, base + 0x18, %x3); \
136 LOAD(ldd, base + 0x20, %x4); \
137 LOAD(ldd, base + 0x28, %x5); \
138 LOAD(ldd, base + 0x30, %x6);
142 .register %g2,#scratch
143 .register %g3,#scratch
144 .register %g6,#scratch
148 ENTRY(__mempcpy_niagara2)
151 END(__mempcpy_niagara2)
154 ENTRY(__memcpy_niagara2)
155 100: /* %o0=dst, %o1=src, %o2=len */
163 218: or %o0, %o1, %o3
168 /* 2 blocks (128 bytes) is the minimum we can do the block
169 * copy with. We need to ensure that we'll iterate at least
170 * once in the block copy loop. At worst we'll need to align
171 * the destination to a 64-byte boundary which can chew up
172 * to (64 - 1) bytes from the length before we perform the
175 * However, the cut-off point, performance wise, is around
184 * %o2: len (known to be >= 128)
186 * The block copy loops can use %o4, %g2, %g3 as
187 * temporaries while copying the data. %o5 must
188 * be preserved between VISEntryHalf and VISExitHalf
191 LOAD(prefetch, %o1 + 0x000, #one_read)
192 LOAD(prefetch, %o1 + 0x040, #one_read)
193 LOAD(prefetch, %o1 + 0x080, #one_read)
195 /* Align destination on 64-byte boundary. */
196 andcc %o0, (64 - 1), %o4
199 sub %g0, %o4, %o4 ! bytes to align dst
209 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
210 * o5 from here until we hit VISExitHalf.
215 alignaddr %o1, %g0, %g0
217 add %o1, (64 - 1), %o4
218 andn %o4, (64 - 1), %o4
219 andn %o2, (64 - 1), %g1
222 and %o1, (64 - 1), %g2
237 4: /* 32 <= low bits < 48 */
241 5: /* 0 < low bits < 32 */
248 6: /* 0 < low bits < 16 */
251 /* fall through for 0 < low bits < 8 */
252 110: sub %o4, 64, %g2
254 1: STORE_INIT(%g0, %o4 + %g3)
256 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
257 STORE_BLK(%f0, %o4 + %g3)
258 FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
262 LOAD(prefetch, %o4 + 64, #one_read)
266 120: sub %o4, 56, %g2
267 FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
268 1: STORE_INIT(%g0, %o4 + %g3)
270 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
271 STORE_BLK(%f0, %o4 + %g3)
272 FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
276 LOAD(prefetch, %o4 + 64, #one_read)
280 130: sub %o4, 48, %g2
281 FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
282 1: STORE_INIT(%g0, %o4 + %g3)
284 FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
285 STORE_BLK(%f0, %o4 + %g3)
286 FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
290 LOAD(prefetch, %o4 + 64, #one_read)
294 140: sub %o4, 40, %g2
295 FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
296 1: STORE_INIT(%g0, %o4 + %g3)
298 FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
299 STORE_BLK(%f0, %o4 + %g3)
300 FREG_MOVE_5(f22, f24, f26, f28, f30)
304 LOAD(prefetch, %o4 + 64, #one_read)
308 150: sub %o4, 32, %g2
309 FREG_LOAD_4(%g2, f0, f2, f4, f6)
310 1: STORE_INIT(%g0, %o4 + %g3)
312 FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
313 STORE_BLK(%f0, %o4 + %g3)
314 FREG_MOVE_4(f24, f26, f28, f30)
318 LOAD(prefetch, %o4 + 64, #one_read)
322 160: sub %o4, 24, %g2
323 FREG_LOAD_3(%g2, f0, f2, f4)
324 1: STORE_INIT(%g0, %o4 + %g3)
326 FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
327 STORE_BLK(%f0, %o4 + %g3)
328 FREG_MOVE_3(f26, f28, f30)
332 LOAD(prefetch, %o4 + 64, #one_read)
336 170: sub %o4, 16, %g2
337 FREG_LOAD_2(%g2, f0, f2)
338 1: STORE_INIT(%g0, %o4 + %g3)
340 FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
341 STORE_BLK(%f0, %o4 + %g3)
342 FREG_MOVE_2(f28, f30)
346 LOAD(prefetch, %o4 + 64, #one_read)
352 1: STORE_INIT(%g0, %o4 + %g3)
354 FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
355 STORE_BLK(%f0, %o4 + %g3)
360 LOAD(prefetch, %o4 + 64, #one_read)
365 1: STORE_INIT(%g0, %o4 + %g3)
368 STORE_BLK(%f0, %o4 + %g3)
371 LOAD(prefetch, %o4 + 64, #one_read)
379 /* %o2 contains any final bytes still needed to be copied
380 * over. If anything is left, we copy it one byte at a time.
387 75: /* 16 < len <= 64 */
394 1: subcc %o4, 0x10, %o4
399 STORE(stx, %o5, %o1 + %o3)
401 STORE(stx, %g1, %o1 + %o3)
404 73: andcc %o2, 0x8, %g0
409 STORE(stx, %o5, %o1 + %o3)
411 1: andcc %o2, 0x4, %g0
416 STORE(stw, %o5, %o1 + %o3)
433 STORE(stb, %o5, %o1 + %o3)
471 80: /* 0 < len <= 16 */
479 STORE(stw, %g1, %o1 + %o3)
490 STORE(stb, %g1, %o1 + %o3)
496 END(__memcpy_niagara2)