1 /* Optimized memcpy implementation for CELL BE PowerPC.
2 Copyright (C) 2010-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
24 #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
26 /* memcpy routine optimized for CELL-BE-PPC v2.0
28 * The CELL PPC core has 1 integer unit and 1 load/store unit
30 * 1st level data cache = 32K
31 * 2nd level data cache = 512K
32 * 3rd level data cache = 0K
33 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
34 * latency to memory is >400 clocks
35 * To improve copy performance we need to prefetch source data
36 * far ahead to hide this latency
37 * For best performance instruction forms ending in "." like "andi."
38 * should be avoided as the are implemented in microcode on CELL.
39 * The below code is loop unrolled for the CELL cache line of 128 bytes
44 EALIGN (BP_SYM (memcpy), 5, 0)
47 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
48 cmplwi cr1,r5,16 /* is size < 16 ? */
53 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
54 clrlwi r8,r8,32-4 /* aling to 16byte boundary */
60 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
64 lbzx r0,r7,r6 /* copy 1 byte */
68 lhzx r0,r7,r6 /* copy 2 byte */
72 lwzx r0,r7,r6 /* copy 4 byte */
76 lfdx fp9,r7,r6 /* copy 8 byte */
87 addi r6,r6,-8 /* prepare for stfdu */
88 addi r4,r4,-8 /* prepare for lfdu */
90 clrlwi r7,r7,32-7 /* align to cacheline boundary */
91 ble+ cr5,.Llessthancacheline
95 srwi r7,r7,4 /* divide size by 16 */
96 srwi r10,r5,7 /* number of cache lines to copy */
99 li r11,0 /* number cachelines to copy with prefetch */
100 beq .Lnocacheprefetch
102 cmplwi r10,PREFETCH_AHEAD
103 li r12,128+8 /* prefetch distance */
104 ble .Llessthanmaxprefetch
106 subi r11,r10,PREFETCH_AHEAD
107 li r10,PREFETCH_AHEAD
109 .Llessthanmaxprefetch:
121 beq cr6,.Lcachelinealigned
128 bdnz .Laligntocacheline
131 .Lcachelinealigned: /* copy while cache lines */
133 blt- cr1,.Llessthancacheline /* size <128 */
140 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
143 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
144 .Lloop: /* Copy aligned body */
145 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
148 lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
149 lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */
184 slwi r10,r10,2 /* adjust from 128 to 32 byte stride */
188 .Lloop2: /* Copy aligned body */
201 .Llessthancacheline: /* less than cache to do ? */
203 srwi r7,r5,4 /* divide size by 16 */
212 bdnz .Lcopy_remaining
214 .Ldo_lt16: /* less than 16 ? */
215 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
216 beqlr+ /* no rest to copy */
220 .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
224 lfdx fp9,r7,r6 /* copy 8 byte */
229 lwzx r0,r7,r6 /* copy 4 byte */
234 lhzx r0,r7,r6 /* copy 2 byte */
239 lbzx r0,r7,r6 /* copy 1 byte */
243 END (BP_SYM (memcpy))
244 libc_hidden_builtin_def (memcpy)