1 /* Optimized memcpy implementation for CELL BE PowerPC.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
25 #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
27 /* memcpy routine optimized for CELL-BE-PPC v2.0
29 * The CELL PPC core has 1 integer unit and 1 load/store unit
31 * 1st level data cache = 32K
32 * 2nd level data cache = 512K
33 * 3rd level data cache = 0K
34 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
35 * latency to memory is >400 clocks
36 * To improve copy performance we need to prefetch source data
37 * far ahead to hide this latency
38 * For best performance instructionforms ending in "." like "andi."
39 * should be avoided as the are implemented in microcode on CELL.
40 * The below code is loop unrolled for the CELL cache line of 128 bytes
45 EALIGN (BP_SYM (memcpy), 5, 0)
48 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
49 cmpldi cr1,r5,16 /* is size < 16 ? */
54 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
55 clrldi r8,r8,64-4 /* aling to 16byte boundary */
61 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
65 lbzx r0,r7,r6 /* copy 1 byte */
69 lhzx r0,r7,r6 /* copy 2 byte */
73 lwzx r0,r7,r6 /* copy 4 byte */
77 ldx r0,r7,r6 /* copy 8 byte */
88 addi r6,r6,-8 /* prepare for stdu */
89 addi r4,r4,-8 /* prepare for ldu */
91 clrldi r7,r7,64-7 /* align to cacheline boundary */
92 ble+ cr5,.Llessthancacheline
96 srdi r7,r7,4 /* divide size by 16 */
97 srdi r10,r5,7 /* number of cache lines to copy */
100 li r11,0 /* number cachelines to copy with prefetch */
101 beq .Lnocacheprefetch
103 cmpldi r10,PREFETCH_AHEAD
104 li r12,128+8 /* prefetch distance */
105 ble .Llessthanmaxprefetch
107 subi r11,r10,PREFETCH_AHEAD
108 li r10,PREFETCH_AHEAD
110 .Llessthanmaxprefetch:
122 beq cr6,.Lcachelinealigned
129 bdnz .Laligntocacheline
132 .Lcachelinealigned: /* copy while cache lines */
134 blt- cr1,.Llessthancacheline /* size <128 */
141 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
144 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
145 .Lloop: /* Copy aligned body */
146 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
149 ld r7, 0x10(r4) /* 4 register stride copy is optimal */
150 ld r8, 0x18(r4) /* to hide 1st level cache lantency. */
185 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
189 .Lloop2: /* Copy aligned body */
202 .Llessthancacheline: /* less than cache to do ? */
204 srdi r7,r5,4 /* divide size by 16 */
213 bdnz .Lcopy_remaining
215 .Ldo_lt16: /* less than 16 ? */
216 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
217 beqlr+ /* no rest to copy */
221 .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
225 ldx r0,r7,r6 /* copy 8 byte */
230 lwzx r0,r7,r6 /* copy 4 byte */
235 lhzx r0,r7,r6 /* copy 2 byte */
240 lbzx r0,r7,r6 /* copy 1 byte */
244 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
245 libc_hidden_builtin_def (memcpy)