1 /* Optimized memcpy implementation for CELL BE PowerPC.
2 Copyright (C) 2010-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
22 #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
24 /* memcpy routine optimized for CELL-BE-PPC v2.0
26 * The CELL PPC core has 1 integer unit and 1 load/store unit
28 * 1st level data cache = 32K
29 * 2nd level data cache = 512K
30 * 3rd level data cache = 0K
31 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
32 * latency to memory is >400 clocks
33 * To improve copy performance we need to prefetch source data
34 * far ahead to hide this latency
35 * For best performance instruction forms ending in "." like "andi."
36 * should be avoided as the are implemented in microcode on CELL.
37 * The below code is loop unrolled for the CELL cache line of 128 bytes
45 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
46 cmplwi cr1,r5,16 /* is size < 16 ? */
51 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
52 clrlwi r8,r8,32-4 /* align to 16byte boundary */
58 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
62 lbzx r0,r7,r6 /* copy 1 byte */
66 lhzx r0,r7,r6 /* copy 2 byte */
70 lwzx r0,r7,r6 /* copy 4 byte */
74 lfdx fp9,r7,r6 /* copy 8 byte */
85 addi r6,r6,-8 /* prepare for stfdu */
86 addi r4,r4,-8 /* prepare for lfdu */
88 clrlwi r7,r7,32-7 /* align to cacheline boundary */
89 ble+ cr5,.Llessthancacheline
93 srwi r7,r7,4 /* divide size by 16 */
94 srwi r10,r5,7 /* number of cache lines to copy */
97 li r11,0 /* number cachelines to copy with prefetch */
100 cmplwi r10,PREFETCH_AHEAD
101 li r12,128+8 /* prefetch distance */
102 ble .Llessthanmaxprefetch
104 subi r11,r10,PREFETCH_AHEAD
105 li r10,PREFETCH_AHEAD
107 .Llessthanmaxprefetch:
119 beq cr6,.Lcachelinealigned
126 bdnz .Laligntocacheline
129 .Lcachelinealigned: /* copy while cache lines */
131 blt- cr1,.Llessthancacheline /* size <128 */
138 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
141 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
142 .Lloop: /* Copy aligned body */
143 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
146 lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
147 lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */
182 slwi r10,r10,2 /* adjust from 128 to 32 byte stride */
186 .Lloop2: /* Copy aligned body */
199 .Llessthancacheline: /* less than cache to do ? */
201 srwi r7,r5,4 /* divide size by 16 */
210 bdnz .Lcopy_remaining
212 .Ldo_lt16: /* less than 16 ? */
213 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
214 beqlr+ /* no rest to copy */
218 .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
222 lfdx fp9,r7,r6 /* copy 8 byte */
227 lwzx r0,r7,r6 /* copy 4 byte */
232 lhzx r0,r7,r6 /* copy 2 byte */
237 lbzx r0,r7,r6 /* copy 1 byte */
242 libc_hidden_builtin_def (memcpy)