1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Michael Brutman <brutman@us.ibm.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
26 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
30 .tc __cache_line_size[TC],__cache_line_size
36 EALIGN (BP_SYM (memcpy), 5, 0)
39 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
40 cmpldi cr1,r5,16 /* is size < 16 ? */
41 mr r6,r3 /* Copy dest reg to r6; */
45 /* Big copy (16 bytes or more)
47 Figure out how far to the nearest quadword boundary, or if we are
48 on one already. Also get the cache line size.
50 r3 - return value (always)
51 r4 - current source addr
53 r6 - current dest addr
56 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
57 ld r9,.LC0@toc(r2) /* Get cache line size (part 1) */
58 clrldi r8,r8,64-4 /* align to 16byte boundary */
59 sub r7,r4,r3 /* compute offset to src from dest */
60 lwz r9,0(r9) /* Get cache line size (part 2) */
61 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
62 addi r10,r9,-1 /* Cache line mask */
67 /* Destination is not aligned on quadword boundary. Get us to one.
69 r3 - return value (always)
70 r4 - current source addr
72 r6 - current dest addr
73 r7 - offset to src from dest
74 r8 - number of bytes to quadword boundary
77 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
78 subf r5,r8,r5 /* adjust remaining len */
81 lbzx r0,r7,r6 /* copy 1 byte addr */
86 lhzx r0,r7,r6 /* copy 2 byte addr */
91 lwzx r0,r7,r6 /* copy 4 byte addr */
96 ldx r0,r7,r6 /* copy 8 byte addr */
100 add r4,r7,r6 /* update src addr */
104 /* Dest is quadword aligned now.
106 Lots of decisions to make. If we are copying less than a cache
107 line we won't be here long. If we are not on a cache line
108 boundary we need to get there. And then we need to figure out
109 how many cache lines ahead to pre-touch.
111 r3 - return value (always)
112 r4 - current source addr
114 r6 - current dest addr
122 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
124 neg r7,r6 /* How far to next cacheline bdy? */
126 addi r6,r6,-8 /* prepare for stdu */
128 addi r4,r4,-8 /* prepare for ldu */
131 ble+ cr5,L(lessthancacheline)
133 beq- cr0,L(big_lines) /* 128 byte line code */
137 /* More than a cacheline left to go, and using 64 byte cachelines */
139 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
141 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
143 /* Reduce total len by what it takes to get to the next cache line */
145 srdi r7,r7,4 /* How many qws to get to the line bdy? */
147 /* How many full cache lines to copy after getting to a line bdy? */
150 cmpldi r10,0 /* If no full cache lines to copy ... */
151 li r11,0 /* number cachelines to copy with prefetch */
152 beq L(nocacheprefetch)
155 /* We are here because we have at least one full cache line to copy,
156 and therefore some pre-touching to do. */
158 cmpldi r10,PREFETCH_AHEAD
159 li r12,64+8 /* prefetch distance */
160 ble L(lessthanmaxprefetch)
162 /* We can only do so much pre-fetching. R11 will have the count of
163 lines left to prefetch after the initial batch of prefetches
166 subi r11,r10,PREFETCH_AHEAD
167 li r10,PREFETCH_AHEAD
169 L(lessthanmaxprefetch):
172 /* At this point r10/ctr hold the number of lines to prefetch in this
173 initial batch, and r11 holds any remainder. */
181 /* Prefetching is done, or was not needed.
183 cr6 - are we on a cacheline boundary already?
184 r7 - number of quadwords to the next cacheline boundary
190 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
192 /* How many bytes are left after we copy whatever full
193 cache lines we can get? */
196 beq cr6,L(cachelinealigned)
199 /* Copy quadwords up to the next cacheline boundary */
207 bdnz L(aligntocacheline)
211 L(cachelinealigned): /* copy while cache lines */
213 blt- cr1,L(lessthancacheline) /* size <64 */
220 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
223 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
224 L(loop): /* Copy aligned body */
225 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
253 L(loop2): /* Copy aligned body */
277 L(lessthancacheline): /* Was there less than cache to do ? */
279 srdi r7,r5,4 /* divide size by 16 */
289 bdnz L(copy_remaining)
291 L(do_lt16): /* less than 16 ? */
292 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
293 beqlr+ /* no rest to copy */
297 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
301 ldx r0,r7,r6 /* copy 8 byte */
306 lwzx r0,r7,r6 /* copy 4 byte */
311 lhzx r0,r7,r6 /* copy 2 byte */
316 lbzx r0,r7,r6 /* copy 1 byte */
325 /* Similar to above, but for use with 128 byte lines. */
330 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
332 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
334 /* Reduce total len by what it takes to get to the next cache line */
336 srdi r7,r7,4 /* How many qws to get to the line bdy? */
338 /* How many full cache lines to copy after getting to a line bdy? */
341 cmpldi r10,0 /* If no full cache lines to copy ... */
342 li r11,0 /* number cachelines to copy with prefetch */
343 beq L(nocacheprefetch_128)
346 /* We are here because we have at least one full cache line to copy,
347 and therefore some pre-touching to do. */
349 cmpldi r10,PREFETCH_AHEAD
350 li r12,128+8 /* prefetch distance */
351 ble L(lessthanmaxprefetch_128)
353 /* We can only do so much pre-fetching. R11 will have the count of
354 lines left to prefetch after the initial batch of prefetches
357 subi r11,r10,PREFETCH_AHEAD
358 li r10,PREFETCH_AHEAD
360 L(lessthanmaxprefetch_128):
363 /* At this point r10/ctr hold the number of lines to prefetch in this
364 initial batch, and r11 holds any remainder. */
369 bdnz L(prefetchSRC_128)
372 /* Prefetching is done, or was not needed.
374 cr6 - are we on a cacheline boundary already?
375 r7 - number of quadwords to the next cacheline boundary
378 L(nocacheprefetch_128):
381 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
383 /* How many bytes are left after we copy whatever full
384 cache lines we can get? */
387 beq cr6,L(cachelinealigned_128)
390 /* Copy quadwords up to the next cacheline boundary */
392 L(aligntocacheline_128):
398 bdnz L(aligntocacheline_128)
401 L(cachelinealigned_128): /* copy while cache lines */
403 blt- cr1,L(lessthancacheline) /* size <128 */
410 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
413 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
414 L(loop_128): /* Copy aligned body */
415 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
459 L(loop2_128): /* Copy aligned body */
497 b L(lessthancacheline)
500 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
501 libc_hidden_builtin_def (memcpy)