1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
2 Copyright (C) 2003, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
27 Memcpy handles short copies (< 32-bytes) using a binary move blocks
28 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
29 with the appropriate combination of byte and halfword load/stores.
30 There is minimal effort to optimize the alignment of short moves.
32 Longer moves (>= 32-bytes) justify the effort to get at least the
33 destination word (4-byte) aligned. Further optimization is
34 possible when both source and destination are word aligned.
35 Each case has an optimized unrolled loop. */
38 EALIGN (BP_SYM (memcpy), 5, 0)
42 cfi_adjust_cfa_offset(32)
43 cmplwi cr1,5,31 /* check for short move. */
46 clrlwi 10,4,30 /* check alignment of src. */
47 andi. 11,3,3 /* check alignment of dst. */
48 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
49 ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */
52 cfi_offset(31,(24-32))
54 cfi_offset(30,(20-32))
58 subf 31,0,5 /* Length after alignment. */
59 add 12,4,0 /* Compute src addr after alignment. */
60 /* Move 0-3 bytes as needed to get the destination word aligned. */
78 clrlwi 10,12,30 /* check alignment of src again. */
79 srwi 9,31,2 /* Number of full words remaining. */
80 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
81 clrlwi 11,31,30 /* calculate the number of tail bytes */
83 /* Copy words from source to destination, assuming the destination is
84 aligned on a word boundary.
86 At this point we know there are at least 29 bytes left (32-3) to copy.
87 The next step is to determine if the source is also word aligned.
88 If not branch to the unaligned move code at .L6. which uses
89 a load, shift, store strategy.
91 Otherwise source and destination are word aligned, and we can use
92 the optimized word copy loop. */
97 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
98 srwi 9,5,2 /* Number of full words remaining. */
99 clrlwi 11,5,30 /* calculate the number of tail bytes */
101 /* Move words where destination and source are word aligned.
102 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
103 If the the copy is not an exact multiple of 16 bytes, 1-3
104 words are copied as needed to set up the main loop. After
105 the main loop exits there may be a tail of 1-3 bytes. These bytes are
106 copied a halfword/byte at a time as needed to preserve alignment. */
109 srwi 8,31,4 /* calculate the 16 byte loop count */
160 /* At this point we have a tail of 0-3 bytes and we know that the
161 destination is word aligned. */
171 /* Return original dst pointer. */
178 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
179 bytes. Each case is handled without loops, using binary (1,2,4,8)
182 In the short (0-8 byte) case no attempt is made to force alignment
183 of either source or destination. The hardware will handle the
184 unaligned load/stores with small delays for crossing 32- 128-byte,
185 and 4096-byte boundaries. Since these short moves are unlikely to be
186 unaligned or cross these boundaries, the overhead to force
187 alignment is not justified.
189 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
190 boundaries. Since only loads are sensitive to the 32-/128-byte
191 boundaries it is more important to align the source then the
192 destination. If the source is not already word aligned, we first
193 move 1-3 bytes as needed. Since we are only word aligned we don't
194 use double word load/stores to insure that all loads are aligned.
195 While the destination and stores may still be unaligned, this
196 is only an issue for page (4096 byte boundary) crossing, which
197 should be rare for these short moves. The hardware handles this
198 case automatically with a small (~20 cycle) delay. */
203 L(word_unaligned_short):
209 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */
210 /* At least 9 bytes left. Get the source word aligned. */
213 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */
217 beq L(wus_tail) /* If the source is already word aligned skip this. */
218 /* Copy 1-3 bytes to get source address word aligned. */
241 /* At least 6 bytes left and the source is word aligned. This allows
242 some speculative loads up front. */
243 /* We need to special case the fall-through because the biggest delays
244 are due to address computation not being ready in time for the
250 L(wus_tail16): /* Move 16 bytes. */
257 /* Move 8 bytes more. */
258 bf 28,L(wus_tail16p8)
264 /* Move 4 bytes more. */
265 bf 29,L(wus_tail16p4)
271 /* exactly 28 bytes. Return original dst pointer and exit. */
275 L(wus_tail16p8): /* less then 8 bytes left. */
276 beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */
278 bf 29,L(wus_tail16p2)
279 /* Move 4 bytes more. */
285 /* exactly 20 bytes. Return original dst pointer and exit. */
289 L(wus_tail16p4): /* less then 4 bytes left. */
293 /* exactly 24 bytes. Return original dst pointer and exit. */
297 L(wus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */
303 L(wus_tail8): /* Move 8 bytes. */
304 /* r6, r7 already loaded speculatively. */
310 /* Move 4 bytes more. */
317 /* exactly 12 bytes. Return original dst pointer and exit. */
321 L(wus_tail8p4): /* less then 4 bytes left. */
325 /* exactly 8 bytes. Return original dst pointer and exit. */
330 L(wus_tail4): /* Move 4 bytes. */
331 /* r6 already loaded speculatively. If we are here we know there is
332 more then 4 bytes left. So there is no need to test. */
336 L(wus_tail2): /* Move 2-3 bytes. */
345 L(wus_tail1): /* Move 1 byte. */
350 /* Return original dst pointer. */
354 /* Special case to copy 0-8 bytes. */
361 /* Return original dst pointer. */
382 /* Return original dst pointer. */
386 L(wus_2): /* Move 2-3 bytes. */
396 L(wus_1): /* Move 1 byte. */
402 /* Return original dst pointer. */
407 cfi_offset(31,(24-32))
408 cfi_offset(30,(20-32))
411 /* Copy words where the destination is aligned but the source is
412 not. For power4, power5 and power6 machines there is penalty for
413 unaligned loads (src) that cross 32-byte, cacheline, or page
414 boundaries. So we want to use simple (unaligned) loads where
415 posible but avoid them where we know the load would span a 32-byte
418 At this point we know we have at least 29 (32-3) bytes to copy
419 the src is unaligned. and we may cross at least one 32-byte
420 boundary. Also we have the following regester values:
421 r3 == adjusted dst, word aligned
424 r9 == adjusted Word length
425 r10 == src alignment (1-3)
426 r12 == adjuested src, not aligned
429 First we need to copy word upto but not crossing the next 32-byte
430 boundary. Then perform aligned loads just before and just after
431 the boundary and use shifts and or to gernerate the next aligned
432 word for dst. If more then 32 bytes remain we copy (unaligned src)
433 the next 7 words and repeat the loop until less then 32-bytes
436 Then if more then 4 bytes remain we again use aligned loads,
437 shifts and or to generate the next dst word. We then process the
438 remaining words using unaligned loads as needed. Finally we check
439 if there more then 0 bytes (1-3) bytes remainting and use
440 halfword and or byte load/stores to complete the copy.
442 mr 4,12 /* restore unaligned adjusted src ptr */
443 clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */
444 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
446 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */
450 subfic 9,10,32 /* number of bits to shift 2nd word right */
451 /* This test is reversed because the timing to compare the bytes to
452 32-byte boundary could not be meet. So we compare the bytes from
453 previous 32-byte boundary and invert the test. */
458 addi 12,4,16 /* generate alternate pointers to avoid agen */
459 addi 11,3,16 /* timing issues downstream. */
516 /* set up for 32-byte boundry crossing word move and possibly 32-byte
534 srwi 8,31,5 /* calculate the 32 byte loop count */
536 clrlwi 31,31,27 /* The remaining bytes, < 32. */
537 blt cr5,L(wdu_32tail)
542 /* copy 32 bytes at a time */
572 /* calculate and store the final word */
582 srwi 8,31,5 /* calculate the 32 byte loop count */
584 clrlwi 31,31,27 /* The remaining bytes, < 32. */
585 blt cr5,L(wdu1_32tail)
591 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
592 rlwimi 6,8,8,(32-8),31
596 /* copy 32 bytes at a time */
601 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
602 rlwimi 6,8,8,(32-8),31
628 /* calculate and store the final word */
630 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
631 rlwimi 6,8,8,(32-8),31
638 srwi 8,31,5 /* calculate the 32 byte loop count */
640 clrlwi 31,31,27 /* The remaining bytes, < 32. */
641 blt cr5,L(wdu2_32tail)
647 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
648 rlwimi 6,8,16,(32-16),31
652 /* copy 32 bytes at a time */
657 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
658 rlwimi 6,8,16,(32-16),31
685 /* calculate and store the final word */
687 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
688 rlwimi 6,8,16,(32-16),31
695 srwi 8,31,5 /* calculate the 32 byte loop count */
697 clrlwi 31,31,27 /* The remaining bytes, < 32. */
698 blt cr5,L(wdu3_32tail)
704 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
705 rlwimi 6,8,24,(32-24),31
709 /* copy 32 bytes at a time */
714 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
715 rlwimi 6,8,24,(32-24),31
741 /* calculate and store the final word */
743 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
744 rlwimi 6,8,24,(32-24),31
750 addi 12,4,16 /* generate alternate pointers to avoid agen */
751 addi 11,3,16 /* timing issues downstream. */
816 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */
835 /* Return original dst pointer. */
841 END (BP_SYM (memcpy))
843 libc_hidden_builtin_def (memcpy)