1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
2 Copyright (C) 2003, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
27 Memcpy handles short copies (< 32-bytes) using a binary move blocks
28 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
29 with the appropriate combination of byte and halfword load/stores.
30 There is minimal effort to optimize the alignment of short moves.
32 Longer moves (>= 32-bytes) justify the effort to get at least the
33 destination word (4-byte) aligned. Further optimization is
34 possible when both source and destination are word aligned.
35 Each case has an optimized unrolled loop. */
37 EALIGN (BP_SYM (memcpy), 5, 0)
41 cfi_adjust_cfa_offset(32)
42 cmplwi cr1,5,31 /* check for short move. */
45 clrlwi 10,4,30 /* check alignment of src. */
46 andi. 11,3,3 /* check alignment of dst. */
47 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
48 ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */
51 cfi_offset(31,(24-32))
53 cfi_offset(30,(20-32))
57 subf 31,0,5 /* Length after alignment. */
58 add 12,4,0 /* Compute src addr after alignment. */
59 /* Move 0-3 bytes as needed to get the destination word aligned. */
77 clrlwi 10,12,30 /* check alignment of src again. */
78 srwi 9,31,2 /* Number of full words remaining. */
79 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
80 clrlwi 11,31,30 /* calculate the number of tail bytes */
82 /* Copy words from source to destination, assuming the destination is
83 aligned on a word boundary.
85 At this point we know there are at least 29 bytes left (32-3) to copy.
86 The next step is to determine if the source is also word aligned.
87 If not branch to the unaligned move code at .L6. which uses
88 a load, shift, store strategy.
90 Otherwise source and destination are word aligned, and we can use
91 the optimized word copy loop. */
96 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
97 srwi 9,5,2 /* Number of full words remaining. */
98 clrlwi 11,5,30 /* calculate the number of tail bytes */
100 /* Move words where destination and source are word aligned.
101 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
102 If the the copy is not an exact multiple of 16 bytes, 1-3
103 words are copied as needed to set up the main loop. After
104 the main loop exits there may be a tail of 1-3 bytes. These bytes are
105 copied a halfword/byte at a time as needed to preserve alignment. */
108 srwi 8,31,4 /* calculate the 16 byte loop count */
159 /* At this point we have a tail of 0-3 bytes and we know that the
160 destination is word aligned. */
170 /* Return original dst pointer. */
177 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
178 bytes. Each case is handled without loops, using binary (1,2,4,8)
181 In the short (0-8 byte) case no attempt is made to force alignment
182 of either source or destination. The hardware will handle the
183 unaligned load/stores with small delays for crossing 32- 128-byte,
184 and 4096-byte boundaries. Since these short moves are unlikely to be
185 unaligned or cross these boundaries, the overhead to force
186 alignment is not justified.
188 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
189 boundaries. Since only loads are sensitive to the 32-/128-byte
190 boundaries it is more important to align the source then the
191 destination. If the source is not already word aligned, we first
192 move 1-3 bytes as needed. Since we are only word aligned we don't
193 use double word load/stores to insure that all loads are aligned.
194 While the destination and stores may still be unaligned, this
195 is only an issue for page (4096 byte boundary) crossing, which
196 should be rare for these short moves. The hardware handles this
197 case automatically with a small (~20 cycle) delay. */
202 L(word_unaligned_short):
208 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */
209 /* At least 9 bytes left. Get the source word aligned. */
212 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */
216 beq L(wus_tail) /* If the source is already word aligned skip this. */
217 /* Copy 1-3 bytes to get source address word aligned. */
240 /* At least 6 bytes left and the source is word aligned. This allows
241 some speculative loads up front. */
242 /* We need to special case the fall-through because the biggest delays
243 are due to address computation not being ready in time for the
249 L(wus_tail16): /* Move 16 bytes. */
256 /* Move 8 bytes more. */
257 bf 28,L(wus_tail16p8)
263 /* Move 4 bytes more. */
264 bf 29,L(wus_tail16p4)
270 /* exactly 28 bytes. Return original dst pointer and exit. */
274 L(wus_tail16p8): /* less then 8 bytes left. */
275 beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */
277 bf 29,L(wus_tail16p2)
278 /* Move 4 bytes more. */
284 /* exactly 20 bytes. Return original dst pointer and exit. */
288 L(wus_tail16p4): /* less then 4 bytes left. */
292 /* exactly 24 bytes. Return original dst pointer and exit. */
296 L(wus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */
302 L(wus_tail8): /* Move 8 bytes. */
303 /* r6, r7 already loaded speculatively. */
309 /* Move 4 bytes more. */
316 /* exactly 12 bytes. Return original dst pointer and exit. */
320 L(wus_tail8p4): /* less then 4 bytes left. */
324 /* exactly 8 bytes. Return original dst pointer and exit. */
329 L(wus_tail4): /* Move 4 bytes. */
330 /* r6 already loaded speculatively. If we are here we know there is
331 more then 4 bytes left. So there is no need to test. */
335 L(wus_tail2): /* Move 2-3 bytes. */
344 L(wus_tail1): /* Move 1 byte. */
349 /* Return original dst pointer. */
353 /* Special case to copy 0-8 bytes. */
360 /* Return original dst pointer. */
381 /* Return original dst pointer. */
385 L(wus_2): /* Move 2-3 bytes. */
395 L(wus_1): /* Move 1 byte. */
401 /* Return original dst pointer. */
406 cfi_offset(31,(24-32))
407 cfi_offset(30,(20-32))
410 /* Copy words where the destination is aligned but the source is
411 not. For power4, power5 and power6 machines there is penalty for
412 unaligned loads (src) that cross 32-byte, cacheline, or page
413 boundaries. So we want to use simple (unaligned) loads where
414 posible but avoid them where we know the load would span a 32-byte
417 At this point we know we have at least 29 (32-3) bytes to copy
418 the src is unaligned. and we may cross at least one 32-byte
419 boundary. Also we have the following regester values:
420 r3 == adjusted dst, word aligned
423 r9 == adjusted Word length
424 r10 == src alignment (1-3)
425 r12 == adjuested src, not aligned
428 First we need to copy word upto but not crossing the next 32-byte
429 boundary. Then perform aligned loads just before and just after
430 the boundary and use shifts and or to gernerate the next aligned
431 word for dst. If more then 32 bytes remain we copy (unaligned src)
432 the next 7 words and repeat the loop until less then 32-bytes
435 Then if more then 4 bytes remain we again use aligned loads,
436 shifts and or to generate the next dst word. We then process the
437 remaining words using unaligned loads as needed. Finally we check
438 if there more then 0 bytes (1-3) bytes remainting and use
439 halfword and or byte load/stores to complete the copy.
441 mr 4,12 /* restore unaligned adjusted src ptr */
442 clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */
443 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
445 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */
449 subfic 9,10,32 /* number of bits to shift 2nd word right */
450 /* This test is reversed because the timing to compare the bytes to
451 32-byte boundary could not be meet. So we compare the bytes from
452 previous 32-byte boundary and invert the test. */
457 addi 12,4,16 /* generate alternate pointers to avoid agen */
458 addi 11,3,16 /* timing issues downstream. */
515 /* set up for 32-byte boundry crossing word move and possibly 32-byte
533 srwi 8,31,5 /* calculate the 32 byte loop count */
535 clrlwi 31,31,27 /* The remaining bytes, < 32. */
536 blt cr5,L(wdu_32tail)
541 /* copy 32 bytes at a time */
571 /* calculate and store the final word */
581 srwi 8,31,5 /* calculate the 32 byte loop count */
583 clrlwi 31,31,27 /* The remaining bytes, < 32. */
584 blt cr5,L(wdu1_32tail)
590 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
591 rlwimi 6,8,8,(32-8),31
595 /* copy 32 bytes at a time */
600 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
601 rlwimi 6,8,8,(32-8),31
627 /* calculate and store the final word */
629 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
630 rlwimi 6,8,8,(32-8),31
637 srwi 8,31,5 /* calculate the 32 byte loop count */
639 clrlwi 31,31,27 /* The remaining bytes, < 32. */
640 blt cr5,L(wdu2_32tail)
646 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
647 rlwimi 6,8,16,(32-16),31
651 /* copy 32 bytes at a time */
656 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
657 rlwimi 6,8,16,(32-16),31
684 /* calculate and store the final word */
686 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
687 rlwimi 6,8,16,(32-16),31
694 srwi 8,31,5 /* calculate the 32 byte loop count */
696 clrlwi 31,31,27 /* The remaining bytes, < 32. */
697 blt cr5,L(wdu3_32tail)
703 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
704 rlwimi 6,8,24,(32-24),31
708 /* copy 32 bytes at a time */
713 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
714 rlwimi 6,8,24,(32-24),31
740 /* calculate and store the final word */
742 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
743 rlwimi 6,8,24,(32-24),31
749 addi 12,4,16 /* generate alternate pointers to avoid agen */
750 addi 11,3,16 /* timing issues downstream. */
815 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */
834 /* Return original dst pointer. */
840 END (BP_SYM (memcpy))
842 libc_hidden_builtin_def (memcpy)