1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
39 cfi_adjust_cfa_offset(32)
40 cmplwi cr1,5,31 /* check for short move. */
43 clrlwi 10,4,30 /* check alignment of src. */
44 andi. 11,3,3 /* check alignment of dst. */
45 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
46 ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */
49 cfi_offset(31,(24-32))
51 cfi_offset(30,(20-32))
55 subf 31,0,5 /* Length after alignment. */
56 add 12,4,0 /* Compute src addr after alignment. */
57 /* Move 0-3 bytes as needed to get the destination word aligned. */
75 clrlwi 10,12,30 /* check alignment of src again. */
76 srwi 9,31,2 /* Number of full words remaining. */
77 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
78 clrlwi 11,31,30 /* calculate the number of tail bytes */
80 /* Copy words from source to destination, assuming the destination is
81 aligned on a word boundary.
83 At this point we know there are at least 29 bytes left (32-3) to copy.
84 The next step is to determine if the source is also word aligned.
85 If not branch to the unaligned move code at .L6. which uses
86 a load, shift, store strategy.
88 Otherwise source and destination are word aligned, and we can use
89 the optimized word copy loop. */
94 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
95 srwi 9,5,2 /* Number of full words remaining. */
96 clrlwi 11,5,30 /* calculate the number of tail bytes */
98 /* Move words where destination and source are word aligned.
99 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
100 If the copy is not an exact multiple of 16 bytes, 1-3
101 words are copied as needed to set up the main loop. After
102 the main loop exits there may be a tail of 1-3 bytes. These bytes are
103 copied a halfword/byte at a time as needed to preserve alignment. */
106 srwi 8,31,4 /* calculate the 16 byte loop count */
157 /* At this point we have a tail of 0-3 bytes and we know that the
158 destination is word aligned. */
168 /* Return original dst pointer. */
175 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
176 bytes. Each case is handled without loops, using binary (1,2,4,8)
179 In the short (0-8 byte) case no attempt is made to force alignment
180 of either source or destination. The hardware will handle the
181 unaligned load/stores with small delays for crossing 32- 128-byte,
182 and 4096-byte boundaries. Since these short moves are unlikely to be
183 unaligned or cross these boundaries, the overhead to force
184 alignment is not justified.
186 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
187 boundaries. Since only loads are sensitive to the 32-/128-byte
188 boundaries it is more important to align the source then the
189 destination. If the source is not already word aligned, we first
190 move 1-3 bytes as needed. Since we are only word aligned we don't
191 use double word load/stores to insure that all loads are aligned.
192 While the destination and stores may still be unaligned, this
193 is only an issue for page (4096 byte boundary) crossing, which
194 should be rare for these short moves. The hardware handles this
195 case automatically with a small (~20 cycle) delay. */
200 L(word_unaligned_short):
206 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */
207 /* At least 9 bytes left. Get the source word aligned. */
210 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */
214 beq L(wus_tail) /* If the source is already word aligned skip this. */
215 /* Copy 1-3 bytes to get source address word aligned. */
222 #ifdef __LITTLE_ENDIAN__
230 #ifdef __LITTLE_ENDIAN__
241 #ifdef __LITTLE_ENDIAN__
251 /* At least 6 bytes left and the source is word aligned. This allows
252 some speculative loads up front. */
253 /* We need to special case the fall-through because the biggest delays
254 are due to address computation not being ready in time for the
260 L(wus_tail16): /* Move 16 bytes. */
267 /* Move 8 bytes more. */
268 bf 28,L(wus_tail16p8)
274 /* Move 4 bytes more. */
275 bf 29,L(wus_tail16p4)
281 /* exactly 28 bytes. Return original dst pointer and exit. */
285 L(wus_tail16p8): /* less than 8 bytes left. */
286 beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */
288 bf 29,L(wus_tail16p2)
289 /* Move 4 bytes more. */
295 /* exactly 20 bytes. Return original dst pointer and exit. */
299 L(wus_tail16p4): /* less than 4 bytes left. */
303 /* exactly 24 bytes. Return original dst pointer and exit. */
307 L(wus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
313 L(wus_tail8): /* Move 8 bytes. */
314 /* r6, r7 already loaded speculatively. */
320 /* Move 4 bytes more. */
327 /* exactly 12 bytes. Return original dst pointer and exit. */
331 L(wus_tail8p4): /* less than 4 bytes left. */
335 /* exactly 8 bytes. Return original dst pointer and exit. */
340 L(wus_tail4): /* Move 4 bytes. */
341 /* r6 already loaded speculatively. If we are here we know there is
342 more than 4 bytes left. So there is no need to test. */
346 L(wus_tail2): /* Move 2-3 bytes. */
355 L(wus_tail1): /* Move 1 byte. */
360 /* Return original dst pointer. */
364 /* Special case to copy 0-8 bytes. */
371 /* Return original dst pointer. */
392 /* Return original dst pointer. */
396 L(wus_2): /* Move 2-3 bytes. */
406 L(wus_1): /* Move 1 byte. */
412 /* Return original dst pointer. */
417 cfi_offset(31,(24-32))
418 cfi_offset(30,(20-32))
421 /* Copy words where the destination is aligned but the source is
422 not. For power4, power5 and power6 machines there is penalty for
423 unaligned loads (src) that cross 32-byte, cacheline, or page
424 boundaries. So we want to use simple (unaligned) loads where
425 possible but avoid them where we know the load would span a 32-byte
428 At this point we know we have at least 29 (32-3) bytes to copy
429 the src is unaligned. and we may cross at least one 32-byte
430 boundary. Also we have the following register values:
431 r3 == adjusted dst, word aligned
434 r9 == adjusted Word length
435 r10 == src alignment (1-3)
436 r12 == adjusted src, not aligned
439 First we need to copy word up to but not crossing the next 32-byte
440 boundary. Then perform aligned loads just before and just after
441 the boundary and use shifts and or to generate the next aligned
442 word for dst. If more than 32 bytes remain we copy (unaligned src)
443 the next 7 words and repeat the loop until less than 32-bytes
446 Then if more than 4 bytes remain we again use aligned loads,
447 shifts and or to generate the next dst word. We then process the
448 remaining words using unaligned loads as needed. Finally we check
449 if there are more than 0 bytes (1-3) bytes remaining and use
450 halfword and or byte load/stores to complete the copy.
452 mr 4,12 /* restore unaligned adjusted src ptr */
453 clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */
454 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
456 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */
460 subfic 9,10,32 /* number of bits to shift 2nd word right */
461 /* This test is reversed because the timing to compare the bytes to
462 32-byte boundary could not be meet. So we compare the bytes from
463 previous 32-byte boundary and invert the test. */
468 addi 12,4,16 /* generate alternate pointers to avoid agen */
469 addi 11,3,16 /* timing issues downstream. */
526 /* set up for 32-byte boundary crossing word move and possibly 32-byte
544 srwi 8,31,5 /* calculate the 32 byte loop count */
546 clrlwi 31,31,27 /* The remaining bytes, < 32. */
547 blt cr5,L(wdu_32tail)
552 /* copy 32 bytes at a time */
582 /* calculate and store the final word */
592 srwi 8,31,5 /* calculate the 32 byte loop count */
593 #ifdef __LITTLE_ENDIAN__
598 clrlwi 31,31,27 /* The remaining bytes, < 32. */
599 blt cr5,L(wdu1_32tail)
605 #ifdef __LITTLE_ENDIAN__
608 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
609 rlwimi 6,8,8,(32-8),31
614 /* copy 32 bytes at a time */
619 #ifdef __LITTLE_ENDIAN__
622 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
623 rlwimi 6,8,8,(32-8),31
641 #ifdef __LITTLE_ENDIAN__
654 /* calculate and store the final word */
656 #ifdef __LITTLE_ENDIAN__
659 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
660 rlwimi 6,8,8,(32-8),31
668 srwi 8,31,5 /* calculate the 32 byte loop count */
669 #ifdef __LITTLE_ENDIAN__
674 clrlwi 31,31,27 /* The remaining bytes, < 32. */
675 blt cr5,L(wdu2_32tail)
681 #ifdef __LITTLE_ENDIAN__
684 rlwimi 6,8,16,(32-16),31
689 /* copy 32 bytes at a time */
694 #ifdef __LITTLE_ENDIAN__
697 rlwimi 6,8,16,(32-16),31
716 #ifdef __LITTLE_ENDIAN__
729 /* calculate and store the final word */
731 #ifdef __LITTLE_ENDIAN__
734 rlwimi 6,8,16,(32-16),31
742 srwi 8,31,5 /* calculate the 32 byte loop count */
743 #ifdef __LITTLE_ENDIAN__
748 clrlwi 31,31,27 /* The remaining bytes, < 32. */
749 blt cr5,L(wdu3_32tail)
755 #ifdef __LITTLE_ENDIAN__
758 rlwimi 6,8,24,(32-24),31
763 /* copy 32 bytes at a time */
768 #ifdef __LITTLE_ENDIAN__
771 rlwimi 6,8,24,(32-24),31
789 #ifdef __LITTLE_ENDIAN__
802 /* calculate and store the final word */
804 #ifdef __LITTLE_ENDIAN__
807 rlwimi 6,8,24,(32-24),31
814 addi 12,4,16 /* generate alternate pointers to avoid agen */
815 addi 11,3,16 /* timing issues downstream. */
880 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */
899 /* Return original dst pointer. */
907 libc_hidden_builtin_def (memcpy)