1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
39 cfi_adjust_cfa_offset(32)
40 cmplwi cr1,5,31 /* check for short move. */
43 clrlwi 10,4,30 /* check alignment of src. */
44 andi. 11,3,3 /* check alignment of dst. */
45 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
46 ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */
49 cfi_offset(31,(24-32))
51 cfi_offset(30,(20-32))
55 subf 31,0,5 /* Length after alignment. */
56 add 12,4,0 /* Compute src addr after alignment. */
57 /* Move 0-3 bytes as needed to get the destination word aligned. */
75 clrlwi 10,12,30 /* check alignment of src again. */
76 srwi 9,31,2 /* Number of full words remaining. */
77 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
78 clrlwi 11,31,30 /* calculate the number of tail bytes */
80 /* Copy words from source to destination, assuming the destination is
81 aligned on a word boundary.
83 At this point we know there are at least 29 bytes left (32-3) to copy.
84 The next step is to determine if the source is also word aligned.
85 If not branch to the unaligned move code at .L6. which uses
86 a load, shift, store strategy.
88 Otherwise source and destination are word aligned, and we can use
89 the optimized word copy loop. */
94 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
95 srwi 9,5,2 /* Number of full words remaining. */
96 clrlwi 11,5,30 /* calculate the number of tail bytes */
98 /* Move words where destination and source are word aligned.
99 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
100 If the copy is not an exact multiple of 16 bytes, 1-3
101 words are copied as needed to set up the main loop. After
102 the main loop exits there may be a tail of 1-3 bytes. These bytes are
103 copied a halfword/byte at a time as needed to preserve alignment. */
106 srwi 8,31,4 /* calculate the 16 byte loop count */
157 /* At this point we have a tail of 0-3 bytes and we know that the
158 destination is word aligned. */
168 /* Return original dst pointer. */
175 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
176 bytes. Each case is handled without loops, using binary (1,2,4,8)
179 In the short (0-8 byte) case no attempt is made to force alignment
180 of either source or destination. The hardware will handle the
181 unaligned load/stores with small delays for crossing 32- 128-byte,
182 and 4096-byte boundaries. Since these short moves are unlikely to be
183 unaligned or cross these boundaries, the overhead to force
184 alignment is not justified.
186 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
187 boundaries. Since only loads are sensitive to the 32-/128-byte
188 boundaries it is more important to align the source then the
189 destination. If the source is not already word aligned, we first
190 move 1-3 bytes as needed. Since we are only word aligned we don't
191 use double word load/stores to insure that all loads are aligned.
192 While the destination and stores may still be unaligned, this
193 is only an issue for page (4096 byte boundary) crossing, which
194 should be rare for these short moves. The hardware handles this
195 case automatically with a small (~20 cycle) delay. */
200 L(word_unaligned_short):
206 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */
207 /* At least 9 bytes left. Get the source word aligned. */
210 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */
214 beq L(wus_tail) /* If the source is already word aligned skip this. */
215 /* Copy 1-3 bytes to get source address word aligned. */
238 /* At least 6 bytes left and the source is word aligned. This allows
239 some speculative loads up front. */
240 /* We need to special case the fall-through because the biggest delays
241 are due to address computation not being ready in time for the
247 L(wus_tail16): /* Move 16 bytes. */
254 /* Move 8 bytes more. */
255 bf 28,L(wus_tail16p8)
261 /* Move 4 bytes more. */
262 bf 29,L(wus_tail16p4)
268 /* exactly 28 bytes. Return original dst pointer and exit. */
272 L(wus_tail16p8): /* less than 8 bytes left. */
273 beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */
275 bf 29,L(wus_tail16p2)
276 /* Move 4 bytes more. */
282 /* exactly 20 bytes. Return original dst pointer and exit. */
286 L(wus_tail16p4): /* less than 4 bytes left. */
290 /* exactly 24 bytes. Return original dst pointer and exit. */
294 L(wus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
300 L(wus_tail8): /* Move 8 bytes. */
301 /* r6, r7 already loaded speculatively. */
307 /* Move 4 bytes more. */
314 /* exactly 12 bytes. Return original dst pointer and exit. */
318 L(wus_tail8p4): /* less than 4 bytes left. */
322 /* exactly 8 bytes. Return original dst pointer and exit. */
327 L(wus_tail4): /* Move 4 bytes. */
328 /* r6 already loaded speculatively. If we are here we know there is
329 more than 4 bytes left. So there is no need to test. */
333 L(wus_tail2): /* Move 2-3 bytes. */
342 L(wus_tail1): /* Move 1 byte. */
347 /* Return original dst pointer. */
351 /* Special case to copy 0-8 bytes. */
358 /* Return original dst pointer. */
379 /* Return original dst pointer. */
383 L(wus_2): /* Move 2-3 bytes. */
393 L(wus_1): /* Move 1 byte. */
399 /* Return original dst pointer. */
404 cfi_offset(31,(24-32))
405 cfi_offset(30,(20-32))
408 /* Copy words where the destination is aligned but the source is
409 not. For power4, power5 and power6 machines there is penalty for
410 unaligned loads (src) that cross 32-byte, cacheline, or page
411 boundaries. So we want to use simple (unaligned) loads where
412 possible but avoid them where we know the load would span a 32-byte
415 At this point we know we have at least 29 (32-3) bytes to copy
416 the src is unaligned. and we may cross at least one 32-byte
417 boundary. Also we have the following register values:
418 r3 == adjusted dst, word aligned
421 r9 == adjusted Word length
422 r10 == src alignment (1-3)
423 r12 == adjusted src, not aligned
426 First we need to copy word up to but not crossing the next 32-byte
427 boundary. Then perform aligned loads just before and just after
428 the boundary and use shifts and or to generate the next aligned
429 word for dst. If more than 32 bytes remain we copy (unaligned src)
430 the next 7 words and repeat the loop until less than 32-bytes
433 Then if more than 4 bytes remain we again use aligned loads,
434 shifts and or to generate the next dst word. We then process the
435 remaining words using unaligned loads as needed. Finally we check
436 if there more than 0 bytes (1-3) bytes remaining and use
437 halfword and or byte load/stores to complete the copy.
439 mr 4,12 /* restore unaligned adjusted src ptr */
440 clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */
441 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
443 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */
447 subfic 9,10,32 /* number of bits to shift 2nd word right */
448 /* This test is reversed because the timing to compare the bytes to
449 32-byte boundary could not be meet. So we compare the bytes from
450 previous 32-byte boundary and invert the test. */
455 addi 12,4,16 /* generate alternate pointers to avoid agen */
456 addi 11,3,16 /* timing issues downstream. */
513 /* set up for 32-byte boundary crossing word move and possibly 32-byte
531 srwi 8,31,5 /* calculate the 32 byte loop count */
533 clrlwi 31,31,27 /* The remaining bytes, < 32. */
534 blt cr5,L(wdu_32tail)
539 /* copy 32 bytes at a time */
569 /* calculate and store the final word */
579 srwi 8,31,5 /* calculate the 32 byte loop count */
581 clrlwi 31,31,27 /* The remaining bytes, < 32. */
582 blt cr5,L(wdu1_32tail)
588 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
589 rlwimi 6,8,8,(32-8),31
593 /* copy 32 bytes at a time */
598 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
599 rlwimi 6,8,8,(32-8),31
625 /* calculate and store the final word */
627 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
628 rlwimi 6,8,8,(32-8),31
635 srwi 8,31,5 /* calculate the 32 byte loop count */
637 clrlwi 31,31,27 /* The remaining bytes, < 32. */
638 blt cr5,L(wdu2_32tail)
644 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
645 rlwimi 6,8,16,(32-16),31
649 /* copy 32 bytes at a time */
654 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
655 rlwimi 6,8,16,(32-16),31
682 /* calculate and store the final word */
684 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
685 rlwimi 6,8,16,(32-16),31
692 srwi 8,31,5 /* calculate the 32 byte loop count */
694 clrlwi 31,31,27 /* The remaining bytes, < 32. */
695 blt cr5,L(wdu3_32tail)
701 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
702 rlwimi 6,8,24,(32-24),31
706 /* copy 32 bytes at a time */
711 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
712 rlwimi 6,8,24,(32-24),31
738 /* calculate and store the final word */
740 /* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
741 rlwimi 6,8,24,(32-24),31
747 addi 12,4,16 /* generate alternate pointers to avoid agen */
748 addi 11,3,16 /* timing issues downstream. */
813 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */
832 /* Return original dst pointer. */
840 libc_hidden_builtin_def (memcpy)