1 /* Optimized memcpy implementation for PowerPC32 on PowerPC64.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
39 cfi_adjust_cfa_offset(32)
41 cfi_offset(30,(20-32))
45 cfi_offset(31,(24-32))
47 andi. 11,3,3 /* check alignment of dst. */
48 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
49 clrlwi 10,4,30 /* check alignment of src. */
51 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
54 srwi 9,5,2 /* Number of full words remaining. */
60 /* Move 0-3 bytes as needed to get the destination word aligned. */
72 clrlwi 10,12,30 /* check alignment of src again. */
73 srwi 9,31,2 /* Number of full words remaining. */
75 /* Copy words from source to destination, assuming the destination is
76 aligned on a word boundary.
78 At this point we know there are at least 25 bytes left (32-7) to copy.
79 The next step is to determine if the source is also word aligned.
80 If not branch to the unaligned move code at .L6. which uses
81 a load, shift, store strategy.
83 Otherwise source and destination are word aligned, and we can use
84 the optimized word copy loop. */
86 clrlwi 11,31,30 /* calculate the number of tail bytes */
88 bne- cr6,.L6 /* If source is not word aligned. */
90 /* Move words where destination and source are word aligned.
91 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
92 If the copy is not an exact multiple of 16 bytes, 1-3
93 words are copied as needed to set up the main loop. After
94 the main loop exits there may be a tail of 1-3 bytes. These bytes are
95 copied a halfword/byte at a time as needed to preserve alignment. */
97 srwi 8,31,4 /* calculate the 16 byte loop count */
148 /* At this point we have a tail of 0-3 bytes and we know that the
149 destination is word aligned. */
159 /* Return original dst pointer. */
166 /* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and
167 9-31 bytes. Each case is handled without loops, using binary
170 In the short (0-8 byte) case no attempt is made to force alignment
171 of either source or destination. The hardware will handle the
172 unaligned load/stores with small delays for crossing 32- 64-byte, and
173 4096-byte boundaries. Since these short moves are unlikely to be
174 unaligned or cross these boundaries, the overhead to force
175 alignment is not justified.
177 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
178 boundaries. Since only loads are sensitive to the 32-/64-byte
179 boundaries it is more important to align the source than the
180 destination. If the source is not already word aligned, we first
181 move 1-3 bytes as needed. While the destination and stores may
182 still be unaligned, this is only an issue for page (4096 byte
183 boundary) crossing, which should be rare for these short moves.
184 The hardware handles this case automatically with a small delay. */
192 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
193 /* At least 9 bytes left. Get the source word aligned. */
198 beq .L3 /* If the source is already word aligned skip this. */
199 /* Copy 1-3 bytes to get source address word aligned. */
206 #ifdef __LITTLE_ENDIAN__
214 #ifdef __LITTLE_ENDIAN__
225 #ifdef __LITTLE_ENDIAN__
235 /* At least 6 bytes left and the source is word aligned. */
237 16: /* Move 16 bytes. */
248 8: /* Move 8 bytes. */
256 4: /* Move 4 bytes. */
262 2: /* Move 2-3 bytes. */
273 1: /* Move 1 byte. */
278 /* Return original dst pointer. */
284 /* Special case to copy 0-8 bytes. */
293 /* Return original dst pointer. */
320 /* Return original dst pointer. */
329 /* Copy words where the destination is aligned but the source is
330 not. Use aligned word loads from the source, shifted to realign
331 the data, to allow aligned destination stores.
332 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
333 A single word is retained for storing at loop exit to avoid walking
334 off the end of a page within the loop.
335 If the copy is not an exact multiple of 16 bytes, 1-3
336 words are copied as needed to set up the main loop. After
337 the main loop exits there may be a tail of 1-3 bytes. These bytes are
338 copied a halfword/byte at a time as needed to preserve alignment. */
341 cmplwi cr6,11,0 /* are there tail bytes left ? */
342 subf 5,10,12 /* back up src pointer to prev word alignment */
343 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
344 addi 11,9,-1 /* we move one word after the loop */
345 srwi 8,11,2 /* calculate the 16 byte loop count */
346 lwz 6,0(5) /* load 1st src word into R6 */
348 lwz 7,4(5) /* load 2nd src word into R7 */
350 subfic 9,10,32 /* number of bits to shift 2nd word right */
354 /* there are at least two words to copy, so copy them */
355 #ifdef __LITTLE_ENDIAN__
359 slw 0,6,10 /* shift 1st src word to left align it in R0 */
360 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
362 or 0,0,8 /* or them to get word to store */
363 lwz 6,8(5) /* load the 3rd src word */
364 stw 0,0(4) /* store the 1st dst word */
365 #ifdef __LITTLE_ENDIAN__
369 slw 0,7,10 /* now left align 2nd src word into R0 */
370 srw 8,6,9 /* shift 3rd src word to right align it in R8 */
372 or 0,0,8 /* or them to get word to store */
374 stw 0,4(4) /* store the 2nd dst word */
378 /* there is a third word to copy, so copy it */
379 #ifdef __LITTLE_ENDIAN__
383 slw 0,6,10 /* shift 3rd src word to left align it in R0 */
384 srw 8,7,9 /* shift 4th src word to right align it in R8 */
386 or 0,0,8 /* or them to get word to store */
387 stw 0,0(4) /* store 3rd dst word */
395 #ifdef __LITTLE_ENDIAN__
399 slw 0,6,10 /* shift 1st src word to left align it in R0 */
400 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
403 or 0,0,8 /* or them to get word to store */
408 stw 0,0(4) /* store the 1st dst word */
413 /* copy 16 bytes at a time */
414 #ifdef __LITTLE_ENDIAN__
424 #ifdef __LITTLE_ENDIAN__
434 #ifdef __LITTLE_ENDIAN__
444 #ifdef __LITTLE_ENDIAN__
458 /* calculate and store the final word */
459 #ifdef __LITTLE_ENDIAN__
471 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
473 /* Return original dst pointer. */
481 libc_hidden_builtin_def (memcpy)