sysdeps/powerpc/powerpc32/power4/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32 on PowerPC64.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28
  29    Longer moves (>= 32-bytes) justify the effort to get at least the
  30    destination word (4-byte) aligned.  Further optimization is
  31    possible when both source and destination are word aligned.
  32    Each case has an optimized unrolled loop.   */
  33
  34         .machine power4
  35 EALIGN (memcpy, 5, 0)
  36         CALL_MCOUNT
  37
  38     stwu  1,-32(1)
  39     cfi_adjust_cfa_offset(32)
  40     stw   30,20(1)
  41     cfi_offset(30,(20-32))
  42     mr    30,3
  43     cmplwi cr1,5,31
  44     stw   31,24(1)
  45     cfi_offset(31,(24-32))
  46     neg   0,3
  47     andi. 11,3,3        /* check alignment of dst.  */
  48     clrlwi 0,0,30       /* Number of bytes until the 1st word of dst.  */
  49     clrlwi 10,4,30      /* check alignment of src.  */
  50     cmplwi cr6,5,8
  51     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  52     cmplw cr6,10,11
  53     mr    12,4
  54     srwi  9,5,2         /* Number of full words remaining.  */
  55     mtcrf 0x01,0
  56     mr    31,5
  57     beq   .L0
  58
  59     subf  31,0,5
  60   /* Move 0-3 bytes as needed to get the destination word aligned.  */
  61 1:  bf    31,2f
  62     lbz   6,0(12)
  63     addi  12,12,1
  64     stb   6,0(3)
  65     addi  3,3,1
  66 2:  bf    30,0f
  67     lhz   6,0(12)
  68     addi  12,12,2
  69     sth   6,0(3)
  70     addi  3,3,2
  71 0:
  72     clrlwi 10,12,30     /* check alignment of src again.  */
  73     srwi  9,31,2        /* Number of full words remaining.  */
  74
  75   /* Copy words from source to destination, assuming the destination is
  76      aligned on a word boundary.
  77
  78      At this point we know there are at least 25 bytes left (32-7) to copy.
  79      The next step is to determine if the source is also word aligned.
  80      If not branch to the unaligned move code at .L6. which uses
  81      a load, shift, store strategy.
  82
  83      Otherwise source and destination are word aligned, and we can use
  84      the optimized word copy loop.  */
  85 .L0:
  86     clrlwi      11,31,30  /* calculate the number of tail bytes */
  87     mtcrf 0x01,9
  88     bne-  cr6,.L6   /* If source is not word aligned.  */
  89
  90   /* Move words where destination and source are word aligned.
  91      Use an unrolled loop to copy 4 words (16-bytes) per iteration.
  92      If the copy is not an exact multiple of 16 bytes, 1-3
  93      words are copied as needed to set up the main loop.  After
  94      the main loop exits there may be a tail of 1-3 bytes. These bytes are
  95      copied a halfword/byte at a time as needed to preserve alignment.  */
  96
  97     srwi  8,31,4    /* calculate the 16 byte loop count */
  98     cmplwi      cr1,9,4
  99     cmplwi      cr6,11,0
 100     mr    11,12
 101
 102     bf    30,1f
 103     lwz   6,0(12)
 104     lwz   7,4(12)
 105     addi  11,12,8
 106     mtctr 8
 107     stw   6,0(3)
 108     stw   7,4(3)
 109     addi  10,3,8
 110     bf    31,4f
 111     lwz   0,8(12)
 112     stw   0,8(3)
 113     blt   cr1,3f
 114     addi  11,12,12
 115     addi  10,3,12
 116     b     4f
 117     .align  4
 118 1:
 119     mr    10,3
 120     mtctr 8
 121     bf    31,4f
 122     lwz   6,0(12)
 123     addi  11,12,4
 124     stw   6,0(3)
 125     addi  10,3,4
 126
 127     .align  4
 128 4:
 129     lwz   6,0(11)
 130     lwz   7,4(11)
 131     lwz   8,8(11)
 132     lwz   0,12(11)
 133     stw   6,0(10)
 134     stw   7,4(10)
 135     stw   8,8(10)
 136     stw   0,12(10)
 137     addi  11,11,16
 138     addi  10,10,16
 139     bdnz  4b
 140 3:
 141     clrrwi 0,31,2
 142     mtcrf 0x01,31
 143     beq   cr6,0f
 144 .L9:
 145     add   3,3,0
 146     add   12,12,0
 147
 148 /*  At this point we have a tail of 0-3 bytes and we know that the
 149     destination is word aligned.  */
 150 2:  bf    30,1f
 151     lhz   6,0(12)
 152     addi  12,12,2
 153     sth   6,0(3)
 154     addi  3,3,2
 155 1:  bf    31,0f
 156     lbz   6,0(12)
 157     stb   6,0(3)
 158 0:
 159   /* Return original dst pointer.  */
 160     mr  3,30
 161     lwz 30,20(1)
 162     lwz 31,24(1)
 163     addi 1,1,32
 164     blr
 165
 166 /* Copy up to 31 bytes.  This is divided into two cases 0-8 bytes and
 167    9-31 bytes.  Each case is handled without loops, using binary
 168    (1,2,4,8) tests.
 169
 170    In the short (0-8 byte) case no attempt is made to force alignment
 171    of either source or destination.  The hardware will handle the
 172    unaligned load/stores with small delays for crossing 32- 64-byte, and
 173    4096-byte boundaries. Since these short moves are unlikely to be
 174    unaligned or cross these boundaries, the overhead to force
 175    alignment is not justified.
 176
 177    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
 178    boundaries.  Since only loads are sensitive to the 32-/64-byte
 179    boundaries it is more important to align the source than the
 180    destination.  If the source is not already word aligned, we first
 181    move 1-3 bytes as needed.  While the destination and stores may
 182    still be unaligned, this is only an issue for page (4096 byte
 183    boundary) crossing, which should be rare for these short moves.
 184    The hardware handles this case automatically with a small delay.  */
 185
 186     .align  4
 187 .L2:
 188     mtcrf 0x01,5
 189     neg   8,4
 190     clrrwi 11,4,2
 191     andi. 0,8,3
 192     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 193 /* At least 9 bytes left.  Get the source word aligned.  */
 194     cmplwi      cr1,5,16
 195     mr    10,5
 196     mr    12,4
 197     cmplwi      cr6,0,2
 198     beq   .L3   /* If the source is already word aligned skip this.  */
 199 /* Copy 1-3 bytes to get source address word aligned.  */
 200     lwz   6,0(11)
 201     subf  10,0,5
 202     add   12,4,0
 203     blt   cr6,5f
 204     srwi  7,6,16
 205     bgt   cr6,3f
 206 #ifdef __LITTLE_ENDIAN__
 207     sth   7,0(3)
 208 #else
 209     sth   6,0(3)
 210 #endif
 211     b     7f
 212     .align  4
 213 3:
 214 #ifdef __LITTLE_ENDIAN__
 215     rotlwi 6,6,24
 216     stb   6,0(3)
 217     sth   7,1(3)
 218 #else
 219     stb   7,0(3)
 220     sth   6,1(3)
 221 #endif
 222     b     7f
 223     .align  4
 224 5:
 225 #ifdef __LITTLE_ENDIAN__
 226     rotlwi 6,6,8
 227 #endif
 228     stb   6,0(3)
 229 7:
 230     cmplwi      cr1,10,16
 231     add   3,3,0
 232     mtcrf 0x01,10
 233     .align  4
 234 .L3:
 235 /* At least 6 bytes left and the source is word aligned.  */
 236     blt   cr1,8f
 237 16: /* Move 16 bytes.  */
 238     lwz   6,0(12)
 239     lwz   7,4(12)
 240     stw   6,0(3)
 241     lwz   6,8(12)
 242     stw   7,4(3)
 243     lwz   7,12(12)
 244     addi  12,12,16
 245     stw   6,8(3)
 246     stw   7,12(3)
 247     addi  3,3,16
 248 8:  /* Move 8 bytes.  */
 249     bf    28,4f
 250     lwz   6,0(12)
 251     lwz   7,4(12)
 252     addi  12,12,8
 253     stw   6,0(3)
 254     stw   7,4(3)
 255     addi  3,3,8
 256 4:  /* Move 4 bytes.  */
 257     bf    29,2f
 258     lwz   6,0(12)
 259     addi  12,12,4
 260     stw   6,0(3)
 261     addi  3,3,4
 262 2:  /* Move 2-3 bytes.  */
 263     bf    30,1f
 264     lhz   6,0(12)
 265     sth   6,0(3)
 266     bf    31,0f
 267     lbz   7,2(12)
 268     stb   7,2(3)
 269     mr    3,30
 270     lwz   30,20(1)
 271     addi  1,1,32
 272     blr
 273 1:  /* Move 1 byte.  */
 274     bf    31,0f
 275     lbz   6,0(12)
 276     stb   6,0(3)
 277 0:
 278   /* Return original dst pointer.  */
 279     mr   3,30
 280     lwz  30,20(1)
 281     addi 1,1,32
 282     blr
 283
 284 /* Special case to copy 0-8 bytes.  */
 285     .align  4
 286 .LE8:
 287     mr    12,4
 288     bne   cr6,4f
 289     lwz   6,0(4)
 290     lwz   7,4(4)
 291     stw   6,0(3)
 292     stw   7,4(3)
 293   /* Return original dst pointer.  */
 294     mr    3,30
 295     lwz   30,20(1)
 296     addi  1,1,32
 297     blr
 298     .align  4
 299 4:  bf    29,2b
 300     lwz   6,0(4)
 301     stw   6,0(3)
 302 6:
 303     bf    30,5f
 304     lhz   7,4(4)
 305     sth   7,4(3)
 306     bf    31,0f
 307     lbz   8,6(4)
 308     stb   8,6(3)
 309     mr    3,30
 310     lwz   30,20(1)
 311     addi  1,1,32
 312     blr
 313     .align  4
 314 5:
 315     bf    31,0f
 316     lbz   6,4(4)
 317     stb   6,4(3)
 318     .align  4
 319 0:
 320   /* Return original dst pointer.  */
 321     mr   3,30
 322     lwz  30,20(1)
 323     addi 1,1,32
 324     blr
 325
 326     .align  4
 327 .L6:
 328
 329   /* Copy words where the destination is aligned but the source is
 330      not.  Use aligned word loads from the source, shifted to realign
 331      the data, to allow aligned destination stores.
 332      Use an unrolled loop to copy 4 words (16-bytes) per iteration.
 333      A single word is retained for storing at loop exit to avoid walking
 334      off the end of a page within the loop.
 335      If the copy is not an exact multiple of 16 bytes, 1-3
 336      words are copied as needed to set up the main loop.  After
 337      the main loop exits there may be a tail of 1-3 bytes. These bytes are
 338      copied a halfword/byte at a time as needed to preserve alignment.  */
 339
 340
 341     cmplwi  cr6,11,0  /* are there tail bytes left ? */
 342     subf    5,10,12   /* back up src pointer to prev word alignment */
 343     slwi    10,10,3   /* calculate number of bits to shift 1st word left */
 344     addi    11,9,-1   /* we move one word after the loop */
 345     srwi    8,11,2    /* calculate the 16 byte loop count */
 346     lwz     6,0(5)    /* load 1st src word into R6 */
 347     mr      4,3
 348     lwz     7,4(5)    /* load 2nd src word into R7 */
 349     mtcrf   0x01,11
 350     subfic  9,10,32   /* number of bits to shift 2nd word right */
 351     mtctr   8
 352     bf      30,1f
 353
 354     /* there are at least two words to copy, so copy them */
 355 #ifdef __LITTLE_ENDIAN__
 356     srw   0,6,10
 357     slw   8,7,9
 358 #else
 359     slw   0,6,10  /* shift 1st src word to left align it in R0 */
 360     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
 361 #endif
 362     or    0,0,8   /* or them to get word to store */
 363     lwz   6,8(5)  /* load the 3rd src word */
 364     stw   0,0(4)  /* store the 1st dst word */
 365 #ifdef __LITTLE_ENDIAN__
 366     srw   0,7,10
 367     slw   8,6,9
 368 #else
 369     slw   0,7,10  /* now left align 2nd src word into R0 */
 370     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
 371 #endif
 372     or    0,0,8   /* or them to get word to store */
 373     lwz   7,12(5)
 374     stw   0,4(4)  /* store the 2nd dst word */
 375     addi  4,4,8
 376     addi  5,5,16
 377     bf    31,4f
 378     /* there is a third word to copy, so copy it */
 379 #ifdef __LITTLE_ENDIAN__
 380     srw   0,6,10
 381     slw   8,7,9
 382 #else
 383     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
 384     srw   8,7,9   /* shift 4th src word to right align it in R8 */
 385 #endif
 386     or    0,0,8   /* or them to get word to store */
 387     stw   0,0(4)  /* store 3rd dst word */
 388     mr    6,7
 389     lwz   7,0(5)
 390     addi  5,5,4
 391     addi  4,4,4
 392     b     4f
 393     .align 4
 394 1:
 395 #ifdef __LITTLE_ENDIAN__
 396     srw     0,6,10
 397     slw     8,7,9
 398 #else
 399     slw     0,6,10  /* shift 1st src word to left align it in R0 */
 400     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
 401 #endif
 402     addi  5,5,8
 403     or    0,0,8   /* or them to get word to store */
 404     bf    31,4f
 405     mr    6,7
 406     lwz   7,0(5)
 407     addi  5,5,4
 408     stw   0,0(4)  /* store the 1st dst word */
 409     addi  4,4,4
 410
 411     .align  4
 412 4:
 413     /* copy 16 bytes at a time */
 414 #ifdef __LITTLE_ENDIAN__
 415     srw   0,6,10
 416     slw   8,7,9
 417 #else
 418     slw   0,6,10
 419     srw   8,7,9
 420 #endif
 421     or    0,0,8
 422     lwz   6,0(5)
 423     stw   0,0(4)
 424 #ifdef __LITTLE_ENDIAN__
 425     srw   0,7,10
 426     slw   8,6,9
 427 #else
 428     slw   0,7,10
 429     srw   8,6,9
 430 #endif
 431     or    0,0,8
 432     lwz   7,4(5)
 433     stw   0,4(4)
 434 #ifdef __LITTLE_ENDIAN__
 435     srw   0,6,10
 436     slw   8,7,9
 437 #else
 438     slw   0,6,10
 439     srw   8,7,9
 440 #endif
 441     or    0,0,8
 442     lwz   6,8(5)
 443     stw   0,8(4)
 444 #ifdef __LITTLE_ENDIAN__
 445     srw   0,7,10
 446     slw   8,6,9
 447 #else
 448     slw   0,7,10
 449     srw   8,6,9
 450 #endif
 451     or    0,0,8
 452     lwz   7,12(5)
 453     stw   0,12(4)
 454     addi  5,5,16
 455     addi  4,4,16
 456     bdnz+ 4b
 457 8:
 458     /* calculate and store the final word */
 459 #ifdef __LITTLE_ENDIAN__
 460     srw   0,6,10
 461     slw   8,7,9
 462 #else
 463     slw   0,6,10
 464     srw   8,7,9
 465 #endif
 466     or    0,0,8
 467     stw   0,0(4)
 468 3:
 469     clrrwi 0,31,2
 470     mtcrf 0x01,31
 471     bne   cr6,.L9       /* If the tail is 0 bytes we are done!  */
 472
 473   /* Return original dst pointer.  */
 474     mr   3,30
 475     lwz  30,20(1)
 476     lwz  31,24(1)
 477     addi 1,1,32
 478     blr
 479 END (memcpy)
 480
 481 libc_hidden_builtin_def (memcpy)