sysdeps/powerpc/powerpc32/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28
  29    Longer moves (>= 32-bytes) justify the effort to get at least the
  30    destination word (4-byte) aligned.  Further optimization is
  31    possible when both source and destination are word aligned.
  32    Each case has an optimized unrolled loop.   */
  33
  34         .machine power6
  35 EALIGN (memcpy, 5, 0)
  36         CALL_MCOUNT
  37
  38     stwu   1,-32(1)
  39     cfi_adjust_cfa_offset(32)
  40     cmplwi cr1,5,31     /* check for short move.  */
  41     neg    0,3
  42     cmplwi cr1,5,31
  43     clrlwi 10,4,30      /* check alignment of src.  */
  44     andi.  11,3,3       /* check alignment of dst.  */
  45     clrlwi 0,0,30       /* Number of bytes until the 1st word of dst.  */
  46     ble-   cr1,L(word_unaligned_short)  /* If move < 32 bytes.  */
  47     cmplw  cr6,10,11
  48     stw    31,24(1)
  49     cfi_offset(31,(24-32))
  50     stw    30,20(1)
  51     cfi_offset(30,(20-32))
  52     mr     30,3
  53     beq    .L0
  54     mtcrf  0x01,0
  55     subf  31,0,5        /* Length after alignment.  */
  56     add   12,4,0        /* Compute src addr after alignment.  */
  57   /* Move 0-3 bytes as needed to get the destination word aligned.  */
  58 1:  bf    31,2f
  59     lbz   6,0(4)
  60     bf    30,3f
  61     lhz   7,1(4)
  62     stb   6,0(3)
  63     sth   7,1(3)
  64     addi  3,3,3
  65     b     0f
  66 3:
  67     stb   6,0(3)
  68     addi  3,3,1
  69     b     0f
  70 2:  bf    30,0f
  71     lhz   6,0(4)
  72     sth   6,0(3)
  73     addi  3,3,2
  74 0:
  75     clrlwi 10,12,30     /* check alignment of src again.  */
  76     srwi   9,31,2       /* Number of full words remaining.  */
  77     bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
  78     clrlwi 11,31,30  /* calculate the number of tail bytes */
  79     b      L(word_aligned)
  80   /* Copy words from source to destination, assuming the destination is
  81      aligned on a word boundary.
  82
  83      At this point we know there are at least 29 bytes left (32-3) to copy.
  84      The next step is to determine if the source is also word aligned.
  85      If not branch to the unaligned move code at .L6. which uses
  86      a load, shift, store strategy.
  87
  88      Otherwise source and destination are word aligned, and we can use
  89      the optimized word copy loop.  */
  90     .align  4
  91 .L0:
  92     mr     31,5
  93     mr     12,4
  94     bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
  95     srwi   9,5,2        /* Number of full words remaining.  */
  96     clrlwi 11,5,30      /* calculate the number of tail bytes */
  97
  98   /* Move words where destination and source are word aligned.
  99      Use an unrolled loop to copy 4 words (16-bytes) per iteration.
 100      If the copy is not an exact multiple of 16 bytes, 1-3
 101      words are copied as needed to set up the main loop.  After
 102      the main loop exits there may be a tail of 1-3 bytes. These bytes are
 103      copied a halfword/byte at a time as needed to preserve alignment.  */
 104 L(word_aligned):
 105     mtcrf 0x01,9
 106     srwi  8,31,4    /* calculate the 16 byte loop count */
 107     cmplwi      cr1,9,4
 108     cmplwi      cr6,11,0
 109     mr    11,12
 110
 111     bf    30,1f
 112     lwz   6,0(12)
 113     lwz   7,4(12)
 114     addi  11,12,8
 115     mtctr 8
 116     stw   6,0(3)
 117     stw   7,4(3)
 118     addi  10,3,8
 119     bf    31,4f
 120     lwz   0,8(12)
 121     stw   0,8(3)
 122     blt   cr1,3f
 123     addi  11,12,12
 124     addi  10,3,12
 125     b     4f
 126     .align  4
 127 1:
 128     mr    10,3
 129     mtctr 8
 130     bf    31,4f
 131     lwz   6,0(12)
 132     addi  11,12,4
 133     stw   6,0(3)
 134     addi  10,3,4
 135
 136     .align  4
 137 4:
 138     lwz   6,0(11)
 139     lwz   7,4(11)
 140     lwz   8,8(11)
 141     lwz   0,12(11)
 142     stw   6,0(10)
 143     stw   7,4(10)
 144     stw   8,8(10)
 145     stw   0,12(10)
 146     addi  11,11,16
 147     addi  10,10,16
 148     bdnz  4b
 149 3:
 150     clrrwi 0,31,2
 151     mtcrf 0x01,31
 152     beq   cr6,0f
 153 .L9:
 154     add   3,3,0
 155     add   12,12,0
 156
 157 /*  At this point we have a tail of 0-3 bytes and we know that the
 158     destination is word aligned.  */
 159 2:  bf    30,1f
 160     lhz   6,0(12)
 161     addi  12,12,2
 162     sth   6,0(3)
 163     addi  3,3,2
 164 1:  bf    31,0f
 165     lbz   6,0(12)
 166     stb   6,0(3)
 167 0:
 168   /* Return original dst pointer.  */
 169     mr  3,30
 170     lwz 30,20(1)
 171     lwz 31,24(1)
 172     addi 1,1,32
 173     blr
 174
 175 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 176    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 177    tests.
 178
 179    In the short (0-8 byte) case no attempt is made to force alignment
 180    of either source or destination.  The hardware will handle the
 181    unaligned load/stores with small delays for crossing 32- 128-byte,
 182    and 4096-byte boundaries. Since these short moves are unlikely to be
 183    unaligned or cross these boundaries, the overhead to force
 184    alignment is not justified.
 185
 186    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 187    boundaries.  Since only loads are sensitive to the 32-/128-byte
 188    boundaries it is more important to align the source then the
 189    destination.  If the source is not already word aligned, we first
 190    move 1-3 bytes as needed.  Since we are only word aligned we don't
 191    use double word load/stores to insure that all loads are aligned.
 192    While the destination and stores may still be unaligned, this
 193    is only an issue for page (4096 byte boundary) crossing, which
 194    should be rare for these short moves.  The hardware handles this
 195    case automatically with a small (~20 cycle) delay.  */
 196     .align  4
 197
 198     cfi_same_value (31)
 199     cfi_same_value (30)
 200 L(word_unaligned_short):
 201     mtcrf 0x01,5
 202     cmplwi cr6,5,8
 203     neg   8,4
 204     clrrwi      9,4,2
 205     andi. 0,8,3
 206     beq   cr6,L(wus_8)  /* Handle moves of 8 bytes.  */
 207 /* At least 9 bytes left.  Get the source word aligned.  */
 208     cmplwi      cr1,5,16
 209     mr    12,4
 210     ble   cr6,L(wus_4)  /* Handle moves of 0-8 bytes.  */
 211     mr    11,3
 212     mr    10,5
 213     cmplwi      cr6,0,2
 214     beq   L(wus_tail)   /* If the source is already word aligned skip this.  */
 215 /* Copy 1-3 bytes to get source address word aligned.  */
 216     lwz   6,0(9)
 217     subf  10,0,5
 218     add   12,4,0
 219     blt   cr6,5f
 220     srwi  7,6,16
 221     bgt   cr6,3f
 222     sth   6,0(3)
 223     b     7f
 224     .align  4
 225 3:
 226     stb   7,0(3)
 227     sth   6,1(3)
 228     b     7f
 229     .align  4
 230 5:
 231     stb   6,0(3)
 232 7:
 233     cmplwi      cr1,10,16
 234     add   11,3,0
 235     mtcrf 0x01,10
 236     .align  4
 237 L(wus_tail):
 238 /* At least 6 bytes left and the source is word aligned.  This allows
 239    some speculative loads up front.  */
 240 /* We need to special case the fall-through because the biggest delays
 241    are due to address computation not being ready in time for the
 242    AGEN.  */
 243     lwz   6,0(12)
 244     lwz   7,4(12)
 245     blt   cr1,L(wus_tail8)
 246     cmplwi      cr0,10,24
 247 L(wus_tail16): /* Move 16 bytes.  */
 248     stw   6,0(11)
 249     stw   7,4(11)
 250     lwz   6,8(12)
 251     lwz   7,12(12)
 252     stw   6,8(11)
 253     stw   7,12(11)
 254 /* Move 8 bytes more.  */
 255     bf    28,L(wus_tail16p8)
 256     cmplwi      cr1,10,28
 257     lwz   6,16(12)
 258     lwz   7,20(12)
 259     stw   6,16(11)
 260     stw   7,20(11)
 261 /* Move 4 bytes more.  */
 262     bf    29,L(wus_tail16p4)
 263     lwz   6,24(12)
 264     stw   6,24(11)
 265     addi  12,12,28
 266     addi  11,11,28
 267     bgt   cr1,L(wus_tail2)
 268  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 269     addi  1,1,32
 270     blr
 271     .align  4
 272 L(wus_tail16p8):  /* less than 8 bytes left.  */
 273     beq   cr1,L(wus_tailX) /* exactly 16 bytes, early exit.  */
 274     cmplwi      cr1,10,20
 275     bf    29,L(wus_tail16p2)
 276 /* Move 4 bytes more.  */
 277     lwz   6,16(12)
 278     stw   6,16(11)
 279     addi  12,12,20
 280     addi  11,11,20
 281     bgt   cr1,L(wus_tail2)
 282  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 283     addi  1,1,32
 284     blr
 285     .align  4
 286 L(wus_tail16p4):  /* less than 4 bytes left.  */
 287     addi  12,12,24
 288     addi  11,11,24
 289     bgt   cr0,L(wus_tail2)
 290  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 291     addi  1,1,32
 292     blr
 293     .align  4
 294 L(wus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
 295     addi  12,12,16
 296     addi  11,11,16
 297     b     L(wus_tail2)
 298
 299     .align  4
 300 L(wus_tail8):  /* Move 8 bytes.  */
 301 /*  r6, r7 already loaded speculatively.  */
 302     cmplwi      cr1,10,8
 303     cmplwi      cr0,10,12
 304     bf    28,L(wus_tail4)
 305     stw   6,0(11)
 306     stw   7,4(11)
 307 /* Move 4 bytes more.  */
 308     bf    29,L(wus_tail8p4)
 309     lwz   6,8(12)
 310     stw   6,8(11)
 311     addi  12,12,12
 312     addi  11,11,12
 313     bgt   cr0,L(wus_tail2)
 314  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 315     addi  1,1,32
 316     blr
 317     .align  4
 318 L(wus_tail8p4):  /* less than 4 bytes left.  */
 319     addi  12,12,8
 320     addi  11,11,8
 321     bgt   cr1,L(wus_tail2)
 322  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 323     addi  1,1,32
 324     blr
 325
 326     .align  4
 327 L(wus_tail4):  /* Move 4 bytes.  */
 328 /*  r6 already loaded speculatively.  If we are here we know there is
 329     more than 4 bytes left.  So there is no need to test.  */
 330     addi  12,12,4
 331     stw   6,0(11)
 332     addi  11,11,4
 333 L(wus_tail2):  /* Move 2-3 bytes.  */
 334     bf    30,L(wus_tail1)
 335     lhz   6,0(12)
 336     sth   6,0(11)
 337     bf    31,L(wus_tailX)
 338     lbz   7,2(12)
 339     stb   7,2(11)
 340     addi  1,1,32
 341     blr
 342 L(wus_tail1):  /* Move 1 byte.  */
 343     bf    31,L(wus_tailX)
 344     lbz   6,0(12)
 345     stb   6,0(11)
 346 L(wus_tailX):
 347   /* Return original dst pointer.  */
 348     addi  1,1,32
 349     blr
 350
 351 /* Special case to copy 0-8 bytes.  */
 352     .align  4
 353 L(wus_8):
 354     lwz   6,0(4)
 355     lwz   7,4(4)
 356     stw   6,0(3)
 357     stw   7,4(3)
 358   /* Return original dst pointer.  */
 359     addi  1,1,32
 360     blr
 361     .align  4
 362 L(wus_4):
 363     bf    29,L(wus_2)
 364     lwz   6,0(4)
 365     stw   6,0(3)
 366     bf    30,L(wus_5)
 367     lhz   7,4(4)
 368     sth   7,4(3)
 369     bf    31,L(wus_0)
 370     lbz   8,6(4)
 371     stb   8,6(3)
 372     addi  1,1,32
 373     blr
 374     .align  4
 375 L(wus_5):
 376     bf    31,L(wus_0)
 377     lbz   6,4(4)
 378     stb   6,4(3)
 379   /* Return original dst pointer.  */
 380     addi 1,1,32
 381     blr
 382     .align  4
 383 L(wus_2):  /* Move 2-3 bytes.  */
 384     bf    30,L(wus_1)
 385     lhz   6,0(4)
 386     sth   6,0(3)
 387     bf    31,L(wus_0)
 388     lbz   7,2(4)
 389     stb   7,2(3)
 390     addi  1,1,32
 391     blr
 392     .align  4
 393 L(wus_1):  /* Move 1 byte.  */
 394     bf    31,L(wus_0)
 395     lbz   6,0(4)
 396     stb   6,0(3)
 397     .align  3
 398 L(wus_0):
 399   /* Return original dst pointer.  */
 400     addi  1,1,32
 401     blr
 402
 403     .align  4
 404     cfi_offset(31,(24-32))
 405     cfi_offset(30,(20-32))
 406 L(wdu):
 407
 408   /* Copy words where the destination is aligned but the source is
 409      not.  For power4, power5 and power6 machines there is penalty for
 410      unaligned loads (src) that cross 32-byte, cacheline, or page
 411      boundaries. So we want to use simple (unaligned) loads where
 412      possible but avoid them where we know the load would span a 32-byte
 413      boundary.
 414
 415      At this point we know we have at least 29 (32-3) bytes to copy
 416      the src is unaligned. and we may cross at least one 32-byte
 417      boundary. Also we have the following register values:
 418      r3 == adjusted dst, word aligned
 419      r4 == unadjusted src
 420      r5 == unadjusted len
 421      r9 == adjusted Word length
 422      r10 == src alignment (1-3)
 423      r12 == adjusted src, not aligned
 424      r31 == adjusted len
 425
 426      First we need to copy word up to but not crossing the next 32-byte
 427      boundary. Then perform aligned loads just before and just after
 428      the boundary and use shifts and or to generate the next aligned
 429      word for dst. If more than 32 bytes remain we copy (unaligned src)
 430      the next 7 words and repeat the loop until less than 32-bytes
 431      remain.
 432
 433      Then if more than 4 bytes remain we again use aligned loads,
 434      shifts and or to generate the next dst word. We then process the
 435      remaining words using unaligned loads as needed. Finally we check
 436      if there more than 0 bytes (1-3) bytes remaining and use
 437      halfword and or byte load/stores to complete the copy.
 438 */
 439     mr      4,12      /* restore unaligned adjusted src ptr */
 440     clrlwi  0,12,27   /* Find dist from previous 32-byte boundary.  */
 441     slwi    10,10,3   /* calculate number of bits to shift 1st word left */
 442     cmplwi  cr5,0,16
 443     subfic  8,0,32   /* Number of bytes to next 32-byte boundary.  */
 444
 445     mtcrf   0x01,8
 446     cmplwi  cr1,10,16
 447     subfic  9,10,32  /* number of bits to shift 2nd word right */
 448 /*  This test is reversed because the timing to compare the bytes to
 449     32-byte boundary could not be meet.  So we compare the bytes from
 450     previous 32-byte boundary and invert the test.  */
 451     bge     cr5,L(wdu_h32_8)
 452     .align  4
 453     lwz   6,0(4)
 454     lwz   7,4(4)
 455     addi  12,4,16    /* generate alternate pointers to avoid agen */
 456     addi  11,3,16    /* timing issues downstream.  */
 457     stw   6,0(3)
 458     stw   7,4(3)
 459     subi  31,31,16
 460     lwz   6,8(4)
 461     lwz   7,12(4)
 462     addi  4,4,16
 463     stw   6,8(3)
 464     stw   7,12(3)
 465     addi  3,3,16
 466     bf    28,L(wdu_h32_4)
 467     lwz   6,0(12)
 468     lwz   7,4(12)
 469     subi  31,31,8
 470     addi  4,4,8
 471     stw   6,0(11)
 472     stw   7,4(11)
 473     addi  3,3,8
 474     bf    29,L(wdu_h32_0)
 475     lwz   6,8(12)
 476     addi  4,4,4
 477     subi  31,31,4
 478     stw   6,8(11)
 479     addi  3,3,4
 480     b     L(wdu_h32_0)
 481     .align  4
 482 L(wdu_h32_8):
 483     bf    28,L(wdu_h32_4)
 484     lwz   6,0(4)
 485     lwz   7,4(4)
 486     subi  31,31,8
 487     bf    29,L(wdu_h32_8x)
 488     stw   6,0(3)
 489     stw   7,4(3)
 490     lwz   6,8(4)
 491     addi  4,4,12
 492     subi  31,31,4
 493     stw   6,8(3)
 494     addi  3,3,12
 495     b     L(wdu_h32_0)
 496     .align  4
 497 L(wdu_h32_8x):
 498     addi  4,4,8
 499     stw   6,0(3)
 500     stw   7,4(3)
 501     addi  3,3,8
 502     b     L(wdu_h32_0)
 503     .align  4
 504 L(wdu_h32_4):
 505     bf    29,L(wdu_h32_0)
 506     lwz   6,0(4)
 507     subi  31,31,4
 508     addi  4,4,4
 509     stw   6,0(3)
 510     addi  3,3,4
 511     .align  4
 512 L(wdu_h32_0):
 513 /*  set up for 32-byte boundary crossing word move and possibly 32-byte
 514     move loop.  */
 515     clrrwi  12,4,2
 516     cmplwi  cr5,31,32
 517     bge     cr1,L(wdu2_32)
 518 #if 0
 519     b       L(wdu1_32)
 520 /*
 521     cmplwi  cr1,10,8
 522     beq     cr1,L(wdu1_32)
 523     cmplwi  cr1,10,16
 524     beq     cr1,L(wdu2_32)
 525     cmplwi  cr1,10,24
 526     beq     cr1,L(wdu3_32)
 527 */
 528 L(wdu_32):
 529     lwz     6,0(12)
 530     cmplwi  cr6,31,4
 531     srwi    8,31,5    /* calculate the 32 byte loop count */
 532     slw     0,6,10
 533     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 534     blt     cr5,L(wdu_32tail)
 535     mtctr   8
 536     cmplwi  cr6,31,4
 537     .align  4
 538 L(wdu_loop32):
 539     /* copy 32 bytes at a time */
 540     lwz   8,4(12)
 541     addi  12,12,32
 542     lwz   7,4(4)
 543     srw   8,8,9
 544     or    0,0,8
 545     stw   0,0(3)
 546     stw   7,4(3)
 547     lwz   6,8(4)
 548     lwz   7,12(4)
 549     stw   6,8(3)
 550     stw   7,12(3)
 551     lwz   6,16(4)
 552     lwz   7,20(4)
 553     stw   6,16(3)
 554     stw   7,20(3)
 555     lwz   6,24(4)
 556     lwz   7,28(4)
 557     lwz   8,0(12)
 558     addi  4,4,32
 559     stw   6,24(3)
 560     stw   7,28(3)
 561     addi  3,3,32
 562     slw   0,8,10
 563     bdnz+ L(wdu_loop32)
 564
 565 L(wdu_32tail):
 566     mtcrf   0x01,31
 567     cmplwi  cr5,31,16
 568     blt     cr6,L(wdu_4tail)
 569     /* calculate and store the final word */
 570     lwz   8,4(12)
 571     srw   8,8,9
 572     or    6,0,8
 573     b     L(wdu_32tailx)
 574 #endif
 575     .align  4
 576 L(wdu1_32):
 577     lwz     6,-1(4)
 578     cmplwi  cr6,31,4
 579     srwi    8,31,5    /* calculate the 32 byte loop count */
 580     slwi    6,6,8
 581     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 582     blt     cr5,L(wdu1_32tail)
 583     mtctr   8
 584     cmplwi  cr6,31,4
 585
 586     lwz   8,3(4)
 587     lwz   7,4(4)
 588 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 589     rlwimi 6,8,8,(32-8),31
 590     b      L(wdu1_loop32x)
 591     .align  4
 592 L(wdu1_loop32):
 593     /* copy 32 bytes at a time */
 594     lwz   8,3(4)
 595     lwz   7,4(4)
 596     stw   10,-8(3)
 597     stw   11,-4(3)
 598 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 599     rlwimi 6,8,8,(32-8),31
 600 L(wdu1_loop32x):
 601     lwz   10,8(4)
 602     lwz   11,12(4)
 603     stw   6,0(3)
 604     stw   7,4(3)
 605     lwz   6,16(4)
 606     lwz   7,20(4)
 607     stw   10,8(3)
 608     stw   11,12(3)
 609     lwz   10,24(4)
 610     lwz   11,28(4)
 611     lwz   8,32-1(4)
 612     addi  4,4,32
 613     stw   6,16(3)
 614     stw   7,20(3)
 615     addi  3,3,32
 616     slwi  6,8,8
 617     bdnz+ L(wdu1_loop32)
 618     stw   10,-8(3)
 619     stw   11,-4(3)
 620
 621 L(wdu1_32tail):
 622     mtcrf   0x01,31
 623     cmplwi  cr5,31,16
 624     blt     cr6,L(wdu_4tail)
 625     /* calculate and store the final word */
 626     lwz   8,3(4)
 627 /*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
 628     rlwimi 6,8,8,(32-8),31
 629     b     L(wdu_32tailx)
 630
 631 L(wdu2_32):
 632     bgt     cr1,L(wdu3_32)
 633     lwz     6,-2(4)
 634     cmplwi  cr6,31,4
 635     srwi    8,31,5    /* calculate the 32 byte loop count */
 636     slwi    6,6,16
 637     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 638     blt     cr5,L(wdu2_32tail)
 639     mtctr   8
 640     cmplwi  cr6,31,4
 641
 642     lwz   8,2(4)
 643     lwz   7,4(4)
 644 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 645     rlwimi 6,8,16,(32-16),31
 646     b      L(wdu2_loop32x)
 647     .align  4
 648 L(wdu2_loop32):
 649     /* copy 32 bytes at a time */
 650     lwz   8,2(4)
 651     lwz   7,4(4)
 652     stw   10,-8(3)
 653     stw   11,-4(3)
 654 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 655     rlwimi 6,8,16,(32-16),31
 656 L(wdu2_loop32x):
 657     lwz   10,8(4)
 658     lwz   11,12(4)
 659     stw   6,0(3)
 660     stw   7,4(3)
 661     lwz   6,16(4)
 662     lwz   7,20(4)
 663     stw   10,8(3)
 664     stw   11,12(3)
 665     lwz   10,24(4)
 666     lwz   11,28(4)
 667 /*    lwz   8,0(12) */
 668     lwz   8,32-2(4)
 669     addi  4,4,32
 670     stw   6,16(3)
 671     stw   7,20(3)
 672     addi  3,3,32
 673     slwi  6,8,16
 674     bdnz+ L(wdu2_loop32)
 675     stw   10,-8(3)
 676     stw   11,-4(3)
 677
 678 L(wdu2_32tail):
 679     mtcrf   0x01,31
 680     cmplwi  cr5,31,16
 681     blt     cr6,L(wdu_4tail)
 682     /* calculate and store the final word */
 683     lwz   8,2(4)
 684 /*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
 685     rlwimi 6,8,16,(32-16),31
 686     b     L(wdu_32tailx)
 687
 688 L(wdu3_32):
 689 /*    lwz     6,0(12) */
 690     lwz     6,-3(4)
 691     cmplwi  cr6,31,4
 692     srwi    8,31,5    /* calculate the 32 byte loop count */
 693     slwi    6,6,24
 694     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 695     blt     cr5,L(wdu3_32tail)
 696     mtctr   8
 697     cmplwi  cr6,31,4
 698
 699     lwz   8,1(4)
 700     lwz   7,4(4)
 701 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 702     rlwimi 6,8,24,(32-24),31
 703     b      L(wdu3_loop32x)
 704     .align  4
 705 L(wdu3_loop32):
 706     /* copy 32 bytes at a time */
 707     lwz   8,1(4)
 708     lwz   7,4(4)
 709     stw   10,-8(3)
 710     stw   11,-4(3)
 711 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 712     rlwimi 6,8,24,(32-24),31
 713 L(wdu3_loop32x):
 714     lwz   10,8(4)
 715     lwz   11,12(4)
 716     stw   6,0(3)
 717     stw   7,4(3)
 718     lwz   6,16(4)
 719     lwz   7,20(4)
 720     stw   10,8(3)
 721     stw   11,12(3)
 722     lwz   10,24(4)
 723     lwz   11,28(4)
 724     lwz   8,32-3(4)
 725     addi  4,4,32
 726     stw   6,16(3)
 727     stw   7,20(3)
 728     addi  3,3,32
 729     slwi  6,8,24
 730     bdnz+ L(wdu3_loop32)
 731     stw   10,-8(3)
 732     stw   11,-4(3)
 733
 734 L(wdu3_32tail):
 735     mtcrf   0x01,31
 736     cmplwi  cr5,31,16
 737     blt     cr6,L(wdu_4tail)
 738     /* calculate and store the final word */
 739     lwz   8,1(4)
 740 /*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
 741     rlwimi 6,8,24,(32-24),31
 742     b     L(wdu_32tailx)
 743     .align  4
 744 L(wdu_32tailx):
 745     blt     cr5,L(wdu_t32_8)
 746     lwz   7,4(4)
 747     addi  12,4,16    /* generate alternate pointers to avoid agen */
 748     addi  11,3,16    /* timing issues downstream.  */
 749     stw   6,0(3)
 750     stw   7,4(3)
 751     subi  31,31,16
 752     lwz   6,8(4)
 753     lwz   7,12(4)
 754     addi  4,4,16
 755     stw   6,8(3)
 756     stw   7,12(3)
 757     addi  3,3,16
 758     bf    28,L(wdu_t32_4x)
 759     lwz   6,0(12)
 760     lwz   7,4(12)
 761     addi  4,4,8
 762     subi  31,31,8
 763     stw   6,0(11)
 764     stw   7,4(11)
 765     addi  3,3,8
 766     bf    29,L(wdu_t32_0)
 767     lwz   6,8(12)
 768     addi  4,4,4
 769     subi  31,31,4
 770     stw   6,8(11)
 771     addi  3,3,4
 772     b     L(wdu_t32_0)
 773     .align  4
 774 L(wdu_t32_4x):
 775     bf    29,L(wdu_t32_0)
 776     lwz   6,0(4)
 777     addi  4,4,4
 778     subi  31,31,4
 779     stw   6,0(3)
 780     addi  3,3,4
 781     b     L(wdu_t32_0)
 782     .align  4
 783 L(wdu_t32_8):
 784     bf    28,L(wdu_t32_4)
 785     lwz   7,4(4)
 786     subi  31,31,8
 787     bf    29,L(wdu_t32_8x)
 788     stw   6,0(3)
 789     stw   7,4(3)
 790     lwz   6,8(4)
 791     subi  31,31,4
 792     addi  4,4,12
 793     stw   6,8(3)
 794     addi  3,3,12
 795     b     L(wdu_t32_0)
 796     .align  4
 797 L(wdu_t32_8x):
 798     addi  4,4,8
 799     stw   6,0(3)
 800     stw   7,4(3)
 801     addi  3,3,8
 802     b     L(wdu_t32_0)
 803     .align  4
 804 L(wdu_t32_4):
 805     subi  31,31,4
 806     stw   6,0(3)
 807     addi  4,4,4
 808     addi  3,3,4
 809     .align  4
 810 L(wdu_t32_0):
 811 L(wdu_4tail):
 812     cmplwi  cr6,31,0
 813     beq   cr6,L(wdus_0) /* If the tail is 0 bytes we are done!  */
 814     bf    30,L(wdus_3)
 815     lhz   7,0(4)
 816     sth   7,0(3)
 817     bf    31,L(wdus_0)
 818     lbz   8,2(4)
 819     stb   8,2(3)
 820     mr    3,30
 821     lwz   30,20(1)
 822     lwz   31,24(1)
 823     addi  1,1,32
 824     blr
 825     .align  4
 826 L(wdus_3):
 827     bf    31,L(wus_0)
 828     lbz   6,0(4)
 829     stb   6,0(3)
 830     .align  4
 831 L(wdus_0):
 832   /* Return original dst pointer.  */
 833     mr   3,30
 834     lwz  30,20(1)
 835     lwz  31,24(1)
 836     addi 1,1,32
 837     blr
 838 END (memcpy)
 839
 840 libc_hidden_builtin_def (memcpy)