sysdeps/powerpc/powerpc32/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
   2    Copyright (C) 2003, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  25    Returns 'dst'.
  26
  27    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  28    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  29    with the appropriate combination of byte and halfword load/stores.
  30    There is minimal effort to optimize the alignment of short moves.
  31
  32    Longer moves (>= 32-bytes) justify the effort to get at least the
  33    destination word (4-byte) aligned.  Further optimization is
  34    possible when both source and destination are word aligned.
  35    Each case has an optimized unrolled loop.   */
  36
  37         .machine power6
  38 EALIGN (BP_SYM (memcpy), 5, 0)
  39         CALL_MCOUNT
  40
  41     stwu   1,-32(1)
  42     cfi_adjust_cfa_offset(32)
  43     cmplwi cr1,5,31     /* check for short move.  */
  44     neg    0,3
  45     cmplwi cr1,5,31
  46     clrlwi 10,4,30      /* check alignment of src.  */
  47     andi.  11,3,3       /* check alignment of dst.  */
  48     clrlwi 0,0,30       /* Number of bytes until the 1st word of dst.  */
  49     ble-   cr1,L(word_unaligned_short)  /* If move < 32 bytes.  */
  50     cmplw  cr6,10,11
  51     stw    31,24(1)
  52     cfi_offset(31,(24-32))
  53     stw    30,20(1)
  54     cfi_offset(30,(20-32))
  55     mr     30,3
  56     beq    .L0
  57     mtcrf  0x01,0
  58     subf  31,0,5        /* Length after alignment.  */
  59     add   12,4,0        /* Compute src addr after alignment.  */
  60   /* Move 0-3 bytes as needed to get the destination word aligned.  */
  61 1:  bf    31,2f
  62     lbz   6,0(4)
  63     bf    30,3f
  64     lhz   7,1(4)
  65     stb   6,0(3)
  66     sth   7,1(3)
  67     addi  3,3,3
  68     b     0f
  69 3:
  70     stb   6,0(3)
  71     addi  3,3,1
  72     b     0f
  73 2:  bf    30,0f
  74     lhz   6,0(4)
  75     sth   6,0(3)
  76     addi  3,3,2
  77 0:
  78     clrlwi 10,12,30     /* check alignment of src again.  */
  79     srwi   9,31,2       /* Number of full words remaining.  */
  80     bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
  81     clrlwi 11,31,30  /* calculate the number of tail bytes */
  82     b      L(word_aligned)
  83   /* Copy words from source to destination, assuming the destination is
  84      aligned on a word boundary.
  85
  86      At this point we know there are at least 29 bytes left (32-3) to copy.
  87      The next step is to determine if the source is also word aligned.
  88      If not branch to the unaligned move code at .L6. which uses
  89      a load, shift, store strategy.
  90
  91      Otherwise source and destination are word aligned, and we can use
  92      the optimized word copy loop.  */
  93     .align  4
  94 .L0:
  95     mr     31,5
  96     mr     12,4
  97     bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
  98     srwi   9,5,2        /* Number of full words remaining.  */
  99     clrlwi 11,5,30      /* calculate the number of tail bytes */
 100
 101   /* Move words where destination and source are word aligned.
 102      Use an unrolled loop to copy 4 words (16-bytes) per iteration.
 103      If the the copy is not an exact multiple of 16 bytes, 1-3
 104      words are copied as needed to set up the main loop.  After
 105      the main loop exits there may be a tail of 1-3 bytes. These bytes are
 106      copied a halfword/byte at a time as needed to preserve alignment.  */
 107 L(word_aligned):
 108     mtcrf 0x01,9
 109     srwi  8,31,4    /* calculate the 16 byte loop count */
 110     cmplwi      cr1,9,4
 111     cmplwi      cr6,11,0
 112     mr    11,12
 113
 114     bf    30,1f
 115     lwz   6,0(12)
 116     lwz   7,4(12)
 117     addi  11,12,8
 118     mtctr 8
 119     stw   6,0(3)
 120     stw   7,4(3)
 121     addi  10,3,8
 122     bf    31,4f
 123     lwz   0,8(12)
 124     stw   0,8(3)
 125     blt   cr1,3f
 126     addi  11,12,12
 127     addi  10,3,12
 128     b     4f
 129     .align  4
 130 1:
 131     mr    10,3
 132     mtctr 8
 133     bf    31,4f
 134     lwz   6,0(12)
 135     addi  11,12,4
 136     stw   6,0(3)
 137     addi  10,3,4
 138
 139     .align  4
 140 4:
 141     lwz   6,0(11)
 142     lwz   7,4(11)
 143     lwz   8,8(11)
 144     lwz   0,12(11)
 145     stw   6,0(10)
 146     stw   7,4(10)
 147     stw   8,8(10)
 148     stw   0,12(10)
 149     addi  11,11,16
 150     addi  10,10,16
 151     bdnz  4b
 152 3:
 153     clrrwi 0,31,2
 154     mtcrf 0x01,31
 155     beq   cr6,0f
 156 .L9:
 157     add   3,3,0
 158     add   12,12,0
 159
 160 /*  At this point we have a tail of 0-3 bytes and we know that the
 161     destination is word aligned.  */
 162 2:  bf    30,1f
 163     lhz   6,0(12)
 164     addi  12,12,2
 165     sth   6,0(3)
 166     addi  3,3,2
 167 1:  bf    31,0f
 168     lbz   6,0(12)
 169     stb   6,0(3)
 170 0:
 171   /* Return original dst pointer.  */
 172     mr  3,30
 173     lwz 30,20(1)
 174     lwz 31,24(1)
 175     addi 1,1,32
 176     blr
 177
 178 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 179    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 180    tests.
 181
 182    In the short (0-8 byte) case no attempt is made to force alignment
 183    of either source or destination.  The hardware will handle the
 184    unaligned load/stores with small delays for crossing 32- 128-byte,
 185    and 4096-byte boundaries. Since these short moves are unlikely to be
 186    unaligned or cross these boundaries, the overhead to force
 187    alignment is not justified.
 188
 189    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 190    boundaries.  Since only loads are sensitive to the 32-/128-byte
 191    boundaries it is more important to align the source then the
 192    destination.  If the source is not already word aligned, we first
 193    move 1-3 bytes as needed.  Since we are only word aligned we don't
 194    use double word load/stores to insure that all loads are aligned.
 195    While the destination and stores may still be unaligned, this
 196    is only an issue for page (4096 byte boundary) crossing, which
 197    should be rare for these short moves.  The hardware handles this
 198    case automatically with a small (~20 cycle) delay.  */
 199     .align  4
 200
 201     cfi_same_value (31)
 202     cfi_same_value (30)
 203 L(word_unaligned_short):
 204     mtcrf 0x01,5
 205     cmplwi cr6,5,8
 206     neg   8,4
 207     clrrwi      9,4,2
 208     andi. 0,8,3
 209     beq   cr6,L(wus_8)  /* Handle moves of 8 bytes.  */
 210 /* At least 9 bytes left.  Get the source word aligned.  */
 211     cmpldi      cr1,5,16
 212     mr    12,4
 213     ble   cr6,L(wus_4)  /* Handle moves of 0-8 bytes.  */
 214     mr    11,3
 215     mr    10,5
 216     cmplwi      cr6,0,2
 217     beq   L(wus_tail)   /* If the source is already word aligned skip this.  */
 218 /* Copy 1-3 bytes to get source address word aligned.  */
 219     lwz   6,0(9)
 220     subf  10,0,5
 221     add   12,4,0
 222     blt   cr6,5f
 223     srdi  7,6,16
 224     bgt   cr6,3f
 225     sth   6,0(3)
 226     b     7f
 227     .align  4
 228 3:
 229     stb   7,0(3)
 230     sth   6,1(3)
 231     b     7f
 232     .align  4
 233 5:
 234     stb   6,0(3)
 235 7:
 236     cmplwi      cr1,10,16
 237     add   11,3,0
 238     mtcrf 0x01,10
 239     .align  4
 240 L(wus_tail):
 241 /* At least 6 bytes left and the source is word aligned.  This allows
 242    some speculative loads up front.  */
 243 /* We need to special case the fall-through because the biggest delays
 244    are due to address computation not being ready in time for the
 245    AGEN.  */
 246     lwz   6,0(12)
 247     lwz   7,4(12)
 248     blt   cr1,L(wus_tail8)
 249     cmplwi      cr0,10,24
 250 L(wus_tail16): /* Move 16 bytes.  */
 251     stw   6,0(11)
 252     stw   7,4(11)
 253     lwz   6,8(12)
 254     lwz   7,12(12)
 255     stw   6,8(11)
 256     stw   7,12(11)
 257 /* Move 8 bytes more.  */
 258     bf    28,L(wus_tail16p8)
 259     cmplwi      cr1,10,28
 260     lwz   6,16(12)
 261     lwz   7,20(12)
 262     stw   6,16(11)
 263     stw   7,20(11)
 264 /* Move 4 bytes more.  */
 265     bf    29,L(wus_tail16p4)
 266     lwz   6,24(12)
 267     stw   6,24(11)
 268     addi  12,12,28
 269     addi  11,11,28
 270     bgt   cr1,L(wus_tail2)
 271  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 272     addi  1,1,32
 273     blr
 274     .align  4
 275 L(wus_tail16p8):  /* less then 8 bytes left.  */
 276     beq   cr1,L(wus_tailX) /* exactly 16 bytes, early exit.  */
 277     cmplwi      cr1,10,20
 278     bf    29,L(wus_tail16p2)
 279 /* Move 4 bytes more.  */
 280     lwz   6,16(12)
 281     stw   6,16(11)
 282     addi  12,12,20
 283     addi  11,11,20
 284     bgt   cr1,L(wus_tail2)
 285  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 286     addi  1,1,32
 287     blr
 288     .align  4
 289 L(wus_tail16p4):  /* less then 4 bytes left.  */
 290     addi  12,12,24
 291     addi  11,11,24
 292     bgt   cr0,L(wus_tail2)
 293  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 294     addi  1,1,32
 295     blr
 296     .align  4
 297 L(wus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
 298     addi  12,12,16
 299     addi  11,11,16
 300     b     L(wus_tail2)
 301
 302     .align  4
 303 L(wus_tail8):  /* Move 8 bytes.  */
 304 /*  r6, r7 already loaded speculatively.  */
 305     cmplwi      cr1,10,8
 306     cmplwi      cr0,10,12
 307     bf    28,L(wus_tail4)
 308     stw   6,0(11)
 309     stw   7,4(11)
 310 /* Move 4 bytes more.  */
 311     bf    29,L(wus_tail8p4)
 312     lwz   6,8(12)
 313     stw   6,8(11)
 314     addi  12,12,12
 315     addi  11,11,12
 316     bgt   cr0,L(wus_tail2)
 317  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 318     addi  1,1,32
 319     blr
 320     .align  4
 321 L(wus_tail8p4):  /* less then 4 bytes left.  */
 322     addi  12,12,8
 323     addi  11,11,8
 324     bgt   cr1,L(wus_tail2)
 325  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 326     addi  1,1,32
 327     blr
 328
 329     .align  4
 330 L(wus_tail4):  /* Move 4 bytes.  */
 331 /*  r6 already loaded speculatively.  If we are here we know there is
 332     more then 4 bytes left.  So there is no need to test.  */
 333     addi  12,12,4
 334     stw   6,0(11)
 335     addi  11,11,4
 336 L(wus_tail2):  /* Move 2-3 bytes.  */
 337     bf    30,L(wus_tail1)
 338     lhz   6,0(12)
 339     sth   6,0(11)
 340     bf    31,L(wus_tailX)
 341     lbz   7,2(12)
 342     stb   7,2(11)
 343     addi  1,1,32
 344     blr
 345 L(wus_tail1):  /* Move 1 byte.  */
 346     bf    31,L(wus_tailX)
 347     lbz   6,0(12)
 348     stb   6,0(11)
 349 L(wus_tailX):
 350   /* Return original dst pointer.  */
 351     addi  1,1,32
 352     blr
 353
 354 /* Special case to copy 0-8 bytes.  */
 355     .align  4
 356 L(wus_8):
 357     lwz   6,0(4)
 358     lwz   7,4(4)
 359     stw   6,0(3)
 360     stw   7,4(3)
 361   /* Return original dst pointer.  */
 362     addi  1,1,32
 363     blr
 364     .align  4
 365 L(wus_4):
 366     bf    29,L(wus_2)
 367     lwz   6,0(4)
 368     stw   6,0(3)
 369     bf    30,L(wus_5)
 370     lhz   7,4(4)
 371     sth   7,4(3)
 372     bf    31,L(wus_0)
 373     lbz   8,6(4)
 374     stb   8,6(3)
 375     addi  1,1,32
 376     blr
 377     .align  4
 378 L(wus_5):
 379     bf    31,L(wus_0)
 380     lbz   6,4(4)
 381     stb   6,4(3)
 382   /* Return original dst pointer.  */
 383     addi 1,1,32
 384     blr
 385     .align  4
 386 L(wus_2):  /* Move 2-3 bytes.  */
 387     bf    30,L(wus_1)
 388     lhz   6,0(4)
 389     sth   6,0(3)
 390     bf    31,L(wus_0)
 391     lbz   7,2(4)
 392     stb   7,2(3)
 393     addi  1,1,32
 394     blr
 395     .align  4
 396 L(wus_1):  /* Move 1 byte.  */
 397     bf    31,L(wus_0)
 398     lbz   6,0(4)
 399     stb   6,0(3)
 400     .align  3
 401 L(wus_0):
 402   /* Return original dst pointer.  */
 403     addi  1,1,32
 404     blr
 405
 406     .align  4
 407     cfi_offset(31,(24-32))
 408     cfi_offset(30,(20-32))
 409 L(wdu):
 410
 411   /* Copy words where the destination is aligned but the source is
 412      not.  For power4, power5 and power6 machines there is penalty for
 413      unaligned loads (src) that cross 32-byte, cacheline, or page
 414      boundaries. So we want to use simple (unaligned) loads where
 415      posible but avoid them where we know the load would span a 32-byte
 416      boundary.
 417
 418      At this point we know we have at least 29 (32-3) bytes to copy
 419      the src is unaligned. and we may cross at least one 32-byte
 420      boundary. Also we have the following regester values:
 421      r3 == adjusted dst, word aligned
 422      r4 == unadjusted src
 423      r5 == unadjusted len
 424      r9 == adjusted Word length
 425      r10 == src alignment (1-3)
 426      r12 == adjuested src, not aligned
 427      r31 == adjusted len
 428
 429      First we need to copy word upto but not crossing the next 32-byte
 430      boundary. Then perform aligned loads just before and just after
 431      the boundary and use shifts and or to gernerate the next aligned
 432      word for dst. If more then 32 bytes remain we copy (unaligned src)
 433      the next 7 words and repeat the loop until less then 32-bytes
 434      remaim.
 435
 436      Then if more then 4 bytes remain we again use aligned loads,
 437      shifts and or to generate the next dst word. We then process the
 438      remaining words using unaligned loads as needed. Finally we check
 439      if there more then 0 bytes (1-3) bytes remainting and use
 440      halfword and or byte load/stores to complete the copy.
 441 */
 442     mr      4,12      /* restore unaligned adjusted src ptr */
 443     clrlwi  0,12,27   /* Find dist from previous 32-byte boundary.  */
 444     slwi    10,10,3   /* calculate number of bits to shift 1st word left */
 445     cmplwi  cr5,0,16
 446     subfic  8,0,32   /* Number of bytes to next 32-byte boundary.  */
 447
 448     mtcrf   0x01,8
 449     cmplwi  cr1,10,16
 450     subfic  9,10,32  /* number of bits to shift 2nd word right */
 451 /*  This test is reversed because the timing to compare the bytes to
 452     32-byte boundary could not be meet.  So we compare the bytes from
 453     previous 32-byte boundary and invert the test.  */
 454     bge     cr5,L(wdu_h32_8)
 455     .align  4
 456     lwz   6,0(4)
 457     lwz   7,4(4)
 458     addi  12,4,16    /* generate alternate pointers to avoid agen */
 459     addi  11,3,16    /* timing issues downstream.  */
 460     stw   6,0(3)
 461     stw   7,4(3)
 462     subi  31,31,16
 463     lwz   6,8(4)
 464     lwz   7,12(4)
 465     addi  4,4,16
 466     stw   6,8(3)
 467     stw   7,12(3)
 468     addi  3,3,16
 469     bf    28,L(wdu_h32_4)
 470     lwz   6,0(12)
 471     lwz   7,4(12)
 472     subi  31,31,8
 473     addi  4,4,8
 474     stw   6,0(11)
 475     stw   7,4(11)
 476     addi  3,3,8
 477     bf    29,L(wdu_h32_0)
 478     lwz   6,8(12)
 479     addi  4,4,4
 480     subi  31,31,4
 481     stw   6,8(11)
 482     addi  3,3,4
 483     b     L(wdu_h32_0)
 484     .align  4
 485 L(wdu_h32_8):
 486     bf    28,L(wdu_h32_4)
 487     lwz   6,0(4)
 488     lwz   7,4(4)
 489     subi  31,31,8
 490     bf    29,L(wdu_h32_8x)
 491     stw   6,0(3)
 492     stw   7,4(3)
 493     lwz   6,8(4)
 494     addi  4,4,12
 495     subi  31,31,4
 496     stw   6,8(3)
 497     addi  3,3,12
 498     b     L(wdu_h32_0)
 499     .align  4
 500 L(wdu_h32_8x):
 501     addi  4,4,8
 502     stw   6,0(3)
 503     stw   7,4(3)
 504     addi  3,3,8
 505     b     L(wdu_h32_0)
 506     .align  4
 507 L(wdu_h32_4):
 508     bf    29,L(wdu_h32_0)
 509     lwz   6,0(4)
 510     subi  31,31,4
 511     addi  4,4,4
 512     stw   6,0(3)
 513     addi  3,3,4
 514     .align  4
 515 L(wdu_h32_0):
 516 /*  set up for 32-byte boundry crossing word move and possibly 32-byte
 517     move loop.  */
 518     clrrwi  12,4,2
 519     cmplwi  cr5,31,32
 520     bge     cr1,L(wdu2_32)
 521 #if 0
 522     b       L(wdu1_32)
 523 /*
 524     cmplwi  cr1,10,8
 525     beq     cr1,L(wdu1_32)
 526     cmplwi  cr1,10,16
 527     beq     cr1,L(wdu2_32)
 528     cmplwi  cr1,10,24
 529     beq     cr1,L(wdu3_32)
 530 */
 531 L(wdu_32):
 532     lwz     6,0(12)
 533     cmplwi  cr6,31,4
 534     srwi    8,31,5    /* calculate the 32 byte loop count */
 535     slw     0,6,10
 536     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 537     blt     cr5,L(wdu_32tail)
 538     mtctr   8
 539     cmplwi  cr6,31,4
 540     .align  4
 541 L(wdu_loop32):
 542     /* copy 32 bytes at a time */
 543     lwz   8,4(12)
 544     addi  12,12,32
 545     lwz   7,4(4)
 546     srw   8,8,9
 547     or    0,0,8
 548     stw   0,0(3)
 549     stw   7,4(3)
 550     lwz   6,8(4)
 551     lwz   7,12(4)
 552     stw   6,8(3)
 553     stw   7,12(3)
 554     lwz   6,16(4)
 555     lwz   7,20(4)
 556     stw   6,16(3)
 557     stw   7,20(3)
 558     lwz   6,24(4)
 559     lwz   7,28(4)
 560     lwz   8,0(12)
 561     addi  4,4,32
 562     stw   6,24(3)
 563     stw   7,28(3)
 564     addi  3,3,32
 565     slw   0,8,10
 566     bdnz+ L(wdu_loop32)
 567
 568 L(wdu_32tail):
 569     mtcrf   0x01,31
 570     cmplwi  cr5,31,16
 571     blt     cr6,L(wdu_4tail)
 572     /* calculate and store the final word */
 573     lwz   8,4(12)
 574     srw   8,8,9
 575     or    6,0,8
 576     b     L(wdu_32tailx)
 577 #endif
 578     .align  4
 579 L(wdu1_32):
 580     lwz     6,-1(4)
 581     cmplwi  cr6,31,4
 582     srwi    8,31,5    /* calculate the 32 byte loop count */
 583     slwi    6,6,8
 584     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 585     blt     cr5,L(wdu1_32tail)
 586     mtctr   8
 587     cmplwi  cr6,31,4
 588
 589     lwz   8,3(4)
 590     lwz   7,4(4)
 591 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 592     rlwimi 6,8,8,(32-8),31
 593     b      L(wdu1_loop32x)
 594     .align  4
 595 L(wdu1_loop32):
 596     /* copy 32 bytes at a time */
 597     lwz   8,3(4)
 598     lwz   7,4(4)
 599     stw   10,-8(3)
 600     stw   11,-4(3)
 601 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 602     rlwimi 6,8,8,(32-8),31
 603 L(wdu1_loop32x):
 604     lwz   10,8(4)
 605     lwz   11,12(4)
 606     stw   6,0(3)
 607     stw   7,4(3)
 608     lwz   6,16(4)
 609     lwz   7,20(4)
 610     stw   10,8(3)
 611     stw   11,12(3)
 612     lwz   10,24(4)
 613     lwz   11,28(4)
 614     lwz   8,32-1(4)
 615     addi  4,4,32
 616     stw   6,16(3)
 617     stw   7,20(3)
 618     addi  3,3,32
 619     slwi  6,8,8
 620     bdnz+ L(wdu1_loop32)
 621     stw   10,-8(3)
 622     stw   11,-4(3)
 623
 624 L(wdu1_32tail):
 625     mtcrf   0x01,31
 626     cmplwi  cr5,31,16
 627     blt     cr6,L(wdu_4tail)
 628     /* calculate and store the final word */
 629     lwz   8,3(4)
 630 /*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
 631     rlwimi 6,8,8,(32-8),31
 632     b     L(wdu_32tailx)
 633
 634 L(wdu2_32):
 635     bgt     cr1,L(wdu3_32)
 636     lwz     6,-2(4)
 637     cmplwi  cr6,31,4
 638     srwi    8,31,5    /* calculate the 32 byte loop count */
 639     slwi    6,6,16
 640     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 641     blt     cr5,L(wdu2_32tail)
 642     mtctr   8
 643     cmplwi  cr6,31,4
 644
 645     lwz   8,2(4)
 646     lwz   7,4(4)
 647 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 648     rlwimi 6,8,16,(32-16),31
 649     b      L(wdu2_loop32x)
 650     .align  4
 651 L(wdu2_loop32):
 652     /* copy 32 bytes at a time */
 653     lwz   8,2(4)
 654     lwz   7,4(4)
 655     stw   10,-8(3)
 656     stw   11,-4(3)
 657 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 658     rlwimi 6,8,16,(32-16),31
 659 L(wdu2_loop32x):
 660     lwz   10,8(4)
 661     lwz   11,12(4)
 662     stw   6,0(3)
 663     stw   7,4(3)
 664     lwz   6,16(4)
 665     lwz   7,20(4)
 666     stw   10,8(3)
 667     stw   11,12(3)
 668     lwz   10,24(4)
 669     lwz   11,28(4)
 670 /*    lwz   8,0(12) */
 671     lwz   8,32-2(4)
 672     addi  4,4,32
 673     stw   6,16(3)
 674     stw   7,20(3)
 675     addi  3,3,32
 676     slwi  6,8,16
 677     bdnz+ L(wdu2_loop32)
 678     stw   10,-8(3)
 679     stw   11,-4(3)
 680
 681 L(wdu2_32tail):
 682     mtcrf   0x01,31
 683     cmplwi  cr5,31,16
 684     blt     cr6,L(wdu_4tail)
 685     /* calculate and store the final word */
 686     lwz   8,2(4)
 687 /*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
 688     rlwimi 6,8,16,(32-16),31
 689     b     L(wdu_32tailx)
 690
 691 L(wdu3_32):
 692 /*    lwz     6,0(12) */
 693     lwz     6,-3(4)
 694     cmplwi  cr6,31,4
 695     srwi    8,31,5    /* calculate the 32 byte loop count */
 696     slwi    6,6,24
 697     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 698     blt     cr5,L(wdu3_32tail)
 699     mtctr   8
 700     cmplwi  cr6,31,4
 701
 702     lwz   8,1(4)
 703     lwz   7,4(4)
 704 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 705     rlwimi 6,8,24,(32-24),31
 706     b      L(wdu3_loop32x)
 707     .align  4
 708 L(wdu3_loop32):
 709     /* copy 32 bytes at a time */
 710     lwz   8,1(4)
 711     lwz   7,4(4)
 712     stw   10,-8(3)
 713     stw   11,-4(3)
 714 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 715     rlwimi 6,8,24,(32-24),31
 716 L(wdu3_loop32x):
 717     lwz   10,8(4)
 718     lwz   11,12(4)
 719     stw   6,0(3)
 720     stw   7,4(3)
 721     lwz   6,16(4)
 722     lwz   7,20(4)
 723     stw   10,8(3)
 724     stw   11,12(3)
 725     lwz   10,24(4)
 726     lwz   11,28(4)
 727     lwz   8,32-3(4)
 728     addi  4,4,32
 729     stw   6,16(3)
 730     stw   7,20(3)
 731     addi  3,3,32
 732     slwi  6,8,24
 733     bdnz+ L(wdu3_loop32)
 734     stw   10,-8(3)
 735     stw   11,-4(3)
 736
 737 L(wdu3_32tail):
 738     mtcrf   0x01,31
 739     cmplwi  cr5,31,16
 740     blt     cr6,L(wdu_4tail)
 741     /* calculate and store the final word */
 742     lwz   8,1(4)
 743 /*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
 744     rlwimi 6,8,24,(32-24),31
 745     b     L(wdu_32tailx)
 746     .align  4
 747 L(wdu_32tailx):
 748     blt     cr5,L(wdu_t32_8)
 749     lwz   7,4(4)
 750     addi  12,4,16    /* generate alternate pointers to avoid agen */
 751     addi  11,3,16    /* timing issues downstream.  */
 752     stw   6,0(3)
 753     stw   7,4(3)
 754     subi  31,31,16
 755     lwz   6,8(4)
 756     lwz   7,12(4)
 757     addi  4,4,16
 758     stw   6,8(3)
 759     stw   7,12(3)
 760     addi  3,3,16
 761     bf    28,L(wdu_t32_4x)
 762     lwz   6,0(12)
 763     lwz   7,4(12)
 764     addi  4,4,8
 765     subi  31,31,8
 766     stw   6,0(11)
 767     stw   7,4(11)
 768     addi  3,3,8
 769     bf    29,L(wdu_t32_0)
 770     lwz   6,8(12)
 771     addi  4,4,4
 772     subi  31,31,4
 773     stw   6,8(11)
 774     addi  3,3,4
 775     b     L(wdu_t32_0)
 776     .align  4
 777 L(wdu_t32_4x):
 778     bf    29,L(wdu_t32_0)
 779     lwz   6,0(4)
 780     addi  4,4,4
 781     subi  31,31,4
 782     stw   6,0(3)
 783     addi  3,3,4
 784     b     L(wdu_t32_0)
 785     .align  4
 786 L(wdu_t32_8):
 787     bf    28,L(wdu_t32_4)
 788     lwz   7,4(4)
 789     subi  31,31,8
 790     bf    29,L(wdu_t32_8x)
 791     stw   6,0(3)
 792     stw   7,4(3)
 793     lwz   6,8(4)
 794     subi  31,31,4
 795     addi  4,4,12
 796     stw   6,8(3)
 797     addi  3,3,12
 798     b     L(wdu_t32_0)
 799     .align  4
 800 L(wdu_t32_8x):
 801     addi  4,4,8
 802     stw   6,0(3)
 803     stw   7,4(3)
 804     addi  3,3,8
 805     b     L(wdu_t32_0)
 806     .align  4
 807 L(wdu_t32_4):
 808     subi  31,31,4
 809     stw   6,0(3)
 810     addi  4,4,4
 811     addi  3,3,4
 812     .align  4
 813 L(wdu_t32_0):
 814 L(wdu_4tail):
 815     cmplwi  cr6,31,0
 816     beq   cr6,L(wdus_0) /* If the tail is 0 bytes we are done!  */
 817     bf    30,L(wdus_3)
 818     lhz   7,0(4)
 819     sth   7,0(3)
 820     bf    31,L(wdus_0)
 821     lbz   8,2(4)
 822     stb   8,2(3)
 823     mr    3,30
 824     lwz   30,20(1)
 825     lwz   31,24(1)
 826     addi  1,1,32
 827     blr
 828     .align  4
 829 L(wdus_3):
 830     bf    31,L(wus_0)
 831     lbz   6,0(4)
 832     stb   6,0(3)
 833     .align  4
 834 L(wdus_0):
 835   /* Return original dst pointer.  */
 836     mr   3,30
 837     lwz  30,20(1)
 838     lwz  31,24(1)
 839     addi 1,1,32
 840     blr
 841 END (BP_SYM (memcpy))
 842
 843 libc_hidden_builtin_def (memcpy)