sysdeps/powerpc/powerpc64/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003, 2006, 2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  25    Returns 'dst'.
  26
  27    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  28    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  29    with the appropriate combination of byte and halfword load/stores.
  30    There is minimal effort to optimize the alignment of short moves.
  31    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  32    of handling unligned load/stores that do not cross 32-byte boundries.
  33
  34    Longer moves (>= 32-bytes) justify the effort to get at least the
  35    destination doubleword (8-byte) aligned.  Further optimization is
  36    posible when both source and destination are doubleword aligned.
  37    Each case has a optimized unrolled loop.
  38
  39    For POWER6 unaligned loads will take a 20+ cycle hicup for any
  40    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
  41    is more forgiving and does not take a hicup until page or
  42    segment boundaries.  So we require doubleword alignment for
  43    the source but may take a risk and only require word alignment
  44    for the destination.  */
  45
  46         .machine        "power6"
  47 EALIGN (BP_SYM (memcpy), 7, 0)
  48         CALL_MCOUNT 3
  49
  50     cmpldi cr1,5,31
  51     neg   0,3
  52     std   3,-16(1)
  53     std   31,-8(1)
  54     andi. 11,3,7        /* check alignement of dst.  */
  55     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  56     clrldi 10,4,61      /* check alignement of src.  */
  57     cmpldi cr6,5,8
  58     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  59     mtcrf 0x01,0
  60     cmpld cr6,10,11
  61     srdi  9,5,3         /* Number of full double words remaining.  */
  62     beq   .L0
  63
  64     subf  5,0,5
  65   /* Move 0-7 bytes as needed to get the destination doubleword alligned.
  66      Duplicate some code to maximize fall-throught and minimize agen delays.  */
  67 1:  bf    31,2f
  68     lbz   6,0(4)
  69     stb   6,0(3)
  70     bf    30,5f
  71     lhz   6,1(4)
  72     sth   6,1(3)
  73     bf    29,0f
  74     lwz   6,3(4)
  75     stw   6,3(3)
  76     b     0f
  77 5:
  78     bf    29,0f
  79     lwz   6,1(4)
  80     stw   6,1(3)
  81     b     0f
  82
  83 2:  bf    30,4f
  84     lhz   6,0(4)
  85     sth   6,0(3)
  86     bf    29,0f
  87     lwz   6,2(4)
  88     stw   6,2(3)
  89     b     0f
  90
  91 4:  bf    29,0f
  92     lwz   6,0(4)
  93     stw   6,0(3)
  94 0:
  95 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
  96     add   4,4,0
  97     add   3,3,0
  98
  99     clrldi 10,4,61      /* check alignement of src again.  */
 100     srdi  9,5,3 /* Number of full double words remaining.  */
 101
 102   /* Copy doublewords from source to destination, assumpting the
 103      destination is aligned on a doubleword boundary.
 104
 105      At this point we know there are at least 25 bytes left (32-7) to copy.
 106      The next step is to determine if the source is also doubleword aligned.
 107      If not branch to the unaligned move code at .L6. which uses
 108      a load, shift, store strategy.
 109
 110      Otherwise source and destination are doubleword aligned, and we can
 111      the optimized doubleword copy loop.  */
 112     .align  4
 113 .L0:
 114     clrldi  11,5,61
 115     andi.   0,5,0x78
 116     srdi    12,5,7      /* Number of 128-byte blocks to move.  */
 117     cmpldi  cr1,11,0    /* If the tail is 0 bytes  */
 118     bne-    cr6,.L6     /* If source is not DW aligned.  */
 119
 120   /* Move doublewords where destination and source are DW aligned.
 121      Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
 122      If the the copy is not an exact multiple of 128 bytes, 1-15
 123      doublewords are copied as needed to set up the main loop.  After
 124      the main loop exits there may be a tail of 1-7 bytes. These byte
 125      are copied a word/halfword/byte at a time as needed to preserve
 126      alignment.
 127
 128      For POWER6 the L1 is store-through and the L2 is store-in.  The
 129      L2 is clocked at half CPU clock so we can store 16 bytes every
 130      other cycle.  POWER6 also has a load/store bypass so we can do
 131      load, load, store, store every 2 cycles.
 132
 133      The following code is sensitive to cache line alignment.  Do not
 134      make any change with out first making sure thay don't result in
 135      splitting ld/std pairs across a cache line.  */
 136
 137     mtcrf 0x02,5
 138     mtcrf 0x01,5
 139     cmpldi  cr5,12,1
 140     beq   L(das_loop)
 141
 142     bf    25,4f
 143     .align  3
 144     ld    6,0(4)
 145     ld    7,8(4)
 146     mr    11,4
 147     mr    10,3
 148     std   6,0(3)
 149     std   7,8(3)
 150     ld    6,16(4)
 151     ld    7,24(4)
 152     std   6,16(3)
 153     std   7,24(3)
 154     ld    6,0+32(4)
 155     ld    7,8+32(4)
 156     addi  4,4,64
 157     addi  3,3,64
 158     std   6,0+32(10)
 159     std   7,8+32(10)
 160     ld    6,16+32(11)
 161     ld    7,24+32(11)
 162     std   6,16+32(10)
 163     std   7,24+32(10)
 164 4:
 165     mr    10,3
 166     bf    26,2f
 167     ld    6,0(4)
 168     ld    7,8(4)
 169     mr    11,4
 170     nop
 171     std   6,0(3)
 172     std   7,8(3)
 173     ld    6,16(4)
 174     ld    7,24(4)
 175     addi  4,4,32
 176     std   6,16(3)
 177     std   7,24(3)
 178     addi  3,3,32
 179 6:
 180     nop
 181     bf    27,5f
 182     ld    6,0+32(11)
 183     ld    7,8+32(11)
 184     addi  4,4,16
 185     addi  3,3,16
 186     std   6,0+32(10)
 187     std   7,8+32(10)
 188     bf    28,L(das_loop_s)
 189     ld    0,16+32(11)
 190     addi  4,4,8
 191     addi  3,3,8
 192     std   0,16+32(10)
 193     blt   cr5,L(das_tail)
 194     b     L(das_loop)
 195     .align  3
 196 5:
 197     nop
 198     bf    28,L(das_loop_s)
 199     ld    6,32(11)
 200     addi  4,4,8
 201     addi  3,3,8
 202     std   6,32(10)
 203     blt   cr5,L(das_tail)
 204     b     L(das_loop)
 205     .align  3
 206 2:
 207     mr    11,4
 208     bf    27,1f
 209     ld    6,0(4)
 210     ld    7,8(4)
 211     addi  4,4,16
 212     addi  3,3,16
 213     std   6,0(10)
 214     std   7,8(10)
 215     bf    28,L(das_loop_s)
 216     ld    0,16(11)
 217     addi  4,11,24
 218     addi  3,10,24
 219     std   0,16(10)
 220     blt   cr5,L(das_tail)
 221     b     L(das_loop)
 222     .align  3
 223 1:
 224     nop
 225     bf    28,L(das_loop_s)
 226     ld    6,0(4)
 227     addi  4,4,8
 228     addi  3,3,8
 229     std   6,0(10)
 230 L(das_loop_s):
 231     nop
 232     blt   cr5,L(das_tail)
 233     .align  4
 234 L(das_loop):
 235     ld    6,0(4)
 236     ld    7,8(4)
 237     mr    10,3
 238     mr    11,4
 239     std   6,0(3)
 240     std   7,8(3)
 241     addi  12,12,-1
 242     nop
 243     ld    8,16(4)
 244     ld    0,24(4)
 245     std   8,16(3)
 246     std   0,24(3)
 247
 248     ld    6,0+32(4)
 249     ld    7,8+32(4)
 250     std   6,0+32(3)
 251     std   7,8+32(3)
 252     ld    8,16+32(4)
 253     ld    0,24+32(4)
 254     std   8,16+32(3)
 255     std   0,24+32(3)
 256
 257     ld    6,0+64(11)
 258     ld    7,8+64(11)
 259     std   6,0+64(10)
 260     std   7,8+64(10)
 261     ld    8,16+64(11)
 262     ld    0,24+64(11)
 263     std   8,16+64(10)
 264     std   0,24+64(10)
 265
 266     ld    6,0+96(11)
 267     ld    7,8+96(11)
 268     addi  4,4,128
 269     addi  3,3,128
 270     std   6,0+96(10)
 271     std   7,8+96(10)
 272     ld    8,16+96(11)
 273     ld    0,24+96(11)
 274     std   8,16+96(10)
 275     std   0,24+96(10)
 276     ble   cr5,L(das_loop_e)
 277
 278     mtctr   12
 279     .align  4
 280 L(das_loop2):
 281     ld    6,0(4)
 282     ld    7,8(4)
 283     mr    10,3
 284     mr    11,4
 285     std   6,0(3)
 286     std   7,8(3)
 287     ld    8,16(4)
 288     ld    0,24(4)
 289     std   8,16(3)
 290     std   0,24(3)
 291
 292     ld    6,0+32(4)
 293     ld    7,8+32(4)
 294     std   6,0+32(3)
 295     std   7,8+32(3)
 296     ld    8,16+32(4)
 297     ld    0,24+32(4)
 298     std   8,16+32(3)
 299     std   0,24+32(3)
 300
 301     ld    6,0+64(11)
 302     ld    7,8+64(11)
 303     std   6,0+64(10)
 304     std   7,8+64(10)
 305     ld    8,16+64(11)
 306     ld    0,24+64(11)
 307     std   8,16+64(10)
 308     std   0,24+64(10)
 309
 310     ld    6,0+96(11)
 311     ld    7,8+96(11)
 312     addi  4,4,128
 313     addi  3,3,128
 314     std   6,0+96(10)
 315     std   7,8+96(10)
 316     ld    8,16+96(11)
 317     ld    0,24+96(11)
 318     std   8,16+96(10)
 319     std   0,24+96(10)
 320     bdnz  L(das_loop2)
 321 L(das_loop_e):
 322 /* Check of a 1-7 byte tail, return if none.  */
 323     bne   cr1,L(das_tail2)
 324 /* Return original dst pointer.  */
 325     ld 3,-16(1)
 326     blr
 327     .align  4
 328 L(das_tail):
 329     beq   cr1,0f
 330
 331 L(das_tail2):
 332 /*  At this point we have a tail of 0-7 bytes and we know that the
 333     destiniation is double word aligned.  */
 334 4:  bf    29,2f
 335     lwz   6,0(4)
 336     stw   6,0(3)
 337     bf    30,5f
 338     lhz   6,4(4)
 339     sth   6,4(3)
 340     bf    31,0f
 341     lbz   6,6(4)
 342     stb   6,6(3)
 343     b     0f
 344 5:  bf    31,0f
 345     lbz   6,4(4)
 346     stb   6,4(3)
 347     b     0f
 348
 349 2:  bf    30,1f
 350     lhz   6,0(4)
 351     sth   6,0(3)
 352     bf    31,0f
 353     lbz   6,2(4)
 354     stb   6,2(3)
 355     b     0f
 356
 357 1:  bf    31,0f
 358     lbz   6,0(4)
 359     stb   6,0(3)
 360 0:
 361   /* Return original dst pointer.  */
 362     ld 3,-16(1)
 363     blr
 364
 365 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 366    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 367    tests.
 368
 369    In the short (0-8 byte) case no attempt is made to force alignment
 370    of either source or destination.  The hardware will handle the
 371    unaligned load/stores with small delays for crossing 32- 128-byte,
 372    and 4096-byte boundaries. Since these short moves are unlikely to be
 373    unaligned or cross these boundaries, the overhead to force
 374    alignment is not justified.
 375
 376    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 377    boundaries.  Since only loads are sensitive to the 32-/128-byte
 378    boundaries it is more important to align the source then the
 379    destination.  If the source is not already word aligned, we first
 380    move 1-3 bytes as needed.  Since we are only word aligned we don't
 381    use double word load/stores to insure that all loads are aligned.
 382    While the destination and stores may still be unaligned, this
 383    is only an issue for page (4096 byte boundary) crossing, which
 384    should be rare for these short moves.  The hardware handles this
 385    case automatically with a small (~20 cycle) delay.  */
 386     .align  4
 387 .L2:
 388     mtcrf 0x01,5
 389     neg   8,4
 390     clrrdi      11,4,2
 391     andi. 0,8,3
 392     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 393 /* At least 9 bytes left.  Get the source word aligned.  */
 394     cmpldi      cr1,5,16
 395     mr    10,5
 396     mr    12,4
 397     cmpldi      cr6,0,2
 398     beq   L(dus_tail)   /* If the source is already word aligned skip this.  */
 399 /* Copy 1-3 bytes to get source address word aligned.  */
 400     lwz   6,0(11)
 401     subf  10,0,5
 402     add   12,4,0
 403     blt   cr6,5f
 404     srdi  7,6,16
 405     bgt   cr6,3f
 406     sth   6,0(3)
 407     b     7f
 408     .align  4
 409 3:
 410     stb   7,0(3)
 411     sth   6,1(3)
 412     b     7f
 413     .align  4
 414 5:
 415     stb   6,0(3)
 416 7:
 417     cmpldi      cr1,10,16
 418     add   3,3,0
 419     mtcrf 0x01,10
 420     .align  4
 421 L(dus_tail):
 422 /* At least 6 bytes left and the source is word aligned.  This allows
 423    some speculative loads up front.  */
 424 /* We need to special case the fall-through because the biggest delays
 425    are due to address computation not being ready in time for the
 426    AGEN.  */
 427     lwz   6,0(12)
 428     lwz   7,4(12)
 429     blt   cr1,L(dus_tail8)
 430     cmpldi      cr0,10,24
 431 L(dus_tail16): /* Move 16 bytes.  */
 432     stw   6,0(3)
 433     stw   7,4(3)
 434     lwz   6,8(12)
 435     lwz   7,12(12)
 436     stw   6,8(3)
 437     stw   7,12(3)
 438 /* Move 8 bytes more.  */
 439     bf    28,L(dus_tail16p8)
 440     cmpldi      cr1,10,28
 441     lwz   6,16(12)
 442     lwz   7,20(12)
 443     stw   6,16(3)
 444     stw   7,20(3)
 445 /* Move 4 bytes more.  */
 446     bf    29,L(dus_tail16p4)
 447     lwz   6,24(12)
 448     stw   6,24(3)
 449     addi  12,12,28
 450     addi  3,3,28
 451     bgt   cr1,L(dus_tail2)
 452  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 453     ld    3,-16(1)
 454     blr
 455     .align  4
 456 L(dus_tail16p8):  /* less then 8 bytes left.  */
 457     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
 458     cmpldi      cr1,10,20
 459     bf    29,L(dus_tail16p2)
 460 /* Move 4 bytes more.  */
 461     lwz   6,16(12)
 462     stw   6,16(3)
 463     addi  12,12,20
 464     addi  3,3,20
 465     bgt   cr1,L(dus_tail2)
 466  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 467     ld    3,-16(1)
 468     blr
 469     .align  4
 470 L(dus_tail16p4):  /* less then 4 bytes left.  */
 471     addi  12,12,24
 472     addi  3,3,24
 473     bgt   cr0,L(dus_tail2)
 474  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 475     ld    3,-16(1)
 476     blr
 477     .align  4
 478 L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
 479     addi  12,12,16
 480     addi  3,3,16
 481     b     L(dus_tail2)
 482
 483     .align  4
 484 L(dus_tail8):  /* Move 8 bytes.  */
 485 /*  r6, r7 already loaded speculatively.  */
 486     cmpldi      cr1,10,8
 487     cmpldi      cr0,10,12
 488     bf    28,L(dus_tail4)
 489     .align  2
 490     stw   6,0(3)
 491     stw   7,4(3)
 492 /* Move 4 bytes more.  */
 493     bf    29,L(dus_tail8p4)
 494     lwz   6,8(12)
 495     stw   6,8(3)
 496     addi  12,12,12
 497     addi  3,3,12
 498     bgt   cr0,L(dus_tail2)
 499  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 500     ld    3,-16(1)
 501     blr
 502     .align  4
 503 L(dus_tail8p4):  /* less then 4 bytes left.  */
 504     addi  12,12,8
 505     addi  3,3,8
 506     bgt   cr1,L(dus_tail2)
 507  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 508     ld    3,-16(1)
 509     blr
 510
 511     .align  4
 512 L(dus_tail4):  /* Move 4 bytes.  */
 513 /*  r6 already loaded speculatively.  If we are here we know there is
 514     more then 4 bytes left.  So there is no need to test.  */
 515     addi  12,12,4
 516     stw   6,0(3)
 517     addi  3,3,4
 518 L(dus_tail2):  /* Move 2-3 bytes.  */
 519     bf    30,L(dus_tail1)
 520     lhz   6,0(12)
 521     sth   6,0(3)
 522     bf    31,L(dus_tailX)
 523     lbz   7,2(12)
 524     stb   7,2(3)
 525     ld 3,-16(1)
 526     blr
 527 L(dus_tail1):  /* Move 1 byte.  */
 528     bf    31,L(dus_tailX)
 529     lbz   6,0(12)
 530     stb   6,0(3)
 531 L(dus_tailX):
 532   /* Return original dst pointer.  */
 533     ld    3,-16(1)
 534     blr
 535
 536 /* Special case to copy 0-8 bytes.  */
 537     .align  4
 538 .LE8:
 539     mr    12,4
 540     bne   cr6,L(dus_4)
 541 /* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
 542    cycle delay.  This case should be rare and any attempt to avoid this
 543    would take most of 20 cycles any way.  */
 544     ld   6,0(4)
 545     std   6,0(3)
 546   /* Return original dst pointer.  */
 547     ld    3,-16(1)
 548     blr
 549     .align  4
 550 L(dus_4):
 551     bf    29,L(dus_tail2)
 552     lwz   6,0(4)
 553     stw   6,0(3)
 554     bf    30,L(dus_5)
 555     lhz   7,4(4)
 556     sth   7,4(3)
 557     bf    31,L(dus_0)
 558     lbz   8,6(4)
 559     stb   8,6(3)
 560     ld 3,-16(1)
 561     blr
 562     .align  4
 563 L(dus_5):
 564     bf    31,L(dus_0)
 565     lbz   6,4(4)
 566     stb   6,4(3)
 567 L(dus_0):
 568   /* Return original dst pointer.  */
 569     ld    3,-16(1)
 570     blr
 571
 572     .align  4
 573 .L6:
 574     cfi_offset(31,-8)
 575     mr    12,4
 576     mr    31,5
 577   /* Copy doublewords where the destination is aligned but the source is
 578      not.  Use aligned doubleword loads from the source, shifted to realign
 579      the data, to allow aligned destination stores.  */
 580     addi    11,9,-1  /* loop DW count is one less than total */
 581     subf    5,10,12  /* Move source addr to previous full double word.  */
 582     cmpldi  cr5, 10, 2
 583     cmpldi  cr0, 10, 4
 584     mr      4,3
 585     srdi    8,11,2   /* calculate the 32 byte loop count */
 586     ld      6,0(5)   /* pre load 1st full doubleword.  */
 587     mtcrf   0x01,11
 588     cmpldi  cr6,9,4
 589     mtctr   8
 590     ld      7,8(5)   /* pre load 2nd full doubleword.  */
 591     bge     cr0, L(du4_do)
 592     blt     cr5, L(du1_do)
 593     beq     cr5, L(du2_do)
 594     b       L(du3_do)
 595
 596     .align 4
 597 L(du1_do):
 598     bf      30,L(du1_1dw)
 599
 600     /* there are at least two DWs to copy */
 601     sldi     0,6, 8
 602     srdi     8,7, 64-8
 603     or      0,0,8
 604     ld      6,16(5)
 605     std     0,0(4)
 606     sldi     0,7, 8
 607     srdi     8,6, 64-8
 608     or      0,0,8
 609     ld      7,24(5)
 610     std     0,8(4)
 611     addi    4,4,16
 612     addi    5,5,32
 613     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
 614     bf      31,L(du1_loop)
 615     /* there is a third DW to copy */
 616     sldi     0,6, 8
 617     srdi     8,7, 64-8
 618     or      0,0,8
 619     std     0,0(4)
 620     mr      6,7
 621     ld      7,0(5)
 622     addi    5,5,8
 623     addi    4,4,8
 624     beq     cr6,L(du1_fini)  /* if total DWs = 4, then bypass loop */
 625     b       L(du1_loop)
 626     .align 4
 627 L(du1_1dw):
 628     sldi     0,6, 8
 629     srdi     8,7, 64-8
 630     addi    5,5,16
 631     or      0,0,8
 632     bf      31,L(du1_loop)
 633     mr      6,7
 634     ld      7,0(5)
 635     addi    5,5,8
 636     std     0,0(4)
 637     addi    4,4,8
 638     .align 4
 639 /* copy 32 bytes at a time */
 640 L(du1_loop):
 641     sldi   0,6, 8
 642     srdi   8,7, 64-8
 643     or    0,0,8
 644     ld    6,0(5)
 645     std   0,0(4)
 646     sldi   0,7, 8
 647     srdi   8,6, 64-8
 648     or    0,0,8
 649     ld    7,8(5)
 650     std   0,8(4)
 651     sldi   0,6, 8
 652     srdi   8,7, 64-8
 653     or    0,0,8
 654     ld    6,16(5)
 655     std   0,16(4)
 656     sldi   0,7, 8
 657     srdi   8,6, 64-8
 658     or    0,0,8
 659     ld    7,24(5)
 660     std   0,24(4)
 661     addi  5,5,32
 662     addi  4,4,32
 663     bdnz+ L(du1_loop)
 664     .align 4
 665 L(du1_fini):
 666     /* calculate and store the final DW */
 667     sldi   0,6, 8
 668     srdi   8,7, 64-8
 669     or    0,0,8
 670     std   0,0(4)
 671     b     L(du_done)
 672
 673     .align 4
 674 L(du2_do):
 675     bf      30,L(du2_1dw)
 676
 677     /* there are at least two DWs to copy */
 678     sldi     0,6, 16
 679     srdi     8,7, 64-16
 680     or      0,0,8
 681     ld      6,16(5)
 682     std     0,0(4)
 683     sldi     0,7, 16
 684     srdi     8,6, 64-16
 685     or      0,0,8
 686     ld      7,24(5)
 687     std     0,8(4)
 688     addi    4,4,16
 689     addi    5,5,32
 690     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
 691     bf      31,L(du2_loop)
 692     /* there is a third DW to copy */
 693     sldi     0,6, 16
 694     srdi     8,7, 64-16
 695     or      0,0,8
 696     std     0,0(4)
 697     mr      6,7
 698     ld      7,0(5)
 699     addi    5,5,8
 700     addi    4,4,8
 701     beq     cr6,L(du2_fini)  /* if total DWs = 4, then bypass loop */
 702     b       L(du2_loop)
 703     .align 4
 704 L(du2_1dw):
 705     sldi     0,6, 16
 706     srdi     8,7, 64-16
 707     addi    5,5,16
 708     or      0,0,8
 709     bf      31,L(du2_loop)
 710     mr      6,7
 711     ld      7,0(5)
 712     addi    5,5,8
 713     std     0,0(4)
 714     addi    4,4,8
 715     .align 4
 716 /* copy 32 bytes at a time */
 717 L(du2_loop):
 718     sldi   0,6, 16
 719     srdi   8,7, 64-16
 720     or    0,0,8
 721     ld    6,0(5)
 722     std   0,0(4)
 723     sldi   0,7, 16
 724     srdi   8,6, 64-16
 725     or    0,0,8
 726     ld    7,8(5)
 727     std   0,8(4)
 728     sldi   0,6, 16
 729     srdi   8,7, 64-16
 730     or    0,0,8
 731     ld    6,16(5)
 732     std   0,16(4)
 733     sldi   0,7, 16
 734     srdi   8,6, 64-16
 735     or    0,0,8
 736     ld    7,24(5)
 737     std   0,24(4)
 738     addi  5,5,32
 739     addi  4,4,32
 740     bdnz+ L(du2_loop)
 741     .align 4
 742 L(du2_fini):
 743     /* calculate and store the final DW */
 744     sldi   0,6, 16
 745     srdi   8,7, 64-16
 746     or    0,0,8
 747     std   0,0(4)
 748     b     L(du_done)
 749
 750     .align 4
 751 L(du3_do):
 752     bf      30,L(du3_1dw)
 753
 754     /* there are at least two DWs to copy */
 755     sldi     0,6, 24
 756     srdi     8,7, 64-24
 757     or      0,0,8
 758     ld      6,16(5)
 759     std     0,0(4)
 760     sldi     0,7, 24
 761     srdi     8,6, 64-24
 762     or      0,0,8
 763     ld      7,24(5)
 764     std     0,8(4)
 765     addi    4,4,16
 766     addi    5,5,32
 767     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
 768     bf      31,L(du3_loop)
 769     /* there is a third DW to copy */
 770     sldi     0,6, 24
 771     srdi     8,7, 64-24
 772     or      0,0,8
 773     std     0,0(4)
 774     mr      6,7
 775     ld      7,0(5)
 776     addi    5,5,8
 777     addi    4,4,8
 778     beq     cr6,L(du3_fini)  /* if total DWs = 4, then bypass loop */
 779     b       L(du3_loop)
 780     .align 4
 781 L(du3_1dw):
 782     sldi     0,6, 24
 783     srdi     8,7, 64-24
 784     addi    5,5,16
 785     or      0,0,8
 786     bf      31,L(du3_loop)
 787     mr      6,7
 788     ld      7,0(5)
 789     addi    5,5,8
 790     std     0,0(4)
 791     addi    4,4,8
 792     .align 4
 793 /* copy 32 bytes at a time */
 794 L(du3_loop):
 795     sldi   0,6, 24
 796     srdi   8,7, 64-24
 797     or    0,0,8
 798     ld    6,0(5)
 799     std   0,0(4)
 800     sldi   0,7, 24
 801     srdi   8,6, 64-24
 802     or    0,0,8
 803     ld    7,8(5)
 804     std   0,8(4)
 805     sldi   0,6, 24
 806     srdi   8,7, 64-24
 807     or    0,0,8
 808     ld    6,16(5)
 809     std   0,16(4)
 810     sldi   0,7, 24
 811     srdi   8,6, 64-24
 812     or    0,0,8
 813     ld    7,24(5)
 814     std   0,24(4)
 815     addi  5,5,32
 816     addi  4,4,32
 817     bdnz+ L(du3_loop)
 818     .align 4
 819 L(du3_fini):
 820     /* calculate and store the final DW */
 821     sldi   0,6, 24
 822     srdi   8,7, 64-24
 823     or    0,0,8
 824     std   0,0(4)
 825     b     L(du_done)
 826
 827     .align 4
 828 L(du4_do):
 829     cmpldi  cr5, 10, 6
 830     beq     cr0, L(du4_dox)
 831     blt     cr5, L(du5_do)
 832     beq     cr5, L(du6_do)
 833     b       L(du7_do)
 834 L(du4_dox):
 835     bf      30,L(du4_1dw)
 836
 837     /* there are at least two DWs to copy */
 838     sldi     0,6, 32
 839     srdi     8,7, 64-32
 840     or      0,0,8
 841     ld      6,16(5)
 842     std     0,0(4)
 843     sldi     0,7, 32
 844     srdi     8,6, 64-32
 845     or      0,0,8
 846     ld      7,24(5)
 847     std     0,8(4)
 848     addi    4,4,16
 849     addi    5,5,32
 850     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
 851     bf      31,L(du4_loop)
 852     /* there is a third DW to copy */
 853     sldi     0,6, 32
 854     srdi     8,7, 64-32
 855     or      0,0,8
 856     std     0,0(4)
 857     mr      6,7
 858     ld      7,0(5)
 859     addi    5,5,8
 860     addi    4,4,8
 861     beq     cr6,L(du4_fini)  /* if total DWs = 4, then bypass loop */
 862     b       L(du4_loop)
 863     .align 4
 864 L(du4_1dw):
 865     sldi     0,6, 32
 866     srdi     8,7, 64-32
 867     addi    5,5,16
 868     or      0,0,8
 869     bf      31,L(du4_loop)
 870     mr      6,7
 871     ld      7,0(5)
 872     addi    5,5,8
 873     std     0,0(4)
 874     addi    4,4,8
 875     .align 4
 876 /* copy 32 bytes at a time */
 877 L(du4_loop):
 878     sldi   0,6, 32
 879     srdi   8,7, 64-32
 880     or    0,0,8
 881     ld    6,0(5)
 882     std   0,0(4)
 883     sldi   0,7, 32
 884     srdi   8,6, 64-32
 885     or    0,0,8
 886     ld    7,8(5)
 887     std   0,8(4)
 888     sldi   0,6, 32
 889     srdi   8,7, 64-32
 890     or    0,0,8
 891     ld    6,16(5)
 892     std   0,16(4)
 893     sldi   0,7, 32
 894     srdi   8,6, 64-32
 895     or    0,0,8
 896     ld    7,24(5)
 897     std   0,24(4)
 898     addi  5,5,32
 899     addi  4,4,32
 900     bdnz+ L(du4_loop)
 901     .align 4
 902 L(du4_fini):
 903     /* calculate and store the final DW */
 904     sldi   0,6, 32
 905     srdi   8,7, 64-32
 906     or    0,0,8
 907     std   0,0(4)
 908     b     L(du_done)
 909
 910     .align 4
 911 L(du5_do):
 912     bf      30,L(du5_1dw)
 913
 914     /* there are at least two DWs to copy */
 915     sldi     0,6, 40
 916     srdi     8,7, 64-40
 917     or      0,0,8
 918     ld      6,16(5)
 919     std     0,0(4)
 920     sldi     0,7, 40
 921     srdi     8,6, 64-40
 922     or      0,0,8
 923     ld      7,24(5)
 924     std     0,8(4)
 925     addi    4,4,16
 926     addi    5,5,32
 927     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
 928     bf      31,L(du5_loop)
 929     /* there is a third DW to copy */
 930     sldi     0,6, 40
 931     srdi     8,7, 64-40
 932     or      0,0,8
 933     std     0,0(4)
 934     mr      6,7
 935     ld      7,0(5)
 936     addi    5,5,8
 937     addi    4,4,8
 938     beq     cr6,L(du5_fini)  /* if total DWs = 4, then bypass loop */
 939     b       L(du5_loop)
 940     .align 4
 941 L(du5_1dw):
 942     sldi     0,6, 40
 943     srdi     8,7, 64-40
 944     addi    5,5,16
 945     or      0,0,8
 946     bf      31,L(du5_loop)
 947     mr      6,7
 948     ld      7,0(5)
 949     addi    5,5,8
 950     std     0,0(4)
 951     addi    4,4,8
 952     .align 4
 953 /* copy 32 bytes at a time */
 954 L(du5_loop):
 955     sldi   0,6, 40
 956     srdi   8,7, 64-40
 957     or    0,0,8
 958     ld    6,0(5)
 959     std   0,0(4)
 960     sldi   0,7, 40
 961     srdi   8,6, 64-40
 962     or    0,0,8
 963     ld    7,8(5)
 964     std   0,8(4)
 965     sldi   0,6, 40
 966     srdi   8,7, 64-40
 967     or    0,0,8
 968     ld    6,16(5)
 969     std   0,16(4)
 970     sldi   0,7, 40
 971     srdi   8,6, 64-40
 972     or    0,0,8
 973     ld    7,24(5)
 974     std   0,24(4)
 975     addi  5,5,32
 976     addi  4,4,32
 977     bdnz+ L(du5_loop)
 978     .align 4
 979 L(du5_fini):
 980     /* calculate and store the final DW */
 981     sldi   0,6, 40
 982     srdi   8,7, 64-40
 983     or    0,0,8
 984     std   0,0(4)
 985     b     L(du_done)
 986
 987     .align 4
 988 L(du6_do):
 989     bf      30,L(du6_1dw)
 990
 991     /* there are at least two DWs to copy */
 992     sldi     0,6, 48
 993     srdi     8,7, 64-48
 994     or      0,0,8
 995     ld      6,16(5)
 996     std     0,0(4)
 997     sldi     0,7, 48
 998     srdi     8,6, 64-48
 999     or      0,0,8
1000     ld      7,24(5)
1001     std     0,8(4)
1002     addi    4,4,16
1003     addi    5,5,32
1004     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
1005     bf      31,L(du6_loop)
1006     /* there is a third DW to copy */
1007     sldi     0,6, 48
1008     srdi     8,7, 64-48
1009     or      0,0,8
1010     std     0,0(4)
1011     mr      6,7
1012     ld      7,0(5)
1013     addi    5,5,8
1014     addi    4,4,8
1015     beq     cr6,L(du6_fini)  /* if total DWs = 4, then bypass loop */
1016     b       L(du6_loop)
1017     .align 4
1018 L(du6_1dw):
1019     sldi     0,6, 48
1020     srdi     8,7, 64-48
1021     addi    5,5,16
1022     or      0,0,8
1023     bf      31,L(du6_loop)
1024     mr      6,7
1025     ld      7,0(5)
1026     addi    5,5,8
1027     std     0,0(4)
1028     addi    4,4,8
1029     .align 4
1030 /* copy 32 bytes at a time */
1031 L(du6_loop):
1032     sldi   0,6, 48
1033     srdi   8,7, 64-48
1034     or    0,0,8
1035     ld    6,0(5)
1036     std   0,0(4)
1037     sldi   0,7, 48
1038     srdi   8,6, 64-48
1039     or    0,0,8
1040     ld    7,8(5)
1041     std   0,8(4)
1042     sldi   0,6, 48
1043     srdi   8,7, 64-48
1044     or    0,0,8
1045     ld    6,16(5)
1046     std   0,16(4)
1047     sldi   0,7, 48
1048     srdi   8,6, 64-48
1049     or    0,0,8
1050     ld    7,24(5)
1051     std   0,24(4)
1052     addi  5,5,32
1053     addi  4,4,32
1054     bdnz+ L(du6_loop)
1055     .align 4
1056 L(du6_fini):
1057     /* calculate and store the final DW */
1058     sldi   0,6, 48
1059     srdi   8,7, 64-48
1060     or    0,0,8
1061     std   0,0(4)
1062     b     L(du_done)
1063
1064     .align 4
1065 L(du7_do):
1066     bf      30,L(du7_1dw)
1067
1068     /* there are at least two DWs to copy */
1069     sldi     0,6, 56
1070     srdi     8,7, 64-56
1071     or      0,0,8
1072     ld      6,16(5)
1073     std     0,0(4)
1074     sldi     0,7, 56
1075     srdi     8,6, 64-56
1076     or      0,0,8
1077     ld      7,24(5)
1078     std     0,8(4)
1079     addi    4,4,16
1080     addi    5,5,32
1081     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
1082     bf      31,L(du7_loop)
1083     /* there is a third DW to copy */
1084     sldi     0,6, 56
1085     srdi     8,7, 64-56
1086     or      0,0,8
1087     std     0,0(4)
1088     mr      6,7
1089     ld      7,0(5)
1090     addi    5,5,8
1091     addi    4,4,8
1092     beq     cr6,L(du7_fini)  /* if total DWs = 4, then bypass loop */
1093     b       L(du7_loop)
1094     .align 4
1095 L(du7_1dw):
1096     sldi     0,6, 56
1097     srdi     8,7, 64-56
1098     addi    5,5,16
1099     or      0,0,8
1100     bf      31,L(du7_loop)
1101     mr      6,7
1102     ld      7,0(5)
1103     addi    5,5,8
1104     std     0,0(4)
1105     addi    4,4,8
1106     .align 4
1107 /* copy 32 bytes at a time */
1108 L(du7_loop):
1109     sldi   0,6, 56
1110     srdi   8,7, 64-56
1111     or    0,0,8
1112     ld    6,0(5)
1113     std   0,0(4)
1114     sldi   0,7, 56
1115     srdi   8,6, 64-56
1116     or    0,0,8
1117     ld    7,8(5)
1118     std   0,8(4)
1119     sldi   0,6, 56
1120     srdi   8,7, 64-56
1121     or    0,0,8
1122     ld    6,16(5)
1123     std   0,16(4)
1124     sldi   0,7, 56
1125     srdi   8,6, 64-56
1126     or    0,0,8
1127     ld    7,24(5)
1128     std   0,24(4)
1129     addi  5,5,32
1130     addi  4,4,32
1131     bdnz+ L(du7_loop)
1132     .align 4
1133 L(du7_fini):
1134     /* calculate and store the final DW */
1135     sldi   0,6, 56
1136     srdi   8,7, 64-56
1137     or    0,0,8
1138     std   0,0(4)
1139     b     L(du_done)
1140
1141     .align 4
1142 L(du_done):
1143     rldicr 0,31,0,60
1144     mtcrf 0x01,31
1145     beq   cr1,0f        /* If the tail is 0 bytes we are done!  */
1146
1147     add   3,3,0
1148     add   12,12,0
1149 /*  At this point we have a tail of 0-7 bytes and we know that the
1150     destiniation is double word aligned.  */
1151 4:  bf    29,2f
1152     lwz   6,0(12)
1153     addi  12,12,4
1154     stw   6,0(3)
1155     addi  3,3,4
1156 2:  bf    30,1f
1157     lhz   6,0(12)
1158     addi  12,12,2
1159     sth   6,0(3)
1160     addi  3,3,2
1161 1:  bf    31,0f
1162     lbz   6,0(12)
1163     stb   6,0(3)
1164 0:
1165   /* Return original dst pointer.  */
1166     ld 31,-8(1)
1167     ld 3,-16(1)
1168     blr
1169 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
1170 libc_hidden_builtin_def (memcpy)