sysdeps/powerpc/powerpc64/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  29    of handling unaligned load/stores that do not cross 32-byte boundaries.
  30
  31    Longer moves (>= 32-bytes) justify the effort to get at least the
  32    destination doubleword (8-byte) aligned.  Further optimization is
  33    possible when both source and destination are doubleword aligned.
  34    Each case has a optimized unrolled loop.
  35
  36    For POWER6 unaligned loads will take a 20+ cycle hiccup for any
  37    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
  38    is more forgiving and does not take a hiccup until page or
  39    segment boundaries.  So we require doubleword alignment for
  40    the source but may take a risk and only require word alignment
  41    for the destination.  */
  42
  43         .machine        "power6"
  44 EALIGN (memcpy, 7, 0)
  45         CALL_MCOUNT 3
  46
  47     cmpldi cr1,5,31
  48     neg   0,3
  49     std   3,-16(1)
  50     std   31,-8(1)
  51     andi. 11,3,7        /* check alignment of dst.  */
  52     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  53     clrldi 10,4,61      /* check alignment of src.  */
  54     cmpldi cr6,5,8
  55     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  56     mtcrf 0x01,0
  57     cmpld cr6,10,11
  58     srdi  9,5,3         /* Number of full double words remaining.  */
  59     beq   .L0
  60
  61     subf  5,0,5
  62   /* Move 0-7 bytes as needed to get the destination doubleword aligned.
  63      Duplicate some code to maximize fall-through and minimize agen delays.  */
  64 1:  bf    31,2f
  65     lbz   6,0(4)
  66     stb   6,0(3)
  67     bf    30,5f
  68     lhz   6,1(4)
  69     sth   6,1(3)
  70     bf    29,0f
  71     lwz   6,3(4)
  72     stw   6,3(3)
  73     b     0f
  74 5:
  75     bf    29,0f
  76     lwz   6,1(4)
  77     stw   6,1(3)
  78     b     0f
  79
  80 2:  bf    30,4f
  81     lhz   6,0(4)
  82     sth   6,0(3)
  83     bf    29,0f
  84     lwz   6,2(4)
  85     stw   6,2(3)
  86     b     0f
  87
  88 4:  bf    29,0f
  89     lwz   6,0(4)
  90     stw   6,0(3)
  91 0:
  92 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
  93     add   4,4,0
  94     add   3,3,0
  95
  96     clrldi 10,4,61      /* check alignment of src again.  */
  97     srdi  9,5,3 /* Number of full double words remaining.  */
  98
  99   /* Copy doublewords from source to destination, assuming the
 100      destination is aligned on a doubleword boundary.
 101
 102      At this point we know there are at least 25 bytes left (32-7) to copy.
 103      The next step is to determine if the source is also doubleword aligned.
 104      If not branch to the unaligned move code at .L6. which uses
 105      a load, shift, store strategy.
 106
 107      Otherwise source and destination are doubleword aligned, and we can
 108      the optimized doubleword copy loop.  */
 109     .align  4
 110 .L0:
 111     clrldi  11,5,61
 112     andi.   0,5,0x78
 113     srdi    12,5,7      /* Number of 128-byte blocks to move.  */
 114     cmpldi  cr1,11,0    /* If the tail is 0 bytes  */
 115     bne-    cr6,.L6     /* If source is not DW aligned.  */
 116
 117   /* Move doublewords where destination and source are DW aligned.
 118      Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
 119      If the copy is not an exact multiple of 128 bytes, 1-15
 120      doublewords are copied as needed to set up the main loop.  After
 121      the main loop exits there may be a tail of 1-7 bytes. These byte
 122      are copied a word/halfword/byte at a time as needed to preserve
 123      alignment.
 124
 125      For POWER6 the L1 is store-through and the L2 is store-in.  The
 126      L2 is clocked at half CPU clock so we can store 16 bytes every
 127      other cycle.  POWER6 also has a load/store bypass so we can do
 128      load, load, store, store every 2 cycles.
 129
 130      The following code is sensitive to cache line alignment.  Do not
 131      make any change with out first making sure they don't result in
 132      splitting ld/std pairs across a cache line.  */
 133
 134     mtcrf 0x02,5
 135     mtcrf 0x01,5
 136     cmpldi  cr5,12,1
 137     beq   L(das_loop)
 138
 139     bf    25,4f
 140     .align  3
 141     ld    6,0(4)
 142     ld    7,8(4)
 143     mr    11,4
 144     mr    10,3
 145     std   6,0(3)
 146     std   7,8(3)
 147     ld    6,16(4)
 148     ld    7,24(4)
 149     std   6,16(3)
 150     std   7,24(3)
 151     ld    6,0+32(4)
 152     ld    7,8+32(4)
 153     addi  4,4,64
 154     addi  3,3,64
 155     std   6,0+32(10)
 156     std   7,8+32(10)
 157     ld    6,16+32(11)
 158     ld    7,24+32(11)
 159     std   6,16+32(10)
 160     std   7,24+32(10)
 161 4:
 162     mr    10,3
 163     bf    26,2f
 164     ld    6,0(4)
 165     ld    7,8(4)
 166     mr    11,4
 167     nop
 168     std   6,0(3)
 169     std   7,8(3)
 170     ld    6,16(4)
 171     ld    7,24(4)
 172     addi  4,4,32
 173     std   6,16(3)
 174     std   7,24(3)
 175     addi  3,3,32
 176 6:
 177     nop
 178     bf    27,5f
 179     ld    6,0+32(11)
 180     ld    7,8+32(11)
 181     addi  4,4,16
 182     addi  3,3,16
 183     std   6,0+32(10)
 184     std   7,8+32(10)
 185     bf    28,L(das_loop_s)
 186     ld    0,16+32(11)
 187     addi  4,4,8
 188     addi  3,3,8
 189     std   0,16+32(10)
 190     blt   cr5,L(das_tail)
 191     b     L(das_loop)
 192     .align  3
 193 5:
 194     nop
 195     bf    28,L(das_loop_s)
 196     ld    6,32(11)
 197     addi  4,4,8
 198     addi  3,3,8
 199     std   6,32(10)
 200     blt   cr5,L(das_tail)
 201     b     L(das_loop)
 202     .align  3
 203 2:
 204     mr    11,4
 205     bf    27,1f
 206     ld    6,0(4)
 207     ld    7,8(4)
 208     addi  4,4,16
 209     addi  3,3,16
 210     std   6,0(10)
 211     std   7,8(10)
 212     bf    28,L(das_loop_s)
 213     ld    0,16(11)
 214     addi  4,11,24
 215     addi  3,10,24
 216     std   0,16(10)
 217     blt   cr5,L(das_tail)
 218     b     L(das_loop)
 219     .align  3
 220 1:
 221     nop
 222     bf    28,L(das_loop_s)
 223     ld    6,0(4)
 224     addi  4,4,8
 225     addi  3,3,8
 226     std   6,0(10)
 227 L(das_loop_s):
 228     nop
 229     blt   cr5,L(das_tail)
 230     .align  4
 231 L(das_loop):
 232     ld    6,0(4)
 233     ld    7,8(4)
 234     mr    10,3
 235     mr    11,4
 236     std   6,0(3)
 237     std   7,8(3)
 238     addi  12,12,-1
 239     nop
 240     ld    8,16(4)
 241     ld    0,24(4)
 242     std   8,16(3)
 243     std   0,24(3)
 244
 245     ld    6,0+32(4)
 246     ld    7,8+32(4)
 247     std   6,0+32(3)
 248     std   7,8+32(3)
 249     ld    8,16+32(4)
 250     ld    0,24+32(4)
 251     std   8,16+32(3)
 252     std   0,24+32(3)
 253
 254     ld    6,0+64(11)
 255     ld    7,8+64(11)
 256     std   6,0+64(10)
 257     std   7,8+64(10)
 258     ld    8,16+64(11)
 259     ld    0,24+64(11)
 260     std   8,16+64(10)
 261     std   0,24+64(10)
 262
 263     ld    6,0+96(11)
 264     ld    7,8+96(11)
 265     addi  4,4,128
 266     addi  3,3,128
 267     std   6,0+96(10)
 268     std   7,8+96(10)
 269     ld    8,16+96(11)
 270     ld    0,24+96(11)
 271     std   8,16+96(10)
 272     std   0,24+96(10)
 273     ble   cr5,L(das_loop_e)
 274
 275     mtctr   12
 276     .align  4
 277 L(das_loop2):
 278     ld    6,0(4)
 279     ld    7,8(4)
 280     mr    10,3
 281     mr    11,4
 282     std   6,0(3)
 283     std   7,8(3)
 284     ld    8,16(4)
 285     ld    0,24(4)
 286     std   8,16(3)
 287     std   0,24(3)
 288
 289     ld    6,0+32(4)
 290     ld    7,8+32(4)
 291     std   6,0+32(3)
 292     std   7,8+32(3)
 293     ld    8,16+32(4)
 294     ld    0,24+32(4)
 295     std   8,16+32(3)
 296     std   0,24+32(3)
 297
 298     ld    6,0+64(11)
 299     ld    7,8+64(11)
 300     std   6,0+64(10)
 301     std   7,8+64(10)
 302     ld    8,16+64(11)
 303     ld    0,24+64(11)
 304     std   8,16+64(10)
 305     std   0,24+64(10)
 306
 307     ld    6,0+96(11)
 308     ld    7,8+96(11)
 309     addi  4,4,128
 310     addi  3,3,128
 311     std   6,0+96(10)
 312     std   7,8+96(10)
 313     ld    8,16+96(11)
 314     ld    0,24+96(11)
 315     std   8,16+96(10)
 316     std   0,24+96(10)
 317     bdnz  L(das_loop2)
 318 L(das_loop_e):
 319 /* Check of a 1-7 byte tail, return if none.  */
 320     bne   cr1,L(das_tail2)
 321 /* Return original dst pointer.  */
 322     ld 3,-16(1)
 323     blr
 324     .align  4
 325 L(das_tail):
 326     beq   cr1,0f
 327
 328 L(das_tail2):
 329 /*  At this point we have a tail of 0-7 bytes and we know that the
 330     destination is double word aligned.  */
 331 4:  bf    29,2f
 332     lwz   6,0(4)
 333     stw   6,0(3)
 334     bf    30,5f
 335     lhz   6,4(4)
 336     sth   6,4(3)
 337     bf    31,0f
 338     lbz   6,6(4)
 339     stb   6,6(3)
 340     b     0f
 341 5:  bf    31,0f
 342     lbz   6,4(4)
 343     stb   6,4(3)
 344     b     0f
 345
 346 2:  bf    30,1f
 347     lhz   6,0(4)
 348     sth   6,0(3)
 349     bf    31,0f
 350     lbz   6,2(4)
 351     stb   6,2(3)
 352     b     0f
 353
 354 1:  bf    31,0f
 355     lbz   6,0(4)
 356     stb   6,0(3)
 357 0:
 358   /* Return original dst pointer.  */
 359     ld 3,-16(1)
 360     blr
 361
 362 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 363    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 364    tests.
 365
 366    In the short (0-8 byte) case no attempt is made to force alignment
 367    of either source or destination.  The hardware will handle the
 368    unaligned load/stores with small delays for crossing 32- 128-byte,
 369    and 4096-byte boundaries. Since these short moves are unlikely to be
 370    unaligned or cross these boundaries, the overhead to force
 371    alignment is not justified.
 372
 373    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 374    boundaries.  Since only loads are sensitive to the 32-/128-byte
 375    boundaries it is more important to align the source then the
 376    destination.  If the source is not already word aligned, we first
 377    move 1-3 bytes as needed.  Since we are only word aligned we don't
 378    use double word load/stores to insure that all loads are aligned.
 379    While the destination and stores may still be unaligned, this
 380    is only an issue for page (4096 byte boundary) crossing, which
 381    should be rare for these short moves.  The hardware handles this
 382    case automatically with a small (~20 cycle) delay.  */
 383     .align  4
 384 .L2:
 385     mtcrf 0x01,5
 386     neg   8,4
 387     clrrdi      11,4,2
 388     andi. 0,8,3
 389     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 390 /* At least 9 bytes left.  Get the source word aligned.  */
 391     cmpldi      cr1,5,16
 392     mr    10,5
 393     mr    12,4
 394     cmpldi      cr6,0,2
 395     beq   L(dus_tail)   /* If the source is already word aligned skip this.  */
 396 /* Copy 1-3 bytes to get source address word aligned.  */
 397     lwz   6,0(11)
 398     subf  10,0,5
 399     add   12,4,0
 400     blt   cr6,5f
 401     srdi  7,6,16
 402     bgt   cr6,3f
 403     sth   6,0(3)
 404     b     7f
 405     .align  4
 406 3:
 407     stb   7,0(3)
 408     sth   6,1(3)
 409     b     7f
 410     .align  4
 411 5:
 412     stb   6,0(3)
 413 7:
 414     cmpldi      cr1,10,16
 415     add   3,3,0
 416     mtcrf 0x01,10
 417     .align  4
 418 L(dus_tail):
 419 /* At least 6 bytes left and the source is word aligned.  This allows
 420    some speculative loads up front.  */
 421 /* We need to special case the fall-through because the biggest delays
 422    are due to address computation not being ready in time for the
 423    AGEN.  */
 424     lwz   6,0(12)
 425     lwz   7,4(12)
 426     blt   cr1,L(dus_tail8)
 427     cmpldi      cr0,10,24
 428 L(dus_tail16): /* Move 16 bytes.  */
 429     stw   6,0(3)
 430     stw   7,4(3)
 431     lwz   6,8(12)
 432     lwz   7,12(12)
 433     stw   6,8(3)
 434     stw   7,12(3)
 435 /* Move 8 bytes more.  */
 436     bf    28,L(dus_tail16p8)
 437     cmpldi      cr1,10,28
 438     lwz   6,16(12)
 439     lwz   7,20(12)
 440     stw   6,16(3)
 441     stw   7,20(3)
 442 /* Move 4 bytes more.  */
 443     bf    29,L(dus_tail16p4)
 444     lwz   6,24(12)
 445     stw   6,24(3)
 446     addi  12,12,28
 447     addi  3,3,28
 448     bgt   cr1,L(dus_tail2)
 449  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 450     ld    3,-16(1)
 451     blr
 452     .align  4
 453 L(dus_tail16p8):  /* less than 8 bytes left.  */
 454     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
 455     cmpldi      cr1,10,20
 456     bf    29,L(dus_tail16p2)
 457 /* Move 4 bytes more.  */
 458     lwz   6,16(12)
 459     stw   6,16(3)
 460     addi  12,12,20
 461     addi  3,3,20
 462     bgt   cr1,L(dus_tail2)
 463  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 464     ld    3,-16(1)
 465     blr
 466     .align  4
 467 L(dus_tail16p4):  /* less than 4 bytes left.  */
 468     addi  12,12,24
 469     addi  3,3,24
 470     bgt   cr0,L(dus_tail2)
 471  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 472     ld    3,-16(1)
 473     blr
 474     .align  4
 475 L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
 476     addi  12,12,16
 477     addi  3,3,16
 478     b     L(dus_tail2)
 479
 480     .align  4
 481 L(dus_tail8):  /* Move 8 bytes.  */
 482 /*  r6, r7 already loaded speculatively.  */
 483     cmpldi      cr1,10,8
 484     cmpldi      cr0,10,12
 485     bf    28,L(dus_tail4)
 486     .align  2
 487     stw   6,0(3)
 488     stw   7,4(3)
 489 /* Move 4 bytes more.  */
 490     bf    29,L(dus_tail8p4)
 491     lwz   6,8(12)
 492     stw   6,8(3)
 493     addi  12,12,12
 494     addi  3,3,12
 495     bgt   cr0,L(dus_tail2)
 496  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 497     ld    3,-16(1)
 498     blr
 499     .align  4
 500 L(dus_tail8p4):  /* less than 4 bytes left.  */
 501     addi  12,12,8
 502     addi  3,3,8
 503     bgt   cr1,L(dus_tail2)
 504  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 505     ld    3,-16(1)
 506     blr
 507
 508     .align  4
 509 L(dus_tail4):  /* Move 4 bytes.  */
 510 /*  r6 already loaded speculatively.  If we are here we know there is
 511     more than 4 bytes left.  So there is no need to test.  */
 512     addi  12,12,4
 513     stw   6,0(3)
 514     addi  3,3,4
 515 L(dus_tail2):  /* Move 2-3 bytes.  */
 516     bf    30,L(dus_tail1)
 517     lhz   6,0(12)
 518     sth   6,0(3)
 519     bf    31,L(dus_tailX)
 520     lbz   7,2(12)
 521     stb   7,2(3)
 522     ld 3,-16(1)
 523     blr
 524 L(dus_tail1):  /* Move 1 byte.  */
 525     bf    31,L(dus_tailX)
 526     lbz   6,0(12)
 527     stb   6,0(3)
 528 L(dus_tailX):
 529   /* Return original dst pointer.  */
 530     ld    3,-16(1)
 531     blr
 532
 533 /* Special case to copy 0-8 bytes.  */
 534     .align  4
 535 .LE8:
 536     mr    12,4
 537     bne   cr6,L(dus_4)
 538 /* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
 539    cycle delay.  This case should be rare and any attempt to avoid this
 540    would take most of 20 cycles any way.  */
 541     ld   6,0(4)
 542     std   6,0(3)
 543   /* Return original dst pointer.  */
 544     ld    3,-16(1)
 545     blr
 546     .align  4
 547 L(dus_4):
 548     bf    29,L(dus_tail2)
 549     lwz   6,0(4)
 550     stw   6,0(3)
 551     bf    30,L(dus_5)
 552     lhz   7,4(4)
 553     sth   7,4(3)
 554     bf    31,L(dus_0)
 555     lbz   8,6(4)
 556     stb   8,6(3)
 557     ld 3,-16(1)
 558     blr
 559     .align  4
 560 L(dus_5):
 561     bf    31,L(dus_0)
 562     lbz   6,4(4)
 563     stb   6,4(3)
 564 L(dus_0):
 565   /* Return original dst pointer.  */
 566     ld    3,-16(1)
 567     blr
 568
 569     .align  4
 570 .L6:
 571     cfi_offset(31,-8)
 572     mr    12,4
 573     mr    31,5
 574   /* Copy doublewords where the destination is aligned but the source is
 575      not.  Use aligned doubleword loads from the source, shifted to realign
 576      the data, to allow aligned destination stores.  */
 577     addi    11,9,-1  /* loop DW count is one less than total */
 578     subf    5,10,12  /* Move source addr to previous full double word.  */
 579     cmpldi  cr5, 10, 2
 580     cmpldi  cr0, 10, 4
 581     mr      4,3
 582     srdi    8,11,2   /* calculate the 32 byte loop count */
 583     ld      6,0(5)   /* pre load 1st full doubleword.  */
 584     mtcrf   0x01,11
 585     cmpldi  cr6,9,4
 586     mtctr   8
 587     ld      7,8(5)   /* pre load 2nd full doubleword.  */
 588     bge     cr0, L(du4_do)
 589     blt     cr5, L(du1_do)
 590     beq     cr5, L(du2_do)
 591     b       L(du3_do)
 592
 593     .align 4
 594 L(du1_do):
 595     bf      30,L(du1_1dw)
 596
 597     /* there are at least two DWs to copy */
 598     sldi     0,6, 8
 599     srdi     8,7, 64-8
 600     or      0,0,8
 601     ld      6,16(5)
 602     std     0,0(4)
 603     sldi     0,7, 8
 604     srdi     8,6, 64-8
 605     or      0,0,8
 606     ld      7,24(5)
 607     std     0,8(4)
 608     addi    4,4,16
 609     addi    5,5,32
 610     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
 611     bf      31,L(du1_loop)
 612     /* there is a third DW to copy */
 613     sldi     0,6, 8
 614     srdi     8,7, 64-8
 615     or      0,0,8
 616     std     0,0(4)
 617     mr      6,7
 618     ld      7,0(5)
 619     addi    5,5,8
 620     addi    4,4,8
 621     beq     cr6,L(du1_fini)  /* if total DWs = 4, then bypass loop */
 622     b       L(du1_loop)
 623     .align 4
 624 L(du1_1dw):
 625     sldi     0,6, 8
 626     srdi     8,7, 64-8
 627     addi    5,5,16
 628     or      0,0,8
 629     bf      31,L(du1_loop)
 630     mr      6,7
 631     ld      7,0(5)
 632     addi    5,5,8
 633     std     0,0(4)
 634     addi    4,4,8
 635     .align 4
 636 /* copy 32 bytes at a time */
 637 L(du1_loop):
 638     sldi   0,6, 8
 639     srdi   8,7, 64-8
 640     or    0,0,8
 641     ld    6,0(5)
 642     std   0,0(4)
 643     sldi   0,7, 8
 644     srdi   8,6, 64-8
 645     or    0,0,8
 646     ld    7,8(5)
 647     std   0,8(4)
 648     sldi   0,6, 8
 649     srdi   8,7, 64-8
 650     or    0,0,8
 651     ld    6,16(5)
 652     std   0,16(4)
 653     sldi   0,7, 8
 654     srdi   8,6, 64-8
 655     or    0,0,8
 656     ld    7,24(5)
 657     std   0,24(4)
 658     addi  5,5,32
 659     addi  4,4,32
 660     bdnz+ L(du1_loop)
 661     .align 4
 662 L(du1_fini):
 663     /* calculate and store the final DW */
 664     sldi   0,6, 8
 665     srdi   8,7, 64-8
 666     or    0,0,8
 667     std   0,0(4)
 668     b     L(du_done)
 669
 670     .align 4
 671 L(du2_do):
 672     bf      30,L(du2_1dw)
 673
 674     /* there are at least two DWs to copy */
 675     sldi     0,6, 16
 676     srdi     8,7, 64-16
 677     or      0,0,8
 678     ld      6,16(5)
 679     std     0,0(4)
 680     sldi     0,7, 16
 681     srdi     8,6, 64-16
 682     or      0,0,8
 683     ld      7,24(5)
 684     std     0,8(4)
 685     addi    4,4,16
 686     addi    5,5,32
 687     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
 688     bf      31,L(du2_loop)
 689     /* there is a third DW to copy */
 690     sldi     0,6, 16
 691     srdi     8,7, 64-16
 692     or      0,0,8
 693     std     0,0(4)
 694     mr      6,7
 695     ld      7,0(5)
 696     addi    5,5,8
 697     addi    4,4,8
 698     beq     cr6,L(du2_fini)  /* if total DWs = 4, then bypass loop */
 699     b       L(du2_loop)
 700     .align 4
 701 L(du2_1dw):
 702     sldi     0,6, 16
 703     srdi     8,7, 64-16
 704     addi    5,5,16
 705     or      0,0,8
 706     bf      31,L(du2_loop)
 707     mr      6,7
 708     ld      7,0(5)
 709     addi    5,5,8
 710     std     0,0(4)
 711     addi    4,4,8
 712     .align 4
 713 /* copy 32 bytes at a time */
 714 L(du2_loop):
 715     sldi   0,6, 16
 716     srdi   8,7, 64-16
 717     or    0,0,8
 718     ld    6,0(5)
 719     std   0,0(4)
 720     sldi   0,7, 16
 721     srdi   8,6, 64-16
 722     or    0,0,8
 723     ld    7,8(5)
 724     std   0,8(4)
 725     sldi   0,6, 16
 726     srdi   8,7, 64-16
 727     or    0,0,8
 728     ld    6,16(5)
 729     std   0,16(4)
 730     sldi   0,7, 16
 731     srdi   8,6, 64-16
 732     or    0,0,8
 733     ld    7,24(5)
 734     std   0,24(4)
 735     addi  5,5,32
 736     addi  4,4,32
 737     bdnz+ L(du2_loop)
 738     .align 4
 739 L(du2_fini):
 740     /* calculate and store the final DW */
 741     sldi   0,6, 16
 742     srdi   8,7, 64-16
 743     or    0,0,8
 744     std   0,0(4)
 745     b     L(du_done)
 746
 747     .align 4
 748 L(du3_do):
 749     bf      30,L(du3_1dw)
 750
 751     /* there are at least two DWs to copy */
 752     sldi     0,6, 24
 753     srdi     8,7, 64-24
 754     or      0,0,8
 755     ld      6,16(5)
 756     std     0,0(4)
 757     sldi     0,7, 24
 758     srdi     8,6, 64-24
 759     or      0,0,8
 760     ld      7,24(5)
 761     std     0,8(4)
 762     addi    4,4,16
 763     addi    5,5,32
 764     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
 765     bf      31,L(du3_loop)
 766     /* there is a third DW to copy */
 767     sldi     0,6, 24
 768     srdi     8,7, 64-24
 769     or      0,0,8
 770     std     0,0(4)
 771     mr      6,7
 772     ld      7,0(5)
 773     addi    5,5,8
 774     addi    4,4,8
 775     beq     cr6,L(du3_fini)  /* if total DWs = 4, then bypass loop */
 776     b       L(du3_loop)
 777     .align 4
 778 L(du3_1dw):
 779     sldi     0,6, 24
 780     srdi     8,7, 64-24
 781     addi    5,5,16
 782     or      0,0,8
 783     bf      31,L(du3_loop)
 784     mr      6,7
 785     ld      7,0(5)
 786     addi    5,5,8
 787     std     0,0(4)
 788     addi    4,4,8
 789     .align 4
 790 /* copy 32 bytes at a time */
 791 L(du3_loop):
 792     sldi   0,6, 24
 793     srdi   8,7, 64-24
 794     or    0,0,8
 795     ld    6,0(5)
 796     std   0,0(4)
 797     sldi   0,7, 24
 798     srdi   8,6, 64-24
 799     or    0,0,8
 800     ld    7,8(5)
 801     std   0,8(4)
 802     sldi   0,6, 24
 803     srdi   8,7, 64-24
 804     or    0,0,8
 805     ld    6,16(5)
 806     std   0,16(4)
 807     sldi   0,7, 24
 808     srdi   8,6, 64-24
 809     or    0,0,8
 810     ld    7,24(5)
 811     std   0,24(4)
 812     addi  5,5,32
 813     addi  4,4,32
 814     bdnz+ L(du3_loop)
 815     .align 4
 816 L(du3_fini):
 817     /* calculate and store the final DW */
 818     sldi   0,6, 24
 819     srdi   8,7, 64-24
 820     or    0,0,8
 821     std   0,0(4)
 822     b     L(du_done)
 823
 824     .align 4
 825 L(du4_do):
 826     cmpldi  cr5, 10, 6
 827     beq     cr0, L(du4_dox)
 828     blt     cr5, L(du5_do)
 829     beq     cr5, L(du6_do)
 830     b       L(du7_do)
 831 L(du4_dox):
 832     bf      30,L(du4_1dw)
 833
 834     /* there are at least two DWs to copy */
 835     sldi     0,6, 32
 836     srdi     8,7, 64-32
 837     or      0,0,8
 838     ld      6,16(5)
 839     std     0,0(4)
 840     sldi     0,7, 32
 841     srdi     8,6, 64-32
 842     or      0,0,8
 843     ld      7,24(5)
 844     std     0,8(4)
 845     addi    4,4,16
 846     addi    5,5,32
 847     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
 848     bf      31,L(du4_loop)
 849     /* there is a third DW to copy */
 850     sldi     0,6, 32
 851     srdi     8,7, 64-32
 852     or      0,0,8
 853     std     0,0(4)
 854     mr      6,7
 855     ld      7,0(5)
 856     addi    5,5,8
 857     addi    4,4,8
 858     beq     cr6,L(du4_fini)  /* if total DWs = 4, then bypass loop */
 859     b       L(du4_loop)
 860     .align 4
 861 L(du4_1dw):
 862     sldi     0,6, 32
 863     srdi     8,7, 64-32
 864     addi    5,5,16
 865     or      0,0,8
 866     bf      31,L(du4_loop)
 867     mr      6,7
 868     ld      7,0(5)
 869     addi    5,5,8
 870     std     0,0(4)
 871     addi    4,4,8
 872     .align 4
 873 /* copy 32 bytes at a time */
 874 L(du4_loop):
 875     sldi   0,6, 32
 876     srdi   8,7, 64-32
 877     or    0,0,8
 878     ld    6,0(5)
 879     std   0,0(4)
 880     sldi   0,7, 32
 881     srdi   8,6, 64-32
 882     or    0,0,8
 883     ld    7,8(5)
 884     std   0,8(4)
 885     sldi   0,6, 32
 886     srdi   8,7, 64-32
 887     or    0,0,8
 888     ld    6,16(5)
 889     std   0,16(4)
 890     sldi   0,7, 32
 891     srdi   8,6, 64-32
 892     or    0,0,8
 893     ld    7,24(5)
 894     std   0,24(4)
 895     addi  5,5,32
 896     addi  4,4,32
 897     bdnz+ L(du4_loop)
 898     .align 4
 899 L(du4_fini):
 900     /* calculate and store the final DW */
 901     sldi   0,6, 32
 902     srdi   8,7, 64-32
 903     or    0,0,8
 904     std   0,0(4)
 905     b     L(du_done)
 906
 907     .align 4
 908 L(du5_do):
 909     bf      30,L(du5_1dw)
 910
 911     /* there are at least two DWs to copy */
 912     sldi     0,6, 40
 913     srdi     8,7, 64-40
 914     or      0,0,8
 915     ld      6,16(5)
 916     std     0,0(4)
 917     sldi     0,7, 40
 918     srdi     8,6, 64-40
 919     or      0,0,8
 920     ld      7,24(5)
 921     std     0,8(4)
 922     addi    4,4,16
 923     addi    5,5,32
 924     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
 925     bf      31,L(du5_loop)
 926     /* there is a third DW to copy */
 927     sldi     0,6, 40
 928     srdi     8,7, 64-40
 929     or      0,0,8
 930     std     0,0(4)
 931     mr      6,7
 932     ld      7,0(5)
 933     addi    5,5,8
 934     addi    4,4,8
 935     beq     cr6,L(du5_fini)  /* if total DWs = 4, then bypass loop */
 936     b       L(du5_loop)
 937     .align 4
 938 L(du5_1dw):
 939     sldi     0,6, 40
 940     srdi     8,7, 64-40
 941     addi    5,5,16
 942     or      0,0,8
 943     bf      31,L(du5_loop)
 944     mr      6,7
 945     ld      7,0(5)
 946     addi    5,5,8
 947     std     0,0(4)
 948     addi    4,4,8
 949     .align 4
 950 /* copy 32 bytes at a time */
 951 L(du5_loop):
 952     sldi   0,6, 40
 953     srdi   8,7, 64-40
 954     or    0,0,8
 955     ld    6,0(5)
 956     std   0,0(4)
 957     sldi   0,7, 40
 958     srdi   8,6, 64-40
 959     or    0,0,8
 960     ld    7,8(5)
 961     std   0,8(4)
 962     sldi   0,6, 40
 963     srdi   8,7, 64-40
 964     or    0,0,8
 965     ld    6,16(5)
 966     std   0,16(4)
 967     sldi   0,7, 40
 968     srdi   8,6, 64-40
 969     or    0,0,8
 970     ld    7,24(5)
 971     std   0,24(4)
 972     addi  5,5,32
 973     addi  4,4,32
 974     bdnz+ L(du5_loop)
 975     .align 4
 976 L(du5_fini):
 977     /* calculate and store the final DW */
 978     sldi   0,6, 40
 979     srdi   8,7, 64-40
 980     or    0,0,8
 981     std   0,0(4)
 982     b     L(du_done)
 983
 984     .align 4
 985 L(du6_do):
 986     bf      30,L(du6_1dw)
 987
 988     /* there are at least two DWs to copy */
 989     sldi     0,6, 48
 990     srdi     8,7, 64-48
 991     or      0,0,8
 992     ld      6,16(5)
 993     std     0,0(4)
 994     sldi     0,7, 48
 995     srdi     8,6, 64-48
 996     or      0,0,8
 997     ld      7,24(5)
 998     std     0,8(4)
 999     addi    4,4,16
1000     addi    5,5,32
1001     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
1002     bf      31,L(du6_loop)
1003     /* there is a third DW to copy */
1004     sldi     0,6, 48
1005     srdi     8,7, 64-48
1006     or      0,0,8
1007     std     0,0(4)
1008     mr      6,7
1009     ld      7,0(5)
1010     addi    5,5,8
1011     addi    4,4,8
1012     beq     cr6,L(du6_fini)  /* if total DWs = 4, then bypass loop */
1013     b       L(du6_loop)
1014     .align 4
1015 L(du6_1dw):
1016     sldi     0,6, 48
1017     srdi     8,7, 64-48
1018     addi    5,5,16
1019     or      0,0,8
1020     bf      31,L(du6_loop)
1021     mr      6,7
1022     ld      7,0(5)
1023     addi    5,5,8
1024     std     0,0(4)
1025     addi    4,4,8
1026     .align 4
1027 /* copy 32 bytes at a time */
1028 L(du6_loop):
1029     sldi   0,6, 48
1030     srdi   8,7, 64-48
1031     or    0,0,8
1032     ld    6,0(5)
1033     std   0,0(4)
1034     sldi   0,7, 48
1035     srdi   8,6, 64-48
1036     or    0,0,8
1037     ld    7,8(5)
1038     std   0,8(4)
1039     sldi   0,6, 48
1040     srdi   8,7, 64-48
1041     or    0,0,8
1042     ld    6,16(5)
1043     std   0,16(4)
1044     sldi   0,7, 48
1045     srdi   8,6, 64-48
1046     or    0,0,8
1047     ld    7,24(5)
1048     std   0,24(4)
1049     addi  5,5,32
1050     addi  4,4,32
1051     bdnz+ L(du6_loop)
1052     .align 4
1053 L(du6_fini):
1054     /* calculate and store the final DW */
1055     sldi   0,6, 48
1056     srdi   8,7, 64-48
1057     or    0,0,8
1058     std   0,0(4)
1059     b     L(du_done)
1060
1061     .align 4
1062 L(du7_do):
1063     bf      30,L(du7_1dw)
1064
1065     /* there are at least two DWs to copy */
1066     sldi     0,6, 56
1067     srdi     8,7, 64-56
1068     or      0,0,8
1069     ld      6,16(5)
1070     std     0,0(4)
1071     sldi     0,7, 56
1072     srdi     8,6, 64-56
1073     or      0,0,8
1074     ld      7,24(5)
1075     std     0,8(4)
1076     addi    4,4,16
1077     addi    5,5,32
1078     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
1079     bf      31,L(du7_loop)
1080     /* there is a third DW to copy */
1081     sldi     0,6, 56
1082     srdi     8,7, 64-56
1083     or      0,0,8
1084     std     0,0(4)
1085     mr      6,7
1086     ld      7,0(5)
1087     addi    5,5,8
1088     addi    4,4,8
1089     beq     cr6,L(du7_fini)  /* if total DWs = 4, then bypass loop */
1090     b       L(du7_loop)
1091     .align 4
1092 L(du7_1dw):
1093     sldi     0,6, 56
1094     srdi     8,7, 64-56
1095     addi    5,5,16
1096     or      0,0,8
1097     bf      31,L(du7_loop)
1098     mr      6,7
1099     ld      7,0(5)
1100     addi    5,5,8
1101     std     0,0(4)
1102     addi    4,4,8
1103     .align 4
1104 /* copy 32 bytes at a time */
1105 L(du7_loop):
1106     sldi   0,6, 56
1107     srdi   8,7, 64-56
1108     or    0,0,8
1109     ld    6,0(5)
1110     std   0,0(4)
1111     sldi   0,7, 56
1112     srdi   8,6, 64-56
1113     or    0,0,8
1114     ld    7,8(5)
1115     std   0,8(4)
1116     sldi   0,6, 56
1117     srdi   8,7, 64-56
1118     or    0,0,8
1119     ld    6,16(5)
1120     std   0,16(4)
1121     sldi   0,7, 56
1122     srdi   8,6, 64-56
1123     or    0,0,8
1124     ld    7,24(5)
1125     std   0,24(4)
1126     addi  5,5,32
1127     addi  4,4,32
1128     bdnz+ L(du7_loop)
1129     .align 4
1130 L(du7_fini):
1131     /* calculate and store the final DW */
1132     sldi   0,6, 56
1133     srdi   8,7, 64-56
1134     or    0,0,8
1135     std   0,0(4)
1136     b     L(du_done)
1137
1138     .align 4
1139 L(du_done):
1140     rldicr 0,31,0,60
1141     mtcrf 0x01,31
1142     beq   cr1,0f        /* If the tail is 0 bytes we are done!  */
1143
1144     add   3,3,0
1145     add   12,12,0
1146 /*  At this point we have a tail of 0-7 bytes and we know that the
1147     destination is double word aligned.  */
1148 4:  bf    29,2f
1149     lwz   6,0(12)
1150     addi  12,12,4
1151     stw   6,0(3)
1152     addi  3,3,4
1153 2:  bf    30,1f
1154     lhz   6,0(12)
1155     addi  12,12,2
1156     sth   6,0(3)
1157     addi  3,3,2
1158 1:  bf    31,0f
1159     lbz   6,0(12)
1160     stb   6,0(3)
1161 0:
1162   /* Return original dst pointer.  */
1163     ld 31,-8(1)
1164     ld 3,-16(1)
1165     blr
1166 END_GEN_TB (memcpy,TB_TOCLESS)
1167 libc_hidden_builtin_def (memcpy)