sysdeps/powerpc/powerpc64/power7/memmove.S

   1 /* Optimized memmove implementation for PowerPC64/POWER7.
   2    Copyright (C) 2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21
  22 /* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
  23
  24    This optimization check if memory 'dest'  overlaps with 'src'. If it does
  25    not then it calls an optimized memcpy call (similar to memcpy for POWER7,
  26    embedded here to gain some cycles).
  27    If source and destiny overlaps, a optimized backwards memcpy is used
  28    instead.  */
  29
  30         .machine power7
  31 EALIGN (memmove, 5, 0)
  32         CALL_MCOUNT 3
  33
  34 L(_memmove):
  35         subf    r9,r4,r3
  36         cmpld   cr7,r9,r5
  37         blt     cr7,L(memmove_bwd)
  38
  39         cmpldi  cr1,r5,31
  40         neg     0,3
  41         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  42                                        code.  */
  43
  44         andi.   10,3,15
  45         clrldi  11,4,60
  46         cmpld   cr6,10,11       /* SRC and DST alignments match?  */
  47
  48         mr      r11,3
  49         bne     cr6,L(copy_GE_32_unaligned)
  50         beq     L(aligned_copy)
  51
  52         mtocrf  0x01,0
  53         clrldi  0,0,60
  54
  55 /* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
  56 1:
  57         bf      31,2f
  58         lbz     6,0(r4)
  59         addi    r4,r4,1
  60         stb     6,0(r11)
  61         addi    r11,r11,1
  62 2:
  63         bf      30,4f
  64         lhz     6,0(r4)
  65         addi    r4,r4,2
  66         sth     6,0(r11)
  67         addi    r11,r11,2
  68 4:
  69         bf      29,8f
  70         lwz     6,0(r4)
  71         addi    r4,r4,4
  72         stw     6,0(r11)
  73         addi    r11,r11,4
  74 8:
  75         bf      28,16f
  76         ld      6,0(r4)
  77         addi    r4,r4,8
  78         std     6,0(r11)
  79         addi    r11,r11,8
  80 16:
  81         subf    r5,0,r5
  82
  83 /* Main aligned copy loop. Copies 128 bytes at a time. */
  84 L(aligned_copy):
  85         li      6,16
  86         li      7,32
  87         li      8,48
  88         mtocrf  0x02,r5
  89         srdi    12,r5,7
  90         cmpdi   12,0
  91         beq     L(aligned_tail)
  92         lxvd2x  6,0,r4
  93         lxvd2x  7,r4,6
  94         mtctr   12
  95         b       L(aligned_128loop)
  96
  97         .align  4
  98 L(aligned_128head):
  99         /* for the 2nd + iteration of this loop. */
 100         lxvd2x  6,0,r4
 101         lxvd2x  7,r4,6
 102 L(aligned_128loop):
 103         lxvd2x  8,r4,7
 104         lxvd2x  9,r4,8
 105         stxvd2x 6,0,r11
 106         addi    r4,r4,64
 107         stxvd2x 7,r11,6
 108         stxvd2x 8,r11,7
 109         stxvd2x 9,r11,8
 110         lxvd2x  6,0,r4
 111         lxvd2x  7,r4,6
 112         addi    r11,r11,64
 113         lxvd2x  8,r4,7
 114         lxvd2x  9,r4,8
 115         addi    r4,r4,64
 116         stxvd2x 6,0,r11
 117         stxvd2x 7,r11,6
 118         stxvd2x 8,r11,7
 119         stxvd2x 9,r11,8
 120         addi    r11,r11,64
 121         bdnz    L(aligned_128head)
 122
 123 L(aligned_tail):
 124         mtocrf  0x01,r5
 125         bf      25,32f
 126         lxvd2x  6,0,r4
 127         lxvd2x  7,r4,6
 128         lxvd2x  8,r4,7
 129         lxvd2x  9,r4,8
 130         addi    r4,r4,64
 131         stxvd2x 6,0,r11
 132         stxvd2x 7,r11,6
 133         stxvd2x 8,r11,7
 134         stxvd2x 9,r11,8
 135         addi    r11,r11,64
 136 32:
 137         bf      26,16f
 138         lxvd2x  6,0,r4
 139         lxvd2x  7,r4,6
 140         addi    r4,r4,32
 141         stxvd2x 6,0,r11
 142         stxvd2x 7,r11,6
 143         addi    r11,r11,32
 144 16:
 145         bf      27,8f
 146         lxvd2x  6,0,r4
 147         addi    r4,r4,16
 148         stxvd2x 6,0,r11
 149         addi    r11,r11,16
 150 8:
 151         bf      28,4f
 152         ld      6,0(r4)
 153         addi    r4,r4,8
 154         std     6,0(r11)
 155         addi    r11,r11,8
 156 4:      /* Copies 4~7 bytes.  */
 157         bf      29,L(tail2)
 158         lwz     6,0(r4)
 159         stw     6,0(r11)
 160         bf      30,L(tail5)
 161         lhz     7,4(r4)
 162         sth     7,4(r11)
 163         bflr    31
 164         lbz     8,6(r4)
 165         stb     8,6(r11)
 166         /* Return original DST pointer.  */
 167         blr
 168
 169 /* Handle copies of 0~31 bytes.  */
 170         .align  4
 171 L(copy_LT_32):
 172         mr      r11,3
 173         cmpldi  cr6,r5,8
 174         mtocrf  0x01,r5
 175         ble     cr6,L(copy_LE_8)
 176
 177         /* At least 9 bytes to go.  */
 178         neg     8,4
 179         andi.   0,8,3
 180         cmpldi  cr1,r5,16
 181         beq     L(copy_LT_32_aligned)
 182
 183         /* Force 4-byte alignment for SRC.  */
 184         mtocrf  0x01,0
 185         subf    r5,0,r5
 186 2:
 187         bf      30,1f
 188         lhz     6,0(r4)
 189         addi    r4,r4,2
 190         sth     6,0(r11)
 191         addi    r11,r11,2
 192 1:
 193         bf      31,L(end_4bytes_alignment)
 194         lbz     6,0(r4)
 195         addi    r4,r4,1
 196         stb     6,0(r11)
 197         addi    r11,r11,1
 198
 199         .align  4
 200 L(end_4bytes_alignment):
 201         cmpldi  cr1,r5,16
 202         mtocrf  0x01,r5
 203
 204 L(copy_LT_32_aligned):
 205         /* At least 6 bytes to go, and SRC is word-aligned.  */
 206         blt     cr1,8f
 207
 208         /* Copy 16 bytes.  */
 209         lwz     6,0(r4)
 210         lwz     7,4(r4)
 211         stw     6,0(r11)
 212         lwz     8,8(r4)
 213         stw     7,4(r11)
 214         lwz     6,12(r4)
 215         addi    r4,r4,16
 216         stw     8,8(r11)
 217         stw     6,12(r11)
 218         addi    r11,r11,16
 219 8:      /* Copy 8 bytes.  */
 220         bf      28,L(tail4)
 221         lwz     6,0(r4)
 222         lwz     7,4(r4)
 223         addi    r4,r4,8
 224         stw     6,0(r11)
 225         stw     7,4(r11)
 226         addi    r11,r11,8
 227
 228         .align  4
 229 /* Copies 4~7 bytes.  */
 230 L(tail4):
 231         bf      29,L(tail2)
 232         lwz     6,0(r4)
 233         stw     6,0(r11)
 234         bf      30,L(tail5)
 235         lhz     7,4(r4)
 236         sth     7,4(r11)
 237         bflr    31
 238         lbz     8,6(r4)
 239         stb     8,6(r11)
 240         /* Return original DST pointer.  */
 241         blr
 242
 243         .align  4
 244 /* Copies 2~3 bytes.  */
 245 L(tail2):
 246         bf      30,1f
 247         lhz     6,0(r4)
 248         sth     6,0(r11)
 249         bflr    31
 250         lbz     7,2(r4)
 251         stb     7,2(r11)
 252         blr
 253
 254         .align  4
 255 L(tail5):
 256         bflr    31
 257         lbz     6,4(r4)
 258         stb     6,4(r11)
 259         blr
 260
 261         .align  4
 262 1:
 263         bflr    31
 264         lbz     6,0(r4)
 265         stb     6,0(r11)
 266         /* Return original DST pointer.  */
 267         blr
 268
 269 /* Handles copies of 0~8 bytes.  */
 270         .align  4
 271 L(copy_LE_8):
 272         bne     cr6,L(tail4)
 273
 274         /* Though we could've used ld/std here, they are still
 275         slow for unaligned cases.  */
 276
 277         lwz     6,0(r4)
 278         lwz     7,4(r4)
 279         stw     6,0(r11)
 280         stw     7,4(r11)
 281         blr
 282
 283
 284 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 285    SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 286    the data, allowing for aligned DST stores.  */
 287         .align  4
 288 L(copy_GE_32_unaligned):
 289         clrldi  0,0,60        /* Number of bytes until the 1st r11 quadword.  */
 290         srdi    9,r5,4        /* Number of full quadwords remaining.  */
 291
 292         beq     L(copy_GE_32_unaligned_cont)
 293
 294         /* DST is not quadword aligned, get it aligned.  */
 295
 296         mtocrf  0x01,0
 297         subf    r5,0,r5
 298
 299         /* Vector instructions work best when proper alignment (16-bytes)
 300         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 301 1:
 302         bf      31,2f
 303         lbz     6,0(r4)
 304         addi    r4,r4,1
 305         stb     6,0(r11)
 306         addi    r11,r11,1
 307 2:
 308         bf      30,4f
 309         lhz     6,0(r4)
 310         addi    r4,r4,2
 311         sth     6,0(r11)
 312         addi    r11,r11,2
 313 4:
 314         bf      29,8f
 315         lwz     6,0(r4)
 316         addi    r4,r4,4
 317         stw     6,0(r11)
 318         addi    r11,r11,4
 319 8:
 320         bf      28,0f
 321         ld      6,0(r4)
 322         addi    r4,r4,8
 323         std     6,0(r11)
 324         addi    r11,r11,8
 325 0:
 326         srdi    9,r5,4        /* Number of full quadwords remaining.  */
 327
 328         /* The proper alignment is present, it is OK to copy the bytes now.  */
 329 L(copy_GE_32_unaligned_cont):
 330
 331         /* Setup two indexes to speed up the indexed vector operations.  */
 332         clrldi  10,r5,60
 333         li      6,16          /* Index for 16-bytes offsets.  */
 334         li      7,32          /* Index for 32-bytes offsets.  */
 335         cmpldi  cr1,10,0
 336         srdi    8,r5,5        /* Setup the loop counter.  */
 337         mtocrf  0x01,9
 338         cmpldi  cr6,9,1
 339 #ifdef __LITTLE_ENDIAN__
 340         lvsr    5,0,r4
 341 #else
 342         lvsl    5,0,r4
 343 #endif
 344         lvx     3,0,r4
 345         li      0,0
 346         bf      31,L(setup_unaligned_loop)
 347
 348         /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
 349         lvx     4,r4,6
 350 #ifdef __LITTLE_ENDIAN__
 351         vperm   6,4,3,5
 352 #else
 353         vperm   6,3,4,5
 354 #endif
 355         addi    r4,r4,16
 356         stvx    6,0,r11
 357         addi    r11,r11,16
 358         vor     3,4,4
 359         clrrdi  0,r4,60
 360
 361 L(setup_unaligned_loop):
 362         mtctr   8
 363         ble     cr6,L(end_unaligned_loop)
 364
 365         /* Copy 32 bytes at a time using vector instructions.  */
 366         .align  4
 367 L(unaligned_loop):
 368
 369         /* Note: vr6/vr10 may contain data that was already copied,
 370         but in order to get proper alignment, we may have to copy
 371         some portions again. This is faster than having unaligned
 372         vector instructions though.  */
 373
 374         lvx     4,r4,6
 375 #ifdef __LITTLE_ENDIAN__
 376         vperm   6,4,3,5
 377 #else
 378         vperm   6,3,4,5
 379 #endif
 380         lvx     3,r4,7
 381 #ifdef __LITTLE_ENDIAN__
 382         vperm   10,3,4,5
 383 #else
 384         vperm   10,4,3,5
 385 #endif
 386         addi    r4,r4,32
 387         stvx    6,0,r11
 388         stvx    10,r11,6
 389         addi    r11,r11,32
 390         bdnz    L(unaligned_loop)
 391
 392         clrrdi  0,r4,60
 393
 394         .align  4
 395 L(end_unaligned_loop):
 396
 397         /* Check for tail bytes.  */
 398         mtocrf  0x01,r5
 399         beqlr   cr1
 400
 401         add     r4,r4,0
 402
 403         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 404         /* Copy 8 bytes.  */
 405         bf      28,4f
 406         lwz     6,0(r4)
 407         lwz     7,4(r4)
 408         addi    r4,r4,8
 409         stw     6,0(r11)
 410         stw     7,4(r11)
 411         addi    r11,r11,8
 412 4:      /* Copy 4~7 bytes.  */
 413         bf      29,L(tail2)
 414         lwz     6,0(r4)
 415         stw     6,0(r11)
 416         bf      30,L(tail5)
 417         lhz     7,4(r4)
 418         sth     7,4(r11)
 419         bflr    31
 420         lbz     8,6(r4)
 421         stb     8,6(r11)
 422         /* Return original DST pointer.  */
 423         blr
 424
 425         /* Start to memcpy backward implementation: the algorith first check if
 426            src and dest have the same alignment and if it does align both to 16
 427            bytes and copy using VSX instructions.
 428            If does not, align dest to 16 bytes and use VMX (altivec) instruction
 429            to read two 16 bytes at time, shift/permute the bytes read and write
 430            aligned to dest.  */
 431 L(memmove_bwd):
 432         cmpldi  cr1,r5,31
 433         /* Copy is done backwards: update the pointers and check alignment.  */
 434         add     r11,r3,r5
 435         add     r4,r4,r5
 436         mr      r0,r11
 437         ble     cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
 438                                            code.  */
 439
 440         andi.   r10,r11,15          /* Check if r11 is aligned to 16 bytes  */
 441         clrldi  r9,r4,60            /* Check if r4 is aligned to 16 bytes  */
 442         cmpld   cr6,r10,r9          /* SRC and DST alignments match?  */
 443
 444         bne     cr6,L(copy_GE_32_unaligned_bwd)
 445         beq     L(aligned_copy_bwd)
 446
 447         mtocrf  0x01,r0
 448         clrldi  r0,r0,60
 449
 450 /* Get the DST and SRC aligned to 16 bytes.  */
 451 1:
 452         bf      31,2f
 453         lbz     r6,-1(r4)
 454         subi    r4,r4,1
 455         stb     r6,-1(r11)
 456         subi    r11,r11,1
 457 2:
 458         bf      30,4f
 459         lhz     r6,-2(r4)
 460         subi    r4,r4,2
 461         sth     r6,-2(r11)
 462         subi    r11,r11,2
 463 4:
 464         bf      29,8f
 465         lwz     r6,-4(r4)
 466         subi    r4,r4,4
 467         stw     r6,-4(r11)
 468         subi    r11,r11,4
 469 8:
 470         bf      28,16f
 471         ld      r6,-8(r4)
 472         subi    r4,r4,8
 473         std     r6,-8(r11)
 474         subi    r11,r11,8
 475 16:
 476         subf    r5,0,r5
 477
 478 /* Main aligned copy loop. Copies 128 bytes at a time. */
 479 L(aligned_copy_bwd):
 480         li      r6,-16
 481         li      r7,-32
 482         li      r8,-48
 483         li      r9,-64
 484         mtocrf  0x02,r5
 485         srdi    r12,r5,7
 486         cmpdi   r12,0
 487         beq     L(aligned_tail_bwd)
 488         lxvd2x  v6,r4,r6
 489         lxvd2x  v7,r4,r7
 490         mtctr   12
 491         b       L(aligned_128loop_bwd)
 492
 493         .align  4
 494 L(aligned_128head_bwd):
 495         /* for the 2nd + iteration of this loop. */
 496         lxvd2x  v6,r4,r6
 497         lxvd2x  v7,r4,r7
 498 L(aligned_128loop_bwd):
 499         lxvd2x  v8,r4,r8
 500         lxvd2x  v9,r4,r9
 501         stxvd2x v6,r11,r6
 502         subi    r4,r4,64
 503         stxvd2x v7,r11,r7
 504         stxvd2x v8,r11,r8
 505         stxvd2x v9,r11,r9
 506         lxvd2x  v6,r4,r6
 507         lxvd2x  v7,r4,7
 508         subi    r11,r11,64
 509         lxvd2x  v8,r4,r8
 510         lxvd2x  v9,r4,r9
 511         subi    r4,r4,64
 512         stxvd2x v6,r11,r6
 513         stxvd2x v7,r11,r7
 514         stxvd2x v8,r11,r8
 515         stxvd2x v9,r11,r9
 516         subi    r11,r11,64
 517         bdnz    L(aligned_128head_bwd)
 518
 519 L(aligned_tail_bwd):
 520         mtocrf  0x01,r5
 521         bf      25,32f
 522         lxvd2x  v6,r4,r6
 523         lxvd2x  v7,r4,r7
 524         lxvd2x  v8,r4,r8
 525         lxvd2x  v9,r4,r9
 526         subi    r4,r4,64
 527         stxvd2x v6,r11,r6
 528         stxvd2x v7,r11,r7
 529         stxvd2x v8,r11,r8
 530         stxvd2x v9,r11,r9
 531         subi    r11,r11,64
 532 32:
 533         bf      26,16f
 534         lxvd2x  v6,r4,r6
 535         lxvd2x  v7,r4,r7
 536         subi    r4,r4,32
 537         stxvd2x v6,r11,r6
 538         stxvd2x v7,r11,r7
 539         subi    r11,r11,32
 540 16:
 541         bf      27,8f
 542         lxvd2x  v6,r4,r6
 543         subi    r4,r4,16
 544         stxvd2x v6,r11,r6
 545         subi    r11,r11,16
 546 8:
 547         bf      28,4f
 548         ld      r6,-8(r4)
 549         subi    r4,r4,8
 550         std     r6,-8(r11)
 551         subi    r11,r11,8
 552 4:      /* Copies 4~7 bytes.  */
 553         bf      29,L(tail2_bwd)
 554         lwz     r6,-4(r4)
 555         stw     r6,-4(r11)
 556         bf      30,L(tail5_bwd)
 557         lhz     r7,-6(r4)
 558         sth     r7,-6(r11)
 559         bflr    31
 560         lbz     r8,-7(r4)
 561         stb     r8,-7(r11)
 562         /* Return original DST pointer.  */
 563         blr
 564
 565 /* Handle copies of 0~31 bytes.  */
 566         .align  4
 567 L(copy_LT_32_bwd):
 568         cmpldi  cr6,r5,8
 569         mtocrf  0x01,r5
 570         ble     cr6,L(copy_LE_8_bwd)
 571
 572         /* At least 9 bytes to go.  */
 573         neg     r8,r4
 574         andi.   r0,r8,3
 575         cmpldi  cr1,r5,16
 576         beq     L(copy_LT_32_aligned_bwd)
 577
 578         /* Force 4-byte alignment for SRC.  */
 579         mtocrf  0x01,0
 580         subf    r5,0,r5
 581 2:
 582         bf      30,1f
 583         lhz     r6,-2(r4)
 584         subi    r4,r4,2
 585         sth     r6,-2(r11)
 586         subi    r11,r11,2
 587 1:
 588         bf      31,L(end_4bytes_alignment_bwd)
 589         lbz     6,-1(r4)
 590         subi    r4,r4,1
 591         stb     6,-1(r11)
 592         subi    r11,r11,1
 593
 594         .align  4
 595 L(end_4bytes_alignment_bwd):
 596         cmpldi  cr1,r5,16
 597         mtocrf  0x01,r5
 598
 599 L(copy_LT_32_aligned_bwd):
 600         /* At least 6 bytes to go, and SRC is word-aligned.  */
 601         blt     cr1,8f
 602
 603         /* Copy 16 bytes.  */
 604         lwz     r6,-4(r4)
 605         lwz     r7,-8(r4)
 606         stw     r6,-4(r11)
 607         lwz     r8,-12(r4)
 608         stw     r7,-8(r11)
 609         lwz     r6,-16(r4)
 610         subi    r4,r4,16
 611         stw     r8,-12(r11)
 612         stw     r6,-16(r11)
 613         subi    r11,r11,16
 614 8:      /* Copy 8 bytes.  */
 615         bf      28,L(tail4_bwd)
 616         lwz     r6,-4(r4)
 617         lwz     r7,-8(r4)
 618         subi    r4,r4,8
 619         stw     r6,-4(r11)
 620         stw     r7,-8(r11)
 621         subi    r11,r11,8
 622
 623         .align  4
 624 /* Copies 4~7 bytes.  */
 625 L(tail4_bwd):
 626         bf      29,L(tail2_bwd)
 627         lwz     6,-4(r4)
 628         stw     6,-4(r11)
 629         bf      30,L(tail5_bwd)
 630         lhz     7,-6(r4)
 631         sth     7,-6(r11)
 632         bflr    31
 633         lbz     8,-7(r4)
 634         stb     8,-7(r11)
 635         /* Return original DST pointer.  */
 636         blr
 637
 638         .align  4
 639 /* Copies 2~3 bytes.  */
 640 L(tail2_bwd):
 641         bf      30,1f
 642         lhz     6,-2(r4)
 643         sth     6,-2(r11)
 644         bflr    31
 645         lbz     7,-3(r4)
 646         stb     7,-3(r11)
 647         blr
 648
 649         .align  4
 650 L(tail5_bwd):
 651         bflr    31
 652         lbz     6,-5(r4)
 653         stb     6,-5(r11)
 654         blr
 655
 656         .align  4
 657 1:
 658         bflr    31
 659         lbz     6,-1(r4)
 660         stb     6,-1(r11)
 661         /* Return original DST pointer.  */
 662         blr
 663
 664
 665 /* Handles copies of 0~8 bytes.  */
 666         .align  4
 667 L(copy_LE_8_bwd):
 668         bne     cr6,L(tail4_bwd)
 669
 670         /* Though we could've used ld/std here, they are still
 671            slow for unaligned cases.  */
 672         lwz     6,-8(r4)
 673         lwz     7,-4(r4)
 674         stw     6,-8(r11)
 675         stw     7,-4(r11)
 676         blr
 677
 678
 679 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 680    SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 681    the data, allowing for aligned DST stores.  */
 682         .align  4
 683 L(copy_GE_32_unaligned_bwd):
 684         andi.   r10,r11,15      /* Check alignment of DST against 16 bytes..  */
 685         srdi    r9,r5,4         /* Number of full quadwords remaining.  */
 686
 687         beq     L(copy_GE_32_unaligned_cont_bwd)
 688
 689         /* DST is not quadword aligned and r10 holds the address masked to
 690            compare alignments.  */
 691         mtocrf  0x01,r10
 692         subf    r5,r10,r5
 693
 694         /* Vector instructions work best when proper alignment (16-bytes)
 695         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 696 1:
 697         bf      31,2f
 698         lbz     r6,-1(r4)
 699         subi    r4,r4,1
 700         stb     r6,-1(r11)
 701         subi    r11,r11,1
 702 2:
 703         bf      30,4f
 704         lhz     r6,-2(r4)
 705         subi    r4,r4,2
 706         sth     r6,-2(r11)
 707         subi    r11,r11,2
 708 4:
 709         bf      29,8f
 710         lwz     r6,-4(r4)
 711         subi    r4,r4,4
 712         stw     r6,-4(r11)
 713         subi    r11,r11,4
 714 8:
 715         bf      28,0f
 716         ld      r6,-8(r4)
 717         subi    r4,r4,8
 718         std     r6,-8(r11)
 719         subi    r11,r11,8
 720 0:
 721         srdi    r9,r5,4       /* Number of full quadwords remaining.  */
 722
 723         /* The proper alignment is present, it is OK to copy the bytes now.  */
 724 L(copy_GE_32_unaligned_cont_bwd):
 725
 726         /* Setup two indexes to speed up the indexed vector operations.  */
 727         clrldi  r10,r5,60
 728         li      r6,-16        /* Index for 16-bytes offsets.  */
 729         li      r7,-32        /* Index for 32-bytes offsets.  */
 730         cmpldi  cr1,10,0
 731         srdi    r8,r5,5       /* Setup the loop counter.  */
 732         mtocrf  0x01,9
 733         cmpldi  cr6,r9,1
 734 #ifdef __LITTLE_ENDIAN__
 735         lvsr    v5,r0,r4
 736 #else
 737         lvsl    v5,r0,r4
 738 #endif
 739         lvx     v3,0,r4
 740         li      r0,0
 741         bf      31,L(setup_unaligned_loop_bwd)
 742
 743         /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
 744         lvx     v4,r4,r6
 745 #ifdef __LITTLE_ENDIAN__
 746         vperm   v6,v3,v4,v5
 747 #else
 748         vperm   v6,v4,v3,v5
 749 #endif
 750         subi    r4,r4,16
 751         stvx    v6,r11,r6
 752         subi    r11,r11,16
 753         vor     v3,v4,v4
 754         clrrdi  r0,r4,60
 755
 756 L(setup_unaligned_loop_bwd):
 757         mtctr   r8
 758         ble     cr6,L(end_unaligned_loop_bwd)
 759
 760         /* Copy 32 bytes at a time using vector instructions.  */
 761         .align  4
 762 L(unaligned_loop_bwd):
 763
 764         /* Note: vr6/vr10 may contain data that was already copied,
 765         but in order to get proper alignment, we may have to copy
 766         some portions again. This is faster than having unaligned
 767         vector instructions though.  */
 768
 769         lvx     v4,r4,r6
 770 #ifdef __LITTLE_ENDIAN__
 771         vperm   v6,v3,v4,v5
 772 #else
 773         vperm   v6,v4,v3,v5
 774 #endif
 775         lvx     v3,r4,r7
 776 #ifdef __LITTLE_ENDIAN__
 777         vperm   v10,v4,v3,v5
 778 #else
 779         vperm   v10,v3,v4,v5
 780 #endif
 781         subi    r4,r4,32
 782         stvx    v6,r11,r6
 783         stvx    v10,r11,r7
 784         subi    r11,r11,32
 785         bdnz    L(unaligned_loop_bwd)
 786
 787         clrrdi  r0,r4,60
 788
 789         .align  4
 790 L(end_unaligned_loop_bwd):
 791
 792         /* Check for tail bytes.  */
 793         mtocrf  0x01,r5
 794         beqlr   cr1
 795
 796         add     r4,r4,0
 797
 798         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 799         /* Copy 8 bytes.  */
 800         bf      28,4f
 801         lwz     r6,-4(r4)
 802         lwz     r7,-8(r4)
 803         subi    r4,r4,8
 804         stw     r6,-4(r11)
 805         stw     r7,-8(r11)
 806         subi    r11,r11,8
 807 4:      /* Copy 4~7 bytes.  */
 808         bf      29,L(tail2_bwd)
 809         lwz     r6,-4(r4)
 810         stw     r6,-4(r11)
 811         bf      30,L(tail5_bwd)
 812         lhz     r7,-6(r4)
 813         sth     r7,-6(r11)
 814         bflr    31
 815         lbz     r8,-7(r4)
 816         stb     r8,-7(r11)
 817         /* Return original DST pointer.  */
 818         blr
 819 END_GEN_TB (memmove, TB_TOCLESS)
 820 libc_hidden_builtin_def (memmove)
 821
 822
 823 /* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
 824    Implemented in this file to avoid linker create a stub function call
 825    in the branch to '_memmove'.  */
 826 ENTRY (bcopy)
 827         mr      r6,r3
 828         mr      r3,r4
 829         mr      r4,r6
 830         b       L(_memmove)
 831 END (bcopy)