sysdeps/powerpc/powerpc64/power7/memmove.S

   1 /* Optimized memmove implementation for PowerPC64/POWER7.
   2    Copyright (C) 2014-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21
  22 /* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
  23
  24    This optimization check if memory 'dest'  overlaps with 'src'. If it does
  25    not then it calls an optimized memcpy call (similar to memcpy for POWER7,
  26    embedded here to gain some cycles).
  27    If source and destiny overlaps, a optimized backwards memcpy is used
  28    instead.  */
  29
  30 #ifndef MEMMOVE
  31 # define MEMMOVE memmove
  32 #endif
  33         .machine power7
  34 ENTRY_TOCLESS (MEMMOVE, 5)
  35         CALL_MCOUNT 3
  36
  37 L(_memmove):
  38         subf    r9,r4,r3
  39         cmpld   cr7,r9,r5
  40         blt     cr7,L(memmove_bwd)
  41
  42         cmpldi  cr1,r5,31
  43         neg     0,3
  44         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  45                                        code.  */
  46
  47         andi.   10,3,15
  48         clrldi  11,4,60
  49         cmpld   cr6,10,11       /* SRC and DST alignments match?  */
  50
  51         mr      r11,3
  52         bne     cr6,L(copy_GE_32_unaligned)
  53         beq     L(aligned_copy)
  54
  55         mtocrf  0x01,0
  56         clrldi  0,0,60
  57
  58 /* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
  59 1:
  60         bf      31,2f
  61         lbz     6,0(r4)
  62         addi    r4,r4,1
  63         stb     6,0(r11)
  64         addi    r11,r11,1
  65 2:
  66         bf      30,4f
  67         lhz     6,0(r4)
  68         addi    r4,r4,2
  69         sth     6,0(r11)
  70         addi    r11,r11,2
  71 4:
  72         bf      29,8f
  73         lwz     6,0(r4)
  74         addi    r4,r4,4
  75         stw     6,0(r11)
  76         addi    r11,r11,4
  77 8:
  78         bf      28,16f
  79         ld      6,0(r4)
  80         addi    r4,r4,8
  81         std     6,0(r11)
  82         addi    r11,r11,8
  83 16:
  84         subf    r5,0,r5
  85
  86 /* Main aligned copy loop. Copies 128 bytes at a time. */
  87 L(aligned_copy):
  88         li      6,16
  89         li      7,32
  90         li      8,48
  91         mtocrf  0x02,r5
  92         srdi    12,r5,7
  93         cmpdi   12,0
  94         beq     L(aligned_tail)
  95         lvx     6,0,r4
  96         lvx     7,r4,6
  97         mtctr   12
  98         b       L(aligned_128loop)
  99
 100         .align  4
 101 L(aligned_128head):
 102         /* for the 2nd + iteration of this loop. */
 103         lvx     6,0,r4
 104         lvx     7,r4,6
 105 L(aligned_128loop):
 106         lvx     8,r4,7
 107         lvx     9,r4,8
 108         stvx    6,0,r11
 109         addi    r4,r4,64
 110         stvx    7,r11,6
 111         stvx    8,r11,7
 112         stvx    9,r11,8
 113         lvx     6,0,r4
 114         lvx     7,r4,6
 115         addi    r11,r11,64
 116         lvx     8,r4,7
 117         lvx     9,r4,8
 118         addi    r4,r4,64
 119         stvx    6,0,r11
 120         stvx    7,r11,6
 121         stvx    8,r11,7
 122         stvx    9,r11,8
 123         addi    r11,r11,64
 124         bdnz    L(aligned_128head)
 125
 126 L(aligned_tail):
 127         mtocrf  0x01,r5
 128         bf      25,32f
 129         lvx     6,0,r4
 130         lvx     7,r4,6
 131         lvx     8,r4,7
 132         lvx     9,r4,8
 133         addi    r4,r4,64
 134         stvx    6,0,r11
 135         stvx    7,r11,6
 136         stvx    8,r11,7
 137         stvx    9,r11,8
 138         addi    r11,r11,64
 139 32:
 140         bf      26,16f
 141         lvx     6,0,r4
 142         lvx     7,r4,6
 143         addi    r4,r4,32
 144         stvx    6,0,r11
 145         stvx    7,r11,6
 146         addi    r11,r11,32
 147 16:
 148         bf      27,8f
 149         lvx     6,0,r4
 150         addi    r4,r4,16
 151         stvx    6,0,r11
 152         addi    r11,r11,16
 153 8:
 154         bf      28,4f
 155         ld      6,0(r4)
 156         addi    r4,r4,8
 157         std     6,0(r11)
 158         addi    r11,r11,8
 159 4:      /* Copies 4~7 bytes.  */
 160         bf      29,L(tail2)
 161         lwz     6,0(r4)
 162         stw     6,0(r11)
 163         bf      30,L(tail5)
 164         lhz     7,4(r4)
 165         sth     7,4(r11)
 166         bflr    31
 167         lbz     8,6(r4)
 168         stb     8,6(r11)
 169         /* Return original DST pointer.  */
 170         blr
 171
 172 /* Handle copies of 0~31 bytes.  */
 173         .align  4
 174 L(copy_LT_32):
 175         mr      r11,3
 176         cmpldi  cr6,r5,8
 177         mtocrf  0x01,r5
 178         ble     cr6,L(copy_LE_8)
 179
 180         /* At least 9 bytes to go.  */
 181         neg     8,4
 182         andi.   0,8,3
 183         cmpldi  cr1,r5,16
 184         beq     L(copy_LT_32_aligned)
 185
 186         /* Force 4-byte alignment for SRC.  */
 187         mtocrf  0x01,0
 188         subf    r5,0,r5
 189 2:
 190         bf      30,1f
 191         lhz     6,0(r4)
 192         addi    r4,r4,2
 193         sth     6,0(r11)
 194         addi    r11,r11,2
 195 1:
 196         bf      31,L(end_4bytes_alignment)
 197         lbz     6,0(r4)
 198         addi    r4,r4,1
 199         stb     6,0(r11)
 200         addi    r11,r11,1
 201
 202         .align  4
 203 L(end_4bytes_alignment):
 204         cmpldi  cr1,r5,16
 205         mtocrf  0x01,r5
 206
 207 L(copy_LT_32_aligned):
 208         /* At least 6 bytes to go, and SRC is word-aligned.  */
 209         blt     cr1,8f
 210
 211         /* Copy 16 bytes.  */
 212         lwz     6,0(r4)
 213         lwz     7,4(r4)
 214         stw     6,0(r11)
 215         lwz     8,8(r4)
 216         stw     7,4(r11)
 217         lwz     6,12(r4)
 218         addi    r4,r4,16
 219         stw     8,8(r11)
 220         stw     6,12(r11)
 221         addi    r11,r11,16
 222 8:      /* Copy 8 bytes.  */
 223         bf      28,L(tail4)
 224         lwz     6,0(r4)
 225         lwz     7,4(r4)
 226         addi    r4,r4,8
 227         stw     6,0(r11)
 228         stw     7,4(r11)
 229         addi    r11,r11,8
 230
 231         .align  4
 232 /* Copies 4~7 bytes.  */
 233 L(tail4):
 234         bf      29,L(tail2)
 235         lwz     6,0(r4)
 236         stw     6,0(r11)
 237         bf      30,L(tail5)
 238         lhz     7,4(r4)
 239         sth     7,4(r11)
 240         bflr    31
 241         lbz     8,6(r4)
 242         stb     8,6(r11)
 243         /* Return original DST pointer.  */
 244         blr
 245
 246         .align  4
 247 /* Copies 2~3 bytes.  */
 248 L(tail2):
 249         bf      30,1f
 250         lhz     6,0(r4)
 251         sth     6,0(r11)
 252         bflr    31
 253         lbz     7,2(r4)
 254         stb     7,2(r11)
 255         blr
 256
 257         .align  4
 258 L(tail5):
 259         bflr    31
 260         lbz     6,4(r4)
 261         stb     6,4(r11)
 262         blr
 263
 264         .align  4
 265 1:
 266         bflr    31
 267         lbz     6,0(r4)
 268         stb     6,0(r11)
 269         /* Return original DST pointer.  */
 270         blr
 271
 272 /* Handles copies of 0~8 bytes.  */
 273         .align  4
 274 L(copy_LE_8):
 275         bne     cr6,L(tail4)
 276
 277         /* Though we could've used ld/std here, they are still
 278         slow for unaligned cases.  */
 279
 280         lwz     6,0(r4)
 281         lwz     7,4(r4)
 282         stw     6,0(r11)
 283         stw     7,4(r11)
 284         blr
 285
 286
 287 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 288    SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 289    the data, allowing for aligned DST stores.  */
 290         .align  4
 291 L(copy_GE_32_unaligned):
 292         clrldi  0,0,60        /* Number of bytes until the 1st r11 quadword.  */
 293         srdi    9,r5,4        /* Number of full quadwords remaining.  */
 294
 295         beq     L(copy_GE_32_unaligned_cont)
 296
 297         /* DST is not quadword aligned, get it aligned.  */
 298
 299         mtocrf  0x01,0
 300         subf    r5,0,r5
 301
 302         /* Vector instructions work best when proper alignment (16-bytes)
 303         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 304 1:
 305         bf      31,2f
 306         lbz     6,0(r4)
 307         addi    r4,r4,1
 308         stb     6,0(r11)
 309         addi    r11,r11,1
 310 2:
 311         bf      30,4f
 312         lhz     6,0(r4)
 313         addi    r4,r4,2
 314         sth     6,0(r11)
 315         addi    r11,r11,2
 316 4:
 317         bf      29,8f
 318         lwz     6,0(r4)
 319         addi    r4,r4,4
 320         stw     6,0(r11)
 321         addi    r11,r11,4
 322 8:
 323         bf      28,0f
 324         ld      6,0(r4)
 325         addi    r4,r4,8
 326         std     6,0(r11)
 327         addi    r11,r11,8
 328 0:
 329         srdi    9,r5,4        /* Number of full quadwords remaining.  */
 330
 331         /* The proper alignment is present, it is OK to copy the bytes now.  */
 332 L(copy_GE_32_unaligned_cont):
 333
 334         /* Setup two indexes to speed up the indexed vector operations.  */
 335         clrldi  10,r5,60
 336         li      6,16          /* Index for 16-bytes offsets.  */
 337         li      7,32          /* Index for 32-bytes offsets.  */
 338         cmpldi  cr1,10,0
 339         srdi    8,r5,5        /* Setup the loop counter.  */
 340         mtocrf  0x01,9
 341         cmpldi  cr6,9,1
 342 #ifdef __LITTLE_ENDIAN__
 343         lvsr    5,0,r4
 344 #else
 345         lvsl    5,0,r4
 346 #endif
 347         lvx     3,0,r4
 348         li      0,0
 349         bf      31,L(setup_unaligned_loop)
 350
 351         /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
 352         lvx     4,r4,6
 353 #ifdef __LITTLE_ENDIAN__
 354         vperm   6,4,3,5
 355 #else
 356         vperm   6,3,4,5
 357 #endif
 358         addi    r4,r4,16
 359         stvx    6,0,r11
 360         addi    r11,r11,16
 361         vor     3,4,4
 362         clrrdi  0,r4,60
 363
 364 L(setup_unaligned_loop):
 365         mtctr   8
 366         ble     cr6,L(end_unaligned_loop)
 367
 368         /* Copy 32 bytes at a time using vector instructions.  */
 369         .align  4
 370 L(unaligned_loop):
 371
 372         /* Note: vr6/vr10 may contain data that was already copied,
 373         but in order to get proper alignment, we may have to copy
 374         some portions again. This is faster than having unaligned
 375         vector instructions though.  */
 376
 377         lvx     4,r4,6
 378 #ifdef __LITTLE_ENDIAN__
 379         vperm   6,4,3,5
 380 #else
 381         vperm   6,3,4,5
 382 #endif
 383         lvx     3,r4,7
 384 #ifdef __LITTLE_ENDIAN__
 385         vperm   10,3,4,5
 386 #else
 387         vperm   10,4,3,5
 388 #endif
 389         addi    r4,r4,32
 390         stvx    6,0,r11
 391         stvx    10,r11,6
 392         addi    r11,r11,32
 393         bdnz    L(unaligned_loop)
 394
 395         clrrdi  0,r4,60
 396
 397         .align  4
 398 L(end_unaligned_loop):
 399
 400         /* Check for tail bytes.  */
 401         mtocrf  0x01,r5
 402         beqlr   cr1
 403
 404         add     r4,r4,0
 405
 406         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 407         /* Copy 8 bytes.  */
 408         bf      28,4f
 409         lwz     6,0(r4)
 410         lwz     7,4(r4)
 411         addi    r4,r4,8
 412         stw     6,0(r11)
 413         stw     7,4(r11)
 414         addi    r11,r11,8
 415 4:      /* Copy 4~7 bytes.  */
 416         bf      29,L(tail2)
 417         lwz     6,0(r4)
 418         stw     6,0(r11)
 419         bf      30,L(tail5)
 420         lhz     7,4(r4)
 421         sth     7,4(r11)
 422         bflr    31
 423         lbz     8,6(r4)
 424         stb     8,6(r11)
 425         /* Return original DST pointer.  */
 426         blr
 427
 428         /* Start to memcpy backward implementation: the algorithm first check if
 429            src and dest have the same alignment and if it does align both to 16
 430            bytes and copy using VSX instructions.
 431            If does not, align dest to 16 bytes and use VMX (altivec) instruction
 432            to read two 16 bytes at time, shift/permute the bytes read and write
 433            aligned to dest.  */
 434 L(memmove_bwd):
 435         cmpldi  cr1,r5,31
 436         /* Copy is done backwards: update the pointers and check alignment.  */
 437         add     r11,r3,r5
 438         add     r4,r4,r5
 439         mr      r0,r11
 440         ble     cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
 441                                            code.  */
 442
 443         andi.   r10,r11,15          /* Check if r11 is aligned to 16 bytes  */
 444         clrldi  r9,r4,60            /* Check if r4 is aligned to 16 bytes  */
 445         cmpld   cr6,r10,r9          /* SRC and DST alignments match?  */
 446
 447         bne     cr6,L(copy_GE_32_unaligned_bwd)
 448         beq     L(aligned_copy_bwd)
 449
 450         mtocrf  0x01,r0
 451         clrldi  r0,r0,60
 452
 453 /* Get the DST and SRC aligned to 16 bytes.  */
 454 1:
 455         bf      31,2f
 456         lbz     r6,-1(r4)
 457         subi    r4,r4,1
 458         stb     r6,-1(r11)
 459         subi    r11,r11,1
 460 2:
 461         bf      30,4f
 462         lhz     r6,-2(r4)
 463         subi    r4,r4,2
 464         sth     r6,-2(r11)
 465         subi    r11,r11,2
 466 4:
 467         bf      29,8f
 468         lwz     r6,-4(r4)
 469         subi    r4,r4,4
 470         stw     r6,-4(r11)
 471         subi    r11,r11,4
 472 8:
 473         bf      28,16f
 474         ld      r6,-8(r4)
 475         subi    r4,r4,8
 476         std     r6,-8(r11)
 477         subi    r11,r11,8
 478 16:
 479         subf    r5,0,r5
 480
 481 /* Main aligned copy loop. Copies 128 bytes at a time. */
 482 L(aligned_copy_bwd):
 483         li      r6,-16
 484         li      r7,-32
 485         li      r8,-48
 486         li      r9,-64
 487         mtocrf  0x02,r5
 488         srdi    r12,r5,7
 489         cmpdi   r12,0
 490         beq     L(aligned_tail_bwd)
 491         lvx     v6,r4,r6
 492         lvx     v7,r4,r7
 493         mtctr   12
 494         b       L(aligned_128loop_bwd)
 495
 496         .align  4
 497 L(aligned_128head_bwd):
 498         /* for the 2nd + iteration of this loop. */
 499         lvx     v6,r4,r6
 500         lvx     v7,r4,r7
 501 L(aligned_128loop_bwd):
 502         lvx     v8,r4,r8
 503         lvx     v9,r4,r9
 504         stvx    v6,r11,r6
 505         subi    r4,r4,64
 506         stvx    v7,r11,r7
 507         stvx    v8,r11,r8
 508         stvx    v9,r11,r9
 509         lvx     v6,r4,r6
 510         lvx     v7,r4,7
 511         subi    r11,r11,64
 512         lvx     v8,r4,r8
 513         lvx     v9,r4,r9
 514         subi    r4,r4,64
 515         stvx    v6,r11,r6
 516         stvx    v7,r11,r7
 517         stvx    v8,r11,r8
 518         stvx    v9,r11,r9
 519         subi    r11,r11,64
 520         bdnz    L(aligned_128head_bwd)
 521
 522 L(aligned_tail_bwd):
 523         mtocrf  0x01,r5
 524         bf      25,32f
 525         lvx     v6,r4,r6
 526         lvx     v7,r4,r7
 527         lvx     v8,r4,r8
 528         lvx     v9,r4,r9
 529         subi    r4,r4,64
 530         stvx    v6,r11,r6
 531         stvx    v7,r11,r7
 532         stvx    v8,r11,r8
 533         stvx    v9,r11,r9
 534         subi    r11,r11,64
 535 32:
 536         bf      26,16f
 537         lvx     v6,r4,r6
 538         lvx     v7,r4,r7
 539         subi    r4,r4,32
 540         stvx    v6,r11,r6
 541         stvx    v7,r11,r7
 542         subi    r11,r11,32
 543 16:
 544         bf      27,8f
 545         lvx     v6,r4,r6
 546         subi    r4,r4,16
 547         stvx    v6,r11,r6
 548         subi    r11,r11,16
 549 8:
 550         bf      28,4f
 551         ld      r6,-8(r4)
 552         subi    r4,r4,8
 553         std     r6,-8(r11)
 554         subi    r11,r11,8
 555 4:      /* Copies 4~7 bytes.  */
 556         bf      29,L(tail2_bwd)
 557         lwz     r6,-4(r4)
 558         stw     r6,-4(r11)
 559         bf      30,L(tail5_bwd)
 560         lhz     r7,-6(r4)
 561         sth     r7,-6(r11)
 562         bflr    31
 563         lbz     r8,-7(r4)
 564         stb     r8,-7(r11)
 565         /* Return original DST pointer.  */
 566         blr
 567
 568 /* Handle copies of 0~31 bytes.  */
 569         .align  4
 570 L(copy_LT_32_bwd):
 571         cmpldi  cr6,r5,8
 572         mtocrf  0x01,r5
 573         ble     cr6,L(copy_LE_8_bwd)
 574
 575         /* At least 9 bytes to go.  */
 576         neg     r8,r4
 577         andi.   r0,r8,3
 578         cmpldi  cr1,r5,16
 579         beq     L(copy_LT_32_aligned_bwd)
 580
 581         /* Force 4-byte alignment for SRC.  */
 582         mtocrf  0x01,0
 583         subf    r5,0,r5
 584 2:
 585         bf      30,1f
 586         lhz     r6,-2(r4)
 587         subi    r4,r4,2
 588         sth     r6,-2(r11)
 589         subi    r11,r11,2
 590 1:
 591         bf      31,L(end_4bytes_alignment_bwd)
 592         lbz     6,-1(r4)
 593         subi    r4,r4,1
 594         stb     6,-1(r11)
 595         subi    r11,r11,1
 596
 597         .align  4
 598 L(end_4bytes_alignment_bwd):
 599         cmpldi  cr1,r5,16
 600         mtocrf  0x01,r5
 601
 602 L(copy_LT_32_aligned_bwd):
 603         /* At least 6 bytes to go, and SRC is word-aligned.  */
 604         blt     cr1,8f
 605
 606         /* Copy 16 bytes.  */
 607         lwz     r6,-4(r4)
 608         lwz     r7,-8(r4)
 609         stw     r6,-4(r11)
 610         lwz     r8,-12(r4)
 611         stw     r7,-8(r11)
 612         lwz     r6,-16(r4)
 613         subi    r4,r4,16
 614         stw     r8,-12(r11)
 615         stw     r6,-16(r11)
 616         subi    r11,r11,16
 617 8:      /* Copy 8 bytes.  */
 618         bf      28,L(tail4_bwd)
 619         lwz     r6,-4(r4)
 620         lwz     r7,-8(r4)
 621         subi    r4,r4,8
 622         stw     r6,-4(r11)
 623         stw     r7,-8(r11)
 624         subi    r11,r11,8
 625
 626         .align  4
 627 /* Copies 4~7 bytes.  */
 628 L(tail4_bwd):
 629         bf      29,L(tail2_bwd)
 630         lwz     6,-4(r4)
 631         stw     6,-4(r11)
 632         bf      30,L(tail5_bwd)
 633         lhz     7,-6(r4)
 634         sth     7,-6(r11)
 635         bflr    31
 636         lbz     8,-7(r4)
 637         stb     8,-7(r11)
 638         /* Return original DST pointer.  */
 639         blr
 640
 641         .align  4
 642 /* Copies 2~3 bytes.  */
 643 L(tail2_bwd):
 644         bf      30,1f
 645         lhz     6,-2(r4)
 646         sth     6,-2(r11)
 647         bflr    31
 648         lbz     7,-3(r4)
 649         stb     7,-3(r11)
 650         blr
 651
 652         .align  4
 653 L(tail5_bwd):
 654         bflr    31
 655         lbz     6,-5(r4)
 656         stb     6,-5(r11)
 657         blr
 658
 659         .align  4
 660 1:
 661         bflr    31
 662         lbz     6,-1(r4)
 663         stb     6,-1(r11)
 664         /* Return original DST pointer.  */
 665         blr
 666
 667
 668 /* Handles copies of 0~8 bytes.  */
 669         .align  4
 670 L(copy_LE_8_bwd):
 671         bne     cr6,L(tail4_bwd)
 672
 673         /* Though we could've used ld/std here, they are still
 674            slow for unaligned cases.  */
 675         lwz     6,-8(r4)
 676         lwz     7,-4(r4)
 677         stw     6,-8(r11)
 678         stw     7,-4(r11)
 679         blr
 680
 681
 682 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 683    SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 684    the data, allowing for aligned DST stores.  */
 685         .align  4
 686 L(copy_GE_32_unaligned_bwd):
 687         andi.   r10,r11,15      /* Check alignment of DST against 16 bytes..  */
 688         srdi    r9,r5,4         /* Number of full quadwords remaining.  */
 689
 690         beq     L(copy_GE_32_unaligned_cont_bwd)
 691
 692         /* DST is not quadword aligned and r10 holds the address masked to
 693            compare alignments.  */
 694         mtocrf  0x01,r10
 695         subf    r5,r10,r5
 696
 697         /* Vector instructions work best when proper alignment (16-bytes)
 698         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 699 1:
 700         bf      31,2f
 701         lbz     r6,-1(r4)
 702         subi    r4,r4,1
 703         stb     r6,-1(r11)
 704         subi    r11,r11,1
 705 2:
 706         bf      30,4f
 707         lhz     r6,-2(r4)
 708         subi    r4,r4,2
 709         sth     r6,-2(r11)
 710         subi    r11,r11,2
 711 4:
 712         bf      29,8f
 713         lwz     r6,-4(r4)
 714         subi    r4,r4,4
 715         stw     r6,-4(r11)
 716         subi    r11,r11,4
 717 8:
 718         bf      28,0f
 719         ld      r6,-8(r4)
 720         subi    r4,r4,8
 721         std     r6,-8(r11)
 722         subi    r11,r11,8
 723 0:
 724         srdi    r9,r5,4       /* Number of full quadwords remaining.  */
 725
 726         /* The proper alignment is present, it is OK to copy the bytes now.  */
 727 L(copy_GE_32_unaligned_cont_bwd):
 728
 729         /* Setup two indexes to speed up the indexed vector operations.  */
 730         clrldi  r10,r5,60
 731         li      r6,-16        /* Index for 16-bytes offsets.  */
 732         li      r7,-32        /* Index for 32-bytes offsets.  */
 733         cmpldi  cr1,10,0
 734         srdi    r8,r5,5       /* Setup the loop counter.  */
 735         mtocrf  0x01,9
 736         cmpldi  cr6,r9,1
 737 #ifdef __LITTLE_ENDIAN__
 738         lvsr    v5,r0,r4
 739 #else
 740         lvsl    v5,r0,r4
 741 #endif
 742         lvx     v3,0,r4
 743         li      r0,0
 744         bf      31,L(setup_unaligned_loop_bwd)
 745
 746         /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
 747         lvx     v4,r4,r6
 748 #ifdef __LITTLE_ENDIAN__
 749         vperm   v6,v3,v4,v5
 750 #else
 751         vperm   v6,v4,v3,v5
 752 #endif
 753         subi    r4,r4,16
 754         stvx    v6,r11,r6
 755         subi    r11,r11,16
 756         vor     v3,v4,v4
 757         clrrdi  r0,r4,60
 758
 759 L(setup_unaligned_loop_bwd):
 760         mtctr   r8
 761         ble     cr6,L(end_unaligned_loop_bwd)
 762
 763         /* Copy 32 bytes at a time using vector instructions.  */
 764         .align  4
 765 L(unaligned_loop_bwd):
 766
 767         /* Note: vr6/vr10 may contain data that was already copied,
 768         but in order to get proper alignment, we may have to copy
 769         some portions again. This is faster than having unaligned
 770         vector instructions though.  */
 771
 772         lvx     v4,r4,r6
 773 #ifdef __LITTLE_ENDIAN__
 774         vperm   v6,v3,v4,v5
 775 #else
 776         vperm   v6,v4,v3,v5
 777 #endif
 778         lvx     v3,r4,r7
 779 #ifdef __LITTLE_ENDIAN__
 780         vperm   v10,v4,v3,v5
 781 #else
 782         vperm   v10,v3,v4,v5
 783 #endif
 784         subi    r4,r4,32
 785         stvx    v6,r11,r6
 786         stvx    v10,r11,r7
 787         subi    r11,r11,32
 788         bdnz    L(unaligned_loop_bwd)
 789
 790         clrrdi  r0,r4,60
 791
 792         .align  4
 793 L(end_unaligned_loop_bwd):
 794
 795         /* Check for tail bytes.  */
 796         mtocrf  0x01,r5
 797         beqlr   cr1
 798
 799         add     r4,r4,0
 800
 801         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 802         /* Copy 8 bytes.  */
 803         bf      28,4f
 804         lwz     r6,-4(r4)
 805         lwz     r7,-8(r4)
 806         subi    r4,r4,8
 807         stw     r6,-4(r11)
 808         stw     r7,-8(r11)
 809         subi    r11,r11,8
 810 4:      /* Copy 4~7 bytes.  */
 811         bf      29,L(tail2_bwd)
 812         lwz     r6,-4(r4)
 813         stw     r6,-4(r11)
 814         bf      30,L(tail5_bwd)
 815         lhz     r7,-6(r4)
 816         sth     r7,-6(r11)
 817         bflr    31
 818         lbz     r8,-7(r4)
 819         stb     r8,-7(r11)
 820         /* Return original DST pointer.  */
 821         blr
 822 END_GEN_TB (MEMMOVE, TB_TOCLESS)
 823 libc_hidden_builtin_def (memmove)