sysdeps/powerpc/powerpc32/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32/POWER7.
   2    Copyright (C) 2010-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  22    Returns 'dst'.  */
  23
  24         .machine  power7
  25 EALIGN (memcpy, 5, 0)
  26         CALL_MCOUNT
  27
  28         stwu    1,-32(1)
  29         cfi_adjust_cfa_offset(32)
  30         stw     30,20(1)
  31         cfi_offset(30,(20-32))
  32         stw     31,24(1)
  33         mr      30,3
  34         cmplwi  cr1,5,31
  35         neg     0,3
  36         cfi_offset(31,-8)
  37         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  38                                     code.  */
  39
  40         andi.   11,3,15       /* Check alignment of DST.  */
  41         clrlwi  10,4,28       /* Check alignment of SRC.  */
  42         cmplw   cr6,10,11     /* SRC and DST alignments match?  */
  43         mr      12,4
  44         mr      31,5
  45         bne     cr6,L(copy_GE_32_unaligned)
  46
  47         srwi    9,5,3         /* Number of full quadwords remaining.  */
  48
  49         beq     L(copy_GE_32_aligned_cont)
  50
  51         clrlwi  0,0,29
  52         mtcrf   0x01,0
  53         subf    31,0,5
  54
  55         /* Get the SRC aligned to 8 bytes.  */
  56
  57 1:      bf      31,2f
  58         lbz     6,0(12)
  59         addi    12,12,1
  60         stb     6,0(3)
  61         addi    3,3,1
  62 2:      bf      30,4f
  63         lhz     6,0(12)
  64         addi    12,12,2
  65         sth     6,0(3)
  66         addi    3,3,2
  67 4:      bf      29,0f
  68         lwz     6,0(12)
  69         addi    12,12,4
  70         stw     6,0(3)
  71         addi    3,3,4
  72 0:
  73         clrlwi  10,12,29      /* Check alignment of SRC again.  */
  74         srwi    9,31,3        /* Number of full doublewords remaining.  */
  75
  76 L(copy_GE_32_aligned_cont):
  77
  78         clrlwi  11,31,29
  79         mtcrf   0x01,9
  80
  81         srwi    8,31,5
  82         cmplwi  cr1,9,4
  83         cmplwi  cr6,11,0
  84         mr      11,12
  85
  86         /* Copy 1~3 doublewords so the main loop starts
  87         at a multiple of 32 bytes.  */
  88
  89         bf      30,1f
  90         lfd     6,0(12)
  91         lfd     7,8(12)
  92         addi    11,12,16
  93         mtctr   8
  94         stfd    6,0(3)
  95         stfd    7,8(3)
  96         addi    10,3,16
  97         bf      31,4f
  98         lfd     0,16(12)
  99         stfd    0,16(3)
 100         blt     cr1,3f
 101         addi    11,12,24
 102         addi    10,3,24
 103         b       4f
 104
 105         .align  4
 106 1:      /* Copy 1 doubleword and set the counter.  */
 107         mr      10,3
 108         mtctr   8
 109         bf      31,4f
 110         lfd     6,0(12)
 111         addi    11,12,8
 112         stfd    6,0(3)
 113         addi    10,3,8
 114
 115 L(aligned_copy):
 116         /* Main aligned copy loop. Copies up to 128-bytes at a time. */
 117         .align  4
 118 4:
 119         /* check for any 32-byte or 64-byte lumps that are outside of a
 120            nice 128-byte range.  R8 contains the number of 32-byte
 121            lumps, so drop this into the CR, and use the SO/EQ bits to help
 122            handle the 32- or 64- byte lumps.  Then handle the rest with an
 123            unrolled 128-bytes-at-a-time copy loop. */
 124         mtocrf  1,8
 125         li      6,16    # 16() index
 126         li      7,32    # 32() index
 127         li      8,48    # 48() index
 128
 129 L(aligned_32byte):
 130         /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
 131         bns     cr7,L(aligned_64byte)
 132         lxvd2x  6,0,11
 133         lxvd2x  7,11,6
 134         addi    11,11,32
 135         stxvd2x 6,0,10
 136         stxvd2x 7,10,6
 137         addi    10,10,32
 138
 139 L(aligned_64byte):
 140         /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
 141         bne     cr7,L(aligned_128setup)
 142         lxvd2x  6,0,11
 143         lxvd2x  7,11,6
 144         lxvd2x  8,11,7
 145         lxvd2x  9,11,8
 146         addi    11,11,64
 147         stxvd2x 6,0,10
 148         stxvd2x 7,10,6
 149         stxvd2x 8,10,7
 150         stxvd2x 9,10,8
 151         addi    10,10,64
 152
 153 L(aligned_128setup):
 154         /* Set up for the 128-byte at a time copy loop.  */
 155         srwi    8,31,7
 156         cmpwi   8,0     # Any 4x lumps left?
 157         beq     3f      # if not, move along.
 158         lxvd2x  6,0,11
 159         lxvd2x  7,11,6
 160         mtctr   8       # otherwise, load the ctr and begin.
 161         li      8,48    # 48() index
 162         b       L(aligned_128loop)
 163
 164 L(aligned_128head):
 165         /* for the 2nd + iteration of this loop. */
 166         lxvd2x  6,0,11
 167         lxvd2x  7,11,6
 168 L(aligned_128loop):
 169         lxvd2x  8,11,7
 170         lxvd2x  9,11,8
 171         stxvd2x 6,0,10
 172         addi    11,11,64
 173         stxvd2x 7,10,6
 174         stxvd2x 8,10,7
 175         stxvd2x 9,10,8
 176         lxvd2x  6,0,11
 177         lxvd2x  7,11,6
 178         addi    10,10,64
 179         lxvd2x  8,11,7
 180         lxvd2x  9,11,8
 181         addi    11,11,64
 182         stxvd2x 6,0,10
 183         stxvd2x 7,10,6
 184         stxvd2x 8,10,7
 185         stxvd2x 9,10,8
 186         addi    10,10,64
 187         bdnz    L(aligned_128head)
 188
 189 3:
 190         /* Check for tail bytes.  */
 191         clrrwi  0,31,3
 192         mtcrf   0x01,31
 193         beq     cr6,0f
 194
 195 .L9:
 196         add     3,3,0
 197         add     12,12,0
 198
 199         /*  At this point we have a tail of 0-7 bytes and we know that the
 200         destination is doubleword-aligned.  */
 201 4:      /* Copy 4 bytes.  */
 202         bf      29,2f
 203
 204         lwz     6,0(12)
 205         addi    12,12,4
 206         stw     6,0(3)
 207         addi    3,3,4
 208 2:      /* Copy 2 bytes.  */
 209         bf      30,1f
 210
 211         lhz     6,0(12)
 212         addi    12,12,2
 213         sth     6,0(3)
 214         addi    3,3,2
 215 1:      /* Copy 1 byte.  */
 216         bf      31,0f
 217
 218         lbz     6,0(12)
 219         stb     6,0(3)
 220 0:      /* Return original DST pointer.  */
 221         mr      3,30
 222         lwz     30,20(1)
 223         lwz     31,24(1)
 224         addi    1,1,32
 225         blr
 226
 227         /* Handle copies of 0~31 bytes.  */
 228         .align  4
 229 L(copy_LT_32):
 230         cmplwi  cr6,5,8
 231         mr      12,4
 232         mtcrf   0x01,5
 233         ble     cr6,L(copy_LE_8)
 234
 235         /* At least 9 bytes to go.  */
 236         neg     8,4
 237         clrrwi  11,4,2
 238         andi.   0,8,3
 239         cmplwi  cr1,5,16
 240         mr      10,5
 241         beq     L(copy_LT_32_aligned)
 242
 243         /* Force 4-bytes alignment for SRC.  */
 244         mtocrf  0x01,0
 245         subf    10,0,5
 246 2:      bf      30,1f
 247
 248         lhz     6,0(12)
 249         addi    12,12,2
 250         sth     6,0(3)
 251         addi    3,3,2
 252 1:      bf      31,L(end_4bytes_alignment)
 253
 254         lbz     6,0(12)
 255         addi    12,12,1
 256         stb     6,0(3)
 257         addi    3,3,1
 258
 259         .align  4
 260 L(end_4bytes_alignment):
 261         cmplwi  cr1,10,16
 262         mtcrf   0x01,10
 263
 264 L(copy_LT_32_aligned):
 265         /* At least 6 bytes to go, and SRC is word-aligned.  */
 266         blt     cr1,8f
 267
 268         /* Copy 16 bytes.  */
 269         lwz     6,0(12)
 270         lwz     7,4(12)
 271         stw     6,0(3)
 272         lwz     8,8(12)
 273         stw     7,4(3)
 274         lwz     6,12(12)
 275         addi    12,12,16
 276         stw     8,8(3)
 277         stw     6,12(3)
 278         addi    3,3,16
 279 8:      /* Copy 8 bytes.  */
 280         bf      28,4f
 281
 282         lwz     6,0(12)
 283         lwz     7,4(12)
 284         addi    12,12,8
 285         stw     6,0(3)
 286         stw     7,4(3)
 287         addi    3,3,8
 288 4:      /* Copy 4 bytes.  */
 289         bf      29,2f
 290
 291         lwz     6,0(12)
 292         addi    12,12,4
 293         stw     6,0(3)
 294         addi    3,3,4
 295 2:      /* Copy 2-3 bytes.  */
 296         bf      30,1f
 297
 298         lhz     6,0(12)
 299         sth     6,0(3)
 300         bf      31,0f
 301         lbz     7,2(12)
 302         stb     7,2(3)
 303
 304         /* Return original DST pointer.  */
 305         mr      3,30
 306         lwz     30,20(1)
 307         addi    1,1,32
 308         blr
 309
 310         .align  4
 311 1:      /* Copy 1 byte.  */
 312         bf      31,0f
 313
 314         lbz     6,0(12)
 315         stb     6,0(3)
 316 0:      /* Return original DST pointer.  */
 317         mr      3,30
 318         lwz     30,20(1)
 319         addi    1,1,32
 320         blr
 321
 322         /* Handles copies of 0~8 bytes.  */
 323         .align  4
 324 L(copy_LE_8):
 325         bne     cr6,4f
 326
 327         /* Though we could've used lfd/stfd here, they are still
 328         slow for unaligned cases.  */
 329
 330         lwz     6,0(4)
 331         lwz     7,4(4)
 332         stw     6,0(3)
 333         stw     7,4(3)
 334
 335         /* Return original DST pointer.  */
 336         mr      3,30
 337         lwz     30,20(1)
 338         addi    1,1,32
 339         blr
 340
 341         .align  4
 342 4:      /* Copies 4~7 bytes.  */
 343         bf      29,2b
 344
 345         lwz     6,0(4)
 346         stw     6,0(3)
 347         bf      30,5f
 348         lhz     7,4(4)
 349         sth     7,4(3)
 350         bf      31,0f
 351         lbz     8,6(4)
 352         stb     8,6(3)
 353
 354         /* Return original DST pointer.  */
 355         mr      3,30
 356         lwz     30,20(1)
 357         addi    1,1,32
 358         blr
 359
 360         .align  4
 361 5:      /* Copy 1 byte.  */
 362         bf      31,0f
 363
 364         lbz     6,4(4)
 365         stb     6,4(3)
 366
 367 0:      /* Return original DST pointer.  */
 368         mr      3,30
 369         lwz     30,20(1)
 370         addi    1,1,32
 371         blr
 372
 373         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 374         SRC is not. Use aligned quadword loads from SRC, shifted to realign
 375         the data, allowing for aligned DST stores.  */
 376         .align  4
 377 L(copy_GE_32_unaligned):
 378         andi.   11,3,15       /* Check alignment of DST.  */
 379         clrlwi  0,0,28        /* Number of bytes until the 1st
 380                               quadword of DST.  */
 381         srwi    9,5,4         /* Number of full quadwords remaining.  */
 382
 383         beq    L(copy_GE_32_unaligned_cont)
 384
 385         /* DST is not quadword aligned, get it aligned.  */
 386
 387         mtcrf   0x01,0
 388         subf    31,0,5
 389
 390         /* Vector instructions work best when proper alignment (16-bytes)
 391         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 392 1:      /* Copy 1 byte.  */
 393         bf      31,2f
 394
 395         lbz     6,0(12)
 396         addi    12,12,1
 397         stb     6,0(3)
 398         addi    3,3,1
 399 2:      /* Copy 2 bytes.  */
 400         bf          30,4f
 401
 402         lhz     6,0(12)
 403         addi    12,12,2
 404         sth     6,0(3)
 405         addi    3,3,2
 406 4:      /* Copy 4 bytes.  */
 407         bf      29,8f
 408
 409         lwz     6,0(12)
 410         addi    12,12,4
 411         stw     6,0(3)
 412         addi    3,3,4
 413 8:      /* Copy 8 bytes.  */
 414         bf      28,0f
 415
 416         lfd     6,0(12)
 417         addi    12,12,8
 418         stfd    6,0(3)
 419         addi    3,3,8
 420 0:
 421         clrlwi  10,12,28      /* Check alignment of SRC.  */
 422         srwi    9,31,4        /* Number of full quadwords remaining.  */
 423
 424         /* The proper alignment is present, it is OK to copy the bytes now.  */
 425 L(copy_GE_32_unaligned_cont):
 426
 427         /* Setup two indexes to speed up the indexed vector operations.  */
 428         clrlwi  11,31,28
 429         li      6,16          /* Index for 16-bytes offsets.  */
 430         li      7,32          /* Index for 32-bytes offsets.  */
 431         cmplwi  cr1,11,0
 432         srwi    8,31,5        /* Setup the loop counter.  */
 433         mr      10,3
 434         mr      11,12
 435         mtcrf   0x01,9
 436         cmplwi  cr6,9,1
 437 #ifdef __LITTLE_ENDIAN__
 438         lvsr    5,0,12
 439 #else
 440         lvsl    5,0,12
 441 #endif
 442         lvx     3,0,12
 443         bf      31,L(setup_unaligned_loop)
 444
 445         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 446         lvx     4,12,6
 447 #ifdef __LITTLE_ENDIAN__
 448         vperm   6,4,3,5
 449 #else
 450         vperm   6,3,4,5
 451 #endif
 452         addi    11,12,16
 453         addi    10,3,16
 454         stvx    6,0,3
 455         vor     3,4,4
 456
 457 L(setup_unaligned_loop):
 458         mtctr   8
 459         ble     cr6,L(end_unaligned_loop)
 460
 461         /* Copy 32 bytes at a time using vector instructions.  */
 462         .align  4
 463 L(unaligned_loop):
 464
 465         /* Note: vr6/vr10 may contain data that was already copied,
 466         but in order to get proper alignment, we may have to copy
 467         some portions again. This is faster than having unaligned
 468         vector instructions though.  */
 469
 470         lvx     4,11,6        /* vr4 = r11+16.  */
 471 #ifdef __LITTLE_ENDIAN__
 472         vperm   6,4,3,5
 473 #else
 474         vperm   6,3,4,5
 475 #endif
 476         lvx     3,11,7        /* vr3 = r11+32.  */
 477 #ifdef __LITTLE_ENDIAN__
 478         vperm   10,3,4,5
 479 #else
 480         vperm   10,4,3,5
 481 #endif
 482         addi    11,11,32
 483         stvx    6,0,10
 484         stvx    10,10,6
 485         addi    10,10,32
 486
 487         bdnz    L(unaligned_loop)
 488
 489         .align  4
 490 L(end_unaligned_loop):
 491
 492         /* Check for tail bytes.  */
 493         clrrwi  0,31,4
 494         mtcrf   0x01,31
 495         beq     cr1,0f
 496
 497         add     3,3,0
 498         add     12,12,0
 499
 500         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 501 8:      /* Copy 8 bytes.  */
 502         bf      28,4f
 503
 504         lwz     6,0(12)
 505         lwz     7,4(12)
 506         addi    12,12,8
 507         stw     6,0(3)
 508         stw     7,4(3)
 509         addi    3,3,8
 510 4:      /* Copy 4 bytes.  */
 511         bf      29,2f
 512
 513         lwz     6,0(12)
 514         addi    12,12,4
 515         stw     6,0(3)
 516         addi    3,3,4
 517 2:      /* Copy 2~3 bytes.  */
 518         bf      30,1f
 519
 520         lhz     6,0(12)
 521         addi    12,12,2
 522         sth     6,0(3)
 523         addi    3,3,2
 524 1:      /* Copy 1 byte.  */
 525         bf      31,0f
 526
 527         lbz     6,0(12)
 528         stb     6,0(3)
 529 0:      /* Return original DST pointer.  */
 530         mr      3,30
 531         lwz     30,20(1)
 532         lwz     31,24(1)
 533         addi    1,1,32
 534         blr
 535
 536 END (memcpy)
 537 libc_hidden_builtin_def (memcpy)