sysdeps/powerpc/powerpc32/power7/mempcpy.S

   1 /* Optimized mempcpy implementation for POWER7.
   2    Copyright (C) 2010-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  22         Returns 'dst' + 'len'.  */
  23
  24         .machine  power7
  25 EALIGN (__mempcpy, 5, 0)
  26         CALL_MCOUNT
  27
  28         stwu    1,-32(1)
  29         cfi_adjust_cfa_offset(32)
  30         stw     30,20(1)
  31         cfi_offset(30,(20-32))
  32         stw     31,24(1)
  33         mr      30,3
  34         cmplwi  cr1,5,31
  35         neg     0,3
  36         cfi_offset(31,-8)
  37         ble     cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
  38                                         code.  */
  39
  40         andi.   11,3,7        /* Check alignment of DST.  */
  41         clrlwi  10,4,29       /* Check alignment of SRC.  */
  42         cmplw   cr6,10,11     /* SRC and DST alignments match?  */
  43         mr      12,4
  44         mr      31,5
  45         bne     cr6,L(copy_GE_32_unaligned)
  46
  47         srwi    9,5,3         /* Number of full quadwords remaining.  */
  48
  49         beq     L(copy_GE_32_aligned_cont)
  50
  51         clrlwi  0,0,29
  52         mtcrf   0x01,0
  53         subf    31,0,5
  54
  55         /* Get the SRC aligned to 8 bytes.  */
  56
  57 1:      bf      31,2f
  58         lbz     6,0(12)
  59         addi    12,12,1
  60         stb     6,0(3)
  61         addi    3,3,1
  62 2:      bf      30,4f
  63         lhz     6,0(12)
  64         addi    12,12,2
  65         sth     6,0(3)
  66         addi    3,3,2
  67 4:      bf      29,0f
  68         lwz     6,0(12)
  69         addi    12,12,4
  70         stw     6,0(3)
  71         addi    3,3,4
  72 0:
  73         clrlwi  10,12,29      /* Check alignment of SRC again.  */
  74         srwi    9,31,3        /* Number of full doublewords remaining.  */
  75
  76 L(copy_GE_32_aligned_cont):
  77
  78         clrlwi  11,31,29
  79         mtcrf   0x01,9
  80
  81         srwi    8,31,5
  82         cmplwi  cr1,9,4
  83         cmplwi  cr6,11,0
  84         mr      11,12
  85
  86         /* Copy 1~3 doublewords so the main loop starts
  87         at a multiple of 32 bytes.  */
  88
  89         bf      30,1f
  90         lfd     6,0(12)
  91         lfd     7,8(12)
  92         addi    11,12,16
  93         mtctr   8
  94         stfd    6,0(3)
  95         stfd    7,8(3)
  96         addi    10,3,16
  97         bf      31,4f
  98         lfd     0,16(12)
  99         stfd    0,16(3)
 100         blt     cr1,3f
 101         addi    11,12,24
 102         addi    10,3,24
 103         b       4f
 104
 105         .align  4
 106 1:      /* Copy 1 doubleword and set the counter.  */
 107         mr      10,3
 108         mtctr   8
 109         bf      31,4f
 110         lfd     6,0(12)
 111         addi    11,12,8
 112         stfd    6,0(3)
 113         addi    10,3,8
 114
 115         .align  4
 116 4:      /* Main aligned copy loop. Copies 32-bytes at a time.  */
 117         lfd     6,0(11)
 118         lfd     7,8(11)
 119         lfd     8,16(11)
 120         lfd     0,24(11)
 121         addi    11,11,32
 122
 123         stfd    6,0(10)
 124         stfd    7,8(10)
 125         stfd    8,16(10)
 126         stfd    0,24(10)
 127         addi    10,10,32
 128         bdnz    4b
 129 3:
 130
 131         /* Check for tail bytes.  */
 132
 133         clrrwi  0,31,3
 134         mtcrf   0x01,31
 135         beq     cr6,0f
 136
 137 .L9:
 138         add     3,3,0
 139         add     12,12,0
 140
 141         /*  At this point we have a tail of 0-7 bytes and we know that the
 142         destination is doubleword-aligned.  */
 143 4:      /* Copy 4 bytes.  */
 144         bf      29,2f
 145
 146         lwz     6,0(12)
 147         addi    12,12,4
 148         stw     6,0(3)
 149         addi    3,3,4
 150 2:      /* Copy 2 bytes.  */
 151         bf      30,1f
 152
 153         lhz     6,0(12)
 154         addi    12,12,2
 155         sth     6,0(3)
 156         addi    3,3,2
 157 1:      /* Copy 1 byte.  */
 158         bf      31,0f
 159
 160         lbz     6,0(12)
 161         stb     6,0(3)
 162 0:      /* Return DST + LEN pointer.  */
 163         add     3,30,5
 164         lwz     30,20(1)
 165         lwz     31,24(1)
 166         addi    1,1,32
 167         blr
 168
 169         /* Handle copies of 0~31 bytes.  */
 170         .align  4
 171 L(copy_LT_32):
 172         cmplwi  cr6,5,8
 173         mr      12,4
 174         mtcrf   0x01,5
 175         ble     cr6,L(copy_LE_8)
 176
 177         /* At least 9 bytes to go.  */
 178         neg     8,4
 179         clrrwi  11,4,2
 180         andi.   0,8,3
 181         cmplwi  cr1,5,16
 182         mr      10,5
 183         beq     L(copy_LT_32_aligned)
 184
 185         /* Force 4-bytes alignment for SRC.  */
 186         mtocrf  0x01,0
 187         subf    10,0,5
 188 2:      bf      30,1f
 189
 190         lhz     6,0(12)
 191         addi    12,12,2
 192         sth     6,0(3)
 193         addi    3,3,2
 194 1:      bf      31,L(end_4bytes_alignment)
 195
 196         lbz     6,0(12)
 197         addi    12,12,1
 198         stb     6,0(3)
 199         addi    3,3,1
 200
 201         .align  4
 202 L(end_4bytes_alignment):
 203         cmplwi  cr1,10,16
 204         mtcrf   0x01,10
 205
 206 L(copy_LT_32_aligned):
 207         /* At least 6 bytes to go, and SRC is word-aligned.  */
 208         blt     cr1,8f
 209
 210         /* Copy 16 bytes.  */
 211         lwz     6,0(12)
 212         lwz     7,4(12)
 213         stw     6,0(3)
 214         lwz     8,8(12)
 215         stw     7,4(3)
 216         lwz     6,12(12)
 217         addi    12,12,16
 218         stw     8,8(3)
 219         stw     6,12(3)
 220         addi    3,3,16
 221 8:      /* Copy 8 bytes.  */
 222         bf      28,4f
 223
 224         lwz     6,0(12)
 225         lwz     7,4(12)
 226         addi    12,12,8
 227         stw     6,0(3)
 228         stw     7,4(3)
 229         addi    3,3,8
 230 4:      /* Copy 4 bytes.  */
 231         bf      29,2f
 232
 233         lwz     6,0(12)
 234         addi    12,12,4
 235         stw     6,0(3)
 236         addi    3,3,4
 237 2:      /* Copy 2-3 bytes.  */
 238         bf      30,1f
 239
 240         lhz     6,0(12)
 241         sth     6,0(3)
 242         bf      31,0f
 243         lbz     7,2(12)
 244         stb     7,2(3)
 245
 246         /* Return DST + LEN pointer.  */
 247         add     3,30,5
 248         lwz     30,20(1)
 249         addi    1,1,32
 250         blr
 251
 252         .align  4
 253 1:      /* Copy 1 byte.  */
 254         bf      31,0f
 255
 256         lbz     6,0(12)
 257         stb     6,0(3)
 258 0:      /* Return DST + LEN pointer.  */
 259         add     3,30,5
 260         lwz     30,20(1)
 261         addi    1,1,32
 262         blr
 263
 264         /* Handles copies of 0~8 bytes.  */
 265         .align  4
 266 L(copy_LE_8):
 267         bne     cr6,4f
 268
 269         /* Though we could've used lfd/stfd here, they are still
 270         slow for unaligned cases.  */
 271
 272         lwz     6,0(4)
 273         lwz     7,4(4)
 274         stw     6,0(3)
 275         stw     7,4(3)
 276
 277         /* Return DST + LEN pointer.  */
 278         add     3,30,5
 279         lwz     30,20(1)
 280         addi    1,1,32
 281         blr
 282
 283         .align  4
 284 4:      /* Copies 4~7 bytes.  */
 285         bf      29,2b
 286
 287         lwz     6,0(4)
 288         stw     6,0(3)
 289         bf      30,5f
 290         lhz     7,4(4)
 291         sth     7,4(3)
 292         bf      31,0f
 293         lbz     8,6(4)
 294         stb     8,6(3)
 295
 296         /* Return DST + LEN pointer.  */
 297         add     3,30,5
 298         lwz     30,20(1)
 299         addi    1,1,32
 300         blr
 301
 302         .align  4
 303 5:      /* Copy 1 byte.  */
 304         bf      31,0f
 305
 306         lbz     6,4(4)
 307         stb     6,4(3)
 308
 309 0:      /* Return DST + LEN pointer.  */
 310         add     3,30,5
 311         lwz     30,20(1)
 312         addi    1,1,32
 313         blr
 314
 315         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 316         SRC is not. Use aligned quadword loads from SRC, shifted to realign
 317         the data, allowing for aligned DST stores.  */
 318         .align  4
 319 L(copy_GE_32_unaligned):
 320         andi.   11,3,15       /* Check alignment of DST.  */
 321         clrlwi  0,0,28        /* Number of bytes until the 1st
 322                                  quadword of DST.  */
 323         srwi    9,5,4         /* Number of full quadwords remaining.  */
 324
 325         beq     L(copy_GE_32_unaligned_cont)
 326
 327         /* DST is not quadword aligned, get it aligned.  */
 328
 329         mtcrf   0x01,0
 330         subf    31,0,5
 331
 332         /* Vector instructions work best when proper alignment (16-bytes)
 333         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 334 1:      /* Copy 1 byte.  */
 335         bf      31,2f
 336
 337         lbz     6,0(12)
 338         addi    12,12,1
 339         stb     6,0(3)
 340         addi    3,3,1
 341 2:      /* Copy 2 bytes.  */
 342         bf              30,4f
 343
 344         lhz     6,0(12)
 345         addi    12,12,2
 346         sth     6,0(3)
 347         addi    3,3,2
 348 4:      /* Copy 4 bytes.  */
 349         bf      29,8f
 350
 351         lwz     6,0(12)
 352         addi    12,12,4
 353         stw     6,0(3)
 354         addi    3,3,4
 355 8:      /* Copy 8 bytes.  */
 356         bf      28,0f
 357
 358         lfd     6,0(12)
 359         addi    12,12,8
 360         stfd    6,0(3)
 361         addi    3,3,8
 362 0:
 363         clrlwi  10,12,28      /* Check alignment of SRC.  */
 364         srwi    9,31,4        /* Number of full quadwords remaining.  */
 365
 366         /* The proper alignment is present, it is OK to copy the bytes now.  */
 367 L(copy_GE_32_unaligned_cont):
 368
 369         /* Setup two indexes to speed up the indexed vector operations.  */
 370         clrlwi  11,31,28
 371         li      6,16          /* Index for 16-bytes offsets.  */
 372         li      7,32          /* Index for 32-bytes offsets.  */
 373         cmplwi  cr1,11,0
 374         srwi    8,31,5        /* Setup the loop counter.  */
 375         mr      10,3
 376         mr      11,12
 377         mtcrf   0x01,9
 378         cmplwi  cr6,9,1
 379 #ifdef __LITTLE_ENDIAN__
 380         lvsr    5,0,12
 381 #else
 382         lvsl    5,0,12
 383 #endif
 384         lvx     3,0,12
 385         bf      31,L(setup_unaligned_loop)
 386
 387         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 388         lvx     4,12,6
 389 #ifdef __LITTLE_ENDIAN__
 390         vperm   6,4,3,5
 391 #else
 392         vperm   6,3,4,5
 393 #endif
 394         addi    11,12,16
 395         addi    10,3,16
 396         stvx    6,0,3
 397         vor     3,4,4
 398
 399 L(setup_unaligned_loop):
 400         mtctr   8
 401         ble     cr6,L(end_unaligned_loop)
 402
 403         /* Copy 32 bytes at a time using vector instructions.  */
 404         .align  4
 405 L(unaligned_loop):
 406
 407         /* Note: vr6/vr10 may contain data that was already copied,
 408         but in order to get proper alignment, we may have to copy
 409         some portions again. This is faster than having unaligned
 410         vector instructions though.  */
 411
 412         lvx     4,11,6        /* vr4 = r11+16.  */
 413 #ifdef __LITTLE_ENDIAN__
 414         vperm   6,4,3,5
 415 #else
 416         vperm   6,3,4,5
 417 #endif
 418         lvx     3,11,7        /* vr3 = r11+32.  */
 419 #ifdef __LITTLE_ENDIAN__
 420         vperm   10,3,4,5
 421 #else
 422         vperm   10,4,3,5
 423 #endif
 424         addi    11,11,32
 425         stvx    6,0,10
 426         stvx    10,10,6
 427         addi    10,10,32
 428
 429         bdnz    L(unaligned_loop)
 430
 431         .align  4
 432 L(end_unaligned_loop):
 433
 434         /* Check for tail bytes.  */
 435         clrrwi  0,31,4
 436         mtcrf   0x01,31
 437         beq     cr1,0f
 438
 439         add     3,3,0
 440         add     12,12,0
 441
 442         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 443 8:      /* Copy 8 bytes.  */
 444         bf      28,4f
 445
 446         lwz     6,0(12)
 447         lwz     7,4(12)
 448         addi    12,12,8
 449         stw     6,0(3)
 450         stw     7,4(3)
 451         addi    3,3,8
 452 4:      /* Copy 4 bytes.  */
 453         bf      29,2f
 454
 455         lwz     6,0(12)
 456         addi    12,12,4
 457         stw     6,0(3)
 458         addi    3,3,4
 459 2:      /* Copy 2~3 bytes.  */
 460         bf      30,1f
 461
 462         lhz     6,0(12)
 463         addi    12,12,2
 464         sth     6,0(3)
 465         addi    3,3,2
 466 1:      /* Copy 1 byte.  */
 467         bf      31,0f
 468
 469         lbz     6,0(12)
 470         stb     6,0(3)
 471 0:      /* Return DST + LEN pointer.  */
 472         add     3,30,5
 473         lwz     30,20(1)
 474         lwz     31,24(1)
 475         addi    1,1,32
 476         blr
 477
 478 END (__mempcpy)
 479 libc_hidden_def (__mempcpy)
 480 weak_alias (__mempcpy, mempcpy)
 481 libc_hidden_builtin_def (mempcpy)