sysdeps/powerpc/powerpc64/power4/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  29    of handling unaligned load/stores that do not cross 32-byte boundaries.
  30
  31    Longer moves (>= 32-bytes) justify the effort to get at least the
  32    destination doubleword (8-byte) aligned.  Further optimization is
  33    possible when both source and destination are doubleword aligned.
  34    Each case has a optimized unrolled loop.   */
  35
  36 #ifndef MEMCPY
  37 # define MEMCPY memcpy
  38 #endif
  39         .machine power4
  40 ENTRY_TOCLESS (MEMCPY, 5)
  41         CALL_MCOUNT 3
  42
  43     cmpldi cr1,5,31
  44     neg   0,3
  45     std   3,-16(1)
  46     std   31,-8(1)
  47     cfi_offset(31,-8)
  48     andi. 11,3,7        /* check alignment of dst.  */
  49     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  50     clrldi 10,4,61      /* check alignment of src.  */
  51     cmpldi cr6,5,8
  52     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  53     cmpld cr6,10,11
  54     mr    12,4
  55     srdi  9,5,3         /* Number of full double words remaining.  */
  56     mtcrf 0x01,0
  57     mr    31,5
  58     beq   .L0
  59
  60     subf  31,0,5
  61   /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
  62 1:  bf    31,2f
  63     lbz   6,0(12)
  64     addi  12,12,1
  65     stb   6,0(3)
  66     addi  3,3,1
  67 2:  bf    30,4f
  68     lhz   6,0(12)
  69     addi  12,12,2
  70     sth   6,0(3)
  71     addi  3,3,2
  72 4:  bf    29,0f
  73     lwz   6,0(12)
  74     addi  12,12,4
  75     stw   6,0(3)
  76     addi  3,3,4
  77 0:
  78     clrldi 10,12,61     /* check alignment of src again.  */
  79     srdi  9,31,3        /* Number of full double words remaining.  */
  80
  81   /* Copy doublewords from source to destination, assuming the
  82      destination is aligned on a doubleword boundary.
  83
  84      At this point we know there are at least 25 bytes left (32-7) to copy.
  85      The next step is to determine if the source is also doubleword aligned.
  86      If not branch to the unaligned move code at .L6. which uses
  87      a load, shift, store strategy.
  88
  89      Otherwise source and destination are doubleword aligned, and we can
  90      the optimized doubleword copy loop.  */
  91 .L0:
  92     clrldi  11,31,61
  93     mtcrf   0x01,9
  94     cmpldi  cr1,11,0
  95     bne-    cr6,.L6   /* If source is not DW aligned.  */
  96
  97   /* Move doublewords where destination and source are DW aligned.
  98      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
  99      If the copy is not an exact multiple of 32 bytes, 1-3
 100      doublewords are copied as needed to set up the main loop.  After
 101      the main loop exits there may be a tail of 1-7 bytes. These byte are
 102      copied a word/halfword/byte at a time as needed to preserve alignment.  */
 103
 104     srdi  8,31,5
 105     cmpldi      cr1,9,4
 106     cmpldi      cr6,11,0
 107     mr    11,12
 108
 109     bf    30,1f
 110     ld    6,0(12)
 111     ld    7,8(12)
 112     addi  11,12,16
 113     mtctr 8
 114     std   6,0(3)
 115     std   7,8(3)
 116     addi  10,3,16
 117     bf    31,4f
 118     ld    0,16(12)
 119     std   0,16(3)
 120     blt   cr1,3f
 121     addi  11,12,24
 122     addi  10,3,24
 123     b     4f
 124     .align  4
 125 1:
 126     mr    10,3
 127     mtctr 8
 128     bf    31,4f
 129     ld    6,0(12)
 130     addi  11,12,8
 131     std   6,0(3)
 132     addi  10,3,8
 133
 134     .align  4
 135 4:
 136     ld    6,0(11)
 137     ld    7,8(11)
 138     ld    8,16(11)
 139     ld    0,24(11)
 140     addi  11,11,32
 141 2:
 142     std   6,0(10)
 143     std   7,8(10)
 144     std   8,16(10)
 145     std   0,24(10)
 146     addi  10,10,32
 147     bdnz  4b
 148 3:
 149
 150     rldicr 0,31,0,60
 151     mtcrf 0x01,31
 152     beq   cr6,0f
 153 .L9:
 154     add   3,3,0
 155     add   12,12,0
 156
 157 /*  At this point we have a tail of 0-7 bytes and we know that the
 158     destination is double word aligned.  */
 159 4:  bf    29,2f
 160     lwz   6,0(12)
 161     addi  12,12,4
 162     stw   6,0(3)
 163     addi  3,3,4
 164 2:  bf    30,1f
 165     lhz   6,0(12)
 166     addi  12,12,2
 167     sth   6,0(3)
 168     addi  3,3,2
 169 1:  bf    31,0f
 170     lbz   6,0(12)
 171     stb   6,0(3)
 172 0:
 173   /* Return original dst pointer.  */
 174     ld 31,-8(1)
 175     ld 3,-16(1)
 176     blr
 177
 178 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 179    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 180    tests.
 181
 182    In the short (0-8 byte) case no attempt is made to force alignment
 183    of either source or destination.  The hardware will handle the
 184    unaligned load/stores with small delays for crossing 32- 64-byte, and
 185    4096-byte boundaries. Since these short moves are unlikely to be
 186    unaligned or cross these boundaries, the overhead to force
 187    alignment is not justified.
 188
 189    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
 190    boundaries.  Since only loads are sensitive to the 32-/64-byte
 191    boundaries it is more important to align the source then the
 192    destination.  If the source is not already word aligned, we first
 193    move 1-3 bytes as needed.  Since we are only word aligned we don't
 194    use double word load/stores to insure that all loads are aligned.
 195    While the destination and stores may still be unaligned, this
 196    is only an issue for page (4096 byte boundary) crossing, which
 197    should be rare for these short moves.  The hardware handles this
 198    case automatically with a small delay.  */
 199
 200     .align  4
 201 .L2:
 202     mtcrf 0x01,5
 203     neg   8,4
 204     clrrdi      11,4,2
 205     andi. 0,8,3
 206     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 207 /* At least 9 bytes left.  Get the source word aligned.  */
 208     cmpldi      cr1,5,16
 209     mr    10,5
 210     mr    12,4
 211     cmpldi      cr6,0,2
 212     beq   .L3   /* If the source is already word aligned skip this.  */
 213 /* Copy 1-3 bytes to get source address word aligned.  */
 214     lwz   6,0(11)
 215     subf  10,0,5
 216     add   12,4,0
 217     blt   cr6,5f
 218     srdi  7,6,16
 219     bgt   cr6,3f
 220 #ifdef __LITTLE_ENDIAN__
 221     sth   7,0(3)
 222 #else
 223     sth   6,0(3)
 224 #endif
 225     b     7f
 226     .align  4
 227 3:
 228 #ifdef __LITTLE_ENDIAN__
 229     rotlwi 6,6,24
 230     stb   6,0(3)
 231     sth   7,1(3)
 232 #else
 233     stb   7,0(3)
 234     sth   6,1(3)
 235 #endif
 236     b     7f
 237     .align  4
 238 5:
 239 #ifdef __LITTLE_ENDIAN__
 240     rotlwi 6,6,8
 241 #endif
 242     stb   6,0(3)
 243 7:
 244     cmpldi      cr1,10,16
 245     add   3,3,0
 246     mtcrf 0x01,10
 247     .align  4
 248 .L3:
 249 /* At least 6 bytes left and the source is word aligned.  */
 250     blt   cr1,8f
 251 16: /* Move 16 bytes.  */
 252     lwz   6,0(12)
 253     lwz   7,4(12)
 254     stw   6,0(3)
 255     lwz   6,8(12)
 256     stw   7,4(3)
 257     lwz   7,12(12)
 258     addi  12,12,16
 259     stw   6,8(3)
 260     stw   7,12(3)
 261     addi  3,3,16
 262 8:  /* Move 8 bytes.  */
 263     bf    28,4f
 264     lwz   6,0(12)
 265     lwz   7,4(12)
 266     addi  12,12,8
 267     stw   6,0(3)
 268     stw   7,4(3)
 269     addi  3,3,8
 270 4:  /* Move 4 bytes.  */
 271     bf    29,2f
 272     lwz   6,0(12)
 273     addi  12,12,4
 274     stw   6,0(3)
 275     addi  3,3,4
 276 2:  /* Move 2-3 bytes.  */
 277     bf    30,1f
 278     lhz   6,0(12)
 279     sth   6,0(3)
 280     bf    31,0f
 281     lbz   7,2(12)
 282     stb   7,2(3)
 283     ld 3,-16(1)
 284     blr
 285 1:  /* Move 1 byte.  */
 286     bf    31,0f
 287     lbz   6,0(12)
 288     stb   6,0(3)
 289 0:
 290   /* Return original dst pointer.  */
 291     ld    3,-16(1)
 292     blr
 293
 294 /* Special case to copy 0-8 bytes.  */
 295     .align  4
 296 .LE8:
 297     mr    12,4
 298     bne   cr6,4f
 299 /* Would have liked to use use ld/std here but the 630 processors are
 300    slow for load/store doubles that are not at least word aligned.
 301    Unaligned Load/Store word execute with only a 1 cycle penalty.  */
 302     lwz   6,0(4)
 303     lwz   7,4(4)
 304     stw   6,0(3)
 305     stw   7,4(3)
 306   /* Return original dst pointer.  */
 307     ld    3,-16(1)
 308     blr
 309     .align  4
 310 4:  bf    29,2b
 311     lwz   6,0(4)
 312     stw   6,0(3)
 313 6:
 314     bf    30,5f
 315     lhz   7,4(4)
 316     sth   7,4(3)
 317     bf    31,0f
 318     lbz   8,6(4)
 319     stb   8,6(3)
 320     ld 3,-16(1)
 321     blr
 322     .align  4
 323 5:
 324     bf    31,0f
 325     lbz   6,4(4)
 326     stb   6,4(3)
 327     .align  4
 328 0:
 329   /* Return original dst pointer.  */
 330     ld    3,-16(1)
 331     blr
 332
 333     .align  4
 334 .L6:
 335
 336   /* Copy doublewords where the destination is aligned but the source is
 337      not.  Use aligned doubleword loads from the source, shifted to realign
 338      the data, to allow aligned destination stores.  */
 339     addi    11,9,-1  /* loop DW count is one less than total */
 340     subf    5,10,12
 341     sldi    10,10,3
 342     mr      4,3
 343     srdi    8,11,2   /* calculate the 32 byte loop count */
 344     ld      6,0(5)
 345     mtcrf   0x01,11
 346     cmpldi  cr6,9,4
 347     mtctr   8
 348     ld      7,8(5)
 349     subfic  9,10,64
 350     bf      30,1f
 351
 352     /* there are at least two DWs to copy */
 353 #ifdef __LITTLE_ENDIAN__
 354     srd     0,6,10
 355     sld     8,7,9
 356 #else
 357     sld     0,6,10
 358     srd     8,7,9
 359 #endif
 360     or      0,0,8
 361     ld      6,16(5)
 362     std     0,0(4)
 363 #ifdef __LITTLE_ENDIAN__
 364     srd     0,7,10
 365     sld     8,6,9
 366 #else
 367     sld     0,7,10
 368     srd     8,6,9
 369 #endif
 370     or      0,0,8
 371     ld      7,24(5)
 372     std     0,8(4)
 373     addi    4,4,16
 374     addi    5,5,32
 375     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
 376     bf      31,4f
 377     /* there is a third DW to copy */
 378 #ifdef __LITTLE_ENDIAN__
 379     srd     0,6,10
 380     sld     8,7,9
 381 #else
 382     sld     0,6,10
 383     srd     8,7,9
 384 #endif
 385     or      0,0,8
 386     std     0,0(4)
 387     mr      6,7
 388     ld      7,0(5)
 389     addi    5,5,8
 390     addi    4,4,8
 391     beq     cr6,8f  /* if total DWs = 4, then bypass loop */
 392     b       4f
 393     .align 4
 394 1:
 395 #ifdef __LITTLE_ENDIAN__
 396     srd     0,6,10
 397     sld     8,7,9
 398 #else
 399     sld     0,6,10
 400     srd     8,7,9
 401 #endif
 402     addi    5,5,16
 403     or      0,0,8
 404     bf      31,4f
 405     mr      6,7
 406     ld      7,0(5)
 407     addi    5,5,8
 408     std     0,0(4)
 409     addi    4,4,8
 410     .align 4
 411 /* copy 32 bytes at a time */
 412 4:
 413 #ifdef __LITTLE_ENDIAN__
 414     srd   0,6,10
 415     sld   8,7,9
 416 #else
 417     sld   0,6,10
 418     srd   8,7,9
 419 #endif
 420     or    0,0,8
 421     ld    6,0(5)
 422     std   0,0(4)
 423 #ifdef __LITTLE_ENDIAN__
 424     srd   0,7,10
 425     sld   8,6,9
 426 #else
 427     sld   0,7,10
 428     srd   8,6,9
 429 #endif
 430     or    0,0,8
 431     ld    7,8(5)
 432     std   0,8(4)
 433 #ifdef __LITTLE_ENDIAN__
 434     srd   0,6,10
 435     sld   8,7,9
 436 #else
 437     sld   0,6,10
 438     srd   8,7,9
 439 #endif
 440     or    0,0,8
 441     ld    6,16(5)
 442     std   0,16(4)
 443 #ifdef __LITTLE_ENDIAN__
 444     srd   0,7,10
 445     sld   8,6,9
 446 #else
 447     sld   0,7,10
 448     srd   8,6,9
 449 #endif
 450     or    0,0,8
 451     ld    7,24(5)
 452     std   0,24(4)
 453     addi  5,5,32
 454     addi  4,4,32
 455     bdnz+ 4b
 456     .align 4
 457 8:
 458     /* calculate and store the final DW */
 459 #ifdef __LITTLE_ENDIAN__
 460     srd   0,6,10
 461     sld   8,7,9
 462 #else
 463     sld   0,6,10
 464     srd   8,7,9
 465 #endif
 466     or    0,0,8
 467     std   0,0(4)
 468 3:
 469     rldicr 0,31,0,60
 470     mtcrf 0x01,31
 471     bne   cr1,.L9       /* If the tail is 0 bytes we are done!  */
 472   /* Return original dst pointer.  */
 473     ld 31,-8(1)
 474     ld 3,-16(1)
 475     blr
 476 END_GEN_TB (MEMCPY,TB_TOCLESS)
 477 libc_hidden_builtin_def (memcpy)