sysdeps/powerpc/powerpc64/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  29    of handling unaligned load/stores that do not cross 32-byte boundaries.
  30
  31    Longer moves (>= 32-bytes) justify the effort to get at least the
  32    destination doubleword (8-byte) aligned.  Further optimization is
  33    possible when both source and destination are doubleword aligned.
  34    Each case has a optimized unrolled loop.   */
  35
  36 #ifndef MEMCPY
  37 # define MEMCPY memcpy
  38 #endif
  39
  40 ENTRY_TOCLESS (MEMCPY, 5)
  41         CALL_MCOUNT 3
  42
  43     cmpldi cr1,5,31
  44     neg   0,3
  45     std   3,-16(1)
  46     std   31,-8(1)
  47     cfi_offset(31,-8)
  48     andi. 11,3,7        /* check alignment of dst.  */
  49     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  50     clrldi 10,4,61      /* check alignment of src.  */
  51     cmpldi cr6,5,8
  52     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  53     cmpld cr6,10,11
  54     mr    12,4
  55     srdi  9,5,3         /* Number of full double words remaining.  */
  56     mtcrf 0x01,0
  57     mr    31,5
  58     beq   .L0
  59
  60     subf  31,0,5
  61   /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
  62 1:  bf    31,2f
  63     lbz   6,0(12)
  64     addi  12,12,1
  65     stb   6,0(3)
  66     addi  3,3,1
  67 2:  bf    30,4f
  68     lhz   6,0(12)
  69     addi  12,12,2
  70     sth   6,0(3)
  71     addi  3,3,2
  72 4:  bf    29,0f
  73     lwz   6,0(12)
  74     addi  12,12,4
  75     stw   6,0(3)
  76     addi  3,3,4
  77 0:
  78     clrldi 10,12,61     /* check alignment of src again.  */
  79     srdi  9,31,3        /* Number of full double words remaining.  */
  80
  81   /* Copy doublewords from source to destination, assuming the
  82      destination is aligned on a doubleword boundary.
  83
  84      At this point we know there are at least 25 bytes left (32-7) to copy.
  85      The next step is to determine if the source is also doubleword aligned.
  86      If not branch to the unaligned move code at .L6. which uses
  87      a load, shift, store strategy.
  88
  89      Otherwise source and destination are doubleword aligned, and we can
  90      the optimized doubleword copy loop.  */
  91 .L0:
  92     clrldi      11,31,61
  93     mtcrf 0x01,9
  94     bne-  cr6,.L6   /* If source is not DW aligned.  */
  95
  96   /* Move doublewords where destination and source are DW aligned.
  97      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
  98      If the copy is not an exact multiple of 32 bytes, 1-3
  99      doublewords are copied as needed to set up the main loop.  After
 100      the main loop exits there may be a tail of 1-7 bytes. These byte are
 101      copied a word/halfword/byte at a time as needed to preserve alignment.  */
 102
 103     srdi  8,31,5
 104     cmpldi      cr1,9,4
 105     cmpldi      cr6,11,0
 106     mr    11,12
 107
 108     bf    30,1f
 109     ld    6,0(12)
 110     ld    7,8(12)
 111     addi  11,12,16
 112     mtctr 8
 113     std   6,0(3)
 114     std   7,8(3)
 115     addi  10,3,16
 116     bf    31,4f
 117     ld    0,16(12)
 118     std   0,16(3)
 119     blt   cr1,3f
 120     addi  11,12,24
 121     addi  10,3,24
 122     b     4f
 123     .align  4
 124 1:
 125     mr    10,3
 126     mtctr 8
 127     bf    31,4f
 128     ld    6,0(12)
 129     addi  11,12,8
 130     std   6,0(3)
 131     addi  10,3,8
 132
 133     .align  4
 134 4:
 135     ld    6,0(11)
 136     ld    7,8(11)
 137     ld    8,16(11)
 138     ld    0,24(11)
 139     addi  11,11,32
 140 2:
 141     std   6,0(10)
 142     std   7,8(10)
 143     std   8,16(10)
 144     std   0,24(10)
 145     addi  10,10,32
 146     bdnz  4b
 147 3:
 148
 149     rldicr 0,31,0,60
 150     mtcrf 0x01,31
 151     beq   cr6,0f
 152 .L9:
 153     add   3,3,0
 154     add   12,12,0
 155
 156 /*  At this point we have a tail of 0-7 bytes and we know that the
 157     destination is double word aligned.  */
 158 4:  bf    29,2f
 159     lwz   6,0(12)
 160     addi  12,12,4
 161     stw   6,0(3)
 162     addi  3,3,4
 163 2:  bf    30,1f
 164     lhz   6,0(12)
 165     addi  12,12,2
 166     sth   6,0(3)
 167     addi  3,3,2
 168 1:  bf    31,0f
 169     lbz   6,0(12)
 170     stb   6,0(3)
 171 0:
 172   /* Return original dst pointer.  */
 173     ld 31,-8(1)
 174     ld 3,-16(1)
 175     blr
 176
 177 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 178    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 179    tests.
 180
 181    In the short (0-8 byte) case no attempt is made to force alignment
 182    of either source or destination.  The hardware will handle the
 183    unaligned load/stores with small delays for crossing 32- 64-byte, and
 184    4096-byte boundaries. Since these short moves are unlikely to be
 185    unaligned or cross these boundaries, the overhead to force
 186    alignment is not justified.
 187
 188    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
 189    boundaries.  Since only loads are sensitive to the 32-/64-byte
 190    boundaries it is more important to align the source then the
 191    destination.  If the source is not already word aligned, we first
 192    move 1-3 bytes as needed.  Since we are only word aligned we don't
 193    use double word load/stores to insure that all loads are aligned.
 194    While the destination and stores may still be unaligned, this
 195    is only an issue for page (4096 byte boundary) crossing, which
 196    should be rare for these short moves.  The hardware handles this
 197    case automatically with a small delay.  */
 198
 199     .align  4
 200 .L2:
 201     mtcrf 0x01,5
 202     neg   8,4
 203     clrrdi      11,4,2
 204     andi. 0,8,3
 205     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 206 /* At least 9 bytes left.  Get the source word aligned.  */
 207     cmpldi      cr1,5,16
 208     mr    10,5
 209     mr    12,4
 210     cmpldi      cr6,0,2
 211     beq   .L3   /* If the source is already word aligned skip this.  */
 212 /* Copy 1-3 bytes to get source address word aligned.  */
 213     lwz   6,0(11)
 214     subf  10,0,5
 215     add   12,4,0
 216     blt   cr6,5f
 217     srdi  7,6,16
 218     bgt   cr6,3f
 219 #ifdef __LITTLE_ENDIAN__
 220     sth   7,0(3)
 221 #else
 222     sth   6,0(3)
 223 #endif
 224     b     7f
 225     .align  4
 226 3:
 227 #ifdef __LITTLE_ENDIAN__
 228     rotlwi 6,6,24
 229     stb   6,0(3)
 230     sth   7,1(3)
 231 #else
 232     stb   7,0(3)
 233     sth   6,1(3)
 234 #endif
 235     b     7f
 236     .align  4
 237 5:
 238 #ifdef __LITTLE_ENDIAN__
 239     rotlwi 6,6,8
 240 #endif
 241     stb   6,0(3)
 242 7:
 243     cmpldi      cr1,10,16
 244     add   3,3,0
 245     mtcrf 0x01,10
 246     .align  4
 247 .L3:
 248 /* At least 6 bytes left and the source is word aligned.  */
 249     blt   cr1,8f
 250 16: /* Move 16 bytes.  */
 251     lwz   6,0(12)
 252     lwz   7,4(12)
 253     stw   6,0(3)
 254     lwz   6,8(12)
 255     stw   7,4(3)
 256     lwz   7,12(12)
 257     addi  12,12,16
 258     stw   6,8(3)
 259     stw   7,12(3)
 260     addi  3,3,16
 261 8:  /* Move 8 bytes.  */
 262     bf    28,4f
 263     lwz   6,0(12)
 264     lwz   7,4(12)
 265     addi  12,12,8
 266     stw   6,0(3)
 267     stw   7,4(3)
 268     addi  3,3,8
 269 4:  /* Move 4 bytes.  */
 270     bf    29,2f
 271     lwz   6,0(12)
 272     addi  12,12,4
 273     stw   6,0(3)
 274     addi  3,3,4
 275 2:  /* Move 2-3 bytes.  */
 276     bf    30,1f
 277     lhz   6,0(12)
 278     sth   6,0(3)
 279     bf    31,0f
 280     lbz   7,2(12)
 281     stb   7,2(3)
 282     ld 3,-16(1)
 283     blr
 284 1:  /* Move 1 byte.  */
 285     bf    31,0f
 286     lbz   6,0(12)
 287     stb   6,0(3)
 288 0:
 289   /* Return original dst pointer.  */
 290     ld    3,-16(1)
 291     blr
 292
 293 /* Special case to copy 0-8 bytes.  */
 294     .align  4
 295 .LE8:
 296     mr    12,4
 297     bne   cr6,4f
 298 /* Would have liked to use use ld/std here but the 630 processors are
 299    slow for load/store doubles that are not at least word aligned.
 300    Unaligned Load/Store word execute with only a 1 cycle penalty.  */
 301     lwz   6,0(4)
 302     lwz   7,4(4)
 303     stw   6,0(3)
 304     stw   7,4(3)
 305   /* Return original dst pointer.  */
 306     ld    3,-16(1)
 307     blr
 308     .align  4
 309 4:  bf    29,2b
 310     lwz   6,0(4)
 311     stw   6,0(3)
 312 6:
 313     bf    30,5f
 314     lhz   7,4(4)
 315     sth   7,4(3)
 316     bf    31,0f
 317     lbz   8,6(4)
 318     stb   8,6(3)
 319     ld 3,-16(1)
 320     blr
 321     .align  4
 322 5:
 323     bf    31,0f
 324     lbz   6,4(4)
 325     stb   6,4(3)
 326     .align  4
 327 0:
 328   /* Return original dst pointer.  */
 329     ld    3,-16(1)
 330     blr
 331
 332     .align  4
 333 .L6:
 334
 335   /* Copy doublewords where the destination is aligned but the source is
 336      not.  Use aligned doubleword loads from the source, shifted to realign
 337      the data, to allow aligned destination stores.  */
 338     subf  5,10,12
 339     andi. 0,9,1
 340     cmpldi cr6,11,0
 341     sldi  10,10,3
 342     mr    11,9
 343     mr    4,3
 344     ld    6,0(5)
 345     ld    7,8(5)
 346     subfic  9,10,64
 347     beq   2f
 348 #ifdef __LITTLE_ENDIAN__
 349     srd   0,6,10
 350 #else
 351     sld   0,6,10
 352 #endif
 353     cmpldi  11,1
 354     mr    6,7
 355     addi  4,4,-8
 356     addi  11,11,-1
 357     b     1f
 358 2:  addi  5,5,8
 359     .align  4
 360 #ifdef __LITTLE_ENDIAN__
 361 0:  srd   0,6,10
 362     sld   8,7,9
 363 #else
 364 0:  sld   0,6,10
 365     srd   8,7,9
 366 #endif
 367     cmpldi  11,2
 368     ld    6,8(5)
 369     or    0,0,8
 370     addi  11,11,-2
 371     std   0,0(4)
 372 #ifdef __LITTLE_ENDIAN__
 373     srd   0,7,10
 374 1:  sld   8,6,9
 375 #else
 376     sld   0,7,10
 377 1:  srd   8,6,9
 378 #endif
 379     or    0,0,8
 380     beq   8f
 381     ld    7,16(5)
 382     std   0,8(4)
 383     addi  5,5,16
 384     addi  4,4,16
 385     b     0b
 386     .align 4
 387 8:
 388     std   0,8(4)
 389     rldicr 0,31,0,60
 390     mtcrf 0x01,31
 391     bne   cr6,.L9       /* If the tail is 0 bytes we are done!  */
 392   /* Return original dst pointer.  */
 393     ld 31,-8(1)
 394     ld 3,-16(1)
 395     blr
 396 END_GEN_TB (MEMCPY,TB_TOCLESS)
 397 libc_hidden_builtin_def (memcpy)