sysdeps/powerpc/powerpc64/power4/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  29    of handling unaligned load/stores that do not cross 32-byte boundaries.
  30
  31    Longer moves (>= 32-bytes) justify the effort to get at least the
  32    destination doubleword (8-byte) aligned.  Further optimization is
  33    possible when both source and destination are doubleword aligned.
  34    Each case has a optimized unrolled loop.   */
  35
  36         .machine power4
  37 EALIGN (memcpy, 5, 0)
  38         CALL_MCOUNT 3
  39
  40     cmpldi cr1,5,31
  41     neg   0,3
  42     std   3,-16(1)
  43     std   31,-8(1)
  44     cfi_offset(31,-8)
  45     andi. 11,3,7        /* check alignment of dst.  */
  46     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  47     clrldi 10,4,61      /* check alignment of src.  */
  48     cmpldi cr6,5,8
  49     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  50     cmpld cr6,10,11
  51     mr    12,4
  52     srdi  9,5,3         /* Number of full double words remaining.  */
  53     mtcrf 0x01,0
  54     mr    31,5
  55     beq   .L0
  56
  57     subf  31,0,5
  58   /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
  59 1:  bf    31,2f
  60     lbz   6,0(12)
  61     addi  12,12,1
  62     stb   6,0(3)
  63     addi  3,3,1
  64 2:  bf    30,4f
  65     lhz   6,0(12)
  66     addi  12,12,2
  67     sth   6,0(3)
  68     addi  3,3,2
  69 4:  bf    29,0f
  70     lwz   6,0(12)
  71     addi  12,12,4
  72     stw   6,0(3)
  73     addi  3,3,4
  74 0:
  75     clrldi 10,12,61     /* check alignment of src again.  */
  76     srdi  9,31,3        /* Number of full double words remaining.  */
  77
  78   /* Copy doublewords from source to destination, assuming the
  79      destination is aligned on a doubleword boundary.
  80
  81      At this point we know there are at least 25 bytes left (32-7) to copy.
  82      The next step is to determine if the source is also doubleword aligned.
  83      If not branch to the unaligned move code at .L6. which uses
  84      a load, shift, store strategy.
  85
  86      Otherwise source and destination are doubleword aligned, and we can
  87      the optimized doubleword copy loop.  */
  88 .L0:
  89     clrldi  11,31,61
  90     mtcrf   0x01,9
  91     cmpldi  cr1,11,0
  92     bne-    cr6,.L6   /* If source is not DW aligned.  */
  93
  94   /* Move doublewords where destination and source are DW aligned.
  95      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
  96      If the copy is not an exact multiple of 32 bytes, 1-3
  97      doublewords are copied as needed to set up the main loop.  After
  98      the main loop exits there may be a tail of 1-7 bytes. These byte are
  99      copied a word/halfword/byte at a time as needed to preserve alignment.  */
 100
 101     srdi  8,31,5
 102     cmpldi      cr1,9,4
 103     cmpldi      cr6,11,0
 104     mr    11,12
 105
 106     bf    30,1f
 107     ld    6,0(12)
 108     ld    7,8(12)
 109     addi  11,12,16
 110     mtctr 8
 111     std   6,0(3)
 112     std   7,8(3)
 113     addi  10,3,16
 114     bf    31,4f
 115     ld    0,16(12)
 116     std   0,16(3)
 117     blt   cr1,3f
 118     addi  11,12,24
 119     addi  10,3,24
 120     b     4f
 121     .align  4
 122 1:
 123     mr    10,3
 124     mtctr 8
 125     bf    31,4f
 126     ld    6,0(12)
 127     addi  11,12,8
 128     std   6,0(3)
 129     addi  10,3,8
 130
 131     .align  4
 132 4:
 133     ld    6,0(11)
 134     ld    7,8(11)
 135     ld    8,16(11)
 136     ld    0,24(11)
 137     addi  11,11,32
 138 2:
 139     std   6,0(10)
 140     std   7,8(10)
 141     std   8,16(10)
 142     std   0,24(10)
 143     addi  10,10,32
 144     bdnz  4b
 145 3:
 146
 147     rldicr 0,31,0,60
 148     mtcrf 0x01,31
 149     beq   cr6,0f
 150 .L9:
 151     add   3,3,0
 152     add   12,12,0
 153
 154 /*  At this point we have a tail of 0-7 bytes and we know that the
 155     destination is double word aligned.  */
 156 4:  bf    29,2f
 157     lwz   6,0(12)
 158     addi  12,12,4
 159     stw   6,0(3)
 160     addi  3,3,4
 161 2:  bf    30,1f
 162     lhz   6,0(12)
 163     addi  12,12,2
 164     sth   6,0(3)
 165     addi  3,3,2
 166 1:  bf    31,0f
 167     lbz   6,0(12)
 168     stb   6,0(3)
 169 0:
 170   /* Return original dst pointer.  */
 171     ld 31,-8(1)
 172     ld 3,-16(1)
 173     blr
 174
 175 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 176    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 177    tests.
 178
 179    In the short (0-8 byte) case no attempt is made to force alignment
 180    of either source or destination.  The hardware will handle the
 181    unaligned load/stores with small delays for crossing 32- 64-byte, and
 182    4096-byte boundaries. Since these short moves are unlikely to be
 183    unaligned or cross these boundaries, the overhead to force
 184    alignment is not justified.
 185
 186    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
 187    boundaries.  Since only loads are sensitive to the 32-/64-byte
 188    boundaries it is more important to align the source then the
 189    destination.  If the source is not already word aligned, we first
 190    move 1-3 bytes as needed.  Since we are only word aligned we don't
 191    use double word load/stores to insure that all loads are aligned.
 192    While the destination and stores may still be unaligned, this
 193    is only an issue for page (4096 byte boundary) crossing, which
 194    should be rare for these short moves.  The hardware handles this
 195    case automatically with a small delay.  */
 196
 197     .align  4
 198 .L2:
 199     mtcrf 0x01,5
 200     neg   8,4
 201     clrrdi      11,4,2
 202     andi. 0,8,3
 203     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 204 /* At least 9 bytes left.  Get the source word aligned.  */
 205     cmpldi      cr1,5,16
 206     mr    10,5
 207     mr    12,4
 208     cmpldi      cr6,0,2
 209     beq   .L3   /* If the source is already word aligned skip this.  */
 210 /* Copy 1-3 bytes to get source address word aligned.  */
 211     lwz   6,0(11)
 212     subf  10,0,5
 213     add   12,4,0
 214     blt   cr6,5f
 215     srdi  7,6,16
 216     bgt   cr6,3f
 217 #ifdef __LITTLE_ENDIAN__
 218     sth   7,0(3)
 219 #else
 220     sth   6,0(3)
 221 #endif
 222     b     7f
 223     .align  4
 224 3:
 225 #ifdef __LITTLE_ENDIAN__
 226     rotlwi 6,6,24
 227     stb   6,0(3)
 228     sth   7,1(3)
 229 #else
 230     stb   7,0(3)
 231     sth   6,1(3)
 232 #endif
 233     b     7f
 234     .align  4
 235 5:
 236 #ifdef __LITTLE_ENDIAN__
 237     rotlwi 6,6,8
 238 #endif
 239     stb   6,0(3)
 240 7:
 241     cmpldi      cr1,10,16
 242     add   3,3,0
 243     mtcrf 0x01,10
 244     .align  4
 245 .L3:
 246 /* At least 6 bytes left and the source is word aligned.  */
 247     blt   cr1,8f
 248 16: /* Move 16 bytes.  */
 249     lwz   6,0(12)
 250     lwz   7,4(12)
 251     stw   6,0(3)
 252     lwz   6,8(12)
 253     stw   7,4(3)
 254     lwz   7,12(12)
 255     addi  12,12,16
 256     stw   6,8(3)
 257     stw   7,12(3)
 258     addi  3,3,16
 259 8:  /* Move 8 bytes.  */
 260     bf    28,4f
 261     lwz   6,0(12)
 262     lwz   7,4(12)
 263     addi  12,12,8
 264     stw   6,0(3)
 265     stw   7,4(3)
 266     addi  3,3,8
 267 4:  /* Move 4 bytes.  */
 268     bf    29,2f
 269     lwz   6,0(12)
 270     addi  12,12,4
 271     stw   6,0(3)
 272     addi  3,3,4
 273 2:  /* Move 2-3 bytes.  */
 274     bf    30,1f
 275     lhz   6,0(12)
 276     sth   6,0(3)
 277     bf    31,0f
 278     lbz   7,2(12)
 279     stb   7,2(3)
 280     ld 3,-16(1)
 281     blr
 282 1:  /* Move 1 byte.  */
 283     bf    31,0f
 284     lbz   6,0(12)
 285     stb   6,0(3)
 286 0:
 287   /* Return original dst pointer.  */
 288     ld    3,-16(1)
 289     blr
 290
 291 /* Special case to copy 0-8 bytes.  */
 292     .align  4
 293 .LE8:
 294     mr    12,4
 295     bne   cr6,4f
 296 /* Would have liked to use use ld/std here but the 630 processors are
 297    slow for load/store doubles that are not at least word aligned.
 298    Unaligned Load/Store word execute with only a 1 cycle penalty.  */
 299     lwz   6,0(4)
 300     lwz   7,4(4)
 301     stw   6,0(3)
 302     stw   7,4(3)
 303   /* Return original dst pointer.  */
 304     ld    3,-16(1)
 305     blr
 306     .align  4
 307 4:  bf    29,2b
 308     lwz   6,0(4)
 309     stw   6,0(3)
 310 6:
 311     bf    30,5f
 312     lhz   7,4(4)
 313     sth   7,4(3)
 314     bf    31,0f
 315     lbz   8,6(4)
 316     stb   8,6(3)
 317     ld 3,-16(1)
 318     blr
 319     .align  4
 320 5:
 321     bf    31,0f
 322     lbz   6,4(4)
 323     stb   6,4(3)
 324     .align  4
 325 0:
 326   /* Return original dst pointer.  */
 327     ld    3,-16(1)
 328     blr
 329
 330     .align  4
 331 .L6:
 332
 333   /* Copy doublewords where the destination is aligned but the source is
 334      not.  Use aligned doubleword loads from the source, shifted to realign
 335      the data, to allow aligned destination stores.  */
 336     addi    11,9,-1  /* loop DW count is one less than total */
 337     subf    5,10,12
 338     sldi    10,10,3
 339     mr      4,3
 340     srdi    8,11,2   /* calculate the 32 byte loop count */
 341     ld      6,0(5)
 342     mtcrf   0x01,11
 343     cmpldi  cr6,9,4
 344     mtctr   8
 345     ld      7,8(5)
 346     subfic  9,10,64
 347     bf      30,1f
 348
 349     /* there are at least two DWs to copy */
 350 #ifdef __LITTLE_ENDIAN__
 351     srd     0,6,10
 352     sld     8,7,9
 353 #else
 354     sld     0,6,10
 355     srd     8,7,9
 356 #endif
 357     or      0,0,8
 358     ld      6,16(5)
 359     std     0,0(4)
 360 #ifdef __LITTLE_ENDIAN__
 361     srd     0,7,10
 362     sld     8,6,9
 363 #else
 364     sld     0,7,10
 365     srd     8,6,9
 366 #endif
 367     or      0,0,8
 368     ld      7,24(5)
 369     std     0,8(4)
 370     addi    4,4,16
 371     addi    5,5,32
 372     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
 373     bf      31,4f
 374     /* there is a third DW to copy */
 375 #ifdef __LITTLE_ENDIAN__
 376     srd     0,6,10
 377     sld     8,7,9
 378 #else
 379     sld     0,6,10
 380     srd     8,7,9
 381 #endif
 382     or      0,0,8
 383     std     0,0(4)
 384     mr      6,7
 385     ld      7,0(5)
 386     addi    5,5,8
 387     addi    4,4,8
 388     beq     cr6,8f  /* if total DWs = 4, then bypass loop */
 389     b       4f
 390     .align 4
 391 1:
 392 #ifdef __LITTLE_ENDIAN__
 393     srd     0,6,10
 394     sld     8,7,9
 395 #else
 396     sld     0,6,10
 397     srd     8,7,9
 398 #endif
 399     addi    5,5,16
 400     or      0,0,8
 401     bf      31,4f
 402     mr      6,7
 403     ld      7,0(5)
 404     addi    5,5,8
 405     std     0,0(4)
 406     addi    4,4,8
 407     .align 4
 408 /* copy 32 bytes at a time */
 409 4:
 410 #ifdef __LITTLE_ENDIAN__
 411     srd   0,6,10
 412     sld   8,7,9
 413 #else
 414     sld   0,6,10
 415     srd   8,7,9
 416 #endif
 417     or    0,0,8
 418     ld    6,0(5)
 419     std   0,0(4)
 420 #ifdef __LITTLE_ENDIAN__
 421     srd   0,7,10
 422     sld   8,6,9
 423 #else
 424     sld   0,7,10
 425     srd   8,6,9
 426 #endif
 427     or    0,0,8
 428     ld    7,8(5)
 429     std   0,8(4)
 430 #ifdef __LITTLE_ENDIAN__
 431     srd   0,6,10
 432     sld   8,7,9
 433 #else
 434     sld   0,6,10
 435     srd   8,7,9
 436 #endif
 437     or    0,0,8
 438     ld    6,16(5)
 439     std   0,16(4)
 440 #ifdef __LITTLE_ENDIAN__
 441     srd   0,7,10
 442     sld   8,6,9
 443 #else
 444     sld   0,7,10
 445     srd   8,6,9
 446 #endif
 447     or    0,0,8
 448     ld    7,24(5)
 449     std   0,24(4)
 450     addi  5,5,32
 451     addi  4,4,32
 452     bdnz+ 4b
 453     .align 4
 454 8:
 455     /* calculate and store the final DW */
 456 #ifdef __LITTLE_ENDIAN__
 457     srd   0,6,10
 458     sld   8,7,9
 459 #else
 460     sld   0,6,10
 461     srd   8,7,9
 462 #endif
 463     or    0,0,8
 464     std   0,0(4)
 465 3:
 466     rldicr 0,31,0,60
 467     mtcrf 0x01,31
 468     bne   cr1,.L9       /* If the tail is 0 bytes we are done!  */
 469   /* Return original dst pointer.  */
 470     ld 31,-8(1)
 471     ld 3,-16(1)
 472     blr
 473 END_GEN_TB (memcpy,TB_TOCLESS)
 474 libc_hidden_builtin_def (memcpy)