sysdeps/powerpc/powerpc64/power4/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  25    Returns 'dst'.
  26
  27    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  28    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  29    with the appropriate combination of byte and halfword load/stores.
  30    There is minimal effort to optimize the alignment of short moves.
  31    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  32    of handling unligned load/stores that do not cross 32-byte boundries.
  33
  34    Longer moves (>= 32-bytes) justify the effort to get at least the
  35    destination doubleword (8-byte) aligned.  Further optimization is
  36    posible when both source and destination are doubleword aligned.
  37    Each case has a optimized unrolled loop.   */
  38
  39         .machine power4
  40 EALIGN (BP_SYM (memcpy), 5, 0)
  41         CALL_MCOUNT 3
  42
  43     cmpldi cr1,5,31
  44     neg   0,3
  45     std   3,-16(1)
  46     std   31,-8(1)
  47     cfi_offset(31,-8)
  48     andi. 11,3,7        /* check alignement of dst.  */
  49     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  50     clrldi 10,4,61      /* check alignement of src.  */
  51     cmpldi cr6,5,8
  52     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  53     cmpld cr6,10,11
  54     mr    12,4
  55     srdi  9,5,3         /* Number of full double words remaining.  */
  56     mtcrf 0x01,0
  57     mr    31,5
  58     beq   .L0
  59
  60     subf  31,0,5
  61   /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
  62 1:  bf    31,2f
  63     lbz   6,0(12)
  64     addi  12,12,1
  65     stb   6,0(3)
  66     addi  3,3,1
  67 2:  bf    30,4f
  68     lhz   6,0(12)
  69     addi  12,12,2
  70     sth   6,0(3)
  71     addi  3,3,2
  72 4:  bf    29,0f
  73     lwz   6,0(12)
  74     addi  12,12,4
  75     stw   6,0(3)
  76     addi  3,3,4
  77 0:
  78     clrldi 10,12,61     /* check alignement of src again.  */
  79     srdi  9,31,3        /* Number of full double words remaining.  */
  80
  81   /* Copy doublewords from source to destination, assumpting the
  82      destination is aligned on a doubleword boundary.
  83
  84      At this point we know there are at least 25 bytes left (32-7) to copy.
  85      The next step is to determine if the source is also doubleword aligned.
  86      If not branch to the unaligned move code at .L6. which uses
  87      a load, shift, store strategy.
  88
  89      Otherwise source and destination are doubleword aligned, and we can
  90      the optimized doubleword copy loop.  */
  91 .L0:
  92     clrldi  11,31,61
  93     mtcrf   0x01,9
  94     cmpldi  cr1,11,0
  95     bne-    cr6,.L6   /* If source is not DW aligned.  */
  96
  97   /* Move doublewords where destination and source are DW aligned.
  98      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
  99      If the the copy is not an exact multiple of 32 bytes, 1-3
 100      doublewords are copied as needed to set up the main loop.  After
 101      the main loop exits there may be a tail of 1-7 bytes. These byte are
 102      copied a word/halfword/byte at a time as needed to preserve alignment.  */
 103
 104     srdi  8,31,5
 105     cmpldi      cr1,9,4
 106     cmpldi      cr6,11,0
 107     mr    11,12
 108
 109     bf    30,1f
 110     ld    6,0(12)
 111     ld    7,8(12)
 112     addi  11,12,16
 113     mtctr 8
 114     std   6,0(3)
 115     std   7,8(3)
 116     addi  10,3,16
 117     bf    31,4f
 118     ld    0,16(12)
 119     std   0,16(3)
 120     blt   cr1,3f
 121     addi  11,12,24
 122     addi  10,3,24
 123     b     4f
 124     .align  4
 125 1:
 126     mr    10,3
 127     mtctr 8
 128     bf    31,4f
 129     ld    6,0(12)
 130     addi  11,12,8
 131     std   6,0(3)
 132     addi  10,3,8
 133
 134     .align  4
 135 4:
 136     ld    6,0(11)
 137     ld    7,8(11)
 138     ld    8,16(11)
 139     ld    0,24(11)
 140     addi  11,11,32
 141 2:
 142     std   6,0(10)
 143     std   7,8(10)
 144     std   8,16(10)
 145     std   0,24(10)
 146     addi  10,10,32
 147     bdnz  4b
 148 3:
 149
 150     rldicr 0,31,0,60
 151     mtcrf 0x01,31
 152     beq   cr6,0f
 153 .L9:
 154     add   3,3,0
 155     add   12,12,0
 156
 157 /*  At this point we have a tail of 0-7 bytes and we know that the
 158     destiniation is double word aligned.  */
 159 4:  bf    29,2f
 160     lwz   6,0(12)
 161     addi  12,12,4
 162     stw   6,0(3)
 163     addi  3,3,4
 164 2:  bf    30,1f
 165     lhz   6,0(12)
 166     addi  12,12,2
 167     sth   6,0(3)
 168     addi  3,3,2
 169 1:  bf    31,0f
 170     lbz   6,0(12)
 171     stb   6,0(3)
 172 0:
 173   /* Return original dst pointer.  */
 174     ld 31,-8(1)
 175     ld 3,-16(1)
 176     blr
 177
 178 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 179    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 180    tests.
 181
 182    In the short (0-8 byte) case no attempt is made to force alignment
 183    of either source or destination.  The hardware will handle the
 184    unaligned load/stores with small delays for crossing 32- 64-byte, and
 185    4096-byte boundaries. Since these short moves are unlikely to be
 186    unaligned or cross these boundaries, the overhead to force
 187    alignment is not justified.
 188
 189    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
 190    boundaries.  Since only loads are sensitive to the 32-/64-byte
 191    boundaries it is more important to align the source then the
 192    destination.  If the source is not already word aligned, we first
 193    move 1-3 bytes as needed.  Since we are only word aligned we don't
 194    use double word load/stores to insure that all loads are aligned.
 195    While the destination and stores may still be unaligned, this
 196    is only an issue for page (4096 byte boundary) crossing, which
 197    should be rare for these short moves.  The hardware handles this
 198    case automatically with a small delay.  */
 199
 200     .align  4
 201 .L2:
 202     mtcrf 0x01,5
 203     neg   8,4
 204     clrrdi      11,4,2
 205     andi. 0,8,3
 206     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 207 /* At least 9 bytes left.  Get the source word aligned.  */
 208     cmpldi      cr1,5,16
 209     mr    10,5
 210     mr    12,4
 211     cmpldi      cr6,0,2
 212     beq   .L3   /* If the source is already word aligned skip this.  */
 213 /* Copy 1-3 bytes to get source address word aligned.  */
 214     lwz   6,0(11)
 215     subf  10,0,5
 216     add   12,4,0
 217     blt   cr6,5f
 218     srdi  7,6,16
 219     bgt   cr6,3f
 220     sth   6,0(3)
 221     b     7f
 222     .align  4
 223 3:
 224     stb   7,0(3)
 225     sth   6,1(3)
 226     b     7f
 227     .align  4
 228 5:
 229     stb   6,0(3)
 230 7:
 231     cmpldi      cr1,10,16
 232     add   3,3,0
 233     mtcrf 0x01,10
 234     .align  4
 235 .L3:
 236 /* At least 6 bytes left and the source is word aligned.  */
 237     blt   cr1,8f
 238 16: /* Move 16 bytes.  */
 239     lwz   6,0(12)
 240     lwz   7,4(12)
 241     stw   6,0(3)
 242     lwz   6,8(12)
 243     stw   7,4(3)
 244     lwz   7,12(12)
 245     addi  12,12,16
 246     stw   6,8(3)
 247     stw   7,12(3)
 248     addi  3,3,16
 249 8:  /* Move 8 bytes.  */
 250     bf    28,4f
 251     lwz   6,0(12)
 252     lwz   7,4(12)
 253     addi  12,12,8
 254     stw   6,0(3)
 255     stw   7,4(3)
 256     addi  3,3,8
 257 4:  /* Move 4 bytes.  */
 258     bf    29,2f
 259     lwz   6,0(12)
 260     addi  12,12,4
 261     stw   6,0(3)
 262     addi  3,3,4
 263 2:  /* Move 2-3 bytes.  */
 264     bf    30,1f
 265     lhz   6,0(12)
 266     sth   6,0(3)
 267     bf    31,0f
 268     lbz   7,2(12)
 269     stb   7,2(3)
 270     ld 3,-16(1)
 271     blr
 272 1:  /* Move 1 byte.  */
 273     bf    31,0f
 274     lbz   6,0(12)
 275     stb   6,0(3)
 276 0:
 277   /* Return original dst pointer.  */
 278     ld    3,-16(1)
 279     blr
 280
 281 /* Special case to copy 0-8 bytes.  */
 282     .align  4
 283 .LE8:
 284     mr    12,4
 285     bne   cr6,4f
 286 /* Would have liked to use use ld/std here but the 630 processors are
 287    slow for load/store doubles that are not at least word aligned.
 288    Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
 289     lwz   6,0(4)
 290     lwz   7,4(4)
 291     stw   6,0(3)
 292     stw   7,4(3)
 293   /* Return original dst pointer.  */
 294     ld    3,-16(1)
 295     blr
 296     .align  4
 297 4:  bf    29,2b
 298     lwz   6,0(4)
 299     stw   6,0(3)
 300 6:
 301     bf    30,5f
 302     lhz   7,4(4)
 303     sth   7,4(3)
 304     bf    31,0f
 305     lbz   8,6(4)
 306     stb   8,6(3)
 307     ld 3,-16(1)
 308     blr
 309     .align  4
 310 5:
 311     bf    31,0f
 312     lbz   6,4(4)
 313     stb   6,4(3)
 314     .align  4
 315 0:
 316   /* Return original dst pointer.  */
 317     ld    3,-16(1)
 318     blr
 319
 320     .align  4
 321 .L6:
 322
 323   /* Copy doublewords where the destination is aligned but the source is
 324      not.  Use aligned doubleword loads from the source, shifted to realign
 325      the data, to allow aligned destination stores.  */
 326     addi    11,9,-1  /* loop DW count is one less than total */
 327     subf    5,10,12
 328     sldi    10,10,3
 329     mr      4,3
 330     srdi    8,11,2   /* calculate the 32 byte loop count */
 331     ld      6,0(5)
 332     mtcrf   0x01,11
 333     cmpldi  cr6,9,4
 334     mtctr   8
 335     ld      7,8(5)
 336     subfic  9,10,64
 337     bf      30,1f
 338
 339     /* there are at least two DWs to copy */
 340     sld     0,6,10
 341     srd     8,7,9
 342     or      0,0,8
 343     ld      6,16(5)
 344     std     0,0(4)
 345     sld     0,7,10
 346     srd     8,6,9
 347     or      0,0,8
 348     ld      7,24(5)
 349     std     0,8(4)
 350     addi    4,4,16
 351     addi    5,5,32
 352     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
 353     bf      31,4f
 354     /* there is a third DW to copy */
 355     sld     0,6,10
 356     srd     8,7,9
 357     or      0,0,8
 358     std     0,0(4)
 359     mr      6,7
 360     ld      7,0(5)
 361     addi    5,5,8
 362     addi    4,4,8
 363     beq     cr6,8f  /* if total DWs = 4, then bypass loop */
 364     b       4f
 365     .align 4
 366 1:
 367     sld     0,6,10
 368     srd     8,7,9
 369     addi    5,5,16
 370     or      0,0,8
 371     bf      31,4f
 372     mr      6,7
 373     ld      7,0(5)
 374     addi    5,5,8
 375     std     0,0(4)
 376     addi    4,4,8
 377     .align 4
 378 /* copy 32 bytes at a time */
 379 4:  sld   0,6,10
 380     srd   8,7,9
 381     or    0,0,8
 382     ld    6,0(5)
 383     std   0,0(4)
 384     sld   0,7,10
 385     srd   8,6,9
 386     or    0,0,8
 387     ld    7,8(5)
 388     std   0,8(4)
 389     sld   0,6,10
 390     srd   8,7,9
 391     or    0,0,8
 392     ld    6,16(5)
 393     std   0,16(4)
 394     sld   0,7,10
 395     srd   8,6,9
 396     or    0,0,8
 397     ld    7,24(5)
 398     std   0,24(4)
 399     addi  5,5,32
 400     addi  4,4,32
 401     bdnz+ 4b
 402     .align 4
 403 8:
 404     /* calculate and store the final DW */
 405     sld   0,6,10
 406     srd   8,7,9
 407     or    0,0,8
 408     std   0,0(4)
 409 3:
 410     rldicr 0,31,0,60
 411     mtcrf 0x01,31
 412     bne   cr1,.L9       /* If the tail is 0 bytes we are done!  */
 413   /* Return original dst pointer.  */
 414     ld 31,-8(1)
 415     ld 3,-16(1)
 416     blr
 417 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
 418 libc_hidden_builtin_def (memcpy)