sysdeps/powerpc/powerpc32/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32/POWER7.
   2    Copyright (C) 2010 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  19    02110-1301 USA.  */
  20
  21 #include <sysdep.h>
  22 #include <bp-sym.h>
  23 #include <bp-asm.h>
  24
  25 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  26    Returns 'dst'.  */
  27
  28         .machine  power7
  29 EALIGN (BP_SYM (memcpy), 5, 0)
  30         CALL_MCOUNT
  31
  32         stwu    1,-32(1)
  33         cfi_adjust_cfa_offset(32)
  34         stw     30,20(1)
  35         cfi_offset(30,(20-32))
  36         stw     31,24(1)
  37         mr      30,3
  38         cmplwi  cr1,5,31
  39         neg     0,3
  40         cfi_offset(31,-8)
  41         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  42                                     code.  */
  43
  44         andi.   11,3,7        /* Check alignment of DST.  */
  45         clrlwi  10,4,29       /* Check alignment of SRC.  */
  46         cmplw   cr6,10,11     /* SRC and DST alignments match?  */
  47         mr      12,4
  48         mr      31,5
  49         bne     cr6,L(copy_GE_32_unaligned)
  50
  51         srwi    9,5,3         /* Number of full quadwords remaining.  */
  52
  53         beq     L(copy_GE_32_aligned_cont)
  54
  55         clrlwi  0,0,29
  56         mtcrf   0x01,0
  57         subf    31,0,5
  58
  59         /* Get the SRC aligned to 8 bytes.  */
  60
  61 1:      bf      31,2f
  62         lbz     6,0(12)
  63         addi    12,12,1
  64         stb     6,0(3)
  65         addi    3,3,1
  66 2:      bf      30,4f
  67         lhz     6,0(12)
  68         addi    12,12,2
  69         sth     6,0(3)
  70         addi    3,3,2
  71 4:      bf      29,0f
  72         lwz     6,0(12)
  73         addi    12,12,4
  74         stw     6,0(3)
  75         addi    3,3,4
  76 0:
  77         clrlwi  10,12,29      /* Check alignment of SRC again.  */
  78         srwi    9,31,3        /* Number of full doublewords remaining.  */
  79
  80 L(copy_GE_32_aligned_cont):
  81
  82         clrlwi  11,31,29
  83         mtcrf   0x01,9
  84
  85         srwi    8,31,5
  86         cmplwi  cr1,9,4
  87         cmplwi  cr6,11,0
  88         mr      11,12
  89
  90         /* Copy 1~3 doublewords so the main loop starts
  91         at a multiple of 32 bytes.  */
  92
  93         bf      30,1f
  94         lfd     6,0(12)
  95         lfd     7,8(12)
  96         addi    11,12,16
  97         mtctr   8
  98         stfd    6,0(3)
  99         stfd    7,8(3)
 100         addi    10,3,16
 101         bf      31,4f
 102         lfd     0,16(12)
 103         stfd    0,16(3)
 104         blt     cr1,3f
 105         addi    11,12,24
 106         addi    10,3,24
 107         b       4f
 108
 109         .align  4
 110 1:      /* Copy 1 doubleword and set the counter.  */
 111         mr      10,3
 112         mtctr   8
 113         bf      31,4f
 114         lfd     6,0(12)
 115         addi    11,12,8
 116         stfd    6,0(3)
 117         addi    10,3,8
 118
 119         .align  4
 120 4:      /* Main aligned copy loop. Copies 32-bytes at a time.  */
 121         lfd     6,0(11)
 122         lfd     7,8(11)
 123         lfd     8,16(11)
 124         lfd     0,24(11)
 125         addi    11,11,32
 126
 127         stfd    6,0(10)
 128         stfd    7,8(10)
 129         stfd    8,16(10)
 130         stfd    0,24(10)
 131         addi    10,10,32
 132         bdnz    4b
 133 3:
 134
 135         /* Check for tail bytes.  */
 136
 137         clrrwi  0,31,3
 138         mtcrf   0x01,31
 139         beq     cr6,0f
 140
 141 .L9:
 142         add     3,3,0
 143         add     12,12,0
 144
 145         /*  At this point we have a tail of 0-7 bytes and we know that the
 146         destination is doubleword-aligned.  */
 147 4:      /* Copy 4 bytes.  */
 148         bf      29,2f
 149
 150         lwz     6,0(12)
 151         addi    12,12,4
 152         stw     6,0(3)
 153         addi    3,3,4
 154 2:      /* Copy 2 bytes.  */
 155         bf      30,1f
 156
 157         lhz     6,0(12)
 158         addi    12,12,2
 159         sth     6,0(3)
 160         addi    3,3,2
 161 1:      /* Copy 1 byte.  */
 162         bf      31,0f
 163
 164         lbz     6,0(12)
 165         stb     6,0(3)
 166 0:      /* Return original DST pointer.  */
 167         mr      3,30
 168         lwz     30,20(1)
 169         lwz     31,24(1)
 170         addi    1,1,32
 171         blr
 172
 173         /* Handle copies of 0~31 bytes.  */
 174         .align  4
 175 L(copy_LT_32):
 176         cmplwi  cr6,5,8
 177         mr      12,4
 178         mtcrf   0x01,5
 179         ble     cr6,L(copy_LE_8)
 180
 181         /* At least 9 bytes to go.  */
 182         neg     8,4
 183         clrrwi  11,4,2
 184         andi.   0,8,3
 185         cmplwi  cr1,5,16
 186         mr      10,5
 187         beq     L(copy_LT_32_aligned)
 188
 189         /* Force 4-bytes alignment for SRC.  */
 190         mtocrf  0x01,0
 191         subf    10,0,5
 192 2:      bf      30,1f
 193
 194         lhz     6,0(12)
 195         addi    12,12,2
 196         sth     6,0(3)
 197         addi    3,3,2
 198 1:      bf      31,L(end_4bytes_alignment)
 199
 200         lbz     6,0(12)
 201         addi    12,12,1
 202         stb     6,0(3)
 203         addi    3,3,1
 204
 205         .align  4
 206 L(end_4bytes_alignment):
 207         cmplwi  cr1,10,16
 208         mtcrf   0x01,10
 209
 210 L(copy_LT_32_aligned):
 211         /* At least 6 bytes to go, and SRC is word-aligned.  */
 212         blt     cr1,8f
 213
 214         /* Copy 16 bytes.  */
 215         lwz     6,0(12)
 216         lwz     7,4(12)
 217         stw     6,0(3)
 218         lwz     8,8(12)
 219         stw     7,4(3)
 220         lwz     6,12(12)
 221         addi    12,12,16
 222         stw     8,8(3)
 223         stw     6,12(3)
 224         addi    3,3,16
 225 8:      /* Copy 8 bytes.  */
 226         bf      28,4f
 227
 228         lwz     6,0(12)
 229         lwz     7,4(12)
 230         addi    12,12,8
 231         stw     6,0(3)
 232         stw     7,4(3)
 233         addi    3,3,8
 234 4:      /* Copy 4 bytes.  */
 235         bf      29,2f
 236
 237         lwz     6,0(12)
 238         addi    12,12,4
 239         stw     6,0(3)
 240         addi    3,3,4
 241 2:      /* Copy 2-3 bytes.  */
 242         bf      30,1f
 243
 244         lhz     6,0(12)
 245         sth     6,0(3)
 246         bf      31,0f
 247         lbz     7,2(12)
 248         stb     7,2(3)
 249
 250         /* Return original DST pointer.  */
 251         mr      3,30
 252         lwz     30,20(1)
 253         addi    1,1,32
 254         blr
 255
 256         .align  4
 257 1:      /* Copy 1 byte.  */
 258         bf      31,0f
 259
 260         lbz     6,0(12)
 261         stb     6,0(3)
 262 0:      /* Return original DST pointer.  */
 263         mr      3,30
 264         lwz     30,20(1)
 265         addi    1,1,32
 266         blr
 267
 268         /* Handles copies of 0~8 bytes.  */
 269         .align  4
 270 L(copy_LE_8):
 271         bne     cr6,4f
 272
 273         /* Though we could've used lfd/stfd here, they are still
 274         slow for unaligned cases.  */
 275
 276         lwz     6,0(4)
 277         lwz     7,4(4)
 278         stw     6,0(3)
 279         stw     7,4(3)
 280
 281         /* Return original DST pointer.  */
 282         mr      3,30
 283         lwz     30,20(1)
 284         addi    1,1,32
 285         blr
 286
 287         .align  4
 288 4:      /* Copies 4~7 bytes.  */
 289         bf      29,2b
 290
 291         lwz     6,0(4)
 292         stw     6,0(3)
 293         bf      30,5f
 294         lhz     7,4(4)
 295         sth     7,4(3)
 296         bf      31,0f
 297         lbz     8,6(4)
 298         stb     8,6(3)
 299
 300         /* Return original DST pointer.  */
 301         mr      3,30
 302         lwz     30,20(1)
 303         addi    1,1,32
 304         blr
 305
 306         .align  4
 307 5:      /* Copy 1 byte.  */
 308         bf      31,0f
 309
 310         lbz     6,4(4)
 311         stb     6,4(3)
 312
 313 0:      /* Return original DST pointer.  */
 314         mr      3,30
 315         lwz     30,20(1)
 316         addi    1,1,32
 317         blr
 318
 319         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 320         SRC is not. Use aligned quadword loads from SRC, shifted to realign
 321         the data, allowing for aligned DST stores.  */
 322         .align  4
 323 L(copy_GE_32_unaligned):
 324         andi.   11,3,15       /* Check alignment of DST.  */
 325         clrlwi  0,0,28        /* Number of bytes until the 1st
 326                               quadword of DST.  */
 327         srwi    9,5,4         /* Number of full quadwords remaining.  */
 328
 329         beq    L(copy_GE_32_unaligned_cont)
 330
 331         /* SRC is not quadword aligned, get it aligned.  */
 332
 333         mtcrf   0x01,0
 334         subf    31,0,5
 335
 336         /* Vector instructions work best when proper alignment (16-bytes)
 337         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 338 1:      /* Copy 1 byte.  */
 339         bf      31,2f
 340
 341         lbz     6,0(12)
 342         addi    12,12,1
 343         stb     6,0(3)
 344         addi    3,3,1
 345 2:      /* Copy 2 bytes.  */
 346         bf          30,4f
 347
 348         lhz     6,0(12)
 349         addi    12,12,2
 350         sth     6,0(3)
 351         addi    3,3,2
 352 4:      /* Copy 4 bytes.  */
 353         bf      29,8f
 354
 355         lwz     6,0(12)
 356         addi    12,12,4
 357         stw     6,0(3)
 358         addi    3,3,4
 359 8:      /* Copy 8 bytes.  */
 360         bf      28,0f
 361
 362         lfd     6,0(12)
 363         addi    12,12,8
 364         stfd    6,0(3)
 365         addi    3,3,8
 366 0:
 367         clrlwi  10,12,28      /* Check alignment of SRC.  */
 368         srdi    9,31,4        /* Number of full quadwords remaining.  */
 369
 370         /* The proper alignment is present, it is OK to copy the bytes now.  */
 371 L(copy_GE_32_unaligned_cont):
 372
 373         /* Setup two indexes to speed up the indexed vector operations.  */
 374         clrlwi  11,31,28
 375         li      6,16          /* Index for 16-bytes offsets.  */
 376         li      7,32          /* Index for 32-bytes offsets.  */
 377         cmplwi  cr1,11,0
 378         srdi    8,31,5        /* Setup the loop counter.  */
 379         mr      10,3
 380         mr      11,12
 381         mtcrf   0x01,9
 382         cmplwi  cr6,9,1
 383         lvsl    5,0,12
 384         lvx     3,0,12
 385         bf      31,L(setup_unaligned_loop)
 386
 387         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 388         lvx     4,12,6
 389         vperm   6,3,4,5
 390         addi    11,12,16
 391         addi    10,3,16
 392         stvx    6,0,3
 393         vor     3,4,4
 394
 395 L(setup_unaligned_loop):
 396         mtctr   8
 397         ble     cr6,L(end_unaligned_loop)
 398
 399         /* Copy 32 bytes at a time using vector instructions.  */
 400         .align  4
 401 L(unaligned_loop):
 402
 403         /* Note: vr6/vr10 may contain data that was already copied,
 404         but in order to get proper alignment, we may have to copy
 405         some portions again. This is faster than having unaligned
 406         vector instructions though.  */
 407
 408         lvx     4,11,6        /* vr4 = r11+16.  */
 409         vperm   6,3,4,5       /* Merge the correctly-aligned portions
 410                               of vr3/vr4 into vr6.  */
 411         lvx     3,11,7        /* vr3 = r11+32.  */
 412         vperm   10,4,3,5      /* Merge the correctly-aligned portions
 413                               of vr3/vr4 into vr10.  */
 414         addi    11,11,32
 415         stvx    6,0,10
 416         stvx    10,10,6
 417         addi    10,10,32
 418
 419         bdnz    L(unaligned_loop)
 420
 421         .align  4
 422 L(end_unaligned_loop):
 423
 424         /* Check for tail bytes.  */
 425         clrrwi  0,31,4
 426         mtcrf   0x01,31
 427         beq     cr1,0f
 428
 429         add     3,3,0
 430         add     12,12,0
 431
 432         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 433 8:      /* Copy 8 bytes.  */
 434         bf      28,4f
 435
 436         lwz     6,0(12)
 437         lwz     7,4(12)
 438         addi    12,12,8
 439         stw     6,0(3)
 440         stw     7,4(3)
 441         addi    3,3,8
 442 4:      /* Copy 4 bytes.  */
 443         bf      29,2f
 444
 445         lwz     6,0(12)
 446         addi    12,12,4
 447         stw     6,0(3)
 448         addi    3,3,4
 449 2:      /* Copy 2~3 bytes.  */
 450         bf      30,1f
 451
 452         lhz     6,0(12)
 453         addi    12,12,2
 454         sth     6,0(3)
 455         addi    3,3,2
 456 1:      /* Copy 1 byte.  */
 457         bf      31,0f
 458
 459         lbz     6,0(12)
 460         stb     6,0(3)
 461 0:      /* Return original DST pointer.  */
 462         mr      3,30
 463         lwz     30,20(1)
 464         lwz     31,24(1)
 465         addi    1,1,32
 466         blr
 467
 468 END (BP_SYM (memcpy))
 469 libc_hidden_builtin_def (memcpy)