sysdeps/powerpc/powerpc64/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64/POWER7.
   2    Copyright (C) 2010-2013 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22
  23 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  24    Returns 'dst'.  */
  25
  26         .machine power7
  27 EALIGN (memcpy, 5, 0)
  28         CALL_MCOUNT 3
  29
  30         cmpldi  cr1,5,31
  31         neg     0,3
  32         std     3,-16(1)
  33         std     31,-8(1)
  34         cfi_offset(31,-8)
  35         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  36                                     code.  */
  37
  38         andi.   11,3,7        /* Check alignment of DST.  */
  39
  40
  41         clrldi  10,4,61       /* Check alignment of SRC.  */
  42         cmpld   cr6,10,11     /* SRC and DST alignments match?  */
  43         mr      12,4
  44         mr      31,5
  45         bne     cr6,L(copy_GE_32_unaligned)
  46
  47         srdi    9,5,3         /* Number of full quadwords remaining.  */
  48
  49         beq    L(copy_GE_32_aligned_cont)
  50
  51         clrldi  0,0,61
  52         mtcrf   0x01,0
  53         subf    31,0,5
  54
  55         /* Get the SRC aligned to 8 bytes.  */
  56
  57 1:      bf      31,2f
  58         lbz     6,0(12)
  59         addi    12,12,1
  60         stb     6,0(3)
  61         addi    3,3,1
  62 2:      bf      30,4f
  63         lhz     6,0(12)
  64         addi    12,12,2
  65         sth     6,0(3)
  66         addi    3,3,2
  67 4:      bf      29,0f
  68         lwz     6,0(12)
  69         addi    12,12,4
  70         stw     6,0(3)
  71         addi    3,3,4
  72 0:
  73         clrldi  10,12,61      /* Check alignment of SRC again.  */
  74         srdi    9,31,3        /* Number of full doublewords remaining.  */
  75
  76 L(copy_GE_32_aligned_cont):
  77
  78         clrldi  11,31,61
  79         mtcrf   0x01,9
  80
  81         srdi    8,31,5
  82         cmpldi  cr1,9,4
  83         cmpldi  cr6,11,0
  84         mr      11,12
  85
  86         /* Copy 1~3 doublewords so the main loop starts
  87         at a multiple of 32 bytes.  */
  88
  89         bf      30,1f
  90         ld      6,0(12)
  91         ld      7,8(12)
  92         addi    11,12,16
  93         mtctr   8
  94         std     6,0(3)
  95         std     7,8(3)
  96         addi    10,3,16
  97         bf      31,4f
  98         ld      0,16(12)
  99         std     0,16(3)
 100         blt     cr1,3f
 101         addi    11,12,24
 102         addi    10,3,24
 103         b       4f
 104
 105         .align  4
 106 1:      /* Copy 1 doubleword and set the counter.  */
 107         mr      10,3
 108         mtctr   8
 109         bf      31,4f
 110         ld      6,0(12)
 111         addi    11,12,8
 112         std     6,0(3)
 113         addi    10,3,8
 114
 115 L(aligned_copy):
 116         /* Main aligned copy loop. Copies up to 128-bytes at a time. */
 117         .align  4
 118 4:
 119         /* check for any 32-byte or 64-byte lumps that are outside of a
 120            nice 128-byte range.  R8 contains the number of 32-byte
 121            lumps, so drop this into the CR, and use the SO/EQ bits to help
 122            handle the 32- or 64- byte lumps.  Then handle the rest with an
 123            unrolled 128-bytes-at-a-time copy loop. */
 124         mtocrf  1,8
 125         li      6,16    # 16() index
 126         li      7,32    # 32() index
 127         li      8,48    # 48() index
 128
 129 L(aligned_32byte):
 130         /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
 131         bns     cr7,L(aligned_64byte)
 132         lxvd2x  6,0,11
 133         lxvd2x  7,11,6
 134         addi    11,11,32
 135         stxvd2x 6,0,10
 136         stxvd2x 7,10,6
 137         addi    10,10,32
 138
 139 L(aligned_64byte):
 140         /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
 141         bne     cr7,L(aligned_128setup)
 142         lxvd2x  6,0,11
 143         lxvd2x  7,11,6
 144         lxvd2x  8,11,7
 145         lxvd2x  9,11,8
 146         addi    11,11,64
 147         stxvd2x 6,0,10
 148         stxvd2x 7,10,6
 149         stxvd2x 8,10,7
 150         stxvd2x 9,10,8
 151         addi    10,10,64
 152
 153 L(aligned_128setup):
 154         /* Set up for the 128-byte at a time copy loop.  */
 155         srdi    8,31,7
 156         cmpdi   8,0     # Any 4x lumps left?
 157         beq     3f      # if not, move along.
 158         lxvd2x  6,0,11
 159         lxvd2x  7,11,6
 160         mtctr   8       # otherwise, load the ctr and begin.
 161         li      8,48    # 48() index
 162         b       L(aligned_128loop)
 163
 164 L(aligned_128head):
 165         /* for the 2nd + iteration of this loop. */
 166         lxvd2x  6,0,11
 167         lxvd2x  7,11,6
 168 L(aligned_128loop):
 169         lxvd2x  8,11,7
 170         lxvd2x  9,11,8
 171         stxvd2x 6,0,10
 172         addi    11,11,64
 173         stxvd2x 7,10,6
 174         stxvd2x 8,10,7
 175         stxvd2x 9,10,8
 176         lxvd2x  6,0,11
 177         lxvd2x  7,11,6
 178         addi    10,10,64
 179         lxvd2x  8,11,7
 180         lxvd2x  9,11,8
 181         addi    11,11,64
 182         stxvd2x 6,0,10
 183         stxvd2x 7,10,6
 184         stxvd2x 8,10,7
 185         stxvd2x 9,10,8
 186         addi    10,10,64
 187         bdnz    L(aligned_128head)
 188
 189 3:
 190         /* Check for tail bytes.  */
 191         rldicr  0,31,0,60
 192         mtcrf   0x01,31
 193         beq     cr6,0f
 194
 195 .L9:
 196         add     3,3,0
 197         add     12,12,0
 198
 199         /*  At this point we have a tail of 0-7 bytes and we know that the
 200         destination is doubleword-aligned.  */
 201 4:      /* Copy 4 bytes.  */
 202         bf      29,2f
 203
 204         lwz     6,0(12)
 205         addi    12,12,4
 206         stw     6,0(3)
 207         addi    3,3,4
 208 2:      /* Copy 2 bytes.  */
 209         bf      30,1f
 210
 211         lhz     6,0(12)
 212         addi    12,12,2
 213         sth     6,0(3)
 214         addi    3,3,2
 215 1:      /* Copy 1 byte.  */
 216         bf      31,0f
 217
 218         lbz     6,0(12)
 219         stb     6,0(3)
 220 0:      /* Return original DST pointer.  */
 221         ld      31,-8(1)
 222         ld      3,-16(1)
 223         blr
 224
 225         /* Handle copies of 0~31 bytes.  */
 226         .align  4
 227 L(copy_LT_32):
 228         cmpldi  cr6,5,8
 229         mr      12,4
 230         mtcrf   0x01,5
 231         ble     cr6,L(copy_LE_8)
 232
 233         /* At least 9 bytes to go.  */
 234         neg     8,4
 235         clrrdi  11,4,2
 236         andi.   0,8,3
 237         cmpldi  cr1,5,16
 238         mr      10,5
 239         beq     L(copy_LT_32_aligned)
 240
 241         /* Force 4-bytes alignment for SRC.  */
 242         mtocrf  0x01,0
 243         subf    10,0,5
 244 2:      bf      30,1f
 245
 246         lhz     6,0(12)
 247         addi    12,12,2
 248         sth     6,0(3)
 249         addi    3,3,2
 250 1:      bf      31,L(end_4bytes_alignment)
 251
 252         lbz     6,0(12)
 253         addi    12,12,1
 254         stb     6,0(3)
 255         addi    3,3,1
 256
 257         .align  4
 258 L(end_4bytes_alignment):
 259         cmpldi  cr1,10,16
 260         mtcrf   0x01,10
 261
 262 L(copy_LT_32_aligned):
 263         /* At least 6 bytes to go, and SRC is word-aligned.  */
 264         blt     cr1,8f
 265
 266         /* Copy 16 bytes.  */
 267         lwz     6,0(12)
 268         lwz     7,4(12)
 269         stw     6,0(3)
 270         lwz     8,8(12)
 271         stw     7,4(3)
 272         lwz     6,12(12)
 273         addi    12,12,16
 274         stw     8,8(3)
 275         stw     6,12(3)
 276         addi    3,3,16
 277 8:      /* Copy 8 bytes.  */
 278         bf      28,4f
 279
 280         lwz     6,0(12)
 281         lwz     7,4(12)
 282         addi    12,12,8
 283         stw     6,0(3)
 284         stw     7,4(3)
 285         addi    3,3,8
 286 4:      /* Copy 4 bytes.  */
 287         bf      29,2f
 288
 289         lwz     6,0(12)
 290         addi    12,12,4
 291         stw     6,0(3)
 292         addi    3,3,4
 293 2:      /* Copy 2-3 bytes.  */
 294         bf      30,1f
 295
 296         lhz     6,0(12)
 297         sth     6,0(3)
 298         bf      31,0f
 299         lbz     7,2(12)
 300         stb     7,2(3)
 301         ld      3,-16(1)
 302         blr
 303
 304         .align  4
 305 1:      /* Copy 1 byte.  */
 306         bf      31,0f
 307
 308         lbz     6,0(12)
 309         stb     6,0(3)
 310 0:      /* Return original DST pointer.  */
 311         ld      3,-16(1)
 312         blr
 313
 314         /* Handles copies of 0~8 bytes.  */
 315         .align  4
 316 L(copy_LE_8):
 317         bne     cr6,4f
 318
 319         /* Though we could've used ld/std here, they are still
 320         slow for unaligned cases.  */
 321
 322         lwz     6,0(4)
 323         lwz     7,4(4)
 324         stw     6,0(3)
 325         stw     7,4(3)
 326         ld      3,-16(1)      /* Return original DST pointers.  */
 327         blr
 328
 329         .align  4
 330 4:      /* Copies 4~7 bytes.  */
 331         bf      29,2b
 332
 333         lwz     6,0(4)
 334         stw     6,0(3)
 335         bf      30,5f
 336         lhz     7,4(4)
 337         sth     7,4(3)
 338         bf      31,0f
 339         lbz     8,6(4)
 340         stb     8,6(3)
 341         ld      3,-16(1)
 342         blr
 343
 344         .align  4
 345 5:      /* Copy 1 byte.  */
 346         bf      31,0f
 347
 348         lbz     6,4(4)
 349         stb     6,4(3)
 350
 351 0:      /* Return original DST pointer.  */
 352         ld      3,-16(1)
 353         blr
 354
 355         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 356         SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 357         the data, allowing for aligned DST stores.  */
 358         .align  4
 359 L(copy_GE_32_unaligned):
 360         clrldi  0,0,60        /* Number of bytes until the 1st
 361                               quadword.  */
 362         andi.   11,3,15       /* Check alignment of DST (against
 363                               quadwords).  */
 364         srdi    9,5,4         /* Number of full quadwords remaining.  */
 365
 366         beq     L(copy_GE_32_unaligned_cont)
 367
 368         /* SRC is not quadword aligned, get it aligned.  */
 369
 370         mtcrf   0x01,0
 371         subf    31,0,5
 372
 373         /* Vector instructions work best when proper alignment (16-bytes)
 374         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 375 1:      /* Copy 1 byte.  */
 376         bf      31,2f
 377
 378         lbz     6,0(12)
 379         addi    12,12,1
 380         stb     6,0(3)
 381         addi    3,3,1
 382 2:      /* Copy 2 bytes.  */
 383         bf      30,4f
 384
 385         lhz     6,0(12)
 386         addi    12,12,2
 387         sth     6,0(3)
 388         addi    3,3,2
 389 4:      /* Copy 4 bytes.  */
 390         bf      29,8f
 391
 392         lwz     6,0(12)
 393         addi    12,12,4
 394         stw     6,0(3)
 395         addi    3,3,4
 396 8:      /* Copy 8 bytes.  */
 397         bf      28,0f
 398
 399         ld      6,0(12)
 400         addi    12,12,8
 401         std     6,0(3)
 402         addi    3,3,8
 403 0:
 404         clrldi  10,12,60      /* Check alignment of SRC.  */
 405         srdi    9,31,4        /* Number of full quadwords remaining.  */
 406
 407         /* The proper alignment is present, it is OK to copy the bytes now.  */
 408 L(copy_GE_32_unaligned_cont):
 409
 410         /* Setup two indexes to speed up the indexed vector operations.  */
 411         clrldi  11,31,60
 412         li      6,16          /* Index for 16-bytes offsets.  */
 413         li      7,32          /* Index for 32-bytes offsets.  */
 414         cmpldi  cr1,11,0
 415         srdi    8,31,5        /* Setup the loop counter.  */
 416         mr      10,3
 417         mr      11,12
 418         mtcrf   0x01,9
 419         cmpldi  cr6,9,1
 420         lvsl    5,0,12
 421         lvx     3,0,12
 422         bf      31,L(setup_unaligned_loop)
 423
 424         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 425         lvx     4,12,6
 426         vperm   6,3,4,5
 427         addi    11,12,16
 428         addi    10,3,16
 429         stvx    6,0,3
 430         vor     3,4,4
 431
 432 L(setup_unaligned_loop):
 433         mtctr   8
 434         ble     cr6,L(end_unaligned_loop)
 435
 436         /* Copy 32 bytes at a time using vector instructions.  */
 437         .align  4
 438 L(unaligned_loop):
 439
 440         /* Note: vr6/vr10 may contain data that was already copied,
 441         but in order to get proper alignment, we may have to copy
 442         some portions again. This is faster than having unaligned
 443         vector instructions though.  */
 444
 445         lvx     4,11,6        /* vr4 = r11+16.  */
 446         vperm   6,3,4,5       /* Merge the correctly-aligned portions
 447                               of vr3/vr4 into vr6.  */
 448         lvx     3,11,7        /* vr3 = r11+32.  */
 449         vperm   10,4,3,5      /* Merge the correctly-aligned portions
 450                               of vr3/vr4 into vr10.  */
 451         addi    11,11,32
 452         stvx    6,0,10
 453         stvx    10,10,6
 454         addi    10,10,32
 455
 456         bdnz    L(unaligned_loop)
 457
 458         .align  4
 459 L(end_unaligned_loop):
 460
 461         /* Check for tail bytes.  */
 462         rldicr  0,31,0,59
 463         mtcrf   0x01,31
 464         beq     cr1,0f
 465
 466         add     3,3,0
 467         add     12,12,0
 468
 469         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 470 8:      /* Copy 8 bytes.  */
 471         bf      28,4f
 472
 473         lwz     6,0(12)
 474         lwz     7,4(12)
 475         addi    12,12,8
 476         stw     6,0(3)
 477         stw     7,4(3)
 478         addi    3,3,8
 479 4:      /* Copy 4 bytes.  */
 480         bf      29,2f
 481
 482         lwz     6,0(12)
 483         addi    12,12,4
 484         stw     6,0(3)
 485         addi    3,3,4
 486 2:      /* Copy 2~3 bytes.  */
 487         bf      30,1f
 488
 489         lhz     6,0(12)
 490         addi    12,12,2
 491         sth     6,0(3)
 492         addi    3,3,2
 493 1:      /* Copy 1 byte.  */
 494         bf      31,0f
 495
 496         lbz     6,0(12)
 497         stb     6,0(3)
 498 0:      /* Return original DST pointer.  */
 499         ld      31,-8(1)
 500         ld      3,-16(1)
 501         blr
 502
 503 END_GEN_TB (memcpy,TB_TOCLESS)
 504 libc_hidden_builtin_def (memcpy)