sysdeps/powerpc/powerpc64/power7/mempcpy.S

   1 /* Optimized mempcpy implementation for POWER7.
   2    Copyright (C) 2010-2014 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22
  23 /* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  24     Returns 'dst' + 'len'.  */
  25
  26         .machine  power7
  27 EALIGN (__mempcpy, 5, 0)
  28         CALL_MCOUNT 3
  29
  30         cmpldi  cr1,5,31
  31         neg     0,3
  32         std     3,-16(1)
  33         std     31,-8(1)
  34         cfi_offset(31,-8)
  35         ble     cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
  36                                        code.  */
  37
  38         andi.   11,3,7        /* Check alignment of DST.  */
  39
  40
  41         clrldi  10,4,61       /* Check alignment of SRC.  */
  42         cmpld   cr6,10,11     /* SRC and DST alignments match?  */
  43         mr      12,4
  44         mr      31,5
  45         bne     cr6,L(copy_GE_32_unaligned)
  46
  47         srdi    9,5,3         /* Number of full quadwords remaining.  */
  48
  49         beq     L(copy_GE_32_aligned_cont)
  50
  51         clrldi  0,0,61
  52         mtcrf   0x01,0
  53         subf    31,0,5
  54
  55         /* Get the SRC aligned to 8 bytes.  */
  56
  57 1:      bf      31,2f
  58         lbz     6,0(12)
  59         addi    12,12,1
  60         stb     6,0(3)
  61         addi    3,3,1
  62 2:      bf      30,4f
  63         lhz     6,0(12)
  64         addi    12,12,2
  65         sth     6,0(3)
  66         addi    3,3,2
  67 4:      bf      29,0f
  68         lwz     6,0(12)
  69         addi    12,12,4
  70         stw     6,0(3)
  71         addi    3,3,4
  72 0:
  73         clrldi  10,12,61      /* Check alignment of SRC again.  */
  74         srdi    9,31,3        /* Number of full doublewords remaining.  */
  75
  76 L(copy_GE_32_aligned_cont):
  77
  78         clrldi  11,31,61
  79         mtcrf   0x01,9
  80
  81         srdi    8,31,5
  82         cmpldi  cr1,9,4
  83         cmpldi  cr6,11,0
  84         mr      11,12
  85
  86         /* Copy 1~3 doublewords so the main loop starts
  87         at a multiple of 32 bytes.  */
  88
  89         bf      30,1f
  90         ld      6,0(12)
  91         ld      7,8(12)
  92         addi    11,12,16
  93         mtctr   8
  94         std     6,0(3)
  95         std     7,8(3)
  96         addi    10,3,16
  97         bf      31,4f
  98         ld      0,16(12)
  99         std     0,16(3)
 100         blt     cr1,3f
 101         addi    11,12,24
 102         addi    10,3,24
 103         b       4f
 104
 105         .align  4
 106 1:      /* Copy 1 doubleword and set the counter.  */
 107         mr      10,3
 108         mtctr   8
 109         bf      31,4f
 110         ld      6,0(12)
 111         addi    11,12,8
 112         std     6,0(3)
 113         addi    10,3,8
 114
 115         /* Main aligned copy loop. Copies 32-bytes at a time.  */
 116         .align  4
 117 4:
 118         ld      6,0(11)
 119         ld      7,8(11)
 120         ld      8,16(11)
 121         ld      0,24(11)
 122         addi    11,11,32
 123
 124         std     6,0(10)
 125         std     7,8(10)
 126         std     8,16(10)
 127         std     0,24(10)
 128         addi    10,10,32
 129         bdnz    4b
 130 3:
 131
 132         /* Check for tail bytes.  */
 133         rldicr  0,31,0,60
 134         mtcrf   0x01,31
 135         beq     cr6,0f
 136
 137 .L9:
 138         add     3,3,0
 139         add     12,12,0
 140
 141         /*  At this point we have a tail of 0-7 bytes and we know that the
 142         destination is doubleword-aligned.  */
 143 4:      /* Copy 4 bytes.  */
 144         bf      29,2f
 145
 146         lwz     6,0(12)
 147         addi    12,12,4
 148         stw     6,0(3)
 149         addi    3,3,4
 150 2:      /* Copy 2 bytes.  */
 151         bf      30,1f
 152
 153         lhz     6,0(12)
 154         addi    12,12,2
 155         sth     6,0(3)
 156         addi    3,3,2
 157 1:      /* Copy 1 byte.  */
 158         bf      31,0f
 159
 160         lbz     6,0(12)
 161         stb     6,0(3)
 162 0:      /* Return DST + LEN pointer.  */
 163         ld      31,-8(1)
 164         ld      3,-16(1)
 165         add     3,3,5
 166         blr
 167
 168         /* Handle copies of 0~31 bytes.  */
 169         .align  4
 170 L(copy_LT_32):
 171         cmpldi  cr6,5,8
 172         mr      12,4
 173         mtcrf   0x01,5
 174         ble     cr6,L(copy_LE_8)
 175
 176         /* At least 9 bytes to go.  */
 177         neg     8,4
 178         clrrdi  11,4,2
 179         andi.   0,8,3
 180         cmpldi  cr1,5,16
 181         mr      10,5
 182         beq     L(copy_LT_32_aligned)
 183
 184         /* Force 4-bytes alignment for SRC.  */
 185         mtocrf  0x01,0
 186         subf    10,0,5
 187 2:      bf      30,1f
 188
 189         lhz     6,0(12)
 190         addi    12,12,2
 191         sth     6,0(3)
 192         addi    3,3,2
 193 1:      bf      31,L(end_4bytes_alignment)
 194
 195         lbz     6,0(12)
 196         addi    12,12,1
 197         stb     6,0(3)
 198         addi    3,3,1
 199
 200         .align  4
 201 L(end_4bytes_alignment):
 202         cmpldi  cr1,10,16
 203         mtcrf   0x01,10
 204
 205 L(copy_LT_32_aligned):
 206         /* At least 6 bytes to go, and SRC is word-aligned.  */
 207         blt     cr1,8f
 208
 209         /* Copy 16 bytes.  */
 210         lwz     6,0(12)
 211         lwz     7,4(12)
 212         stw     6,0(3)
 213         lwz     8,8(12)
 214         stw     7,4(3)
 215         lwz     6,12(12)
 216         addi    12,12,16
 217         stw     8,8(3)
 218         stw     6,12(3)
 219         addi    3,3,16
 220 8:      /* Copy 8 bytes.  */
 221         bf      28,4f
 222
 223         lwz     6,0(12)
 224         lwz     7,4(12)
 225         addi    12,12,8
 226         stw     6,0(3)
 227         stw     7,4(3)
 228         addi    3,3,8
 229 4:      /* Copy 4 bytes.  */
 230         bf      29,2f
 231
 232         lwz     6,0(12)
 233         addi    12,12,4
 234         stw     6,0(3)
 235         addi    3,3,4
 236 2:      /* Copy 2-3 bytes.  */
 237         bf      30,1f
 238
 239         lhz     6,0(12)
 240         sth     6,0(3)
 241         bf      31,0f
 242         lbz     7,2(12)
 243         stb     7,2(3)
 244         ld      3,-16(1)
 245         add     3,3,5
 246         blr
 247
 248         .align  4
 249 1:      /* Copy 1 byte.  */
 250         bf      31,0f
 251
 252         lbz     6,0(12)
 253         stb     6,0(3)
 254 0:      /* Return DST + LEN pointer.  */
 255         ld      3,-16(1)
 256         add     3,3,5
 257         blr
 258
 259         /* Handles copies of 0~8 bytes.  */
 260         .align  4
 261 L(copy_LE_8):
 262         bne     cr6,4f
 263
 264         /* Though we could've used ld/std here, they are still
 265         slow for unaligned cases.  */
 266
 267         lwz     6,0(4)
 268         lwz     7,4(4)
 269         stw     6,0(3)
 270         stw     7,4(3)
 271         ld      3,-16(1)      /* Return DST + LEN pointer.  */
 272         add     3,3,5
 273         blr
 274
 275         .align  4
 276 4:      /* Copies 4~7 bytes.  */
 277         bf      29,2b
 278
 279         lwz     6,0(4)
 280         stw     6,0(3)
 281         bf      30,5f
 282         lhz     7,4(4)
 283         sth     7,4(3)
 284         bf      31,0f
 285         lbz     8,6(4)
 286         stb     8,6(3)
 287         ld      3,-16(1)
 288         add     3,3,5
 289         blr
 290
 291         .align  4
 292 5:      /* Copy 1 byte.  */
 293         bf      31,0f
 294
 295         lbz     6,4(4)
 296         stb     6,4(3)
 297
 298 0:      /* Return DST + LEN pointer.  */
 299         ld      3,-16(1)
 300         add     3,3,5
 301         blr
 302
 303         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 304         SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 305         the data, allowing for aligned DST stores.  */
 306         .align  4
 307 L(copy_GE_32_unaligned):
 308         clrldi  0,0,60        /* Number of bytes until the 1st
 309                                  quadword.  */
 310         andi.   11,3,15       /* Check alignment of DST (against
 311                                  quadwords).  */
 312         srdi    9,5,4         /* Number of full quadwords remaining.  */
 313
 314         beq     L(copy_GE_32_unaligned_cont)
 315
 316         /* SRC is not quadword aligned, get it aligned.  */
 317
 318         mtcrf   0x01,0
 319         subf    31,0,5
 320
 321         /* Vector instructions work best when proper alignment (16-bytes)
 322         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 323 1:      /* Copy 1 byte.  */
 324         bf      31,2f
 325
 326         lbz     6,0(12)
 327         addi    12,12,1
 328         stb     6,0(3)
 329         addi    3,3,1
 330 2:      /* Copy 2 bytes.  */
 331         bf      30,4f
 332
 333         lhz     6,0(12)
 334         addi    12,12,2
 335         sth     6,0(3)
 336         addi    3,3,2
 337 4:      /* Copy 4 bytes.  */
 338         bf      29,8f
 339
 340         lwz     6,0(12)
 341         addi    12,12,4
 342         stw     6,0(3)
 343         addi    3,3,4
 344 8:      /* Copy 8 bytes.  */
 345         bf      28,0f
 346
 347         ld      6,0(12)
 348         addi    12,12,8
 349         std     6,0(3)
 350         addi    3,3,8
 351 0:
 352         clrldi  10,12,60      /* Check alignment of SRC.  */
 353         srdi    9,31,4        /* Number of full quadwords remaining.  */
 354
 355         /* The proper alignment is present, it is OK to copy the bytes now.  */
 356 L(copy_GE_32_unaligned_cont):
 357
 358         /* Setup two indexes to speed up the indexed vector operations.  */
 359         clrldi  11,31,60
 360         li      6,16          /* Index for 16-bytes offsets.  */
 361         li      7,32          /* Index for 32-bytes offsets.  */
 362         cmpldi  cr1,11,0
 363         srdi    8,31,5        /* Setup the loop counter.  */
 364         mr      10,3
 365         mr      11,12
 366         mtcrf   0x01,9
 367         cmpldi  cr6,9,1
 368 #ifdef __LITTLE_ENDIAN__
 369         lvsr    5,0,12
 370 #else
 371         lvsl    5,0,12
 372 #endif
 373         lvx     3,0,12
 374         bf      31,L(setup_unaligned_loop)
 375
 376         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 377         lvx     4,12,6
 378 #ifdef __LITTLE_ENDIAN__
 379         vperm   6,4,3,5
 380 #else
 381         vperm   6,3,4,5
 382 #endif
 383         addi    11,12,16
 384         addi    10,3,16
 385         stvx    6,0,3
 386         vor     3,4,4
 387
 388 L(setup_unaligned_loop):
 389         mtctr   8
 390         ble     cr6,L(end_unaligned_loop)
 391
 392         /* Copy 32 bytes at a time using vector instructions.  */
 393         .align  4
 394 L(unaligned_loop):
 395
 396         /* Note: vr6/vr10 may contain data that was already copied,
 397         but in order to get proper alignment, we may have to copy
 398         some portions again. This is faster than having unaligned
 399         vector instructions though.  */
 400
 401         lvx     4,11,6        /* vr4 = r11+16.  */
 402 #ifdef __LITTLE_ENDIAN__
 403         vperm   6,4,3,5
 404 #else
 405         vperm   6,3,4,5
 406 #endif
 407         lvx     3,11,7        /* vr3 = r11+32.  */
 408 #ifdef __LITTLE_ENDIAN__
 409         vperm   10,3,4,5
 410 #else
 411         vperm   10,4,3,5
 412 #endif
 413         addi    11,11,32
 414         stvx    6,0,10
 415         stvx    10,10,6
 416         addi    10,10,32
 417
 418         bdnz    L(unaligned_loop)
 419
 420         .align  4
 421 L(end_unaligned_loop):
 422
 423         /* Check for tail bytes.  */
 424         rldicr  0,31,0,59
 425         mtcrf   0x01,31
 426         beq     cr1,0f
 427
 428         add     3,3,0
 429         add     12,12,0
 430
 431         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 432 8:      /* Copy 8 bytes.  */
 433         bf      28,4f
 434
 435         lwz     6,0(12)
 436         lwz     7,4(12)
 437         addi    12,12,8
 438         stw     6,0(3)
 439         stw     7,4(3)
 440         addi    3,3,8
 441 4:      /* Copy 4 bytes.  */
 442         bf      29,2f
 443
 444         lwz     6,0(12)
 445         addi    12,12,4
 446         stw     6,0(3)
 447         addi    3,3,4
 448 2:      /* Copy 2~3 bytes.  */
 449         bf      30,1f
 450
 451         lhz     6,0(12)
 452         addi    12,12,2
 453         sth     6,0(3)
 454         addi    3,3,2
 455 1:      /* Copy 1 byte.  */
 456         bf      31,0f
 457
 458         lbz     6,0(12)
 459         stb     6,0(3)
 460 0:      /* Return DST + LEN pointer.  */
 461         ld      31,-8(1)
 462         ld      3,-16(1)
 463         add     3,3,5
 464         blr
 465
 466 END_GEN_TB (__mempcpy,TB_TOCLESS)
 467 libc_hidden_def (__mempcpy)
 468 weak_alias (__mempcpy, mempcpy)
 469 libc_hidden_builtin_def (mempcpy)