sysdeps/powerpc/powerpc64/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64/POWER7.
   2    Copyright (C) 2010 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  19    02110-1301 USA.  */
  20
  21 #include <sysdep.h>
  22 #include <bp-sym.h>
  23 #include <bp-asm.h>
  24
  25
  26 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  27    Returns 'dst'.  */
  28
  29         .machine power7
  30 EALIGN (BP_SYM (memcpy), 5, 0)
  31         CALL_MCOUNT 3
  32
  33         cmpldi  cr1,5,31
  34         neg     0,3
  35         std     3,-16(1)
  36         std     31,-8(1)
  37         cfi_offset(31,-8)
  38         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  39                                     code.  */
  40
  41         andi.   11,3,7        /* Check alignment of DST.  */
  42
  43
  44         clrldi  10,4,61       /* Check alignment of SRC.  */
  45         cmpld   cr6,10,11     /* SRC and DST alignments match?  */
  46         mr      12,4
  47         mr      31,5
  48         bne     cr6,L(copy_GE_32_unaligned)
  49
  50         srdi    9,5,3         /* Number of full quadwords remaining.  */
  51
  52         beq    L(copy_GE_32_aligned_cont)
  53
  54         clrldi  0,0,61
  55         mtcrf   0x01,0
  56         subf    31,0,5
  57
  58         /* Get the SRC aligned to 8 bytes.  */
  59
  60 1:      bf      31,2f
  61         lbz     6,0(12)
  62         addi    12,12,1
  63         stb     6,0(3)
  64         addi    3,3,1
  65 2:      bf      30,4f
  66         lhz     6,0(12)
  67         addi    12,12,2
  68         sth     6,0(3)
  69         addi    3,3,2
  70 4:      bf      29,0f
  71         lwz     6,0(12)
  72         addi    12,12,4
  73         stw     6,0(3)
  74         addi    3,3,4
  75 0:
  76         clrldi  10,12,61      /* Check alignment of SRC again.  */
  77         srdi    9,31,3        /* Number of full doublewords remaining.  */
  78
  79 L(copy_GE_32_aligned_cont):
  80
  81         clrldi  11,31,61
  82         mtcrf   0x01,9
  83
  84         srdi    8,31,5
  85         cmpldi  cr1,9,4
  86         cmpldi  cr6,11,0
  87         mr      11,12
  88
  89         /* Copy 1~3 doublewords so the main loop starts
  90         at a multiple of 32 bytes.  */
  91
  92         bf      30,1f
  93         ld      6,0(12)
  94         ld      7,8(12)
  95         addi    11,12,16
  96         mtctr   8
  97         std     6,0(3)
  98         std     7,8(3)
  99         addi    10,3,16
 100         bf      31,4f
 101         ld      0,16(12)
 102         std     0,16(3)
 103         blt     cr1,3f
 104         addi    11,12,24
 105         addi    10,3,24
 106         b       4f
 107
 108         .align  4
 109 1:      /* Copy 1 doubleword and set the counter.  */
 110         mr      10,3
 111         mtctr   8
 112         bf      31,4f
 113         ld      6,0(12)
 114         addi    11,12,8
 115         std     6,0(3)
 116         addi    10,3,8
 117
 118         /* Main aligned copy loop. Copies 32-bytes at a time.  */
 119         .align  4
 120 4:
 121         ld      6,0(11)
 122         ld      7,8(11)
 123         ld      8,16(11)
 124         ld      0,24(11)
 125         addi    11,11,32
 126
 127         std     6,0(10)
 128         std     7,8(10)
 129         std     8,16(10)
 130         std     0,24(10)
 131         addi    10,10,32
 132         bdnz    4b
 133 3:
 134
 135         /* Check for tail bytes.  */
 136         rldicr  0,31,0,60
 137         mtcrf   0x01,31
 138         beq     cr6,0f
 139
 140 .L9:
 141         add     3,3,0
 142         add     12,12,0
 143
 144         /*  At this point we have a tail of 0-7 bytes and we know that the
 145         destination is doubleword-aligned.  */
 146 4:      /* Copy 4 bytes.  */
 147         bf      29,2f
 148
 149         lwz     6,0(12)
 150         addi    12,12,4
 151         stw     6,0(3)
 152         addi    3,3,4
 153 2:      /* Copy 2 bytes.  */
 154         bf      30,1f
 155
 156         lhz     6,0(12)
 157         addi    12,12,2
 158         sth     6,0(3)
 159         addi    3,3,2
 160 1:      /* Copy 1 byte.  */
 161         bf      31,0f
 162
 163         lbz     6,0(12)
 164         stb     6,0(3)
 165 0:      /* Return original DST pointer.  */
 166         ld      31,-8(1)
 167         ld      3,-16(1)
 168         blr
 169
 170         /* Handle copies of 0~31 bytes.  */
 171         .align  4
 172 L(copy_LT_32):
 173         cmpldi  cr6,5,8
 174         mr      12,4
 175         mtcrf   0x01,5
 176         ble     cr6,L(copy_LE_8)
 177
 178         /* At least 9 bytes to go.  */
 179         neg     8,4
 180         clrrdi  11,4,2
 181         andi.   0,8,3
 182         cmpldi  cr1,5,16
 183         mr      10,5
 184         beq     L(copy_LT_32_aligned)
 185
 186         /* Force 4-bytes alignment for SRC.  */
 187         mtocrf  0x01,0
 188         subf    10,0,5
 189 2:      bf      30,1f
 190
 191         lhz     6,0(12)
 192         addi    12,12,2
 193         sth     6,0(3)
 194         addi    3,3,2
 195 1:      bf      31,L(end_4bytes_alignment)
 196
 197         lbz     6,0(12)
 198         addi    12,12,1
 199         stb     6,0(3)
 200         addi    3,3,1
 201
 202         .align  4
 203 L(end_4bytes_alignment):
 204         cmpldi  cr1,10,16
 205         mtcrf   0x01,10
 206
 207 L(copy_LT_32_aligned):
 208         /* At least 6 bytes to go, and SRC is word-aligned.  */
 209         blt     cr1,8f
 210
 211         /* Copy 16 bytes.  */
 212         lwz     6,0(12)
 213         lwz     7,4(12)
 214         stw     6,0(3)
 215         lwz     8,8(12)
 216         stw     7,4(3)
 217         lwz     6,12(12)
 218         addi    12,12,16
 219         stw     8,8(3)
 220         stw     6,12(3)
 221         addi    3,3,16
 222 8:      /* Copy 8 bytes.  */
 223         bf      28,4f
 224
 225         lwz     6,0(12)
 226         lwz     7,4(12)
 227         addi    12,12,8
 228         stw     6,0(3)
 229         stw     7,4(3)
 230         addi    3,3,8
 231 4:      /* Copy 4 bytes.  */
 232         bf      29,2f
 233
 234         lwz     6,0(12)
 235         addi    12,12,4
 236         stw     6,0(3)
 237         addi    3,3,4
 238 2:      /* Copy 2-3 bytes.  */
 239         bf      30,1f
 240
 241         lhz     6,0(12)
 242         sth     6,0(3)
 243         bf      31,0f
 244         lbz     7,2(12)
 245         stb     7,2(3)
 246         ld      3,-16(1)
 247         blr
 248
 249         .align  4
 250 1:      /* Copy 1 byte.  */
 251         bf      31,0f
 252
 253         lbz     6,0(12)
 254         stb     6,0(3)
 255 0:      /* Return original DST pointer.  */
 256         ld      3,-16(1)
 257         blr
 258
 259         /* Handles copies of 0~8 bytes.  */
 260         .align  4
 261 L(copy_LE_8):
 262         bne     cr6,4f
 263
 264         /* Though we could've used ld/std here, they are still
 265         slow for unaligned cases.  */
 266
 267         lwz     6,0(4)
 268         lwz     7,4(4)
 269         stw     6,0(3)
 270         stw     7,4(3)
 271         ld      3,-16(1)      /* Return original DST pointers.  */
 272         blr
 273
 274         .align  4
 275 4:      /* Copies 4~7 bytes.  */
 276         bf      29,2b
 277
 278         lwz     6,0(4)
 279         stw     6,0(3)
 280         bf      30,5f
 281         lhz     7,4(4)
 282         sth     7,4(3)
 283         bf      31,0f
 284         lbz     8,6(4)
 285         stb     8,6(3)
 286         ld      3,-16(1)
 287         blr
 288
 289         .align  4
 290 5:      /* Copy 1 byte.  */
 291         bf      31,0f
 292
 293         lbz     6,4(4)
 294         stb     6,4(3)
 295
 296 0:      /* Return original DST pointer.  */
 297         ld      3,-16(1)
 298         blr
 299
 300         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 301         SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 302         the data, allowing for aligned DST stores.  */
 303         .align  4
 304 L(copy_GE_32_unaligned):
 305         clrldi  0,0,60        /* Number of bytes until the 1st
 306                               quadword.  */
 307         andi.   11,3,15       /* Check alignment of DST (against
 308                               quadwords).  */
 309         srdi    9,5,4         /* Number of full quadwords remaining.  */
 310
 311         beq     L(copy_GE_32_unaligned_cont)
 312
 313         /* SRC is not quadword aligned, get it aligned.  */
 314
 315         mtcrf   0x01,0
 316         subf    31,0,5
 317
 318         /* Vector instructions work best when proper alignment (16-bytes)
 319         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 320 1:      /* Copy 1 byte.  */
 321         bf      31,2f
 322
 323         lbz     6,0(12)
 324         addi    12,12,1
 325         stb     6,0(3)
 326         addi    3,3,1
 327 2:      /* Copy 2 bytes.  */
 328         bf      30,4f
 329
 330         lhz     6,0(12)
 331         addi    12,12,2
 332         sth     6,0(3)
 333         addi    3,3,2
 334 4:      /* Copy 4 bytes.  */
 335         bf      29,8f
 336
 337         lwz     6,0(12)
 338         addi    12,12,4
 339         stw     6,0(3)
 340         addi    3,3,4
 341 8:      /* Copy 8 bytes.  */
 342         bf      28,0f
 343
 344         ld      6,0(12)
 345         addi    12,12,8
 346         std     6,0(3)
 347         addi    3,3,8
 348 0:
 349         clrldi  10,12,60      /* Check alignment of SRC.  */
 350         srdi    9,31,4        /* Number of full quadwords remaining.  */
 351
 352         /* The proper alignment is present, it is OK to copy the bytes now.  */
 353 L(copy_GE_32_unaligned_cont):
 354
 355         /* Setup two indexes to speed up the indexed vector operations.  */
 356         clrldi  11,31,60
 357         li      6,16          /* Index for 16-bytes offsets.  */
 358         li      7,32          /* Index for 32-bytes offsets.  */
 359         cmpldi  cr1,11,0
 360         srdi    8,31,5        /* Setup the loop counter.  */
 361         mr      10,3
 362         mr      11,12
 363         mtcrf   0x01,9
 364         cmpldi  cr6,9,1
 365         lvsl    5,0,12
 366         lvx     3,0,12
 367         bf      31,L(setup_unaligned_loop)
 368
 369         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 370         lvx     4,12,6
 371         vperm   6,3,4,5
 372         addi    11,12,16
 373         addi    10,3,16
 374         stvx    6,0,3
 375         vor     3,4,4
 376
 377 L(setup_unaligned_loop):
 378         mtctr   8
 379         ble     cr6,L(end_unaligned_loop)
 380
 381         /* Copy 32 bytes at a time using vector instructions.  */
 382         .align  4
 383 L(unaligned_loop):
 384
 385         /* Note: vr6/vr10 may contain data that was already copied,
 386         but in order to get proper alignment, we may have to copy
 387         some portions again. This is faster than having unaligned
 388         vector instructions though.  */
 389
 390         lvx     4,11,6        /* vr4 = r11+16.  */
 391         vperm   6,3,4,5       /* Merge the correctly-aligned portions
 392                               of vr3/vr4 into vr6.  */
 393         lvx     3,11,7        /* vr3 = r11+32.  */
 394         vperm   10,4,3,5      /* Merge the correctly-aligned portions
 395                               of vr3/vr4 into vr10.  */
 396         addi    11,11,32
 397         stvx    6,0,10
 398         stvx    10,10,6
 399         addi    10,10,32
 400
 401         bdnz    L(unaligned_loop)
 402
 403         .align  4
 404 L(end_unaligned_loop):
 405
 406         /* Check for tail bytes.  */
 407         rldicr  0,31,0,59
 408         mtcrf   0x01,31
 409         beq     cr1,0f
 410
 411         add     3,3,0
 412         add     12,12,0
 413
 414         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 415 8:      /* Copy 8 bytes.  */
 416         bf      28,4f
 417
 418         lwz     6,0(12)
 419         lwz     7,4(12)
 420         addi    12,12,8
 421         stw     6,0(3)
 422         stw     7,4(3)
 423         addi    3,3,8
 424 4:      /* Copy 4 bytes.  */
 425         bf      29,2f
 426
 427         lwz     6,0(12)
 428         addi    12,12,4
 429         stw     6,0(3)
 430         addi    3,3,4
 431 2:      /* Copy 2~3 bytes.  */
 432         bf      30,1f
 433
 434         lhz     6,0(12)
 435         addi    12,12,2
 436         sth     6,0(3)
 437         addi    3,3,2
 438 1:      /* Copy 1 byte.  */
 439         bf      31,0f
 440
 441         lbz     6,0(12)
 442         stb     6,0(3)
 443 0:      /* Return original DST pointer.  */
 444         ld      31,-8(1)
 445         ld      3,-16(1)
 446         blr
 447
 448 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
 449 libc_hidden_builtin_def (memcpy)