sysdeps/powerpc/powerpc64/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64/POWER7.
   2    Copyright (C) 2010-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21
  22 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  23    Returns 'dst'.  */
  24
  25 #ifndef MEMCPY
  26 # define MEMCPY memcpy
  27 #endif
  28
  29 #define dst 11          /* Use r11 so r3 kept unchanged.  */
  30 #define src 4
  31 #define cnt 5
  32
  33         .machine power7
  34 ENTRY_TOCLESS (MEMCPY, 5)
  35         CALL_MCOUNT 3
  36
  37         cmpldi  cr1,cnt,31
  38         neg     0,3
  39         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  40                                     code.  */
  41
  42 /* Align copies using VSX instructions to quadword. It is to avoid alignment
  43    traps when memcpy is used on non-cacheable memory (for instance, memory
  44    mapped I/O).  */
  45         andi.   10,3,15
  46         clrldi  11,4,60
  47         cmpld   cr6,10,11       /* SRC and DST alignments match?  */
  48
  49         mr      dst,3
  50         bne     cr6,L(copy_GE_32_unaligned)
  51         beq     L(aligned_copy)
  52
  53         mtocrf  0x01,0
  54         clrldi  0,0,60
  55
  56 /* Get the DST and SRC aligned to 16 bytes.  */
  57 1:
  58         bf      31,2f
  59         lbz     6,0(src)
  60         addi    src,src,1
  61         stb     6,0(dst)
  62         addi    dst,dst,1
  63 2:
  64         bf      30,4f
  65         lhz     6,0(src)
  66         addi    src,src,2
  67         sth     6,0(dst)
  68         addi    dst,dst,2
  69 4:
  70         bf      29,8f
  71         lwz     6,0(src)
  72         addi    src,src,4
  73         stw     6,0(dst)
  74         addi    dst,dst,4
  75 8:
  76         bf      28,16f
  77         ld      6,0(src)
  78         addi    src,src,8
  79         std     6,0(dst)
  80         addi    dst,dst,8
  81 16:
  82         subf    cnt,0,cnt
  83
  84 /* Main aligned copy loop. Copies 128 bytes at a time. */
  85 L(aligned_copy):
  86         li      6,16
  87         li      7,32
  88         li      8,48
  89         mtocrf  0x02,cnt
  90         srdi    12,cnt,7
  91         cmpdi   12,0
  92         beq     L(aligned_tail)
  93         lvx     6,0,src
  94         lvx     7,src,6
  95         mtctr   12
  96         b       L(aligned_128loop)
  97
  98         .align  4
  99 L(aligned_128head):
 100         /* for the 2nd + iteration of this loop. */
 101         lvx     6,0,src
 102         lvx     7,src,6
 103 L(aligned_128loop):
 104         lvx     8,src,7
 105         lvx     9,src,8
 106         stvx    6,0,dst
 107         addi    src,src,64
 108         stvx    7,dst,6
 109         stvx    8,dst,7
 110         stvx    9,dst,8
 111         lvx     6,0,src
 112         lvx     7,src,6
 113         addi    dst,dst,64
 114         lvx     8,src,7
 115         lvx     9,src,8
 116         addi    src,src,64
 117         stvx    6,0,dst
 118         stvx    7,dst,6
 119         stvx    8,dst,7
 120         stvx    9,dst,8
 121         addi    dst,dst,64
 122         bdnz    L(aligned_128head)
 123
 124 L(aligned_tail):
 125         mtocrf  0x01,cnt
 126         bf      25,32f
 127         lvx     6,0,src
 128         lvx     7,src,6
 129         lvx     8,src,7
 130         lvx     9,src,8
 131         addi    src,src,64
 132         stvx    6,0,dst
 133         stvx    7,dst,6
 134         stvx    8,dst,7
 135         stvx    9,dst,8
 136         addi    dst,dst,64
 137 32:
 138         bf      26,16f
 139         lvx     6,0,src
 140         lvx     7,src,6
 141         addi    src,src,32
 142         stvx    6,0,dst
 143         stvx    7,dst,6
 144         addi    dst,dst,32
 145 16:
 146         bf      27,8f
 147         lvx     6,0,src
 148         addi    src,src,16
 149         stvx    6,0,dst
 150         addi    dst,dst,16
 151 8:
 152         bf      28,4f
 153         ld      6,0(src)
 154         addi    src,src,8
 155         std     6,0(dst)
 156         addi    dst,dst,8
 157 4:      /* Copies 4~7 bytes.  */
 158         bf      29,L(tail2)
 159         lwz     6,0(src)
 160         stw     6,0(dst)
 161         bf      30,L(tail5)
 162         lhz     7,4(src)
 163         sth     7,4(dst)
 164         bflr    31
 165         lbz     8,6(src)
 166         stb     8,6(dst)
 167         /* Return original DST pointer.  */
 168         blr
 169
 170
 171 /* Handle copies of 0~31 bytes.  */
 172         .align  4
 173 L(copy_LT_32):
 174         mr      dst,3
 175         cmpldi  cr6,cnt,8
 176         mtocrf  0x01,cnt
 177         ble     cr6,L(copy_LE_8)
 178
 179         /* At least 9 bytes to go.  */
 180         neg     8,4
 181         andi.   0,8,3
 182         cmpldi  cr1,cnt,16
 183         beq     L(copy_LT_32_aligned)
 184
 185         /* Force 4-byte alignment for SRC.  */
 186         mtocrf  0x01,0
 187         subf    cnt,0,cnt
 188 2:
 189         bf      30,1f
 190         lhz     6,0(src)
 191         addi    src,src,2
 192         sth     6,0(dst)
 193         addi    dst,dst,2
 194 1:
 195         bf      31,L(end_4bytes_alignment)
 196         lbz     6,0(src)
 197         addi    src,src,1
 198         stb     6,0(dst)
 199         addi    dst,dst,1
 200
 201         .align  4
 202 L(end_4bytes_alignment):
 203         cmpldi  cr1,cnt,16
 204         mtocrf  0x01,cnt
 205
 206 L(copy_LT_32_aligned):
 207         /* At least 6 bytes to go, and SRC is word-aligned.  */
 208         blt     cr1,8f
 209
 210         /* Copy 16 bytes.  */
 211         lwz     6,0(src)
 212         lwz     7,4(src)
 213         stw     6,0(dst)
 214         lwz     8,8(src)
 215         stw     7,4(dst)
 216         lwz     6,12(src)
 217         addi    src,src,16
 218         stw     8,8(dst)
 219         stw     6,12(dst)
 220         addi    dst,dst,16
 221 8:      /* Copy 8 bytes.  */
 222         bf      28,L(tail4)
 223         lwz     6,0(src)
 224         lwz     7,4(src)
 225         addi    src,src,8
 226         stw     6,0(dst)
 227         stw     7,4(dst)
 228         addi    dst,dst,8
 229
 230         .align  4
 231 /* Copies 4~7 bytes.  */
 232 L(tail4):
 233         bf      29,L(tail2)
 234         lwz     6,0(src)
 235         stw     6,0(dst)
 236         bf      30,L(tail5)
 237         lhz     7,4(src)
 238         sth     7,4(dst)
 239         bflr    31
 240         lbz     8,6(src)
 241         stb     8,6(dst)
 242         /* Return original DST pointer.  */
 243         blr
 244
 245         .align  4
 246 /* Copies 2~3 bytes.  */
 247 L(tail2):
 248         bf      30,1f
 249         lhz     6,0(src)
 250         sth     6,0(dst)
 251         bflr    31
 252         lbz     7,2(src)
 253         stb     7,2(dst)
 254         blr
 255
 256         .align  4
 257 L(tail5):
 258         bflr    31
 259         lbz     6,4(src)
 260         stb     6,4(dst)
 261         blr
 262
 263         .align  4
 264 1:
 265         bflr    31
 266         lbz     6,0(src)
 267         stb     6,0(dst)
 268         /* Return original DST pointer.  */
 269         blr
 270
 271
 272 /* Handles copies of 0~8 bytes.  */
 273         .align  4
 274 L(copy_LE_8):
 275         bne     cr6,L(tail4)
 276
 277         /* Though we could've used ld/std here, they are still
 278         slow for unaligned cases.  */
 279
 280         lwz     6,0(src)
 281         lwz     7,4(src)
 282         stw     6,0(dst)
 283         stw     7,4(dst)
 284         blr
 285
 286
 287 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 288    SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 289    the data, allowing for aligned DST stores.  */
 290         .align  4
 291 L(copy_GE_32_unaligned):
 292         clrldi  0,0,60        /* Number of bytes until the 1st dst quadword.  */
 293         srdi    9,cnt,4       /* Number of full quadwords remaining.  */
 294
 295         beq     L(copy_GE_32_unaligned_cont)
 296
 297         /* DST is not quadword aligned, get it aligned.  */
 298
 299         mtocrf  0x01,0
 300         subf    cnt,0,cnt
 301
 302         /* Vector instructions work best when proper alignment (16-bytes)
 303         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 304 1:
 305         bf      31,2f
 306         lbz     6,0(src)
 307         addi    src,src,1
 308         stb     6,0(dst)
 309         addi    dst,dst,1
 310 2:
 311         bf      30,4f
 312         lhz     6,0(src)
 313         addi    src,src,2
 314         sth     6,0(dst)
 315         addi    dst,dst,2
 316 4:
 317         bf      29,8f
 318         lwz     6,0(src)
 319         addi    src,src,4
 320         stw     6,0(dst)
 321         addi    dst,dst,4
 322 8:
 323         bf      28,0f
 324         ld      6,0(src)
 325         addi    src,src,8
 326         std     6,0(dst)
 327         addi    dst,dst,8
 328 0:
 329         srdi    9,cnt,4       /* Number of full quadwords remaining.  */
 330
 331         /* The proper alignment is present, it is OK to copy the bytes now.  */
 332 L(copy_GE_32_unaligned_cont):
 333
 334         /* Setup two indexes to speed up the indexed vector operations.  */
 335         clrldi  10,cnt,60
 336         li      6,16          /* Index for 16-bytes offsets.  */
 337         li      7,32          /* Index for 32-bytes offsets.  */
 338         cmpldi  cr1,10,0
 339         srdi    8,cnt,5       /* Setup the loop counter.  */
 340         mtocrf  0x01,9
 341         cmpldi  cr6,9,1
 342 #ifdef __LITTLE_ENDIAN__
 343         lvsr    5,0,src
 344 #else
 345         lvsl    5,0,src
 346 #endif
 347         lvx     3,0,src
 348         li      0,0
 349         bf      31,L(setup_unaligned_loop)
 350
 351         /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
 352         lvx     4,src,6
 353 #ifdef __LITTLE_ENDIAN__
 354         vperm   6,4,3,5
 355 #else
 356         vperm   6,3,4,5
 357 #endif
 358         addi    src,src,16
 359         stvx    6,0,dst
 360         addi    dst,dst,16
 361         vor     3,4,4
 362         clrrdi  0,src,60
 363
 364 L(setup_unaligned_loop):
 365         mtctr   8
 366         ble     cr6,L(end_unaligned_loop)
 367
 368         /* Copy 32 bytes at a time using vector instructions.  */
 369         .align  4
 370 L(unaligned_loop):
 371
 372         /* Note: vr6/vr10 may contain data that was already copied,
 373         but in order to get proper alignment, we may have to copy
 374         some portions again. This is faster than having unaligned
 375         vector instructions though.  */
 376
 377         lvx     4,src,6
 378 #ifdef __LITTLE_ENDIAN__
 379         vperm   6,4,3,5
 380 #else
 381         vperm   6,3,4,5
 382 #endif
 383         lvx     3,src,7
 384 #ifdef __LITTLE_ENDIAN__
 385         vperm   10,3,4,5
 386 #else
 387         vperm   10,4,3,5
 388 #endif
 389         addi    src,src,32
 390         stvx    6,0,dst
 391         stvx    10,dst,6
 392         addi    dst,dst,32
 393         bdnz    L(unaligned_loop)
 394
 395         clrrdi  0,src,60
 396
 397         .align  4
 398 L(end_unaligned_loop):
 399
 400         /* Check for tail bytes.  */
 401         mtocrf  0x01,cnt
 402         beqlr   cr1
 403
 404         add     src,src,0
 405
 406         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 407         /* Copy 8 bytes.  */
 408         bf      28,4f
 409         lwz     6,0(src)
 410         lwz     7,4(src)
 411         addi    src,src,8
 412         stw     6,0(dst)
 413         stw     7,4(dst)
 414         addi    dst,dst,8
 415 4:      /* Copy 4~7 bytes.  */
 416         bf      29,L(tail2)
 417         lwz     6,0(src)
 418         stw     6,0(dst)
 419         bf      30,L(tail5)
 420         lhz     7,4(src)
 421         sth     7,4(dst)
 422         bflr    31
 423         lbz     8,6(src)
 424         stb     8,6(dst)
 425         /* Return original DST pointer.  */
 426         blr
 427
 428 END_GEN_TB (MEMCPY,TB_TOCLESS)
 429 libc_hidden_builtin_def (memcpy)