sysdeps/powerpc/powerpc64/power8/memset.S

   1 /* Optimized memset implementation for PowerPC64/POWER8.
   2    Copyright (C) 2014-2015 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 #define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
  22
  23 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
  24    Returns 's'.  */
  25
  26         .machine power8
  27 EALIGN (memset, 5, 0)
  28         CALL_MCOUNT 3
  29
  30 L(_memset):
  31         cmpldi  cr7,r5,31
  32         neg     r0,r3
  33         mr      r10,r3
  34
  35         insrdi  r4,r4,8,48
  36         insrdi  r4,r4,16,32     /* Replicate byte to word.  */
  37         ble     cr7,L(write_LT_32)
  38
  39         andi.   r11,r10,15      /* Check alignment of DST.  */
  40         insrdi  r4,r4,32,0      /* Replicate word to double word.  */
  41
  42         beq     L(big_aligned)
  43
  44         mtocrf  0x01,r0
  45         clrldi  r0,r0,60
  46
  47         /* Get DST aligned to 16 bytes.  */
  48 1:      bf      31,2f
  49         stb     r4,0(r10)
  50         addi    r10,r10,1
  51
  52 2:      bf      30,4f
  53         sth     r4,0(r10)
  54         addi    r10,r10,2
  55
  56 4:      bf      29,8f
  57         stw     r4,0(r10)
  58         addi    r10,r10,4
  59
  60 8:      bf      28,16f
  61         std     r4,0(r10)
  62         addi    r10,r10,8
  63
  64 16:     subf    r5,r0,r5
  65
  66         .align  4
  67 L(big_aligned):
  68         /* For sizes larger than 255 two possible paths:
  69            - if constant is '0', zero full cache lines with dcbz
  70            - otherwise uses vector instructions.  */
  71         cmpldi  cr5,r5,255
  72         dcbtst  0,r10
  73         cmpldi  cr6,r4,0
  74         crand   27,26,21
  75         bt      27,L(huge_dcbz)
  76         bge     cr5,L(huge_vector)
  77
  78
  79         /* Size between 32 and 255 bytes with constant different than 0, use
  80            doubleword store instruction to achieve best throughput.  */
  81         srdi    r8,r5,5
  82         clrldi  r11,r5,59
  83         cmpldi  cr6,r11,0
  84         cmpdi   r8,0
  85         beq     L(tail_bytes)
  86         mtctr   r8
  87
  88         /* Main aligned write loop, writes 32-bytes at a time.  */
  89         .align  4
  90 L(big_loop):
  91         std     r4,0(r10)
  92         std     r4,8(r10)
  93         std     r4,16(r10)
  94         std     r4,24(r10)
  95         addi    r10,r10,32
  96         bdz     L(tail_bytes)
  97
  98         std     r4,0(r10)
  99         std     r4,8(r10)
 100         std     r4,16(r10)
 101         std     r4,24(r10)
 102         addi    r10,10,32
 103         bdnz    L(big_loop)
 104
 105         b       L(tail_bytes)
 106
 107         /* Write remaining 1~31 bytes.  */
 108         .align  4
 109 L(tail_bytes):
 110         beqlr   cr6
 111
 112         srdi    r7,r11,4
 113         clrldi  r8,r11,60
 114         mtocrf  0x01,r7
 115
 116         .align  4
 117         bf      31,8f
 118         std     r4,0(r10)
 119         std     r4,8(r10)
 120         addi    r10,r10,16
 121
 122         .align  4
 123 8:      mtocrf  0x1,r8
 124         bf      28,4f
 125         std     r4,0(r10)
 126         addi    r10,r10,8
 127
 128         .align  4
 129 4:      bf      29,2f
 130         stw     4,0(10)
 131         addi    10,10,4
 132
 133         .align  4
 134 2:      bf      30,1f
 135         sth     4,0(10)
 136         addi    10,10,2
 137
 138         .align  4
 139 1:      bflr    31
 140         stb     4,0(10)
 141         blr
 142
 143         /* Size larger than 255 bytes with constant different than 0, use
 144            vector instruction to achieve best throughput.  */
 145 L(huge_vector):
 146         /* Replicate set byte to quadword in VMX register.  */
 147         MTVSRD_V1_R4
 148         xxpermdi 32,v0,v1,0
 149         vspltb   v2,v0,15
 150
 151         /* Main aligned write loop: 128 bytes at a time.  */
 152         li      r6,16
 153         li      r7,32
 154         li      r8,48
 155         mtocrf  0x02,r5
 156         srdi    r12,r5,7
 157         cmpdi   r12,0
 158         beq     L(aligned_tail)
 159         mtctr   r12
 160         b       L(aligned_128loop)
 161
 162         .align  4
 163 L(aligned_128loop):
 164         stvx    v2,0,r10
 165         stvx    v2,r10,r6
 166         stvx    v2,r10,r7
 167         stvx    v2,r10,r8
 168         addi    r10,r10,64
 169         stvx    v2,0,r10
 170         stvx    v2,r10,r6
 171         stvx    v2,r10,r7
 172         stvx    v2,r10,r8
 173         addi    r10,r10,64
 174         bdnz    L(aligned_128loop)
 175
 176         /* Write remaining 1~127 bytes.  */
 177 L(aligned_tail):
 178         mtocrf  0x01,r5
 179         bf      25,32f
 180         stvx    v2,0,r10
 181         stvx    v2,r10,r6
 182         stvx    v2,r10,r7
 183         stvx    v2,r10,r8
 184         addi    r10,r10,64
 185
 186 32:     bf      26,16f
 187         stvx    v2,0,r10
 188         stvx    v2,r10,r6
 189         addi    r10,r10,32
 190
 191 16:     bf      27,8f
 192         stvx    v2,0,r10
 193         addi    r10,r10,16
 194
 195 8:      bf      28,4f
 196         std     r4,0(r10)
 197         addi    r10,r10,8
 198
 199         /* Copies 4~7 bytes.  */
 200 4:      bf      29,L(tail2)
 201         stw     r4,0(r10)
 202         bf      30,L(tail5)
 203         sth     r4,4(r10)
 204         bflr    31
 205         stb     r4,6(r10)
 206         /* Return original DST pointer.  */
 207         blr
 208
 209         /* Special case when value is 0 and we have a long length to deal
 210            with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
 211            Before using dcbz though, we need to get the destination 128-byte
 212            aligned.  */
 213         .align  4
 214 L(huge_dcbz):
 215         andi.   r11,r10,127
 216         neg     r0,r10
 217         beq     L(huge_dcbz_aligned)
 218
 219         clrldi  r0,r0,57
 220         subf    r5,r0,r5
 221         srdi    r0,r0,3
 222         mtocrf  0x01,r0
 223
 224         /* Write 1~128 bytes until DST is aligned to 128 bytes.  */
 225 8:      bf      28,4f
 226
 227         std     r4,0(r10)
 228         std     r4,8(r10)
 229         std     r4,16(r10)
 230         std     r4,24(r10)
 231         std     r4,32(r10)
 232         std     r4,40(r10)
 233         std     r4,48(r10)
 234         std     r4,56(r10)
 235         addi    r10,r10,64
 236
 237         .align  4
 238 4:      bf      29,2f
 239         std     r4,0(r10)
 240         std     r4,8(r10)
 241         std     r4,16(r10)
 242         std     r4,24(r10)
 243         addi    r10,r10,32
 244
 245         .align  4
 246 2:      bf      30,1f
 247         std     r4,0(r10)
 248         std     r4,8(r10)
 249         addi    r10,r10,16
 250
 251         .align  4
 252 1:      bf      31,L(huge_dcbz_aligned)
 253         std     r4,0(r10)
 254         addi    r10,r10,8
 255
 256 L(huge_dcbz_aligned):
 257         /* Setup dcbz unroll offsets and count numbers.  */
 258         srdi    r8,r5,9
 259         clrldi  r11,r5,55
 260         cmpldi  cr6,r11,0
 261         li      r9,128
 262         cmpdi   r8,0
 263         beq     L(huge_tail)
 264         li      r7,256
 265         li      r6,384
 266         mtctr   r8
 267
 268         .align  4
 269 L(huge_loop):
 270         /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
 271            a throughput boost for large sizes (2048 bytes or higher).  */
 272         dcbz    0,r10
 273         dcbz    r9,r10
 274         dcbz    r7,r10
 275         dcbz    r6,r10
 276         addi    r10,r10,512
 277         bdnz    L(huge_loop)
 278
 279         beqlr   cr6
 280
 281 L(huge_tail):
 282         srdi    r6,r11,8
 283         srdi    r7,r11,4
 284         clrldi  r8,r11,4
 285         cmpldi  cr6,r8,0
 286         mtocrf  0x01,r6
 287
 288         beq     cr6,L(tail)
 289
 290         /* We have 1~511 bytes remaining.  */
 291         .align  4
 292 32:     bf      31,16f
 293         dcbz    0,r10
 294         dcbz    r9,r10
 295         addi    r10,r10,256
 296
 297         .align  4
 298 16:     mtocrf  0x01,r7
 299         bf      28,8f
 300         dcbz    0,r10
 301         addi    r10,r10,128
 302
 303         .align  4
 304 8:      bf      29,4f
 305         std     r4,0(r10)
 306         std     r4,8(r10)
 307         std     r4,16(r10)
 308         std     r4,24(r10)
 309         std     r4,32(r10)
 310         std     r4,40(r10)
 311         std     r4,48(r10)
 312         std     r4,56(r10)
 313         addi    r10,r10,64
 314
 315         .align  4
 316 4:      bf      30,2f
 317         std     r4,0(r10)
 318         std     r4,8(r10)
 319         std     r4,16(r10)
 320         std     r4,24(r10)
 321         addi    r10,r10,32
 322
 323         .align  4
 324 2:      bf      31,L(tail)
 325         std     r4,0(r10)
 326         std     r4,8(r10)
 327         addi    r10,r10,16
 328         .align  4
 329
 330         /* Remaining 1~15 bytes.  */
 331 L(tail):
 332         mtocrf  0x01,r8
 333
 334         .align
 335 8:      bf      28,4f
 336         std     r4,0(r10)
 337         addi    r10,r10,8
 338
 339         .align  4
 340 4:      bf      29,2f
 341         stw     r4,0(r10)
 342         addi    r10,r10,4
 343
 344         .align  4
 345 2:      bf      30,1f
 346         sth     r4,0(r10)
 347         addi    r10,r10,2
 348
 349         .align  4
 350 1:      bflr    31
 351         stb     r4,0(r10)
 352         blr
 353
 354         /* Handle short copies of 0~31 bytes.  Best throughput is achieved
 355            by just unrolling all operations.  */
 356         .align  4
 357 L(write_LT_32):
 358         cmpldi  cr6,5,8
 359         mtocrf  0x01,r5
 360         ble     cr6,L(write_LE_8)
 361
 362         /* At least 9 bytes to go.  */
 363         neg     r8,r4
 364         andi.   r0,r8,3
 365         cmpldi  cr1,r5,16
 366         beq     L(write_LT_32_aligned)
 367
 368         /* Force 4-byte alignment for SRC.  */
 369         mtocrf  0x01,r0
 370         subf    r5,r0,r5
 371
 372 2:      bf      30,1f
 373         sth     r4,0(r10)
 374         addi    r10,r10,2
 375
 376 1:      bf      31,L(end_4bytes_alignment)
 377         stb     r4,0(r10)
 378         addi    r10,r10,1
 379
 380         .align  4
 381 L(end_4bytes_alignment):
 382         cmpldi  cr1,r5,16
 383         mtocrf  0x01,r5
 384
 385 L(write_LT_32_aligned):
 386         blt     cr1,8f
 387
 388         stw     r4,0(r10)
 389         stw     r4,4(r10)
 390         stw     r4,8(r10)
 391         stw     r4,12(r10)
 392         addi    r10,r10,16
 393
 394 8:      bf      28,L(tail4)
 395         stw     r4,0(r10)
 396         stw     r4,4(r10)
 397         addi    r10,r10,8
 398
 399         .align  4
 400         /* Copies 4~7 bytes.  */
 401 L(tail4):
 402         bf      29,L(tail2)
 403         stw     r4,0(r10)
 404         bf      30,L(tail5)
 405         sth     r4,4(r10)
 406         bflr    31
 407         stb     r4,6(r10)
 408         blr
 409
 410         .align  4
 411         /* Copies 2~3 bytes.  */
 412 L(tail2):
 413         bf      30,1f
 414         sth     r4,0(r10)
 415         bflr    31
 416         stb     r4,2(r10)
 417         blr
 418
 419         .align  4
 420 L(tail5):
 421         bflr    31
 422         stb     r4,4(r10)
 423         blr
 424
 425         .align  4
 426 1:      bflr    31
 427         stb     r4,0(r10)
 428         blr
 429
 430         /* Handles copies of 0~8 bytes.  */
 431         .align  4
 432 L(write_LE_8):
 433         bne     cr6,L(tail4)
 434
 435         stw     r4,0(r10)
 436         stw     r4,4(r10)
 437         blr
 438 END_GEN_TB (memset,TB_TOCLESS)
 439 libc_hidden_builtin_def (memset)
 440
 441 /* Copied from bzero.S to prevent the linker from inserting a stub
 442    between bzero and memset.  */
 443 ENTRY (__bzero)
 444         CALL_MCOUNT 3
 445         mr      r5,r4
 446         li      r4,0
 447         b       L(_memset)
 448 END (__bzero)
 449 #ifndef __bzero
 450 weak_alias (__bzero, bzero)
 451 #endif