sysdeps/powerpc/powerpc64/power8/memset.S

   1 /* Optimized memset implementation for PowerPC64/POWER8.
   2    Copyright (C) 2014-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
  22    Returns 's'.  */
  23
  24 #ifndef MEMSET
  25 # define MEMSET memset
  26 #endif
  27         .machine  power8
  28 ENTRY_TOCLESS (MEMSET, 5)
  29         CALL_MCOUNT 3
  30
  31 L(_memset):
  32         cmpldi  cr7,r5,31
  33         neg     r0,r3
  34         mr      r10,r3
  35
  36         insrdi  r4,r4,8,48
  37         insrdi  r4,r4,16,32     /* Replicate byte to word.  */
  38         ble     cr7,L(write_LT_32)
  39
  40         andi.   r11,r10,15      /* Check alignment of DST.  */
  41         insrdi  r4,r4,32,0      /* Replicate word to double word.  */
  42
  43         beq     L(big_aligned)
  44
  45         mtocrf  0x01,r0
  46         clrldi  r0,r0,60
  47
  48         /* Get DST aligned to 16 bytes.  */
  49 1:      bf      31,2f
  50         stb     r4,0(r10)
  51         addi    r10,r10,1
  52
  53 2:      bf      30,4f
  54         sth     r4,0(r10)
  55         addi    r10,r10,2
  56
  57 4:      bf      29,8f
  58         stw     r4,0(r10)
  59         addi    r10,r10,4
  60
  61 8:      bf      28,16f
  62         std     r4,0(r10)
  63         addi    r10,r10,8
  64
  65 16:     subf    r5,r0,r5
  66
  67         .align  4
  68 L(big_aligned):
  69         /* For sizes larger than 255 two possible paths:
  70            - if constant is '0', zero full cache lines with dcbz
  71            - otherwise uses vector instructions.  */
  72         cmpldi  cr5,r5,255
  73         dcbtst  0,r10
  74         cmpldi  cr6,r4,0
  75         crand   27,26,21
  76         bt      27,L(huge_dcbz)
  77         bge     cr5,L(huge_vector)
  78
  79
  80         /* Size between 32 and 255 bytes with constant different than 0, use
  81            doubleword store instruction to achieve best throughput.  */
  82         srdi    r8,r5,5
  83         clrldi  r11,r5,59
  84         cmpldi  cr6,r11,0
  85         cmpdi   r8,0
  86         beq     L(tail_bytes)
  87         mtctr   r8
  88
  89         /* Main aligned write loop, writes 32-bytes at a time.  */
  90         .align  4
  91 L(big_loop):
  92         std     r4,0(r10)
  93         std     r4,8(r10)
  94         std     r4,16(r10)
  95         std     r4,24(r10)
  96         addi    r10,r10,32
  97         bdz     L(tail_bytes)
  98
  99         std     r4,0(r10)
 100         std     r4,8(r10)
 101         std     r4,16(r10)
 102         std     r4,24(r10)
 103         addi    r10,10,32
 104         bdnz    L(big_loop)
 105
 106         b       L(tail_bytes)
 107
 108         /* Write remaining 1~31 bytes.  */
 109         .align  4
 110 L(tail_bytes):
 111         beqlr   cr6
 112
 113         srdi    r7,r11,4
 114         clrldi  r8,r11,60
 115         mtocrf  0x01,r7
 116
 117         .align  4
 118         bf      31,8f
 119         std     r4,0(r10)
 120         std     r4,8(r10)
 121         addi    r10,r10,16
 122
 123         .align  4
 124 8:      mtocrf  0x1,r8
 125         bf      28,4f
 126         std     r4,0(r10)
 127         addi    r10,r10,8
 128
 129         .align  4
 130 4:      bf      29,2f
 131         stw     4,0(10)
 132         addi    10,10,4
 133
 134         .align  4
 135 2:      bf      30,1f
 136         sth     4,0(10)
 137         addi    10,10,2
 138
 139         .align  4
 140 1:      bflr    31
 141         stb     4,0(10)
 142         blr
 143
 144         /* Size larger than 255 bytes with constant different than 0, use
 145            vector instruction to achieve best throughput.  */
 146 L(huge_vector):
 147         /* Replicate set byte to quadword in VMX register.  */
 148         mtvsrd  v1,r4
 149         xxpermdi 32,v0,v1,0
 150         vspltb   v2,v0,15
 151
 152         /* Main aligned write loop: 128 bytes at a time.  */
 153         li      r6,16
 154         li      r7,32
 155         li      r8,48
 156         mtocrf  0x02,r5
 157         srdi    r12,r5,7
 158         cmpdi   r12,0
 159         beq     L(aligned_tail)
 160         mtctr   r12
 161         b       L(aligned_128loop)
 162
 163         .align  4
 164 L(aligned_128loop):
 165         stvx    v2,0,r10
 166         stvx    v2,r10,r6
 167         stvx    v2,r10,r7
 168         stvx    v2,r10,r8
 169         addi    r10,r10,64
 170         stvx    v2,0,r10
 171         stvx    v2,r10,r6
 172         stvx    v2,r10,r7
 173         stvx    v2,r10,r8
 174         addi    r10,r10,64
 175         bdnz    L(aligned_128loop)
 176
 177         /* Write remaining 1~127 bytes.  */
 178 L(aligned_tail):
 179         mtocrf  0x01,r5
 180         bf      25,32f
 181         stvx    v2,0,r10
 182         stvx    v2,r10,r6
 183         stvx    v2,r10,r7
 184         stvx    v2,r10,r8
 185         addi    r10,r10,64
 186
 187 32:     bf      26,16f
 188         stvx    v2,0,r10
 189         stvx    v2,r10,r6
 190         addi    r10,r10,32
 191
 192 16:     bf      27,8f
 193         stvx    v2,0,r10
 194         addi    r10,r10,16
 195
 196 8:      bf      28,4f
 197         std     r4,0(r10)
 198         addi    r10,r10,8
 199
 200         /* Copies 4~7 bytes.  */
 201 4:      bf      29,L(tail2)
 202         stw     r4,0(r10)
 203         bf      30,L(tail5)
 204         sth     r4,4(r10)
 205         bflr    31
 206         stb     r4,6(r10)
 207         /* Return original DST pointer.  */
 208         blr
 209
 210         /* Special case when value is 0 and we have a long length to deal
 211            with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
 212            Before using dcbz though, we need to get the destination 128-byte
 213            aligned.  */
 214         .align  4
 215 L(huge_dcbz):
 216         andi.   r11,r10,127
 217         neg     r0,r10
 218         beq     L(huge_dcbz_aligned)
 219
 220         clrldi  r0,r0,57
 221         subf    r5,r0,r5
 222         srdi    r0,r0,3
 223         mtocrf  0x01,r0
 224
 225         /* Write 1~128 bytes until DST is aligned to 128 bytes.  */
 226 8:      bf      28,4f
 227
 228         std     r4,0(r10)
 229         std     r4,8(r10)
 230         std     r4,16(r10)
 231         std     r4,24(r10)
 232         std     r4,32(r10)
 233         std     r4,40(r10)
 234         std     r4,48(r10)
 235         std     r4,56(r10)
 236         addi    r10,r10,64
 237
 238         .align  4
 239 4:      bf      29,2f
 240         std     r4,0(r10)
 241         std     r4,8(r10)
 242         std     r4,16(r10)
 243         std     r4,24(r10)
 244         addi    r10,r10,32
 245
 246         .align  4
 247 2:      bf      30,1f
 248         std     r4,0(r10)
 249         std     r4,8(r10)
 250         addi    r10,r10,16
 251
 252         .align  4
 253 1:      bf      31,L(huge_dcbz_aligned)
 254         std     r4,0(r10)
 255         addi    r10,r10,8
 256
 257 L(huge_dcbz_aligned):
 258         /* Setup dcbz unroll offsets and count numbers.  */
 259         srdi    r8,r5,9
 260         clrldi  r11,r5,55
 261         cmpldi  cr6,r11,0
 262         li      r9,128
 263         cmpdi   r8,0
 264         beq     L(huge_tail)
 265         li      r7,256
 266         li      r6,384
 267         mtctr   r8
 268
 269         .align  4
 270 L(huge_loop):
 271         /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
 272            a throughput boost for large sizes (2048 bytes or higher).  */
 273         dcbz    0,r10
 274         dcbz    r9,r10
 275         dcbz    r7,r10
 276         dcbz    r6,r10
 277         addi    r10,r10,512
 278         bdnz    L(huge_loop)
 279
 280         beqlr   cr6
 281
 282 L(huge_tail):
 283         srdi    r6,r11,8
 284         srdi    r7,r11,4
 285         clrldi  r8,r11,4
 286         cmpldi  cr6,r8,0
 287         mtocrf  0x01,r6
 288
 289         beq     cr6,L(tail)
 290
 291         /* We have 1~511 bytes remaining.  */
 292         .align  4
 293 32:     bf      31,16f
 294         dcbz    0,r10
 295         dcbz    r9,r10
 296         addi    r10,r10,256
 297
 298         .align  4
 299 16:     mtocrf  0x01,r7
 300         bf      28,8f
 301         dcbz    0,r10
 302         addi    r10,r10,128
 303
 304         .align  4
 305 8:      bf      29,4f
 306         std     r4,0(r10)
 307         std     r4,8(r10)
 308         std     r4,16(r10)
 309         std     r4,24(r10)
 310         std     r4,32(r10)
 311         std     r4,40(r10)
 312         std     r4,48(r10)
 313         std     r4,56(r10)
 314         addi    r10,r10,64
 315
 316         .align  4
 317 4:      bf      30,2f
 318         std     r4,0(r10)
 319         std     r4,8(r10)
 320         std     r4,16(r10)
 321         std     r4,24(r10)
 322         addi    r10,r10,32
 323
 324         .align  4
 325 2:      bf      31,L(tail)
 326         std     r4,0(r10)
 327         std     r4,8(r10)
 328         addi    r10,r10,16
 329         .align  4
 330
 331         /* Remaining 1~15 bytes.  */
 332 L(tail):
 333         mtocrf  0x01,r8
 334
 335         .align
 336 8:      bf      28,4f
 337         std     r4,0(r10)
 338         addi    r10,r10,8
 339
 340         .align  4
 341 4:      bf      29,2f
 342         stw     r4,0(r10)
 343         addi    r10,r10,4
 344
 345         .align  4
 346 2:      bf      30,1f
 347         sth     r4,0(r10)
 348         addi    r10,r10,2
 349
 350         .align  4
 351 1:      bflr    31
 352         stb     r4,0(r10)
 353         blr
 354
 355         /* Handle short copies of 0~31 bytes.  Best throughput is achieved
 356            by just unrolling all operations.  */
 357         .align  4
 358 L(write_LT_32):
 359         cmpldi  cr6,5,8
 360         mtocrf  0x01,r5
 361         ble     cr6,L(write_LE_8)
 362
 363         /* At least 9 bytes to go.  */
 364         neg     r8,r4
 365         andi.   r0,r8,3
 366         cmpldi  cr1,r5,16
 367         beq     L(write_LT_32_aligned)
 368
 369         /* Force 4-byte alignment for SRC.  */
 370         mtocrf  0x01,r0
 371         subf    r5,r0,r5
 372
 373 2:      bf      30,1f
 374         /* Use stb instead of sth because it doesn't generate
 375            alignment interrupts on cache-inhibited storage.  */
 376         stb     r4,0(r10)
 377         stb     r4,1(r10)
 378         addi    r10,r10,2
 379
 380 1:      bf      31,L(end_4bytes_alignment)
 381         stb     r4,0(r10)
 382         addi    r10,r10,1
 383
 384         .align  4
 385 L(end_4bytes_alignment):
 386         cmpldi  cr1,r5,16
 387         mtocrf  0x01,r5
 388
 389 L(write_LT_32_aligned):
 390         blt     cr1,8f
 391
 392         stw     r4,0(r10)
 393         stw     r4,4(r10)
 394         stw     r4,8(r10)
 395         stw     r4,12(r10)
 396         addi    r10,r10,16
 397
 398 8:      bf      28,L(tail4)
 399         stw     r4,0(r10)
 400         stw     r4,4(r10)
 401         addi    r10,r10,8
 402
 403         .align  4
 404         /* Copies 4~7 bytes.  */
 405 L(tail4):
 406         bf      29,L(tail2)
 407         stw     r4,0(r10)
 408         bf      30,L(tail5)
 409         sth     r4,4(r10)
 410         bflr    31
 411         stb     r4,6(r10)
 412         blr
 413
 414         .align  4
 415         /* Copies 2~3 bytes.  */
 416 L(tail2):
 417         bf      30,1f
 418         sth     r4,0(r10)
 419         bflr    31
 420         stb     r4,2(r10)
 421         blr
 422
 423         .align  4
 424 L(tail5):
 425         bflr    31
 426         stb     r4,4(r10)
 427         blr
 428
 429         .align  4
 430 1:      bflr    31
 431         stb     r4,0(r10)
 432         blr
 433
 434         /* Handles copies of 0~8 bytes.  */
 435         .align  4
 436 L(write_LE_8):
 437         bne     cr6,L(LE7_tail4)
 438         /* If input is word aligned, use stw, else use stb.  */
 439         andi.   r0,r10,3
 440         bne     L(8_unalign)
 441
 442         stw     r4,0(r10)
 443         stw     r4,4(r10)
 444         blr
 445
 446         /* Unaligned input and size is 8.  */
 447         .align  4
 448 L(8_unalign):
 449         andi.   r0,r10,1
 450         beq     L(8_hwalign)
 451         stb     r4,0(r10)
 452         sth     r4,1(r10)
 453         sth     r4,3(r10)
 454         sth     r4,5(r10)
 455         stb     r4,7(r10)
 456         blr
 457
 458         /* Halfword aligned input and size is 8.  */
 459         .align  4
 460 L(8_hwalign):
 461         sth     r4,0(r10)
 462         sth     r4,2(r10)
 463         sth     r4,4(r10)
 464         sth     r4,6(r10)
 465         blr
 466
 467         .align  4
 468         /* Copies 4~7 bytes.  */
 469 L(LE7_tail4):
 470         /* Use stb instead of sth because it doesn't generate
 471            alignment interrupts on cache-inhibited storage.  */
 472         bf      29,L(LE7_tail2)
 473         stb     r4,0(r10)
 474         stb     r4,1(r10)
 475         stb     r4,2(r10)
 476         stb     r4,3(r10)
 477         bf      30,L(LE7_tail5)
 478         stb     r4,4(r10)
 479         stb     r4,5(r10)
 480         bflr    31
 481         stb     r4,6(r10)
 482         blr
 483
 484         .align  4
 485         /* Copies 2~3 bytes.  */
 486 L(LE7_tail2):
 487         bf      30,1f
 488         stb     r4,0(r10)
 489         stb     r4,1(r10)
 490         bflr    31
 491         stb     r4,2(r10)
 492         blr
 493
 494         .align  4
 495 L(LE7_tail5):
 496         bflr    31
 497         stb     r4,4(r10)
 498         blr
 499
 500         .align  4
 501 1:      bflr    31
 502         stb     r4,0(r10)
 503         blr
 504
 505 END_GEN_TB (MEMSET,TB_TOCLESS)
 506 libc_hidden_builtin_def (memset)