sysdeps/powerpc/powerpc64/power6/memset.S

   1 /* Optimized 64-bit memset implementation for POWER6.
   2    Copyright (C) 1997-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
  22    Returns 's'.
  23
  24    The memset is done in three sizes: byte (8 bits), word (32 bits),
  25    cache line (256 bits). There is a special case for setting cache lines
  26    to 0, to take advantage of the dcbz instruction.  */
  27
  28 #ifndef MEMSET
  29 # define MEMSET memset
  30 #endif
  31         .machine power6
  32 ENTRY_TOCLESS (MEMSET, 7)
  33         CALL_MCOUNT 3
  34
  35 #define rTMP    r0
  36 #define rRTN    r3      /* Initial value of 1st argument.  */
  37 #define rMEMP0  r3      /* Original value of 1st arg.  */
  38 #define rCHR    r4      /* Char to set in each byte.  */
  39 #define rLEN    r5      /* Length of region to set.  */
  40 #define rMEMP   r6      /* Address at which we are storing.  */
  41 #define rALIGN  r7      /* Number of bytes we are setting now (when aligning). */
  42 #define rMEMP2  r8
  43 #define rMEMP3  r9      /* Alt mem pointer.  */
  44 L(_memset):
  45 /* Take care of case for size <= 4.  */
  46         cmpldi  cr1, rLEN, 8
  47         andi.   rALIGN, rMEMP0, 7
  48         mr      rMEMP, rMEMP0
  49         ble     cr1, L(small)
  50
  51 /* Align to doubleword boundary.  */
  52         cmpldi  cr5, rLEN, 31
  53         insrdi  rCHR, rCHR, 8, 48       /* Replicate byte to halfword.  */
  54         beq+    L(aligned2)
  55         mtcrf   0x01, rMEMP0
  56         subfic  rALIGN, rALIGN, 8
  57         cror    28,30,31                /* Detect odd word aligned.  */
  58         add     rMEMP, rMEMP, rALIGN
  59         sub     rLEN, rLEN, rALIGN
  60         insrdi  rCHR, rCHR, 16, 32      /* Replicate halfword to word.  */
  61         bt      29, L(g4)
  62 /* Process the even word of doubleword.  */
  63         bf+     31, L(g2)
  64         stb     rCHR, 0(rMEMP0)
  65         bt      30, L(g4x)
  66 L(g2):
  67         sth     rCHR, -6(rMEMP)
  68 L(g4x):
  69         stw     rCHR, -4(rMEMP)
  70         b       L(aligned)
  71 /* Process the odd word of doubleword.  */
  72 L(g4):
  73         bf      28, L(g4x) /* If false, word aligned on odd word.  */
  74         bf+     31, L(g0)
  75         stb     rCHR, 0(rMEMP0)
  76         bt      30, L(aligned)
  77 L(g0):
  78         sth     rCHR, -2(rMEMP)
  79
  80 /* Handle the case of size < 31.  */
  81 L(aligned2):
  82         insrdi  rCHR, rCHR, 16, 32      /* Replicate halfword to word.  */
  83 L(aligned):
  84         mtcrf   0x01, rLEN
  85         ble     cr5, L(medium)
  86 /* Align to 32-byte boundary.  */
  87         andi.   rALIGN, rMEMP, 0x18
  88         subfic  rALIGN, rALIGN, 0x20
  89         insrdi  rCHR, rCHR, 32, 0       /* Replicate word to double word. */
  90         beq     L(caligned)
  91         mtcrf   0x01, rALIGN
  92         add     rMEMP, rMEMP, rALIGN
  93         sub     rLEN, rLEN, rALIGN
  94         cmplwi  cr1, rALIGN, 0x10
  95         mr      rMEMP2, rMEMP
  96         bf      28, L(a1)
  97         stdu    rCHR, -8(rMEMP2)
  98 L(a1):  blt     cr1, L(a2)
  99         std     rCHR, -8(rMEMP2)
 100         stdu    rCHR, -16(rMEMP2)
 101 L(a2):
 102
 103 /* Now aligned to a 32 byte boundary.  */
 104         .align 4
 105 L(caligned):
 106         cmpldi  cr1, rCHR, 0
 107         clrrdi. rALIGN, rLEN, 5
 108         mtcrf   0x01, rLEN
 109         beq     cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
 110         beq     L(medium)       /* We may not actually get to do a full line.  */
 111         .align 4
 112 /* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
 113    boundary may not be at cache line (128-byte) boundary.  */
 114 L(nzloopstart):
 115 /* memset in 32-byte chunks until we get to a cache line boundary.
 116    If rLEN is less than the distance to the next cache-line boundary use
 117    cacheAligned1 code to finish the tail.  */
 118         cmpldi  cr1,rLEN,128
 119
 120         andi.   rTMP,rMEMP,127
 121         blt     cr1,L(cacheAligned1)
 122         addi    rMEMP3,rMEMP,32
 123         beq     L(nzCacheAligned)
 124         addi    rLEN,rLEN,-32
 125         std     rCHR,0(rMEMP)
 126         std     rCHR,8(rMEMP)
 127         std     rCHR,16(rMEMP)
 128         addi    rMEMP,rMEMP,32
 129         andi.   rTMP,rMEMP3,127
 130         std     rCHR,-8(rMEMP3)
 131
 132         beq     L(nzCacheAligned)
 133         addi    rLEN,rLEN,-32
 134         std     rCHR,0(rMEMP3)
 135         addi    rMEMP,rMEMP,32
 136         std     rCHR,8(rMEMP3)
 137         andi.   rTMP,rMEMP,127
 138         std     rCHR,16(rMEMP3)
 139         std     rCHR,24(rMEMP3)
 140
 141         beq     L(nzCacheAligned)
 142         addi    rLEN,rLEN,-32
 143         std     rCHR,32(rMEMP3)
 144         addi    rMEMP,rMEMP,32
 145         cmpldi  cr1,rLEN,128
 146         std     rCHR,40(rMEMP3)
 147         cmpldi  cr6,rLEN,256
 148         li      rMEMP2,128
 149         std     rCHR,48(rMEMP3)
 150         std     rCHR,56(rMEMP3)
 151         blt     cr1,L(cacheAligned1)
 152         b       L(nzCacheAligned128)
 153
 154 /* Now we are aligned to the cache line and can use dcbtst.  */
 155         .align 4
 156 L(nzCacheAligned):
 157         cmpldi  cr1,rLEN,128
 158         blt     cr1,L(cacheAligned1)
 159         b       L(nzCacheAligned128)
 160         .align 5
 161 L(nzCacheAligned128):
 162         cmpldi  cr1,rLEN,256
 163         addi    rMEMP3,rMEMP,64
 164         std     rCHR,0(rMEMP)
 165         std     rCHR,8(rMEMP)
 166         std     rCHR,16(rMEMP)
 167         std     rCHR,24(rMEMP)
 168         std     rCHR,32(rMEMP)
 169         std     rCHR,40(rMEMP)
 170         std     rCHR,48(rMEMP)
 171         std     rCHR,56(rMEMP)
 172         addi    rMEMP,rMEMP3,64
 173         addi    rLEN,rLEN,-128
 174         std     rCHR,0(rMEMP3)
 175         std     rCHR,8(rMEMP3)
 176         std     rCHR,16(rMEMP3)
 177         std     rCHR,24(rMEMP3)
 178         std     rCHR,32(rMEMP3)
 179         std     rCHR,40(rMEMP3)
 180         std     rCHR,48(rMEMP3)
 181         std     rCHR,56(rMEMP3)
 182         bge     cr1,L(nzCacheAligned128)
 183         dcbtst  0,rMEMP
 184         b       L(cacheAligned1)
 185         .align 5
 186 /* Storing a zero "c" value. We are aligned at a sector (32-byte)
 187    boundary but may not be at cache line (128-byte) boundary.  If the
 188    remaining length spans a full cache line we can use the Data cache
 189    block zero instruction. */
 190 L(zloopstart):
 191 /* memset in 32-byte chunks until we get to a cache line boundary.
 192    If rLEN is less than the distance to the next cache-line boundary use
 193    cacheAligned1 code to finish the tail.  */
 194         cmpldi  cr1,rLEN,128
 195         beq     L(medium)
 196 L(getCacheAligned):
 197         andi.   rTMP,rMEMP,127
 198         nop
 199         blt     cr1,L(cacheAligned1)
 200         addi    rMEMP3,rMEMP,32
 201         beq     L(cacheAligned)
 202         addi    rLEN,rLEN,-32
 203         std     rCHR,0(rMEMP)
 204         std     rCHR,8(rMEMP)
 205         std     rCHR,16(rMEMP)
 206         addi    rMEMP,rMEMP,32
 207         andi.   rTMP,rMEMP3,127
 208         std     rCHR,-8(rMEMP3)
 209 L(getCacheAligned2):
 210         beq     L(cacheAligned)
 211         addi    rLEN,rLEN,-32
 212         std     rCHR,0(rMEMP3)
 213         std     rCHR,8(rMEMP3)
 214         addi    rMEMP,rMEMP,32
 215         andi.   rTMP,rMEMP,127
 216         std     rCHR,16(rMEMP3)
 217         std     rCHR,24(rMEMP3)
 218 L(getCacheAligned3):
 219         beq     L(cacheAligned)
 220         addi    rLEN,rLEN,-32
 221         std     rCHR,32(rMEMP3)
 222         addi    rMEMP,rMEMP,32
 223         cmpldi  cr1,rLEN,128
 224         std     rCHR,40(rMEMP3)
 225         cmpldi  cr6,rLEN,256
 226         li      rMEMP2,128
 227         std     rCHR,48(rMEMP3)
 228         std     rCHR,56(rMEMP3)
 229         blt     cr1,L(cacheAligned1)
 230         blt     cr6,L(cacheAligned128)
 231         b       L(cacheAlignedx)
 232
 233 /* Now we are aligned to the cache line and can use dcbz.  */
 234         .align 5
 235 L(cacheAligned):
 236         cmpldi  cr1,rLEN,128
 237         cmpldi  cr6,rLEN,256
 238         blt     cr1,L(cacheAligned1)
 239         li      rMEMP2,128
 240 L(cacheAlignedx):
 241         cmpldi  cr5,rLEN,640
 242         blt     cr6,L(cacheAligned128)
 243         bgt     cr5,L(cacheAligned512)
 244         cmpldi  cr6,rLEN,512
 245         dcbz    0,rMEMP
 246         cmpldi  cr1,rLEN,384
 247         dcbz    rMEMP2,rMEMP
 248         addi    rMEMP,rMEMP,256
 249         addi    rLEN,rLEN,-256
 250         blt     cr1,L(cacheAligned1)
 251         blt     cr6,L(cacheAligned128)
 252         b       L(cacheAligned256)
 253         .align 5
 254 /* A simple loop for the longer (>640 bytes) lengths.  This form limits
 255    the branch miss-predicted to exactly 1 at loop exit.*/
 256 L(cacheAligned512):
 257         cmpldi  cr1,rLEN,128
 258         blt     cr1,L(cacheAligned1)
 259         dcbz    0,rMEMP
 260         addi    rLEN,rLEN,-128
 261         addi    rMEMP,rMEMP,128
 262         b       L(cacheAligned512)
 263         .align 5
 264 L(cacheAligned256):
 265
 266         cmpldi  cr6,rLEN,512
 267
 268         dcbz    0,rMEMP
 269         cmpldi  cr1,rLEN,384
 270         dcbz    rMEMP2,rMEMP
 271         addi    rMEMP,rMEMP,256
 272         addi    rLEN,rLEN,-256
 273
 274         bge     cr6,L(cacheAligned256)
 275
 276         blt     cr1,L(cacheAligned1)
 277         .align 4
 278 L(cacheAligned128):
 279         dcbz    0,rMEMP
 280         addi    rMEMP,rMEMP,128
 281         addi    rLEN,rLEN,-128
 282         nop
 283 L(cacheAligned1):
 284         cmpldi  cr1,rLEN,32
 285         blt     cr1,L(handletail32)
 286         addi    rMEMP3,rMEMP,32
 287         addi    rLEN,rLEN,-32
 288         std     rCHR,0(rMEMP)
 289         std     rCHR,8(rMEMP)
 290         std     rCHR,16(rMEMP)
 291         addi    rMEMP,rMEMP,32
 292         cmpldi  cr1,rLEN,32
 293         std     rCHR,-8(rMEMP3)
 294 L(cacheAligned2):
 295         blt     cr1,L(handletail32)
 296         addi    rLEN,rLEN,-32
 297         std     rCHR,0(rMEMP3)
 298         std     rCHR,8(rMEMP3)
 299         addi    rMEMP,rMEMP,32
 300         cmpldi  cr1,rLEN,32
 301         std     rCHR,16(rMEMP3)
 302         std     rCHR,24(rMEMP3)
 303         nop
 304 L(cacheAligned3):
 305         blt     cr1,L(handletail32)
 306         addi    rMEMP,rMEMP,32
 307         addi    rLEN,rLEN,-32
 308         std     rCHR,32(rMEMP3)
 309         std     rCHR,40(rMEMP3)
 310         std     rCHR,48(rMEMP3)
 311         std     rCHR,56(rMEMP3)
 312
 313 /* We are here because the length or remainder (rLEN) is less than the
 314    cache line/sector size and does not justify aggressive loop unrolling.
 315    So set up the preconditions for L(medium) and go there.  */
 316         .align 3
 317 L(handletail32):
 318         cmpldi  cr1,rLEN,0
 319         beqlr   cr1
 320         b       L(medium)
 321
 322         .align 5
 323 L(small):
 324 /* Memset of 8 bytes or less.  */
 325         cmpldi  cr6, rLEN, 4
 326         cmpldi  cr5, rLEN, 1
 327         ble     cr6,L(le4)
 328         subi    rLEN, rLEN, 4
 329         stb     rCHR,0(rMEMP)
 330         stb     rCHR,1(rMEMP)
 331         stb     rCHR,2(rMEMP)
 332         stb     rCHR,3(rMEMP)
 333         addi    rMEMP,rMEMP, 4
 334         cmpldi  cr5, rLEN, 1
 335 L(le4):
 336         cmpldi  cr1, rLEN, 3
 337         bltlr   cr5
 338         stb     rCHR, 0(rMEMP)
 339         beqlr   cr5
 340         stb     rCHR, 1(rMEMP)
 341         bltlr   cr1
 342         stb     rCHR, 2(rMEMP)
 343         beqlr   cr1
 344         stb     rCHR, 3(rMEMP)
 345         blr
 346
 347 /* Memset of 0-31 bytes.  */
 348         .align 5
 349 L(medium):
 350         insrdi  rCHR, rCHR, 32, 0       /* Replicate word to double word.  */
 351         cmpldi  cr1, rLEN, 16
 352 L(medium_tail2):
 353         add     rMEMP, rMEMP, rLEN
 354 L(medium_tail):
 355         bt-     31, L(medium_31t)
 356         bt-     30, L(medium_30t)
 357 L(medium_30f):
 358         bt      29, L(medium_29t)
 359 L(medium_29f):
 360         bge     cr1, L(medium_27t)
 361         bflr    28
 362         std     rCHR, -8(rMEMP)
 363         blr
 364
 365 L(medium_31t):
 366         stbu    rCHR, -1(rMEMP)
 367         bf-     30, L(medium_30f)
 368 L(medium_30t):
 369         sthu    rCHR, -2(rMEMP)
 370         bf-     29, L(medium_29f)
 371 L(medium_29t):
 372         stwu    rCHR, -4(rMEMP)
 373         blt     cr1, L(medium_27f)
 374 L(medium_27t):
 375         std     rCHR, -8(rMEMP)
 376         stdu    rCHR, -16(rMEMP)
 377 L(medium_27f):
 378         bflr    28
 379 L(medium_28t):
 380         std     rCHR, -8(rMEMP)
 381         blr
 382 END_GEN_TB (MEMSET,TB_TOCLESS)
 383 libc_hidden_builtin_def (memset)