sysdeps/powerpc/powerpc32/power6/memset.S

   1 /* Optimized 32-bit memset implementation for POWER6.
   2    Copyright (C) 1997,99, 2000,02,03,06,2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
  18    02110-1301 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
  25    Returns 's'.
  26
  27    The memset is done in three sizes: byte (8 bits), word (32 bits),
  28    cache line (1024 bits). There is a special case for setting cache lines
  29    to 0, to take advantage of the dcbz instruction.  */
  30
  31         .machine power6
  32 EALIGN (BP_SYM (memset), 7, 0)
  33         CALL_MCOUNT
  34
  35 #define rTMP    r0
  36 #define rRTN    r3      /* Initial value of 1st argument.  */
  37 #define rMEMP0  r3      /* Original value of 1st arg.  */
  38 #define rCHR    r4      /* Char to set in each byte.  */
  39 #define rLEN    r5      /* Length of region to set.  */
  40 #define rMEMP   r6      /* Address at which we are storing.  */
  41 #define rALIGN  r7      /* Number of bytes we are setting now (when aligning). */
  42 #define rMEMP2  r8
  43
  44 #define rNEG64  r8      /* Constant -64 for clearing with dcbz.  */
  45 #define rMEMP3  r9      /* Alt mem pointer.  */
  46 L(_memset):
  47 /* Take care of case for size <= 4.  */
  48         cmplwi  cr1, rLEN, 4
  49         andi.   rALIGN, rMEMP0, 3
  50         mr      rMEMP, rMEMP0
  51         ble-    cr1, L(small)
  52 /* Align to word boundary.  */
  53         cmplwi  cr5, rLEN, 31
  54         rlwimi  rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
  55         beq+    L(aligned)
  56         mtcrf   0x01, rMEMP0
  57         subfic  rALIGN, rALIGN, 4
  58         add     rMEMP, rMEMP, rALIGN
  59         sub     rLEN, rLEN, rALIGN
  60         bf+     31, L(g0)
  61         stb     rCHR, 0(rMEMP0)
  62         bt      30, L(aligned)
  63 L(g0):
  64         sth     rCHR, -2(rMEMP)
  65
  66         .align 4
  67 /* Handle the case of size < 31.  */
  68 L(aligned):
  69         mtcrf   0x01, rLEN
  70         rlwimi  rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
  71         ble     cr5, L(medium)
  72 /* Align to 32-byte boundary.  */
  73         andi.   rALIGN, rMEMP, 0x1C
  74         subfic  rALIGN, rALIGN, 0x20
  75         beq     L(caligned)
  76         mtcrf   0x01, rALIGN
  77         add     rMEMP, rMEMP, rALIGN
  78         sub     rLEN, rLEN, rALIGN
  79         cmplwi  cr1, rALIGN, 0x10
  80         mr      rMEMP2, rMEMP
  81         bf      28, L(a1)
  82         stw     rCHR, -4(rMEMP2)
  83         stwu    rCHR, -8(rMEMP2)
  84         nop
  85 L(a1):  blt     cr1, L(a2)
  86         stw     rCHR, -4(rMEMP2)
  87         stw     rCHR, -8(rMEMP2)
  88         stw     rCHR, -12(rMEMP2)
  89         stwu    rCHR, -16(rMEMP2)
  90 L(a2):  bf      29, L(caligned)
  91         stw     rCHR, -4(rMEMP2)
  92
  93         .align 3
  94 /* Now aligned to a 32 byte boundary.  */
  95 L(caligned):
  96         cmplwi  cr1, rCHR, 0
  97         clrrwi. rALIGN, rLEN, 5
  98         mtcrf   0x01, rLEN
  99         beq     cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
 100 L(nondcbz):
 101         beq     L(medium)       /* We may not actually get to do a full line.  */
 102         nop
 103 /* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
 104    boundary may not be at cache line (128-byte) boundary.  */
 105 L(nzloopstart):
 106 /* memset in 32-byte chunks until we get to a cache line boundary.
 107    If rLEN is less then the distance to the next cache-line boundary use
 108    cacheAligned1 code to finish the tail.  */
 109         cmplwi  cr1,rLEN,128
 110
 111         andi.   rTMP,rMEMP,127
 112         blt     cr1,L(cacheAligned1)
 113         addi    rMEMP3,rMEMP,32
 114         beq     L(nzCacheAligned)
 115         addi    rLEN,rLEN,-32
 116         stw     rCHR,0(rMEMP)
 117         stw     rCHR,4(rMEMP)
 118         stw     rCHR,8(rMEMP)
 119         stw     rCHR,12(rMEMP)
 120         stw     rCHR,16(rMEMP)
 121         stw     rCHR,20(rMEMP)
 122         addi    rMEMP,rMEMP,32
 123         andi.   rTMP,rMEMP3,127
 124         stw     rCHR,-8(rMEMP3)
 125         stw     rCHR,-4(rMEMP3)
 126
 127         beq     L(nzCacheAligned)
 128         addi    rLEN,rLEN,-32
 129         stw     rCHR,0(rMEMP3)
 130         stw     rCHR,4(rMEMP3)
 131         addi    rMEMP,rMEMP,32
 132         stw     rCHR,8(rMEMP3)
 133         stw     rCHR,12(rMEMP3)
 134         andi.   rTMP,rMEMP,127
 135         stw     rCHR,16(rMEMP3)
 136         stw     rCHR,20(rMEMP3)
 137         stw     rCHR,24(rMEMP3)
 138         stw     rCHR,28(rMEMP3)
 139
 140         beq     L(nzCacheAligned)
 141         addi    rLEN,rLEN,-32
 142 /* At this point we can overrun the store queue (pipe reject) so it is
 143    time to slow things down. The store queue can merge two adjacent
 144    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 145    So we add "group ending nops" to guarantee that we dispatch only two
 146    stores every other cycle. */
 147         ori     r1,r1,0
 148         ori     r1,r1,0
 149         stw     rCHR,32(rMEMP3)
 150         stw     rCHR,36(rMEMP3)
 151         addi    rMEMP,rMEMP,32
 152         cmplwi  cr1,rLEN,128
 153         ori     r1,r1,0
 154         ori     r1,r1,0
 155         stw     rCHR,40(rMEMP3)
 156         stw     rCHR,44(rMEMP3)
 157         ori     r1,r1,0
 158         ori     r1,r1,0
 159         stw     rCHR,48(rMEMP3)
 160         stw     rCHR,52(rMEMP3)
 161         ori     r1,r1,0
 162         ori     r1,r1,0
 163         stw     rCHR,56(rMEMP3)
 164         stw     rCHR,60(rMEMP3)
 165         blt     cr1,L(cacheAligned1)
 166         b       L(nzCacheAligned)
 167
 168 /* Now we are aligned to the cache line and can use dcbtst.  */
 169         .align 5
 170 L(nzCacheAligned):
 171         cmplwi  cr1,rLEN,128
 172         cmplwi  cr6,rLEN,256
 173         blt     cr1,L(cacheAligned1)
 174         blt     cr6,L(nzCacheAligned128)
 175         .align 4
 176 L(nzCacheAligned128):
 177         nop
 178         addi    rMEMP3,rMEMP,64
 179         stw     rCHR,0(rMEMP)
 180         stw     rCHR,4(rMEMP)
 181         stw     rCHR,8(rMEMP)
 182         stw     rCHR,12(rMEMP)
 183         stw     rCHR,16(rMEMP)
 184         stw     rCHR,20(rMEMP)
 185         stw     rCHR,24(rMEMP)
 186         stw     rCHR,28(rMEMP)
 187         stw     rCHR,32(rMEMP)
 188         stw     rCHR,36(rMEMP)
 189         stw     rCHR,40(rMEMP)
 190         stw     rCHR,44(rMEMP)
 191         stw     rCHR,48(rMEMP)
 192         stw     rCHR,52(rMEMP)
 193         stw     rCHR,56(rMEMP)
 194         stw     rCHR,60(rMEMP)
 195         addi    rMEMP,rMEMP3,64
 196         addi    rLEN,rLEN,-128
 197 /* At this point we can overrun the store queue (pipe reject) so it is
 198    time to slow things down. The store queue can merge two adjacent
 199    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 200    So we add "group ending nops" to guarantee that we dispatch only one
 201    store per cycle. */
 202         stw     rCHR,0(rMEMP3)
 203         ori     r1,r1,0
 204         stw     rCHR,4(rMEMP3)
 205         ori     r1,r1,0
 206         stw     rCHR,8(rMEMP3)
 207         ori     r1,r1,0
 208         stw     rCHR,12(rMEMP3)
 209         ori     r1,r1,0
 210         stw     rCHR,16(rMEMP3)
 211         ori     r1,r1,0
 212         stw     rCHR,20(rMEMP3)
 213         ori     r1,r1,0
 214         stw     rCHR,24(rMEMP3)
 215         ori     r1,r1,0
 216         stw     rCHR,28(rMEMP3)
 217         ori     r1,r1,0
 218         stw     rCHR,32(rMEMP3)
 219         ori     r1,r1,0
 220         stw     rCHR,36(rMEMP3)
 221         ori     r1,r1,0
 222         stw     rCHR,40(rMEMP3)
 223         ori     r1,r1,0
 224         stw     rCHR,44(rMEMP3)
 225         ori     r1,r1,0
 226         stw     rCHR,48(rMEMP3)
 227         ori     r1,r1,0
 228         stw     rCHR,52(rMEMP3)
 229         ori     r1,r1,0
 230         stw     rCHR,56(rMEMP3)
 231         ori     r1,r1,0
 232         stw     rCHR,60(rMEMP3)
 233         blt     cr6,L(cacheAligned1)
 234 #ifndef NOT_IN_libc
 235         lfd     0,-128(rMEMP)
 236 #endif
 237         b       L(nzCacheAligned256)
 238         .align 5
 239 L(nzCacheAligned256):
 240         cmplwi  cr1,rLEN,256
 241         addi    rMEMP3,rMEMP,64
 242 #ifdef NOT_IN_libc
 243 /* When we are not in libc we should use only GPRs to avoid the FPU lock
 244    interrupt.  */
 245         stw     rCHR,0(rMEMP)
 246         stw     rCHR,4(rMEMP)
 247         stw     rCHR,8(rMEMP)
 248         stw     rCHR,12(rMEMP)
 249         stw     rCHR,16(rMEMP)
 250         stw     rCHR,20(rMEMP)
 251         stw     rCHR,24(rMEMP)
 252         stw     rCHR,28(rMEMP)
 253         stw     rCHR,32(rMEMP)
 254         stw     rCHR,36(rMEMP)
 255         stw     rCHR,40(rMEMP)
 256         stw     rCHR,44(rMEMP)
 257         stw     rCHR,48(rMEMP)
 258         stw     rCHR,52(rMEMP)
 259         stw     rCHR,56(rMEMP)
 260         stw     rCHR,60(rMEMP)
 261         addi    rMEMP,rMEMP3,64
 262         addi    rLEN,rLEN,-128
 263         stw     rCHR,0(rMEMP3)
 264         stw     rCHR,4(rMEMP3)
 265         stw     rCHR,8(rMEMP3)
 266         stw     rCHR,12(rMEMP3)
 267         stw     rCHR,16(rMEMP3)
 268         stw     rCHR,20(rMEMP3)
 269         stw     rCHR,24(rMEMP3)
 270         stw     rCHR,28(rMEMP3)
 271         stw     rCHR,32(rMEMP3)
 272         stw     rCHR,36(rMEMP3)
 273         stw     rCHR,40(rMEMP3)
 274         stw     rCHR,44(rMEMP3)
 275         stw     rCHR,48(rMEMP3)
 276         stw     rCHR,52(rMEMP3)
 277         stw     rCHR,56(rMEMP3)
 278         stw     rCHR,60(rMEMP3)
 279 #else
 280 /* We are in libc and this is a long memset so we can use FPRs and can afford
 281    occasional FPU locked interrupts.  */
 282         stfd    0,0(rMEMP)
 283         stfd    0,8(rMEMP)
 284         stfd    0,16(rMEMP)
 285         stfd    0,24(rMEMP)
 286         stfd    0,32(rMEMP)
 287         stfd    0,40(rMEMP)
 288         stfd    0,48(rMEMP)
 289         stfd    0,56(rMEMP)
 290         addi    rMEMP,rMEMP3,64
 291         addi    rLEN,rLEN,-128
 292         stfd    0,0(rMEMP3)
 293         stfd    0,8(rMEMP3)
 294         stfd    0,16(rMEMP3)
 295         stfd    0,24(rMEMP3)
 296         stfd    0,32(rMEMP3)
 297         stfd    0,40(rMEMP3)
 298         stfd    0,48(rMEMP3)
 299         stfd    0,56(rMEMP3)
 300 #endif
 301         bge     cr1,L(nzCacheAligned256)
 302         dcbtst  0,rMEMP
 303         b       L(cacheAligned1)
 304
 305         .align 4
 306 /* Storing a zero "c" value. We are aligned at a sector (32-byte)
 307    boundary but may not be at cache line (128-byte) boundary.  If the
 308    remaining length spans a full cache line we can use the Data cache
 309    block zero instruction. */
 310 L(zloopstart):
 311 /* memset in 32-byte chunks until we get to a cache line boundary.
 312    If rLEN is less then the distance to the next cache-line boundary use
 313    cacheAligned1 code to finish the tail.  */
 314         cmplwi  cr1,rLEN,128
 315         beq     L(medium)
 316 L(getCacheAligned):
 317         andi.   rTMP,rMEMP,127
 318         blt     cr1,L(cacheAligned1)
 319         addi    rMEMP3,rMEMP,32
 320         beq     L(cacheAligned)
 321         addi    rLEN,rLEN,-32
 322         stw     rCHR,0(rMEMP)
 323         stw     rCHR,4(rMEMP)
 324         stw     rCHR,8(rMEMP)
 325         stw     rCHR,12(rMEMP)
 326         stw     rCHR,16(rMEMP)
 327         stw     rCHR,20(rMEMP)
 328         addi    rMEMP,rMEMP,32
 329         andi.   rTMP,rMEMP3,127
 330         stw     rCHR,-8(rMEMP3)
 331         stw     rCHR,-4(rMEMP3)
 332 L(getCacheAligned2):
 333         beq     L(cacheAligned)
 334         addi    rLEN,rLEN,-32
 335         addi    rMEMP,rMEMP,32
 336         stw     rCHR,0(rMEMP3)
 337         stw     rCHR,4(rMEMP3)
 338         stw     rCHR,8(rMEMP3)
 339         stw     rCHR,12(rMEMP3)
 340         andi.   rTMP,rMEMP,127
 341         nop
 342         stw     rCHR,16(rMEMP3)
 343         stw     rCHR,20(rMEMP3)
 344         stw     rCHR,24(rMEMP3)
 345         stw     rCHR,28(rMEMP3)
 346 L(getCacheAligned3):
 347         beq     L(cacheAligned)
 348 /* At this point we can overrun the store queue (pipe reject) so it is
 349    time to slow things down. The store queue can merge two adjacent
 350    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 351    So we add "group ending nops" to guarantee that we dispatch only two
 352    stores every other cycle. */
 353         addi    rLEN,rLEN,-32
 354         ori     r1,r1,0
 355         ori     r1,r1,0
 356         stw     rCHR,32(rMEMP3)
 357         stw     rCHR,36(rMEMP3)
 358         addi    rMEMP,rMEMP,32
 359         cmplwi  cr1,rLEN,128
 360         ori     r1,r1,0
 361         stw     rCHR,40(rMEMP3)
 362         stw     rCHR,44(rMEMP3)
 363         cmplwi  cr6,rLEN,256
 364         li      rMEMP2,128
 365         ori     r1,r1,0
 366         stw     rCHR,48(rMEMP3)
 367         stw     rCHR,52(rMEMP3)
 368         ori     r1,r1,0
 369         ori     r1,r1,0
 370         stw     rCHR,56(rMEMP3)
 371         stw     rCHR,60(rMEMP3)
 372         blt     cr1,L(cacheAligned1)
 373         blt     cr6,L(cacheAligned128)
 374         b       L(cacheAlignedx)
 375
 376 /* Now we are aligned to the cache line and can use dcbz.  */
 377         .align 4
 378 L(cacheAligned):
 379         cmplwi  cr1,rLEN,128
 380         cmplwi  cr6,rLEN,256
 381         blt     cr1,L(cacheAligned1)
 382         li      rMEMP2,128
 383 L(cacheAlignedx):
 384         cmpldi  cr5,rLEN,640
 385         blt     cr6,L(cacheAligned128)
 386         bgt     cr5,L(cacheAligned512)
 387         cmplwi  cr6,rLEN,512
 388         dcbz    0,rMEMP
 389         cmplwi  cr1,rLEN,384
 390         dcbz    rMEMP2,rMEMP
 391         addi    rMEMP,rMEMP,256
 392         addi    rLEN,rLEN,-256
 393         blt     cr1,L(cacheAligned1)
 394         blt     cr6,L(cacheAligned128)
 395         b       L(cacheAligned256)
 396         .align 5
 397 /* A simple loop for the longer (>640 bytes) lengths.  This form limits
 398    the branch miss-predicted to exactly 1 at loop exit.*/
 399 L(cacheAligned512):
 400         cmpli   cr1,rLEN,128
 401         blt     cr1,L(cacheAligned1)
 402         dcbz    0,rMEMP
 403         addi    rLEN,rLEN,-128
 404         addi    rMEMP,rMEMP,128
 405         b       L(cacheAligned512)
 406         .align 5
 407 L(cacheAligned256):
 408         cmplwi  cr6,rLEN,512
 409         dcbz    0,rMEMP
 410         cmplwi  cr1,rLEN,384
 411         dcbz    rMEMP2,rMEMP
 412         addi    rMEMP,rMEMP,256
 413         addi    rLEN,rLEN,-256
 414         bge     cr6,L(cacheAligned256)
 415         blt     cr1,L(cacheAligned1)
 416         .align 4
 417 L(cacheAligned128):
 418         dcbz    0,rMEMP
 419         addi    rMEMP,rMEMP,128
 420         addi    rLEN,rLEN,-128
 421         .align 4
 422 L(cacheAligned1):
 423         cmplwi  cr1,rLEN,32
 424         blt     cr1,L(handletail32)
 425         addi    rMEMP3,rMEMP,32
 426         addi    rLEN,rLEN,-32
 427         stw     rCHR,0(rMEMP)
 428         stw     rCHR,4(rMEMP)
 429         stw     rCHR,8(rMEMP)
 430         stw     rCHR,12(rMEMP)
 431         stw     rCHR,16(rMEMP)
 432         stw     rCHR,20(rMEMP)
 433         addi    rMEMP,rMEMP,32
 434         cmplwi  cr1,rLEN,32
 435         stw     rCHR,-8(rMEMP3)
 436         stw     rCHR,-4(rMEMP3)
 437 L(cacheAligned2):
 438         blt     cr1,L(handletail32)
 439         addi    rLEN,rLEN,-32
 440         stw     rCHR,0(rMEMP3)
 441         stw     rCHR,4(rMEMP3)
 442         stw     rCHR,8(rMEMP3)
 443         stw     rCHR,12(rMEMP3)
 444         addi    rMEMP,rMEMP,32
 445         cmplwi  cr1,rLEN,32
 446         stw     rCHR,16(rMEMP3)
 447         stw     rCHR,20(rMEMP3)
 448         stw     rCHR,24(rMEMP3)
 449         stw     rCHR,28(rMEMP3)
 450         nop
 451 L(cacheAligned3):
 452         blt     cr1,L(handletail32)
 453 /* At this point we can overrun the store queue (pipe reject) so it is
 454    time to slow things down. The store queue can merge two adjacent
 455    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 456    So we add "group ending nops" to guarantee that we dispatch only two
 457    stores every other cycle. */
 458         ori     r1,r1,0
 459         ori     r1,r1,0
 460         addi    rMEMP,rMEMP,32
 461         addi    rLEN,rLEN,-32
 462         ori     r1,r1,0
 463         ori     r1,r1,0
 464         stw     rCHR,32(rMEMP3)
 465         stw     rCHR,36(rMEMP3)
 466         ori     r1,r1,0
 467         ori     r1,r1,0
 468         stw     rCHR,40(rMEMP3)
 469         stw     rCHR,44(rMEMP3)
 470         ori     r1,r1,0
 471         ori     r1,r1,0
 472         stw     rCHR,48(rMEMP3)
 473         stw     rCHR,52(rMEMP3)
 474         ori     r1,r1,0
 475         ori     r1,r1,0
 476         stw     rCHR,56(rMEMP3)
 477         stw     rCHR,60(rMEMP3)
 478
 479 /* We are here because the length or remainder (rLEN) is less than the
 480    cache line/sector size and does not justify aggressive loop unrolling.
 481    So set up the preconditions for L(medium) and go there.  */
 482         .align 3
 483 L(handletail32):
 484         cmplwi  cr1,rLEN,0
 485         beqlr   cr1
 486         b       L(medium)
 487
 488         .align 4
 489 L(small):
 490 /* Memset of 4 bytes or less.  */
 491         cmplwi  cr5, rLEN, 1
 492         cmplwi  cr1, rLEN, 3
 493         bltlr   cr5
 494         stb     rCHR, 0(rMEMP)
 495         beqlr   cr5
 496         stb     rCHR, 1(rMEMP)
 497         bltlr   cr1
 498         stb     rCHR, 2(rMEMP)
 499         beqlr   cr1
 500         stb     rCHR, 3(rMEMP)
 501         blr
 502
 503 /* Memset of 0-31 bytes.  */
 504         .align 5
 505 L(medium):
 506         cmplwi  cr1, rLEN, 16
 507 L(medium_tail2):
 508         add     rMEMP, rMEMP, rLEN
 509 L(medium_tail):
 510         bt-     31, L(medium_31t)
 511         bt-     30, L(medium_30t)
 512 L(medium_30f):
 513         bt      29, L(medium_29t)
 514 L(medium_29f):
 515         bge     cr1, L(medium_27t)
 516         bflr    28
 517         stw     rCHR, -4(rMEMP)
 518         stw     rCHR, -8(rMEMP)
 519         blr
 520
 521 L(medium_31t):
 522         stbu    rCHR, -1(rMEMP)
 523         bf-     30, L(medium_30f)
 524 L(medium_30t):
 525         sthu    rCHR, -2(rMEMP)
 526         bf-     29, L(medium_29f)
 527 L(medium_29t):
 528         stwu    rCHR, -4(rMEMP)
 529         blt     cr1, L(medium_27f)
 530 L(medium_27t):
 531         stw     rCHR, -4(rMEMP)
 532         stw     rCHR, -8(rMEMP)
 533         stw     rCHR, -12(rMEMP)
 534         stwu    rCHR, -16(rMEMP)
 535 L(medium_27f):
 536         bflr    28
 537 L(medium_28t):
 538         stw     rCHR, -4(rMEMP)
 539         stw     rCHR, -8(rMEMP)
 540         blr
 541 END (BP_SYM (memset))
 542 libc_hidden_builtin_def (memset)