sysdeps/powerpc/powerpc32/power6/memset.S

   1 /* Optimized 32-bit memset implementation for POWER6.
   2    Copyright (C) 1997,99,2000,02,03,06,2007,2009 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include <bp-sym.h>
  21 #include <bp-asm.h>
  22
  23 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
  24    Returns 's'.
  25
  26    The memset is done in three sizes: byte (8 bits), word (32 bits),
  27    cache line (1024 bits). There is a special case for setting cache lines
  28    to 0, to take advantage of the dcbz instruction.  */
  29
  30         .machine power6
  31 EALIGN (BP_SYM (memset), 7, 0)
  32         CALL_MCOUNT
  33
  34 #define rTMP    r0
  35 #define rRTN    r3      /* Initial value of 1st argument.  */
  36 #define rMEMP0  r3      /* Original value of 1st arg.  */
  37 #define rCHR    r4      /* Char to set in each byte.  */
  38 #define rLEN    r5      /* Length of region to set.  */
  39 #define rMEMP   r6      /* Address at which we are storing.  */
  40 #define rALIGN  r7      /* Number of bytes we are setting now (when aligning). */
  41 #define rMEMP2  r8
  42
  43 #define rNEG64  r8      /* Constant -64 for clearing with dcbz.  */
  44 #define rMEMP3  r9      /* Alt mem pointer.  */
  45 L(_memset):
  46 /* Take care of case for size <= 4.  */
  47         cmplwi  cr1, rLEN, 4
  48         andi.   rALIGN, rMEMP0, 3
  49         mr      rMEMP, rMEMP0
  50         ble-    cr1, L(small)
  51 /* Align to word boundary.  */
  52         cmplwi  cr5, rLEN, 31
  53         rlwimi  rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
  54         beq+    L(aligned)
  55         mtcrf   0x01, rMEMP0
  56         subfic  rALIGN, rALIGN, 4
  57         add     rMEMP, rMEMP, rALIGN
  58         sub     rLEN, rLEN, rALIGN
  59         bf+     31, L(g0)
  60         stb     rCHR, 0(rMEMP0)
  61         bt      30, L(aligned)
  62 L(g0):
  63         sth     rCHR, -2(rMEMP)
  64
  65         .align 4
  66 /* Handle the case of size < 31.  */
  67 L(aligned):
  68         mtcrf   0x01, rLEN
  69         rlwimi  rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
  70         ble     cr5, L(medium)
  71 /* Align to 32-byte boundary.  */
  72         andi.   rALIGN, rMEMP, 0x1C
  73         subfic  rALIGN, rALIGN, 0x20
  74         beq     L(caligned)
  75         mtcrf   0x01, rALIGN
  76         add     rMEMP, rMEMP, rALIGN
  77         sub     rLEN, rLEN, rALIGN
  78         cmplwi  cr1, rALIGN, 0x10
  79         mr      rMEMP2, rMEMP
  80         bf      28, L(a1)
  81         stw     rCHR, -4(rMEMP2)
  82         stwu    rCHR, -8(rMEMP2)
  83         nop
  84 L(a1):  blt     cr1, L(a2)
  85         stw     rCHR, -4(rMEMP2)
  86         stw     rCHR, -8(rMEMP2)
  87         stw     rCHR, -12(rMEMP2)
  88         stwu    rCHR, -16(rMEMP2)
  89 L(a2):  bf      29, L(caligned)
  90         stw     rCHR, -4(rMEMP2)
  91
  92         .align 3
  93 /* Now aligned to a 32 byte boundary.  */
  94 L(caligned):
  95         cmplwi  cr1, rCHR, 0
  96         clrrwi. rALIGN, rLEN, 5
  97         mtcrf   0x01, rLEN
  98         beq     cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
  99 L(nondcbz):
 100         beq     L(medium)       /* We may not actually get to do a full line.  */
 101         nop
 102 /* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
 103    boundary may not be at cache line (128-byte) boundary.  */
 104 L(nzloopstart):
 105 /* memset in 32-byte chunks until we get to a cache line boundary.
 106    If rLEN is less then the distance to the next cache-line boundary use
 107    cacheAligned1 code to finish the tail.  */
 108         cmplwi  cr1,rLEN,128
 109
 110         andi.   rTMP,rMEMP,127
 111         blt     cr1,L(cacheAligned1)
 112         addi    rMEMP3,rMEMP,32
 113         beq     L(nzCacheAligned)
 114         addi    rLEN,rLEN,-32
 115         stw     rCHR,0(rMEMP)
 116         stw     rCHR,4(rMEMP)
 117         stw     rCHR,8(rMEMP)
 118         stw     rCHR,12(rMEMP)
 119         stw     rCHR,16(rMEMP)
 120         stw     rCHR,20(rMEMP)
 121         addi    rMEMP,rMEMP,32
 122         andi.   rTMP,rMEMP3,127
 123         stw     rCHR,-8(rMEMP3)
 124         stw     rCHR,-4(rMEMP3)
 125
 126         beq     L(nzCacheAligned)
 127         addi    rLEN,rLEN,-32
 128         stw     rCHR,0(rMEMP3)
 129         stw     rCHR,4(rMEMP3)
 130         addi    rMEMP,rMEMP,32
 131         stw     rCHR,8(rMEMP3)
 132         stw     rCHR,12(rMEMP3)
 133         andi.   rTMP,rMEMP,127
 134         stw     rCHR,16(rMEMP3)
 135         stw     rCHR,20(rMEMP3)
 136         stw     rCHR,24(rMEMP3)
 137         stw     rCHR,28(rMEMP3)
 138
 139         beq     L(nzCacheAligned)
 140         addi    rLEN,rLEN,-32
 141 /* At this point we can overrun the store queue (pipe reject) so it is
 142    time to slow things down. The store queue can merge two adjacent
 143    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 144    So we add "group ending nops" to guarantee that we dispatch only two
 145    stores every other cycle. */
 146         ori     r1,r1,0
 147         ori     r1,r1,0
 148         stw     rCHR,32(rMEMP3)
 149         stw     rCHR,36(rMEMP3)
 150         addi    rMEMP,rMEMP,32
 151         cmplwi  cr1,rLEN,128
 152         ori     r1,r1,0
 153         ori     r1,r1,0
 154         stw     rCHR,40(rMEMP3)
 155         stw     rCHR,44(rMEMP3)
 156         ori     r1,r1,0
 157         ori     r1,r1,0
 158         stw     rCHR,48(rMEMP3)
 159         stw     rCHR,52(rMEMP3)
 160         ori     r1,r1,0
 161         ori     r1,r1,0
 162         stw     rCHR,56(rMEMP3)
 163         stw     rCHR,60(rMEMP3)
 164         blt     cr1,L(cacheAligned1)
 165         b       L(nzCacheAligned)
 166
 167 /* Now we are aligned to the cache line and can use dcbtst.  */
 168         .align 5
 169 L(nzCacheAligned):
 170         cmplwi  cr1,rLEN,128
 171         cmplwi  cr6,rLEN,256
 172         blt     cr1,L(cacheAligned1)
 173         blt     cr6,L(nzCacheAligned128)
 174         .align 4
 175 L(nzCacheAligned128):
 176         nop
 177         addi    rMEMP3,rMEMP,64
 178         stw     rCHR,0(rMEMP)
 179         stw     rCHR,4(rMEMP)
 180         stw     rCHR,8(rMEMP)
 181         stw     rCHR,12(rMEMP)
 182         stw     rCHR,16(rMEMP)
 183         stw     rCHR,20(rMEMP)
 184         stw     rCHR,24(rMEMP)
 185         stw     rCHR,28(rMEMP)
 186         stw     rCHR,32(rMEMP)
 187         stw     rCHR,36(rMEMP)
 188         stw     rCHR,40(rMEMP)
 189         stw     rCHR,44(rMEMP)
 190         stw     rCHR,48(rMEMP)
 191         stw     rCHR,52(rMEMP)
 192         stw     rCHR,56(rMEMP)
 193         stw     rCHR,60(rMEMP)
 194         addi    rMEMP,rMEMP3,64
 195         addi    rLEN,rLEN,-128
 196 /* At this point we can overrun the store queue (pipe reject) so it is
 197    time to slow things down. The store queue can merge two adjacent
 198    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 199    So we add "group ending nops" to guarantee that we dispatch only one
 200    store per cycle. */
 201         stw     rCHR,0(rMEMP3)
 202         ori     r1,r1,0
 203         stw     rCHR,4(rMEMP3)
 204         ori     r1,r1,0
 205         stw     rCHR,8(rMEMP3)
 206         ori     r1,r1,0
 207         stw     rCHR,12(rMEMP3)
 208         ori     r1,r1,0
 209         stw     rCHR,16(rMEMP3)
 210         ori     r1,r1,0
 211         stw     rCHR,20(rMEMP3)
 212         ori     r1,r1,0
 213         stw     rCHR,24(rMEMP3)
 214         ori     r1,r1,0
 215         stw     rCHR,28(rMEMP3)
 216         ori     r1,r1,0
 217         stw     rCHR,32(rMEMP3)
 218         ori     r1,r1,0
 219         stw     rCHR,36(rMEMP3)
 220         ori     r1,r1,0
 221         stw     rCHR,40(rMEMP3)
 222         ori     r1,r1,0
 223         stw     rCHR,44(rMEMP3)
 224         ori     r1,r1,0
 225         stw     rCHR,48(rMEMP3)
 226         ori     r1,r1,0
 227         stw     rCHR,52(rMEMP3)
 228         ori     r1,r1,0
 229         stw     rCHR,56(rMEMP3)
 230         ori     r1,r1,0
 231         stw     rCHR,60(rMEMP3)
 232         blt     cr6,L(cacheAligned1)
 233 #ifndef NOT_IN_libc
 234         lfd     0,-128(rMEMP)
 235 #endif
 236         b       L(nzCacheAligned256)
 237         .align 5
 238 L(nzCacheAligned256):
 239         cmplwi  cr1,rLEN,256
 240         addi    rMEMP3,rMEMP,64
 241 #ifdef NOT_IN_libc
 242 /* When we are not in libc we should use only GPRs to avoid the FPU lock
 243    interrupt.  */
 244         stw     rCHR,0(rMEMP)
 245         stw     rCHR,4(rMEMP)
 246         stw     rCHR,8(rMEMP)
 247         stw     rCHR,12(rMEMP)
 248         stw     rCHR,16(rMEMP)
 249         stw     rCHR,20(rMEMP)
 250         stw     rCHR,24(rMEMP)
 251         stw     rCHR,28(rMEMP)
 252         stw     rCHR,32(rMEMP)
 253         stw     rCHR,36(rMEMP)
 254         stw     rCHR,40(rMEMP)
 255         stw     rCHR,44(rMEMP)
 256         stw     rCHR,48(rMEMP)
 257         stw     rCHR,52(rMEMP)
 258         stw     rCHR,56(rMEMP)
 259         stw     rCHR,60(rMEMP)
 260         addi    rMEMP,rMEMP3,64
 261         addi    rLEN,rLEN,-128
 262         stw     rCHR,0(rMEMP3)
 263         stw     rCHR,4(rMEMP3)
 264         stw     rCHR,8(rMEMP3)
 265         stw     rCHR,12(rMEMP3)
 266         stw     rCHR,16(rMEMP3)
 267         stw     rCHR,20(rMEMP3)
 268         stw     rCHR,24(rMEMP3)
 269         stw     rCHR,28(rMEMP3)
 270         stw     rCHR,32(rMEMP3)
 271         stw     rCHR,36(rMEMP3)
 272         stw     rCHR,40(rMEMP3)
 273         stw     rCHR,44(rMEMP3)
 274         stw     rCHR,48(rMEMP3)
 275         stw     rCHR,52(rMEMP3)
 276         stw     rCHR,56(rMEMP3)
 277         stw     rCHR,60(rMEMP3)
 278 #else
 279 /* We are in libc and this is a long memset so we can use FPRs and can afford
 280    occasional FPU locked interrupts.  */
 281         stfd    0,0(rMEMP)
 282         stfd    0,8(rMEMP)
 283         stfd    0,16(rMEMP)
 284         stfd    0,24(rMEMP)
 285         stfd    0,32(rMEMP)
 286         stfd    0,40(rMEMP)
 287         stfd    0,48(rMEMP)
 288         stfd    0,56(rMEMP)
 289         addi    rMEMP,rMEMP3,64
 290         addi    rLEN,rLEN,-128
 291         stfd    0,0(rMEMP3)
 292         stfd    0,8(rMEMP3)
 293         stfd    0,16(rMEMP3)
 294         stfd    0,24(rMEMP3)
 295         stfd    0,32(rMEMP3)
 296         stfd    0,40(rMEMP3)
 297         stfd    0,48(rMEMP3)
 298         stfd    0,56(rMEMP3)
 299 #endif
 300         bge     cr1,L(nzCacheAligned256)
 301         dcbtst  0,rMEMP
 302         b       L(cacheAligned1)
 303
 304         .align 4
 305 /* Storing a zero "c" value. We are aligned at a sector (32-byte)
 306    boundary but may not be at cache line (128-byte) boundary.  If the
 307    remaining length spans a full cache line we can use the Data cache
 308    block zero instruction. */
 309 L(zloopstart):
 310 /* memset in 32-byte chunks until we get to a cache line boundary.
 311    If rLEN is less then the distance to the next cache-line boundary use
 312    cacheAligned1 code to finish the tail.  */
 313         cmplwi  cr1,rLEN,128
 314         beq     L(medium)
 315 L(getCacheAligned):
 316         andi.   rTMP,rMEMP,127
 317         blt     cr1,L(cacheAligned1)
 318         addi    rMEMP3,rMEMP,32
 319         beq     L(cacheAligned)
 320         addi    rLEN,rLEN,-32
 321         stw     rCHR,0(rMEMP)
 322         stw     rCHR,4(rMEMP)
 323         stw     rCHR,8(rMEMP)
 324         stw     rCHR,12(rMEMP)
 325         stw     rCHR,16(rMEMP)
 326         stw     rCHR,20(rMEMP)
 327         addi    rMEMP,rMEMP,32
 328         andi.   rTMP,rMEMP3,127
 329         stw     rCHR,-8(rMEMP3)
 330         stw     rCHR,-4(rMEMP3)
 331 L(getCacheAligned2):
 332         beq     L(cacheAligned)
 333         addi    rLEN,rLEN,-32
 334         addi    rMEMP,rMEMP,32
 335         stw     rCHR,0(rMEMP3)
 336         stw     rCHR,4(rMEMP3)
 337         stw     rCHR,8(rMEMP3)
 338         stw     rCHR,12(rMEMP3)
 339         andi.   rTMP,rMEMP,127
 340         nop
 341         stw     rCHR,16(rMEMP3)
 342         stw     rCHR,20(rMEMP3)
 343         stw     rCHR,24(rMEMP3)
 344         stw     rCHR,28(rMEMP3)
 345 L(getCacheAligned3):
 346         beq     L(cacheAligned)
 347 /* At this point we can overrun the store queue (pipe reject) so it is
 348    time to slow things down. The store queue can merge two adjacent
 349    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 350    So we add "group ending nops" to guarantee that we dispatch only two
 351    stores every other cycle. */
 352         addi    rLEN,rLEN,-32
 353         ori     r1,r1,0
 354         ori     r1,r1,0
 355         stw     rCHR,32(rMEMP3)
 356         stw     rCHR,36(rMEMP3)
 357         addi    rMEMP,rMEMP,32
 358         cmplwi  cr1,rLEN,128
 359         ori     r1,r1,0
 360         stw     rCHR,40(rMEMP3)
 361         stw     rCHR,44(rMEMP3)
 362         cmplwi  cr6,rLEN,256
 363         li      rMEMP2,128
 364         ori     r1,r1,0
 365         stw     rCHR,48(rMEMP3)
 366         stw     rCHR,52(rMEMP3)
 367         ori     r1,r1,0
 368         ori     r1,r1,0
 369         stw     rCHR,56(rMEMP3)
 370         stw     rCHR,60(rMEMP3)
 371         blt     cr1,L(cacheAligned1)
 372         blt     cr6,L(cacheAligned128)
 373         b       L(cacheAlignedx)
 374
 375 /* Now we are aligned to the cache line and can use dcbz.  */
 376         .align 4
 377 L(cacheAligned):
 378         cmplwi  cr1,rLEN,128
 379         cmplwi  cr6,rLEN,256
 380         blt     cr1,L(cacheAligned1)
 381         li      rMEMP2,128
 382 L(cacheAlignedx):
 383         cmplwi  cr5,rLEN,640
 384         blt     cr6,L(cacheAligned128)
 385         bgt     cr5,L(cacheAligned512)
 386         cmplwi  cr6,rLEN,512
 387         dcbz    0,rMEMP
 388         cmplwi  cr1,rLEN,384
 389         dcbz    rMEMP2,rMEMP
 390         addi    rMEMP,rMEMP,256
 391         addi    rLEN,rLEN,-256
 392         blt     cr1,L(cacheAligned1)
 393         blt     cr6,L(cacheAligned128)
 394         b       L(cacheAligned256)
 395         .align 5
 396 /* A simple loop for the longer (>640 bytes) lengths.  This form limits
 397    the branch miss-predicted to exactly 1 at loop exit.*/
 398 L(cacheAligned512):
 399         cmpli   cr1,rLEN,128
 400         blt     cr1,L(cacheAligned1)
 401         dcbz    0,rMEMP
 402         addi    rLEN,rLEN,-128
 403         addi    rMEMP,rMEMP,128
 404         b       L(cacheAligned512)
 405         .align 5
 406 L(cacheAligned256):
 407         cmplwi  cr6,rLEN,512
 408         dcbz    0,rMEMP
 409         cmplwi  cr1,rLEN,384
 410         dcbz    rMEMP2,rMEMP
 411         addi    rMEMP,rMEMP,256
 412         addi    rLEN,rLEN,-256
 413         bge     cr6,L(cacheAligned256)
 414         blt     cr1,L(cacheAligned1)
 415         .align 4
 416 L(cacheAligned128):
 417         dcbz    0,rMEMP
 418         addi    rMEMP,rMEMP,128
 419         addi    rLEN,rLEN,-128
 420         .align 4
 421 L(cacheAligned1):
 422         cmplwi  cr1,rLEN,32
 423         blt     cr1,L(handletail32)
 424         addi    rMEMP3,rMEMP,32
 425         addi    rLEN,rLEN,-32
 426         stw     rCHR,0(rMEMP)
 427         stw     rCHR,4(rMEMP)
 428         stw     rCHR,8(rMEMP)
 429         stw     rCHR,12(rMEMP)
 430         stw     rCHR,16(rMEMP)
 431         stw     rCHR,20(rMEMP)
 432         addi    rMEMP,rMEMP,32
 433         cmplwi  cr1,rLEN,32
 434         stw     rCHR,-8(rMEMP3)
 435         stw     rCHR,-4(rMEMP3)
 436 L(cacheAligned2):
 437         blt     cr1,L(handletail32)
 438         addi    rLEN,rLEN,-32
 439         stw     rCHR,0(rMEMP3)
 440         stw     rCHR,4(rMEMP3)
 441         stw     rCHR,8(rMEMP3)
 442         stw     rCHR,12(rMEMP3)
 443         addi    rMEMP,rMEMP,32
 444         cmplwi  cr1,rLEN,32
 445         stw     rCHR,16(rMEMP3)
 446         stw     rCHR,20(rMEMP3)
 447         stw     rCHR,24(rMEMP3)
 448         stw     rCHR,28(rMEMP3)
 449         nop
 450 L(cacheAligned3):
 451         blt     cr1,L(handletail32)
 452 /* At this point we can overrun the store queue (pipe reject) so it is
 453    time to slow things down. The store queue can merge two adjacent
 454    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 455    So we add "group ending nops" to guarantee that we dispatch only two
 456    stores every other cycle. */
 457         ori     r1,r1,0
 458         ori     r1,r1,0
 459         addi    rMEMP,rMEMP,32
 460         addi    rLEN,rLEN,-32
 461         ori     r1,r1,0
 462         ori     r1,r1,0
 463         stw     rCHR,32(rMEMP3)
 464         stw     rCHR,36(rMEMP3)
 465         ori     r1,r1,0
 466         ori     r1,r1,0
 467         stw     rCHR,40(rMEMP3)
 468         stw     rCHR,44(rMEMP3)
 469         ori     r1,r1,0
 470         ori     r1,r1,0
 471         stw     rCHR,48(rMEMP3)
 472         stw     rCHR,52(rMEMP3)
 473         ori     r1,r1,0
 474         ori     r1,r1,0
 475         stw     rCHR,56(rMEMP3)
 476         stw     rCHR,60(rMEMP3)
 477
 478 /* We are here because the length or remainder (rLEN) is less than the
 479    cache line/sector size and does not justify aggressive loop unrolling.
 480    So set up the preconditions for L(medium) and go there.  */
 481         .align 3
 482 L(handletail32):
 483         cmplwi  cr1,rLEN,0
 484         beqlr   cr1
 485         b       L(medium)
 486
 487         .align 4
 488 L(small):
 489 /* Memset of 4 bytes or less.  */
 490         cmplwi  cr5, rLEN, 1
 491         cmplwi  cr1, rLEN, 3
 492         bltlr   cr5
 493         stb     rCHR, 0(rMEMP)
 494         beqlr   cr5
 495         stb     rCHR, 1(rMEMP)
 496         bltlr   cr1
 497         stb     rCHR, 2(rMEMP)
 498         beqlr   cr1
 499         stb     rCHR, 3(rMEMP)
 500         blr
 501
 502 /* Memset of 0-31 bytes.  */
 503         .align 5
 504 L(medium):
 505         cmplwi  cr1, rLEN, 16
 506 L(medium_tail2):
 507         add     rMEMP, rMEMP, rLEN
 508 L(medium_tail):
 509         bt-     31, L(medium_31t)
 510         bt-     30, L(medium_30t)
 511 L(medium_30f):
 512         bt      29, L(medium_29t)
 513 L(medium_29f):
 514         bge     cr1, L(medium_27t)
 515         bflr    28
 516         stw     rCHR, -4(rMEMP)
 517         stw     rCHR, -8(rMEMP)
 518         blr
 519
 520 L(medium_31t):
 521         stbu    rCHR, -1(rMEMP)
 522         bf-     30, L(medium_30f)
 523 L(medium_30t):
 524         sthu    rCHR, -2(rMEMP)
 525         bf-     29, L(medium_29f)
 526 L(medium_29t):
 527         stwu    rCHR, -4(rMEMP)
 528         blt     cr1, L(medium_27f)
 529 L(medium_27t):
 530         stw     rCHR, -4(rMEMP)
 531         stw     rCHR, -8(rMEMP)
 532         stw     rCHR, -12(rMEMP)
 533         stwu    rCHR, -16(rMEMP)
 534 L(medium_27f):
 535         bflr    28
 536 L(medium_28t):
 537         stw     rCHR, -4(rMEMP)
 538         stw     rCHR, -8(rMEMP)
 539         blr
 540 END (BP_SYM (memset))
 541 libc_hidden_builtin_def (memset)