sysdeps/powerpc/powerpc32/memset.S

   1 /* Optimized memset implementation for PowerPC.
   2    Copyright (C) 1997, 1999, 2000, 2003, 2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
  25    Returns 's'.
  26
  27    The memset is done in four sizes: byte (8 bits), word (32 bits),
  28    32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits).
  29    There is a special case for setting whole cache lines to 0, which
  30    takes advantage of the dcbz instruction.  */
  31
  32         .section        ".text"
  33 EALIGN (BP_SYM (memset), 5, 1)
  34
  35 #define rTMP    r0
  36 #define rRTN    r3      /* initial value of 1st argument */
  37 #if __BOUNDED_POINTERS__
  38 # define rMEMP0 r4      /* original value of 1st arg */
  39 # define rCHR   r5      /* char to set in each byte */
  40 # define rLEN   r6      /* length of region to set */
  41 # define rMEMP  r10     /* address at which we are storing */
  42 #else
  43 # define rMEMP0 r3      /* original value of 1st arg */
  44 # define rCHR   r4      /* char to set in each byte */
  45 # define rLEN   r5      /* length of region to set */
  46 # define rMEMP  r6      /* address at which we are storing */
  47 #endif
  48 #define rALIGN  r7      /* number of bytes we are setting now (when aligning) */
  49 #define rMEMP2  r8
  50
  51 #define rPOS32  r7      /* constant +32 for clearing with dcbz */
  52 #define rNEG64  r8      /* constant -64 for clearing with dcbz */
  53 #define rNEG32  r9      /* constant -32 for clearing with dcbz */
  54
  55 #define rGOT    r9      /* Address of the Global Offset Table.  */
  56 #define rCLS    r8      /* Cache line size obtained from static.  */
  57 #define rCLM    r9      /* Cache line size mask to check for cache alignment.  */
  58
  59 #if __BOUNDED_POINTERS__
  60         cmplwi  cr1, rRTN, 0
  61         CHECK_BOUNDS_BOTH_WIDE (rMEMP0, rTMP, rTMP2, rLEN)
  62         beq     cr1, L(b0)
  63         STORE_RETURN_VALUE (rMEMP0)
  64         STORE_RETURN_BOUNDS (rTMP, rTMP2)
  65 L(b0):
  66 #endif
  67
  68 /* take care of case for size <= 4  */
  69         cmplwi  cr1, rLEN, 4
  70         andi.   rALIGN, rMEMP0, 3
  71         mr      rMEMP, rMEMP0
  72         ble-    cr1, L(small)
  73 /* align to word boundary  */
  74         cmplwi  cr5, rLEN, 31
  75         rlwimi  rCHR, rCHR, 8, 16, 23
  76         beq+    L(aligned)      /* 8th instruction from .align */
  77         mtcrf   0x01, rMEMP0
  78         subfic  rALIGN, rALIGN, 4
  79         add     rMEMP, rMEMP, rALIGN
  80         sub     rLEN, rLEN, rALIGN
  81         bf+     31, L(g0)
  82         stb     rCHR, 0(rMEMP0)
  83         bt      30, L(aligned)
  84 L(g0):  sth     rCHR, -2(rMEMP) /* 16th instruction from .align */
  85 /* take care of case for size < 31 */
  86 L(aligned):
  87         mtcrf   0x01, rLEN
  88         rlwimi  rCHR, rCHR, 16, 0, 15
  89         ble     cr5, L(medium)
  90 /* align to cache line boundary...  */
  91         andi.   rALIGN, rMEMP, 0x1C
  92         subfic  rALIGN, rALIGN, 0x20
  93         beq     L(caligned)
  94         mtcrf   0x01, rALIGN
  95         add     rMEMP, rMEMP, rALIGN
  96         sub     rLEN, rLEN, rALIGN
  97         cmplwi  cr1, rALIGN, 0x10
  98         mr      rMEMP2, rMEMP
  99         bf      28, L(a1)
 100         stw     rCHR, -4(rMEMP2)
 101         stwu    rCHR, -8(rMEMP2)
 102 L(a1):  blt     cr1, L(a2)
 103         stw     rCHR, -4(rMEMP2) /* 32nd instruction from .align */
 104         stw     rCHR, -8(rMEMP2)
 105         stw     rCHR, -12(rMEMP2)
 106         stwu    rCHR, -16(rMEMP2)
 107 L(a2):  bf      29, L(caligned)
 108         stw     rCHR, -4(rMEMP2)
 109 /* now aligned to a cache line.  */
 110 L(caligned):
 111         cmplwi  cr1, rCHR, 0
 112         clrrwi. rALIGN, rLEN, 5
 113         mtcrf   0x01, rLEN      /* 40th instruction from .align */
 114
 115 /* Check if we can use the special case for clearing memory using dcbz.
 116    This requires that we know the correct cache line size for this
 117    processor.  Getting the __cache_line_size may require establishing GOT
 118    addressability, so branch out of line to set this up.  */
 119         beq     cr1, L(checklinesize)
 120
 121 /* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary.
 122    Can't assume that rCHR is zero or that the cache line size is either
 123    32-bytes or even known.  */
 124 L(nondcbz):
 125         srwi    rTMP, rALIGN, 5
 126         mtctr   rTMP
 127         beq     L(medium)       /* we may not actually get to do a full line */
 128         clrlwi. rLEN, rLEN, 27
 129         add     rMEMP, rMEMP, rALIGN
 130         li      rNEG64, -0x40
 131         bdz     L(cloopdone)    /* 48th instruction from .align */
 132
 133 /* We can't use dcbz here as we don't know the cache line size.  We can
 134    use "data cache block touch for store", which is safe.  */
 135 L(c3):  dcbtst  rNEG64, rMEMP
 136         stw     rCHR, -4(rMEMP)
 137         stw     rCHR, -8(rMEMP)
 138         stw     rCHR, -12(rMEMP)
 139         stw     rCHR, -16(rMEMP)
 140         nop                     /* let 601 fetch last 4 instructions of loop */
 141         stw     rCHR, -20(rMEMP)
 142         stw     rCHR, -24(rMEMP) /* 56th instruction from .align */
 143         nop                     /* let 601 fetch first 8 instructions of loop */
 144         stw     rCHR, -28(rMEMP)
 145         stwu    rCHR, -32(rMEMP)
 146         bdnz    L(c3)
 147 L(cloopdone):
 148         stw     rCHR, -4(rMEMP)
 149         stw     rCHR, -8(rMEMP)
 150         stw     rCHR, -12(rMEMP)
 151         stw     rCHR, -16(rMEMP) /* 64th instruction from .align */
 152         stw     rCHR, -20(rMEMP)
 153         cmplwi  cr1, rLEN, 16
 154         stw     rCHR, -24(rMEMP)
 155         stw     rCHR, -28(rMEMP)
 156         stwu    rCHR, -32(rMEMP)
 157         beqlr
 158         add     rMEMP, rMEMP, rALIGN
 159         b       L(medium_tail2) /* 72nd instruction from .align */
 160
 161         .align  5
 162         nop
 163 /* Clear cache lines of memory in 128-byte chunks.
 164    This code is optimized for processors with 32-byte cache lines.
 165    It is further optimized for the 601 processor, which requires
 166    some care in how the code is aligned in the i-cache.  */
 167 L(zloopstart):
 168         clrlwi  rLEN, rLEN, 27
 169         mtcrf   0x02, rALIGN
 170         srwi.   rTMP, rALIGN, 7
 171         mtctr   rTMP
 172         li      rPOS32, 0x20
 173         li      rNEG64, -0x40
 174         cmplwi  cr1, rLEN, 16   /* 8 */
 175         bf      26, L(z0)
 176         dcbz    0, rMEMP
 177         addi    rMEMP, rMEMP, 0x20
 178 L(z0):  li      rNEG32, -0x20
 179         bf      25, L(z1)
 180         dcbz    0, rMEMP
 181         dcbz    rPOS32, rMEMP
 182         addi    rMEMP, rMEMP, 0x40 /* 16 */
 183 L(z1):  cmplwi  cr5, rLEN, 0
 184         beq     L(medium)
 185 L(zloop):
 186         dcbz    0, rMEMP
 187         dcbz    rPOS32, rMEMP
 188         addi    rMEMP, rMEMP, 0x80
 189         dcbz    rNEG64, rMEMP
 190         dcbz    rNEG32, rMEMP
 191         bdnz    L(zloop)
 192         beqlr   cr5
 193         b       L(medium_tail2)
 194
 195         .align  5
 196 L(small):
 197 /* Memset of 4 bytes or less.  */
 198         cmplwi  cr5, rLEN, 1
 199         cmplwi  cr1, rLEN, 3
 200         bltlr   cr5
 201         stb     rCHR, 0(rMEMP)
 202         beqlr   cr5
 203         nop
 204         stb     rCHR, 1(rMEMP)
 205         bltlr   cr1
 206         stb     rCHR, 2(rMEMP)
 207         beqlr   cr1
 208         nop
 209         stb     rCHR, 3(rMEMP)
 210         blr
 211
 212 /* Memset of 0-31 bytes.  */
 213         .align  5
 214 L(medium):
 215         cmplwi  cr1, rLEN, 16
 216 L(medium_tail2):
 217         add     rMEMP, rMEMP, rLEN
 218 L(medium_tail):
 219         bt-     31, L(medium_31t)
 220         bt-     30, L(medium_30t)
 221 L(medium_30f):
 222         bt-     29, L(medium_29t)
 223 L(medium_29f):
 224         bge-    cr1, L(medium_27t)
 225         bflr-   28
 226         stw     rCHR, -4(rMEMP) /* 8th instruction from .align */
 227         stw     rCHR, -8(rMEMP)
 228         blr
 229
 230 L(medium_31t):
 231         stbu    rCHR, -1(rMEMP)
 232         bf-     30, L(medium_30f)
 233 L(medium_30t):
 234         sthu    rCHR, -2(rMEMP)
 235         bf-     29, L(medium_29f)
 236 L(medium_29t):
 237         stwu    rCHR, -4(rMEMP)
 238         blt-    cr1, L(medium_27f) /* 16th instruction from .align */
 239 L(medium_27t):
 240         stw     rCHR, -4(rMEMP)
 241         stw     rCHR, -8(rMEMP)
 242         stw     rCHR, -12(rMEMP)
 243         stwu    rCHR, -16(rMEMP)
 244 L(medium_27f):
 245         bflr-   28
 246 L(medium_28t):
 247         stw     rCHR, -4(rMEMP)
 248         stw     rCHR, -8(rMEMP)
 249         blr
 250
 251 L(checklinesize):
 252 #ifdef SHARED
 253         mflr    rTMP
 254 /* If the remaining length is less the 32 bytes then don't bother getting
 255    the cache line size.  */
 256         beq     L(medium)
 257 /* Establishes GOT addressability so we can load __cache_line_size
 258    from static. This value was set from the aux vector during startup.  */
 259 # ifdef HAVE_ASM_PPC_REL16
 260         bcl     20,31,1f
 261 1:      mflr    rGOT
 262         addis   rGOT,rGOT,__cache_line_size-1b@ha
 263         lwz     rCLS,__cache_line_size-1b@l(rGOT)
 264 # else
 265         bl      _GLOBAL_OFFSET_TABLE_@local-4
 266         mflr    rGOT
 267         lwz     rGOT,__cache_line_size@got(rGOT)
 268         lwz     rCLS,0(rGOT)
 269 # endif
 270         mtlr    rTMP
 271 #else
 272 /* Load __cache_line_size from static. This value was set from the
 273    aux vector during startup.  */
 274         lis     rCLS,__cache_line_size@ha
 275 /* If the remaining length is less the 32 bytes then don't bother getting
 276    the cache line size.  */
 277         beq     L(medium)
 278         lwz     rCLS,__cache_line_size@l(rCLS)
 279 #endif
 280
 281 /* If the cache line size was not set then goto to L(nondcbz), which is
 282    safe for any cache line size.  */
 283         cmplwi  cr1,rCLS,0
 284         beq     cr1,L(nondcbz)
 285
 286 /* If the cache line size is 32 bytes then goto to L(zloopstart),
 287    which is coded specificly for 32-byte lines (and 601).  */
 288         cmplwi  cr1,rCLS,32
 289         beq     cr1,L(zloopstart)
 290
 291 /* Now we know the cache line size and it is not 32-bytes.  However
 292    we may not yet be aligned to the cache line and may have a partial
 293    line to fill.  Touch it 1st to fetch the cache line.  */
 294         dcbtst  0,rMEMP
 295
 296         addi    rCLM,rCLS,-1
 297 L(getCacheAligned):
 298         cmplwi  cr1,rLEN,32
 299         and.    rTMP,rCLM,rMEMP
 300         blt     cr1,L(handletail32)
 301         beq     L(cacheAligned)
 302 /* We are not aligned to start of a cache line yet.  Store 32-byte
 303    of data and test again.  */
 304         addi    rMEMP,rMEMP,32
 305         addi    rLEN,rLEN,-32
 306         stw     rCHR,-32(rMEMP)
 307         stw     rCHR,-28(rMEMP)
 308         stw     rCHR,-24(rMEMP)
 309         stw     rCHR,-20(rMEMP)
 310         stw     rCHR,-16(rMEMP)
 311         stw     rCHR,-12(rMEMP)
 312         stw     rCHR,-8(rMEMP)
 313         stw     rCHR,-4(rMEMP)
 314         b       L(getCacheAligned)
 315
 316 /* Now we are aligned to the cache line and can use dcbz.  */
 317 L(cacheAligned):
 318         cmplw   cr1,rLEN,rCLS
 319         blt     cr1,L(handletail32)
 320         dcbz    0,rMEMP
 321         subf    rLEN,rCLS,rLEN
 322         add     rMEMP,rMEMP,rCLS
 323         b       L(cacheAligned)
 324
 325 /* We are here because; the cache line size was set, it was not
 326    32-bytes, and the remainder (rLEN) is now less than the actual cache
 327    line size.  Set up the preconditions for L(nondcbz) and go there to
 328    store the remaining bytes.  */
 329 L(handletail32):
 330         clrrwi. rALIGN, rLEN, 5
 331         b       L(nondcbz)
 332
 333 END (BP_SYM (memset))
 334 libc_hidden_builtin_def (memset)