sysdeps/ia64/bzero.S

   1 /* Optimized version of the standard bzero() function.
   2    This file is part of the GNU C Library.
   3    Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
   4    Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
   5    Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 /* Return: dest
  23
  24    Inputs:
  25         in0:    dest
  26         in1:    count
  27
  28    The algorithm is fairly straightforward: set byte by byte until we
  29    we get to a 16B-aligned address, then loop on 128 B chunks using an
  30    early store as prefetching, then loop on 32B chucks, then clear remaining
  31    words, finally clear remaining bytes.
  32    Since a stf.spill f0 can store 16B in one go, we use this instruction
  33    to get peak speed.  */
  34
  35 #include <sysdep.h>
  36 #undef ret
  37
  38 #define dest            in0
  39 #define cnt             in1
  40
  41 #define tmp             r31
  42 #define save_lc         r30
  43 #define ptr0            r29
  44 #define ptr1            r28
  45 #define ptr2            r27
  46 #define ptr3            r26
  47 #define ptr9            r24
  48 #define loopcnt         r23
  49 #define linecnt         r22
  50 #define bytecnt         r21
  51
  52 // This routine uses only scratch predicate registers (p6 - p15)
  53 #define p_scr           p6      // default register for same-cycle branches
  54 #define p_unalgn        p9
  55 #define p_y             p11
  56 #define p_n             p12
  57 #define p_yy            p13
  58 #define p_nn            p14
  59
  60 #define movi0           mov
  61
  62 #define MIN1            15
  63 #define MIN1P1HALF      8
  64 #define LINE_SIZE       128
  65 #define LSIZE_SH        7                       // shift amount
  66 #define PREF_AHEAD      8
  67
  68 #define USE_FLP
  69 #if defined(USE_INT)
  70 #define store           st8
  71 #define myval           r0
  72 #elif defined(USE_FLP)
  73 #define store           stf8
  74 #define myval           f0
  75 #endif
  76
  77 .align  64
  78 ENTRY(bzero)
  79 { .mmi
  80         .prologue
  81         alloc   tmp = ar.pfs, 2, 0, 0, 0
  82         lfetch.nt1 [dest]
  83         .save   ar.lc, save_lc
  84         movi0   save_lc = ar.lc
  85 } { .mmi
  86         .body
  87         mov     ret0 = dest             // return value
  88         nop.m   0
  89         cmp.eq  p_scr, p0 = cnt, r0
  90 ;; }
  91 { .mmi
  92         and     ptr2 = -(MIN1+1), dest  // aligned address
  93         and     tmp = MIN1, dest        // prepare to check for alignment
  94         tbit.nz p_y, p_n = dest, 0      // Do we have an odd address? (M_B_U)
  95 } { .mib
  96         mov     ptr1 = dest
  97         nop.i   0
  98 (p_scr) br.ret.dpnt.many rp             // return immediately if count = 0
  99 ;; }
 100 { .mib
 101         cmp.ne  p_unalgn, p0 = tmp, r0
 102 } { .mib                                        // NB: # of bytes to move is 1
 103         sub     bytecnt = (MIN1+1), tmp         //     higher than loopcnt
 104         cmp.gt  p_scr, p0 = 16, cnt             // is it a minimalistic task?
 105 (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
 106 ;; }
 107 { .mmi
 108 (p_unalgn) add  ptr1 = (MIN1+1), ptr2           // after alignment
 109 (p_unalgn) add  ptr2 = MIN1P1HALF, ptr2         // after alignment
 110 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3    // should we do a st8 ?
 111 ;; }
 112 { .mib
 113 (p_y)   add     cnt = -8, cnt
 114 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2  // should we do a st4 ?
 115 } { .mib
 116 (p_y)   st8     [ptr2] = r0,-4
 117 (p_n)   add     ptr2 = 4, ptr2
 118 ;; }
 119 { .mib
 120 (p_yy)  add     cnt = -4, cnt
 121 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1    // should we do a st2 ?
 122 } { .mib
 123 (p_yy)  st4     [ptr2] = r0,-2
 124 (p_nn)  add     ptr2 = 2, ptr2
 125 ;; }
 126 { .mmi
 127         mov     tmp = LINE_SIZE+1               // for compare
 128 (p_y)   add     cnt = -2, cnt
 129 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0  // should we do a st1 ?
 130 } { .mmi
 131         nop.m   0
 132 (p_y)   st2     [ptr2] = r0,-1
 133 (p_n)   add     ptr2 = 1, ptr2
 134 ;; }
 135
 136 { .mmi
 137 (p_yy)  st1     [ptr2] = r0
 138         cmp.gt  p_scr, p0 = tmp, cnt            // is it a minimalistic task?
 139 } { .mbb
 140 (p_yy)  add     cnt = -1, cnt
 141 (p_scr) br.cond.dpnt.many .fraction_of_line     // go move just a few
 142 ;; }
 143 { .mib
 144         nop.m   0
 145         shr.u   linecnt = cnt, LSIZE_SH
 146         nop.b   0
 147 ;; }
 148
 149         .align 32
 150 .l1b:   // ------------------//  L1B: store ahead into cache lines; fill later
 151 { .mmi
 152         and     tmp = -(LINE_SIZE), cnt         // compute end of range
 153         mov     ptr9 = ptr1                     // used for prefetching
 154         and     cnt = (LINE_SIZE-1), cnt        // remainder
 155 } { .mmi
 156         mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 157         cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 158 ;; }
 159 { .mmi
 160 (p_scr) add     loopcnt = -1, linecnt
 161         add     ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
 162         add     ptr1 = tmp, ptr1        // first address beyond total range
 163 ;; }
 164 { .mmi
 165         add     tmp = -1, linecnt       // next loop count
 166         movi0   ar.lc = loopcnt
 167 ;; }
 168 .pref_l1b:
 169 { .mib
 170         stf.spill [ptr9] = f0, 128      // Do stores one cache line apart
 171         nop.i   0
 172         br.cloop.dptk.few .pref_l1b
 173 ;; }
 174 { .mmi
 175         add     ptr0 = 16, ptr2         // Two stores in parallel
 176         movi0   ar.lc = tmp
 177 ;; }
 178 .l1bx:
 179  { .mmi
 180         stf.spill [ptr2] = f0, 32
 181         stf.spill [ptr0] = f0, 32
 182  ;; }
 183  { .mmi
 184         stf.spill [ptr2] = f0, 32
 185         stf.spill [ptr0] = f0, 32
 186  ;; }
 187  { .mmi
 188         stf.spill [ptr2] = f0, 32
 189         stf.spill [ptr0] = f0, 64
 190         cmp.lt  p_scr, p0 = ptr9, ptr1  // do we need more prefetching?
 191  ;; }
 192 { .mmb
 193         stf.spill [ptr2] = f0, 32
 194 (p_scr) stf.spill [ptr9] = f0, 128
 195         br.cloop.dptk.few .l1bx
 196 ;; }
 197 { .mib
 198         cmp.gt  p_scr, p0 = 8, cnt      // just a few bytes left ?
 199 (p_scr) br.cond.dpnt.many  .move_bytes_from_alignment
 200 ;; }
 201
 202 .fraction_of_line:
 203 { .mib
 204         add     ptr2 = 16, ptr1
 205         shr.u   loopcnt = cnt, 5        // loopcnt = cnt / 32
 206 ;; }
 207 { .mib
 208         cmp.eq  p_scr, p0 = loopcnt, r0
 209         add     loopcnt = -1, loopcnt
 210 (p_scr) br.cond.dpnt.many .store_words
 211 ;; }
 212 { .mib
 213         and     cnt = 0x1f, cnt         // compute the remaining cnt
 214         movi0   ar.lc = loopcnt
 215 ;; }
 216         .align 32
 217 .l2:    // -----------------------------//  L2A:  store 32B in 2 cycles
 218 { .mmb
 219         store   [ptr1] = myval, 8
 220         store   [ptr2] = myval, 8
 221 ;; } { .mmb
 222         store   [ptr1] = myval, 24
 223         store   [ptr2] = myval, 24
 224         br.cloop.dptk.many .l2
 225 ;; }
 226 .store_words:
 227 { .mib
 228         cmp.gt  p_scr, p0 = 8, cnt      // just a few bytes left ?
 229 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment    // Branch
 230 ;; }
 231
 232 { .mmi
 233         store   [ptr1] = myval, 8       // store
 234         cmp.le  p_y, p_n = 16, cnt      //
 235         add     cnt = -8, cnt           // subtract
 236 ;; }
 237 { .mmi
 238 (p_y)   store   [ptr1] = myval, 8       // store
 239 (p_y)   cmp.le.unc p_yy, p_nn = 16, cnt
 240 (p_y)   add     cnt = -8, cnt           // subtract
 241 ;; }
 242 { .mmi                                  // store
 243 (p_yy)  store   [ptr1] = myval, 8
 244 (p_yy)  add     cnt = -8, cnt           // subtract
 245 ;; }
 246
 247 .move_bytes_from_alignment:
 248 { .mib
 249         cmp.eq  p_scr, p0 = cnt, r0
 250         tbit.nz.unc p_y, p0 = cnt, 2    // should we terminate with a st4 ?
 251 (p_scr) br.cond.dpnt.few .restore_and_exit
 252 ;; }
 253 { .mib
 254 (p_y)   st4     [ptr1] = r0,4
 255         tbit.nz.unc p_yy, p0 = cnt, 1   // should we terminate with a st2 ?
 256 ;; }
 257 { .mib
 258 (p_yy)  st2     [ptr1] = r0,2
 259         tbit.nz.unc p_y, p0 = cnt, 0    // should we terminate with a st1 ?
 260 ;; }
 261
 262 { .mib
 263 (p_y)   st1     [ptr1] = r0
 264 ;; }
 265 .restore_and_exit:
 266 { .mib
 267         nop.m   0
 268         movi0   ar.lc = save_lc
 269         br.ret.sptk.many rp
 270 ;; }
 271
 272 .move_bytes_unaligned:
 273 { .mmi
 274        .pred.rel "mutex",p_y, p_n
 275        .pred.rel "mutex",p_yy, p_nn
 276 (p_n)   cmp.le  p_yy, p_nn = 4, cnt
 277 (p_y)   cmp.le  p_yy, p_nn = 5, cnt
 278 (p_n)   add     ptr2 = 2, ptr1
 279 } { .mmi
 280 (p_y)   add     ptr2 = 3, ptr1
 281 (p_y)   st1     [ptr1] = r0, 1          // fill 1 (odd-aligned) byte
 282 (p_y)   add     cnt = -1, cnt           // [15, 14 (or less) left]
 283 ;; }
 284 { .mmi
 285 (p_yy)  cmp.le.unc p_y, p0 = 8, cnt
 286         add     ptr3 = ptr1, cnt        // prepare last store
 287         movi0   ar.lc = save_lc
 288 } { .mmi
 289 (p_yy)  st2     [ptr1] = r0, 4          // fill 2 (aligned) bytes
 290 (p_yy)  st2     [ptr2] = r0, 4          // fill 2 (aligned) bytes
 291 (p_yy)  add     cnt = -4, cnt           // [11, 10 (o less) left]
 292 ;; }
 293 { .mmi
 294 (p_y)   cmp.le.unc p_yy, p0 = 8, cnt
 295         add     ptr3 = -1, ptr3         // last store
 296         tbit.nz p_scr, p0 = cnt, 1      // will there be a st2 at the end ?
 297 } { .mmi
 298 (p_y)   st2     [ptr1] = r0, 4          // fill 2 (aligned) bytes
 299 (p_y)   st2     [ptr2] = r0, 4          // fill 2 (aligned) bytes
 300 (p_y)   add     cnt = -4, cnt           // [7, 6 (or less) left]
 301 ;; }
 302 { .mmi
 303 (p_yy)  st2     [ptr1] = r0, 4          // fill 2 (aligned) bytes
 304 (p_yy)  st2     [ptr2] = r0, 4          // fill 2 (aligned) bytes
 305                                         // [3, 2 (or less) left]
 306         tbit.nz p_y, p0 = cnt, 0        // will there be a st1 at the end ?
 307 } { .mmi
 308 (p_yy)  add     cnt = -4, cnt
 309 ;; }
 310 { .mmb
 311 (p_scr) st2     [ptr1] = r0             // fill 2 (aligned) bytes
 312 (p_y)   st1     [ptr3] = r0             // fill last byte (using ptr3)
 313         br.ret.sptk.many rp
 314 ;; }
 315 END(bzero)