sysdeps/ia64/memset.S

   1 /* Optimized version of the standard memset() function.
   2    This file is part of the GNU C Library.
   3    Copyright (C) 2000-2018 Free Software Foundation, Inc.
   4    Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
   5    Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 /* Return: dest
  22
  23    Inputs:
  24         in0:    dest
  25         in1:    value
  26         in2:    count
  27
  28    The algorithm is fairly straightforward: set byte by byte until we
  29    we get to a 16B-aligned address, then loop on 128 B chunks using an
  30    early store as prefetching, then loop on 32B chucks, then clear remaining
  31    words, finally clear remaining bytes.
  32    Since a stf.spill f0 can store 16B in one go, we use this instruction
  33    to get peak speed when value = 0.  */
  34
  35 #include <sysdep.h>
  36 #undef ret
  37
  38 #define dest            in0
  39 #define value           in1
  40 #define cnt             in2
  41
  42 #define tmp             r31
  43 #define save_lc         r30
  44 #define ptr0            r29
  45 #define ptr1            r28
  46 #define ptr2            r27
  47 #define ptr3            r26
  48 #define ptr9            r24
  49 #define loopcnt         r23
  50 #define linecnt         r22
  51 #define bytecnt         r21
  52
  53 #define fvalue          f6
  54
  55 // This routine uses only scratch predicate registers (p6 - p15)
  56 #define p_scr           p6                      // default register for same-cycle branches
  57 #define p_nz            p7
  58 #define p_zr            p8
  59 #define p_unalgn        p9
  60 #define p_y             p11
  61 #define p_n             p12
  62 #define p_yy            p13
  63 #define p_nn            p14
  64
  65 #define movi0           mov
  66
  67 #define MIN1            15
  68 #define MIN1P1HALF      8
  69 #define LINE_SIZE       128
  70 #define LSIZE_SH        7                       // shift amount
  71 #define PREF_AHEAD      8
  72
  73 #define USE_FLP
  74 #if defined(USE_INT)
  75 #define store           st8
  76 #define myval           value
  77 #elif defined(USE_FLP)
  78 #define store           stf8
  79 #define myval           fvalue
  80 #endif
  81
  82 .align  64
  83 ENTRY(memset)
  84 { .mmi
  85         .prologue
  86         alloc   tmp = ar.pfs, 3, 0, 0, 0
  87         lfetch.nt1 [dest]
  88         .save   ar.lc, save_lc
  89         movi0   save_lc = ar.lc
  90 } { .mmi
  91         .body
  92         mov     ret0 = dest             // return value
  93         cmp.ne  p_nz, p_zr = value, r0  // use stf.spill if value is zero
  94         cmp.eq  p_scr, p0 = cnt, r0
  95 ;; }
  96 { .mmi
  97         and     ptr2 = -(MIN1+1), dest  // aligned address
  98         and     tmp = MIN1, dest        // prepare to check for alignment
  99         tbit.nz p_y, p_n = dest, 0      // Do we have an odd address? (M_B_U)
 100 } { .mib
 101         mov     ptr1 = dest
 102         mux1    value = value, @brcst   // create 8 identical bytes in word
 103 (p_scr) br.ret.dpnt.many rp             // return immediately if count = 0
 104 ;; }
 105 { .mib
 106         cmp.ne  p_unalgn, p0 = tmp, r0
 107 } { .mib                                // NB: # of bytes to move is 1 higher
 108         sub     bytecnt = (MIN1+1), tmp //     than loopcnt
 109         cmp.gt  p_scr, p0 = 16, cnt             // is it a minimalistic task?
 110 (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
 111 ;; }
 112 { .mmi
 113 (p_unalgn) add  ptr1 = (MIN1+1), ptr2           // after alignment
 114 (p_unalgn) add  ptr2 = MIN1P1HALF, ptr2         // after alignment
 115 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3    // should we do a st8 ?
 116 ;; }
 117 { .mib
 118 (p_y)   add     cnt = -8, cnt
 119 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2  // should we do a st4 ?
 120 } { .mib
 121 (p_y)   st8     [ptr2] = value, -4
 122 (p_n)   add     ptr2 = 4, ptr2
 123 ;; }
 124 { .mib
 125 (p_yy)  add     cnt = -4, cnt
 126 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1    // should we do a st2 ?
 127 } { .mib
 128 (p_yy)  st4     [ptr2] = value, -2
 129 (p_nn)  add     ptr2 = 2, ptr2
 130 ;; }
 131 { .mmi
 132         mov     tmp = LINE_SIZE+1               // for compare
 133 (p_y)   add     cnt = -2, cnt
 134 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0  // should we do a st1 ?
 135 } { .mmi
 136         setf.sig fvalue=value                   // transfer value to FLP side
 137 (p_y)   st2     [ptr2] = value, -1
 138 (p_n)   add     ptr2 = 1, ptr2
 139 ;; }
 140
 141 { .mmi
 142 (p_yy)  st1     [ptr2] = value
 143         cmp.gt  p_scr, p0 = tmp, cnt            // is it a minimalistic task?
 144 } { .mbb
 145 (p_yy)  add     cnt = -1, cnt
 146 (p_scr) br.cond.dpnt.many .fraction_of_line     // go move just a few
 147 ;; }
 148
 149 { .mib
 150         nop.m 0
 151         shr.u   linecnt = cnt, LSIZE_SH
 152 (p_zr)  br.cond.dptk.many .l1b                  // Jump to use stf.spill
 153 ;; }
 154
 155 #ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
 156         .align 32 // -------- //  L1A: store ahead into cache lines; fill later
 157 #endif
 158 { .mmi
 159         and     tmp = -(LINE_SIZE), cnt         // compute end of range
 160         mov     ptr9 = ptr1                     // used for prefetching
 161         and     cnt = (LINE_SIZE-1), cnt        // remainder
 162 } { .mmi
 163         mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 164         cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 165 ;; }
 166 { .mmi
 167 (p_scr) add     loopcnt = -1, linecnt           // start of stores
 168         add     ptr2 = 8, ptr1                  // (beyond prefetch stores)
 169         add     ptr1 = tmp, ptr1                // first address beyond total
 170 ;; }                                            // range
 171 { .mmi
 172         add     tmp = -1, linecnt               // next loop count
 173         movi0   ar.lc = loopcnt
 174 ;; }
 175 .pref_l1a:
 176 { .mib
 177         store [ptr9] = myval, 128       // Do stores one cache line apart
 178         nop.i   0
 179         br.cloop.dptk.few .pref_l1a
 180 ;; }
 181 { .mmi
 182         add     ptr0 = 16, ptr2         // Two stores in parallel
 183         movi0   ar.lc = tmp
 184 ;; }
 185 .l1ax:
 186  { .mmi
 187         store [ptr2] = myval, 8
 188         store [ptr0] = myval, 8
 189  ;; }
 190  { .mmi
 191         store [ptr2] = myval, 24
 192         store [ptr0] = myval, 24
 193  ;; }
 194  { .mmi
 195         store [ptr2] = myval, 8
 196         store [ptr0] = myval, 8
 197  ;; }
 198  { .mmi
 199         store [ptr2] = myval, 24
 200         store [ptr0] = myval, 24
 201  ;; }
 202  { .mmi
 203         store [ptr2] = myval, 8
 204         store [ptr0] = myval, 8
 205  ;; }
 206  { .mmi
 207         store [ptr2] = myval, 24
 208         store [ptr0] = myval, 24
 209  ;; }
 210  { .mmi
 211         store [ptr2] = myval, 8
 212         store [ptr0] = myval, 32
 213         cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
 214  ;; }
 215 { .mmb
 216         store [ptr2] = myval, 24
 217 (p_scr) store [ptr9] = myval, 128
 218         br.cloop.dptk.few .l1ax
 219 ;; }
 220 { .mbb
 221         cmp.le  p_scr, p0 = 8, cnt              // just a few bytes left ?
 222 (p_scr) br.cond.dpnt.many  .fraction_of_line    // Branch no. 2
 223         br.cond.dpnt.many  .move_bytes_from_alignment   // Branch no. 3
 224 ;; }
 225
 226 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
 227         { nop 0 }
 228 #else
 229         .align 32
 230 #endif
 231 .l1b:   // ------------------ //  L1B: store ahead into cache lines; fill later
 232 { .mmi
 233         and     tmp = -(LINE_SIZE), cnt         // compute end of range
 234         mov     ptr9 = ptr1                     // used for prefetching
 235         and     cnt = (LINE_SIZE-1), cnt        // remainder
 236 } { .mmi
 237         mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 238         cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 239 ;; }
 240 { .mmi
 241 (p_scr) add     loopcnt = -1, linecnt
 242         add     ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
 243         add     ptr1 = tmp, ptr1        // first address beyond total range
 244 ;; }
 245 { .mmi
 246         add     tmp = -1, linecnt       // next loop count
 247         movi0   ar.lc = loopcnt
 248 ;; }
 249 .pref_l1b:
 250 { .mib
 251         stf.spill [ptr9] = f0, 128      // Do stores one cache line apart
 252         nop.i   0
 253         br.cloop.dptk.few .pref_l1b
 254 ;; }
 255 { .mmi
 256         add     ptr0 = 16, ptr2         // Two stores in parallel
 257         movi0   ar.lc = tmp
 258 ;; }
 259 .l1bx:
 260  { .mmi
 261         stf.spill [ptr2] = f0, 32
 262         stf.spill [ptr0] = f0, 32
 263  ;; }
 264  { .mmi
 265         stf.spill [ptr2] = f0, 32
 266         stf.spill [ptr0] = f0, 32
 267  ;; }
 268  { .mmi
 269         stf.spill [ptr2] = f0, 32
 270         stf.spill [ptr0] = f0, 64
 271         cmp.lt  p_scr, p0 = ptr9, ptr1  // do we need more prefetching?
 272  ;; }
 273 { .mmb
 274         stf.spill [ptr2] = f0, 32
 275 (p_scr) stf.spill [ptr9] = f0, 128
 276         br.cloop.dptk.few .l1bx
 277 ;; }
 278 { .mib
 279         cmp.gt  p_scr, p0 = 8, cnt      // just a few bytes left ?
 280 (p_scr) br.cond.dpnt.many  .move_bytes_from_alignment
 281 ;; }
 282
 283 .fraction_of_line:
 284 { .mib
 285         add     ptr2 = 16, ptr1
 286         shr.u   loopcnt = cnt, 5        // loopcnt = cnt / 32
 287 ;; }
 288 { .mib
 289         cmp.eq  p_scr, p0 = loopcnt, r0
 290         add     loopcnt = -1, loopcnt
 291 (p_scr) br.cond.dpnt.many store_words
 292 ;; }
 293 { .mib
 294         and     cnt = 0x1f, cnt         // compute the remaining cnt
 295         movi0   ar.lc = loopcnt
 296 ;; }
 297 #ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
 298         .align 32
 299 #endif
 300 .l2:    // ---------------------------- //  L2A:  store 32B in 2 cycles
 301 { .mmb
 302         store   [ptr1] = myval, 8
 303         store   [ptr2] = myval, 8
 304 ;; } { .mmb
 305         store   [ptr1] = myval, 24
 306         store   [ptr2] = myval, 24
 307         br.cloop.dptk.many .l2
 308 ;; }
 309 store_words:
 310 { .mib
 311         cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
 312 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment    // Branch
 313 ;; }
 314
 315 { .mmi
 316         store   [ptr1] = myval, 8               // store
 317         cmp.le  p_y, p_n = 16, cnt              //
 318         add     cnt = -8, cnt                   // subtract
 319 ;; }
 320 { .mmi
 321 (p_y)   store   [ptr1] = myval, 8               // store
 322 (p_y)   cmp.le.unc p_yy, p_nn = 16, cnt         //
 323 (p_y)   add     cnt = -8, cnt                   // subtract
 324 ;; }
 325 { .mmi                                          // store
 326 (p_yy)  store   [ptr1] = myval, 8               //
 327 (p_yy)  add     cnt = -8, cnt                   // subtract
 328 ;; }
 329
 330 .move_bytes_from_alignment:
 331 { .mib
 332         cmp.eq  p_scr, p0 = cnt, r0
 333         tbit.nz.unc p_y, p0 = cnt, 2    // should we terminate with a st4 ?
 334 (p_scr) br.cond.dpnt.few .restore_and_exit
 335 ;; }
 336 { .mib
 337 (p_y)   st4     [ptr1] = value, 4
 338         tbit.nz.unc p_yy, p0 = cnt, 1   // should we terminate with a st2 ?
 339 ;; }
 340 { .mib
 341 (p_yy)  st2     [ptr1] = value, 2
 342         tbit.nz.unc p_y, p0 = cnt, 0
 343 ;; }
 344
 345 { .mib
 346 (p_y)   st1     [ptr1] = value
 347 ;; }
 348 .restore_and_exit:
 349 { .mib
 350         nop.m   0
 351         movi0   ar.lc = save_lc
 352         br.ret.sptk.many rp
 353 ;; }
 354
 355 .move_bytes_unaligned:
 356 { .mmi
 357        .pred.rel "mutex",p_y, p_n
 358        .pred.rel "mutex",p_yy, p_nn
 359 (p_n)   cmp.le  p_yy, p_nn = 4, cnt
 360 (p_y)   cmp.le  p_yy, p_nn = 5, cnt
 361 (p_n)   add     ptr2 = 2, ptr1
 362 } { .mmi
 363 (p_y)   add     ptr2 = 3, ptr1
 364 (p_y)   st1     [ptr1] = value, 1       // fill 1 (odd-aligned) byte
 365 (p_y)   add     cnt = -1, cnt           // [15, 14 (or less) left]
 366 ;; }
 367 { .mmi
 368 (p_yy)  cmp.le.unc p_y, p0 = 8, cnt
 369         add     ptr3 = ptr1, cnt        // prepare last store
 370         movi0   ar.lc = save_lc
 371 } { .mmi
 372 (p_yy)  st2     [ptr1] = value, 4       // fill 2 (aligned) bytes
 373 (p_yy)  st2     [ptr2] = value, 4       // fill 2 (aligned) bytes
 374 (p_yy)  add     cnt = -4, cnt           // [11, 10 (o less) left]
 375 ;; }
 376 { .mmi
 377 (p_y)   cmp.le.unc p_yy, p0 = 8, cnt
 378         add     ptr3 = -1, ptr3         // last store
 379         tbit.nz p_scr, p0 = cnt, 1      // will there be a st2 at the end ?
 380 } { .mmi
 381 (p_y)   st2     [ptr1] = value, 4       // fill 2 (aligned) bytes
 382 (p_y)   st2     [ptr2] = value, 4       // fill 2 (aligned) bytes
 383 (p_y)   add     cnt = -4, cnt           // [7, 6 (or less) left]
 384 ;; }
 385 { .mmi
 386 (p_yy)  st2     [ptr1] = value, 4       // fill 2 (aligned) bytes
 387 (p_yy)  st2     [ptr2] = value, 4       // fill 2 (aligned) bytes
 388                                         // [3, 2 (or less) left]
 389         tbit.nz p_y, p0 = cnt, 0        // will there be a st1 at the end ?
 390 } { .mmi
 391 (p_yy)  add     cnt = -4, cnt
 392 ;; }
 393 { .mmb
 394 (p_scr) st2     [ptr1] = value          // fill 2 (aligned) bytes
 395 (p_y)   st1     [ptr3] = value          // fill last byte (using ptr3)
 396         br.ret.sptk.many rp
 397 ;; }
 398 END(memset)
 399 libc_hidden_builtin_def (memset)