sysdeps/aarch64/memset.S

   1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 /* Assumptions:
  20  *
  21  * ARMv8-a, AArch64
  22  * Unaligned accesses
  23  *
  24  */
  25
  26 #include <sysdep.h>
  27
  28 /* By default we assume that the DC instruction can be used to zero
  29    data blocks more efficiently.  In some circumstances this might be
  30    unsafe, for example in an asymmetric multiprocessor environment with
  31    different DC clear lengths (neither the upper nor lower lengths are
  32    safe to use).  The feature can be disabled by defining DONT_USE_DC.
  33
  34    If code may be run in a virtualized environment, then define
  35    MAYBE_VIRT.  This will cause the code to cache the system register
  36    values rather than re-reading them each call.  */
  37
  38 #define dstin           x0
  39 #define val             w1
  40 #define count           x2
  41 #define tmp1            x3
  42 #define tmp1w           w3
  43 #define tmp2            x4
  44 #define tmp2w           w4
  45 #define zva_len_x       x5
  46 #define zva_len         w5
  47 #define zva_bits_x      x6
  48
  49 #define A_l             x7
  50 #define A_lw            w7
  51 #define dst             x8
  52 #define tmp3w           w9
  53
  54 ENTRY_ALIGN (__memset, 6)
  55
  56         mov     dst, dstin              /* Preserve return value.  */
  57         ands    A_lw, val, #255
  58 #ifndef DONT_USE_DC
  59         b.eq    L(zero_mem)
  60 #endif
  61         orr     A_lw, A_lw, A_lw, lsl #8
  62         orr     A_lw, A_lw, A_lw, lsl #16
  63         orr     A_l, A_l, A_l, lsl #32
  64 L(tail_maybe_long):
  65         cmp     count, #64
  66         b.ge    L(not_short)
  67 L(tail_maybe_tiny):
  68         cmp     count, #15
  69         b.le    L(tail15tiny)
  70 L(tail63):
  71         ands    tmp1, count, #0x30
  72         b.eq    L(tail15)
  73         add     dst, dst, tmp1
  74         cmp     tmp1w, #0x20
  75         b.eq    1f
  76         b.lt    2f
  77         stp     A_l, A_l, [dst, #-48]
  78 1:
  79         stp     A_l, A_l, [dst, #-32]
  80 2:
  81         stp     A_l, A_l, [dst, #-16]
  82
  83 L(tail15):
  84         and     count, count, #15
  85         add     dst, dst, count
  86         stp     A_l, A_l, [dst, #-16]   /* Repeat some/all of last store. */
  87         RET
  88
  89 L(tail15tiny):
  90         /* Set up to 15 bytes.  Does not assume earlier memory
  91            being set.  */
  92         tbz     count, #3, 1f
  93         str     A_l, [dst], #8
  94 1:
  95         tbz     count, #2, 1f
  96         str     A_lw, [dst], #4
  97 1:
  98         tbz     count, #1, 1f
  99         strh    A_lw, [dst], #2
 100 1:
 101         tbz     count, #0, 1f
 102         strb    A_lw, [dst]
 103 1:
 104         RET
 105
 106         /* Critical loop.  Start at a new cache line boundary.  Assuming
 107          * 64 bytes per line, this ensures the entire loop is in one line.  */
 108         .p2align 6
 109 L(not_short):
 110         neg     tmp2, dst
 111         ands    tmp2, tmp2, #15
 112         b.eq    2f
 113         /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
 114          * more than that to set, so we simply store 16 bytes and advance by
 115          * the amount required to reach alignment.  */
 116         sub     count, count, tmp2
 117         stp     A_l, A_l, [dst]
 118         add     dst, dst, tmp2
 119         /* There may be less than 63 bytes to go now.  */
 120         cmp     count, #63
 121         b.le    L(tail63)
 122 2:
 123         sub     dst, dst, #16           /* Pre-bias.  */
 124         sub     count, count, #64
 125 1:
 126         stp     A_l, A_l, [dst, #16]
 127         stp     A_l, A_l, [dst, #32]
 128         stp     A_l, A_l, [dst, #48]
 129         stp     A_l, A_l, [dst, #64]!
 130         subs    count, count, #64
 131         b.ge    1b
 132         tst     count, #0x3f
 133         add     dst, dst, #16
 134         b.ne    L(tail63)
 135         RET
 136
 137 #ifndef DONT_USE_DC
 138         /* For zeroing memory, check to see if we can use the ZVA feature to
 139          * zero entire 'cache' lines.  */
 140 L(zero_mem):
 141         mov     A_l, #0
 142         cmp     count, #63
 143         b.le    L(tail_maybe_tiny)
 144         neg     tmp2, dst
 145         ands    tmp2, tmp2, #15
 146         b.eq    1f
 147         sub     count, count, tmp2
 148         stp     A_l, A_l, [dst]
 149         add     dst, dst, tmp2
 150         cmp     count, #63
 151         b.le    L(tail63)
 152 1:
 153         /* For zeroing small amounts of memory, it's not worth setting up
 154          * the line-clear code.  */
 155         cmp     count, #128
 156         b.lt    L(not_short)
 157 #ifdef MAYBE_VIRT
 158         /* For efficiency when virtualized, we cache the ZVA capability.  */
 159         adrp    tmp2, L(cache_clear)
 160         ldr     zva_len, [tmp2, #:lo12:L(cache_clear)]
 161         tbnz    zva_len, #31, L(not_short)
 162         cbnz    zva_len, L(zero_by_line)
 163         mrs     tmp1, dczid_el0
 164         tbz     tmp1, #4, 1f
 165         /* ZVA not available.  Remember this for next time.  */
 166         mov     zva_len, #~0
 167         str     zva_len, [tmp2, #:lo12:L(cache_clear)]
 168         b       L(not_short)
 169 1:
 170         mov     tmp3w, #4
 171         and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
 172         lsl     zva_len, tmp3w, zva_len
 173         str     zva_len, [tmp2, #:lo12:L(cache_clear)]
 174 #else
 175         mrs     tmp1, dczid_el0
 176         tbnz    tmp1, #4, L(not_short)
 177         mov     tmp3w, #4
 178         and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
 179         lsl     zva_len, tmp3w, zva_len
 180 #endif
 181
 182 L(zero_by_line):
 183         /* Compute how far we need to go to become suitably aligned.  We're
 184          * already at quad-word alignment.  */
 185         cmp     count, zva_len_x
 186         b.lt    L(not_short)            /* Not enough to reach alignment.  */
 187         sub     zva_bits_x, zva_len_x, #1
 188         neg     tmp2, dst
 189         ands    tmp2, tmp2, zva_bits_x
 190         b.eq    1f                      /* Already aligned.  */
 191         /* Not aligned, check that there's enough to copy after alignment.  */
 192         sub     tmp1, count, tmp2
 193         cmp     tmp1, #64
 194         ccmp    tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
 195         b.lt    L(not_short)
 196         /* We know that there's at least 64 bytes to zero and that it's safe
 197          * to overrun by 64 bytes.  */
 198         mov     count, tmp1
 199 2:
 200         stp     A_l, A_l, [dst]
 201         stp     A_l, A_l, [dst, #16]
 202         stp     A_l, A_l, [dst, #32]
 203         subs    tmp2, tmp2, #64
 204         stp     A_l, A_l, [dst, #48]
 205         add     dst, dst, #64
 206         b.ge    2b
 207         /* We've overrun a bit, so adjust dst downwards.  */
 208         add     dst, dst, tmp2
 209 1:
 210         sub     count, count, zva_len_x
 211 3:
 212         dc      zva, dst
 213         add     dst, dst, zva_len_x
 214         subs    count, count, zva_len_x
 215         b.ge    3b
 216         ands    count, count, zva_bits_x
 217         b.ne    L(tail_maybe_long)
 218         RET
 219 #ifdef MAYBE_VIRT
 220         .bss
 221         .p2align 2
 222 L(cache_clear):
 223         .space 4
 224 #endif
 225 #endif /* DONT_USE_DC */
 226
 227 END (__memset)
 228 weak_alias (__memset, memset)
 229 libc_hidden_builtin_def (memset)