sysdeps/aarch64/memmove.S

   1 /* Copyright (C) 2012-2015 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64
  24  * Unaligned accesses
  25  */
  26
  27 /* Parameters and result.  */
  28 #define dstin   x0
  29 #define src     x1
  30 #define count   x2
  31 #define tmp1    x3
  32 #define tmp1w   w3
  33 #define tmp2    x4
  34 #define tmp2w   w4
  35 #define tmp3    x5
  36 #define tmp3w   w5
  37 #define dst     x6
  38
  39 #define A_l     x7
  40 #define A_h     x8
  41 #define B_l     x9
  42 #define B_h     x10
  43 #define C_l     x11
  44 #define C_h     x12
  45 #define D_l     x13
  46 #define D_h     x14
  47
  48 ENTRY_ALIGN (memmove, 6)
  49
  50         cmp     dstin, src
  51         b.lo    L(downwards)
  52         add     tmp1, src, count
  53         cmp     dstin, tmp1
  54         b.hs    memcpy          /* No overlap.  */
  55
  56         /* Upwards move with potential overlap.
  57          * Need to move from the tail backwards.  SRC and DST point one
  58          * byte beyond the remaining data to move.  */
  59         add     dst, dstin, count
  60         add     src, src, count
  61         cmp     count, #64
  62         b.ge    L(mov_not_short_up)
  63
  64         /* Deal with small moves quickly by dropping straight into the
  65          * exit block.  */
  66 L(tail63up):
  67         /* Move up to 48 bytes of data.  At this point we only need the
  68          * bottom 6 bits of count to be accurate.  */
  69         ands    tmp1, count, #0x30
  70         b.eq    L(tail15up)
  71         sub     dst, dst, tmp1
  72         sub     src, src, tmp1
  73         cmp     tmp1w, #0x20
  74         b.eq    1f
  75         b.lt    2f
  76         ldp     A_l, A_h, [src, #32]
  77         stp     A_l, A_h, [dst, #32]
  78 1:
  79         ldp     A_l, A_h, [src, #16]
  80         stp     A_l, A_h, [dst, #16]
  81 2:
  82         ldp     A_l, A_h, [src]
  83         stp     A_l, A_h, [dst]
  84 L(tail15up):
  85         /* Move up to 15 bytes of data.  Does not assume additional data
  86          * being moved.  */
  87         tbz     count, #3, 1f
  88         ldr     tmp1, [src, #-8]!
  89         str     tmp1, [dst, #-8]!
  90 1:
  91         tbz     count, #2, 1f
  92         ldr     tmp1w, [src, #-4]!
  93         str     tmp1w, [dst, #-4]!
  94 1:
  95         tbz     count, #1, 1f
  96         ldrh    tmp1w, [src, #-2]!
  97         strh    tmp1w, [dst, #-2]!
  98 1:
  99         tbz     count, #0, 1f
 100         ldrb    tmp1w, [src, #-1]
 101         strb    tmp1w, [dst, #-1]
 102 1:
 103         RET
 104
 105 L(mov_not_short_up):
 106         /* We don't much care about the alignment of DST, but we want SRC
 107          * to be 128-bit (16 byte) aligned so that we don't cross cache line
 108          * boundaries on both loads and stores.  */
 109         ands    tmp2, src, #15          /* Bytes to reach alignment.  */
 110         b.eq    2f
 111         sub     count, count, tmp2
 112         /* Move enough data to reach alignment; unlike memcpy, we have to
 113          * be aware of the overlap, which means we can't move data twice.  */
 114         tbz     tmp2, #3, 1f
 115         ldr     tmp1, [src, #-8]!
 116         str     tmp1, [dst, #-8]!
 117 1:
 118         tbz     tmp2, #2, 1f
 119         ldr     tmp1w, [src, #-4]!
 120         str     tmp1w, [dst, #-4]!
 121 1:
 122         tbz     tmp2, #1, 1f
 123         ldrh    tmp1w, [src, #-2]!
 124         strh    tmp1w, [dst, #-2]!
 125 1:
 126         tbz     tmp2, #0, 1f
 127         ldrb    tmp1w, [src, #-1]!
 128         strb    tmp1w, [dst, #-1]!
 129 1:
 130
 131         /* There may be less than 63 bytes to go now.  */
 132         cmp     count, #63
 133         b.le    L(tail63up)
 134 2:
 135         subs    count, count, #128
 136         b.ge    L(mov_body_large_up)
 137         /* Less than 128 bytes to move, so handle 64 here and then jump
 138          * to the tail.  */
 139         ldp     A_l, A_h, [src, #-64]!
 140         ldp     B_l, B_h, [src, #16]
 141         ldp     C_l, C_h, [src, #32]
 142         ldp     D_l, D_h, [src, #48]
 143         stp     A_l, A_h, [dst, #-64]!
 144         stp     B_l, B_h, [dst, #16]
 145         stp     C_l, C_h, [dst, #32]
 146         stp     D_l, D_h, [dst, #48]
 147         tst     count, #0x3f
 148         b.ne    L(tail63up)
 149         RET
 150
 151         /* Critical loop.  Start at a new Icache line boundary.  Assuming
 152          * 64 bytes per line this ensures the entire loop is in one line.  */
 153         .p2align 6
 154 L(mov_body_large_up):
 155         /* There are at least 128 bytes to move.  */
 156         ldp     A_l, A_h, [src, #-16]
 157         ldp     B_l, B_h, [src, #-32]
 158         ldp     C_l, C_h, [src, #-48]
 159         ldp     D_l, D_h, [src, #-64]!
 160 1:
 161         stp     A_l, A_h, [dst, #-16]
 162         ldp     A_l, A_h, [src, #-16]
 163         stp     B_l, B_h, [dst, #-32]
 164         ldp     B_l, B_h, [src, #-32]
 165         stp     C_l, C_h, [dst, #-48]
 166         ldp     C_l, C_h, [src, #-48]
 167         stp     D_l, D_h, [dst, #-64]!
 168         ldp     D_l, D_h, [src, #-64]!
 169         subs    count, count, #64
 170         b.ge    1b
 171         stp     A_l, A_h, [dst, #-16]
 172         stp     B_l, B_h, [dst, #-32]
 173         stp     C_l, C_h, [dst, #-48]
 174         stp     D_l, D_h, [dst, #-64]!
 175         tst     count, #0x3f
 176         b.ne    L(tail63up)
 177         RET
 178
 179 L(downwards):
 180         /* For a downwards move we can safely use memcpy provided that
 181          * DST is more than 16 bytes away from SRC.  */
 182         sub     tmp1, src, #16
 183         cmp     dstin, tmp1
 184         b.ls    memcpy          /* May overlap, but not critically.  */
 185
 186         mov     dst, dstin      /* Preserve DSTIN for return value.  */
 187         cmp     count, #64
 188         b.ge    L(mov_not_short_down)
 189
 190         /* Deal with small moves quickly by dropping straight into the
 191          * exit block.  */
 192 L(tail63down):
 193         /* Move up to 48 bytes of data.  At this point we only need the
 194          * bottom 6 bits of count to be accurate.  */
 195         ands    tmp1, count, #0x30
 196         b.eq    L(tail15down)
 197         add     dst, dst, tmp1
 198         add     src, src, tmp1
 199         cmp     tmp1w, #0x20
 200         b.eq    1f
 201         b.lt    2f
 202         ldp     A_l, A_h, [src, #-48]
 203         stp     A_l, A_h, [dst, #-48]
 204 1:
 205         ldp     A_l, A_h, [src, #-32]
 206         stp     A_l, A_h, [dst, #-32]
 207 2:
 208         ldp     A_l, A_h, [src, #-16]
 209         stp     A_l, A_h, [dst, #-16]
 210 L(tail15down):
 211         /* Move up to 15 bytes of data.  Does not assume additional data
 212            being moved.  */
 213         tbz     count, #3, 1f
 214         ldr     tmp1, [src], #8
 215         str     tmp1, [dst], #8
 216 1:
 217         tbz     count, #2, 1f
 218         ldr     tmp1w, [src], #4
 219         str     tmp1w, [dst], #4
 220 1:
 221         tbz     count, #1, 1f
 222         ldrh    tmp1w, [src], #2
 223         strh    tmp1w, [dst], #2
 224 1:
 225         tbz     count, #0, 1f
 226         ldrb    tmp1w, [src]
 227         strb    tmp1w, [dst]
 228 1:
 229         RET
 230
 231 L(mov_not_short_down):
 232         /* We don't much care about the alignment of DST, but we want SRC
 233          * to be 128-bit (16 byte) aligned so that we don't cross cache line
 234          * boundaries on both loads and stores.  */
 235         neg     tmp2, src
 236         ands    tmp2, tmp2, #15         /* Bytes to reach alignment.  */
 237         b.eq    2f
 238         sub     count, count, tmp2
 239         /* Move enough data to reach alignment; unlike memcpy, we have to
 240          * be aware of the overlap, which means we can't move data twice.  */
 241         tbz     tmp2, #3, 1f
 242         ldr     tmp1, [src], #8
 243         str     tmp1, [dst], #8
 244 1:
 245         tbz     tmp2, #2, 1f
 246         ldr     tmp1w, [src], #4
 247         str     tmp1w, [dst], #4
 248 1:
 249         tbz     tmp2, #1, 1f
 250         ldrh    tmp1w, [src], #2
 251         strh    tmp1w, [dst], #2
 252 1:
 253         tbz     tmp2, #0, 1f
 254         ldrb    tmp1w, [src], #1
 255         strb    tmp1w, [dst], #1
 256 1:
 257
 258         /* There may be less than 63 bytes to go now.  */
 259         cmp     count, #63
 260         b.le    L(tail63down)
 261 2:
 262         subs    count, count, #128
 263         b.ge    L(mov_body_large_down)
 264         /* Less than 128 bytes to move, so handle 64 here and then jump
 265          * to the tail.  */
 266         ldp     A_l, A_h, [src]
 267         ldp     B_l, B_h, [src, #16]
 268         ldp     C_l, C_h, [src, #32]
 269         ldp     D_l, D_h, [src, #48]
 270         stp     A_l, A_h, [dst]
 271         stp     B_l, B_h, [dst, #16]
 272         stp     C_l, C_h, [dst, #32]
 273         stp     D_l, D_h, [dst, #48]
 274         tst     count, #0x3f
 275         add     src, src, #64
 276         add     dst, dst, #64
 277         b.ne    L(tail63down)
 278         RET
 279
 280         /* Critical loop.  Start at a new cache line boundary.  Assuming
 281          * 64 bytes per line this ensures the entire loop is in one line.  */
 282         .p2align 6
 283 L(mov_body_large_down):
 284         /* There are at least 128 bytes to move.  */
 285         ldp     A_l, A_h, [src, #0]
 286         sub     dst, dst, #16           /* Pre-bias.  */
 287         ldp     B_l, B_h, [src, #16]
 288         ldp     C_l, C_h, [src, #32]
 289         ldp     D_l, D_h, [src, #48]!   /* src += 64 - Pre-bias.  */
 290 1:
 291         stp     A_l, A_h, [dst, #16]
 292         ldp     A_l, A_h, [src, #16]
 293         stp     B_l, B_h, [dst, #32]
 294         ldp     B_l, B_h, [src, #32]
 295         stp     C_l, C_h, [dst, #48]
 296         ldp     C_l, C_h, [src, #48]
 297         stp     D_l, D_h, [dst, #64]!
 298         ldp     D_l, D_h, [src, #64]!
 299         subs    count, count, #64
 300         b.ge    1b
 301         stp     A_l, A_h, [dst, #16]
 302         stp     B_l, B_h, [dst, #32]
 303         stp     C_l, C_h, [dst, #48]
 304         stp     D_l, D_h, [dst, #64]
 305         add     src, src, #16
 306         add     dst, dst, #64 + 16
 307         tst     count, #0x3f
 308         b.ne    L(tail63down)
 309         RET
 310 END (memmove)
 311
 312 libc_hidden_builtin_def (memmove)