sysdeps/aarch64/memcpy.S

   1 /* Copyright (C) 2012-2022 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses.
  24  *
  25  */
  26
  27 #define dstin   x0
  28 #define src     x1
  29 #define count   x2
  30 #define dst     x3
  31 #define srcend  x4
  32 #define dstend  x5
  33 #define A_l     x6
  34 #define A_lw    w6
  35 #define A_h     x7
  36 #define B_l     x8
  37 #define B_lw    w8
  38 #define B_h     x9
  39 #define C_l     x10
  40 #define C_lw    w10
  41 #define C_h     x11
  42 #define D_l     x12
  43 #define D_h     x13
  44 #define E_l     x14
  45 #define E_h     x15
  46 #define F_l     x16
  47 #define F_h     x17
  48 #define G_l     count
  49 #define G_h     dst
  50 #define H_l     src
  51 #define H_h     srcend
  52 #define tmp1    x14
  53
  54 #ifndef MEMMOVE
  55 # define MEMMOVE memmove
  56 #endif
  57 #ifndef MEMCPY
  58 # define MEMCPY memcpy
  59 #endif
  60
  61 /* This implementation supports both memcpy and memmove and shares most code.
  62    It uses unaligned accesses and branchless sequences to keep the code small,
  63    simple and improve performance.
  64
  65    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  66    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  67    check in memmove is negligible since it is only required for large copies.
  68
  69    Large copies use a software pipelined loop processing 64 bytes per
  70    iteration.  The destination pointer is 16-byte aligned to minimize
  71    unaligned accesses.  The loop tail is handled by always copying 64 bytes
  72    from the end.
  73 */
  74
  75 ENTRY_ALIGN (MEMCPY, 6)
  76         PTR_ARG (0)
  77         PTR_ARG (1)
  78         SIZE_ARG (2)
  79
  80         add     srcend, src, count
  81         add     dstend, dstin, count
  82         cmp     count, 128
  83         b.hi    L(copy_long)
  84         cmp     count, 32
  85         b.hi    L(copy32_128)
  86
  87         /* Small copies: 0..32 bytes.  */
  88         cmp     count, 16
  89         b.lo    L(copy16)
  90         ldp     A_l, A_h, [src]
  91         ldp     D_l, D_h, [srcend, -16]
  92         stp     A_l, A_h, [dstin]
  93         stp     D_l, D_h, [dstend, -16]
  94         ret
  95
  96         /* Copy 8-15 bytes.  */
  97 L(copy16):
  98         tbz     count, 3, L(copy8)
  99         ldr     A_l, [src]
 100         ldr     A_h, [srcend, -8]
 101         str     A_l, [dstin]
 102         str     A_h, [dstend, -8]
 103         ret
 104
 105         .p2align 3
 106         /* Copy 4-7 bytes.  */
 107 L(copy8):
 108         tbz     count, 2, L(copy4)
 109         ldr     A_lw, [src]
 110         ldr     B_lw, [srcend, -4]
 111         str     A_lw, [dstin]
 112         str     B_lw, [dstend, -4]
 113         ret
 114
 115         /* Copy 0..3 bytes using a branchless sequence.  */
 116 L(copy4):
 117         cbz     count, L(copy0)
 118         lsr     tmp1, count, 1
 119         ldrb    A_lw, [src]
 120         ldrb    C_lw, [srcend, -1]
 121         ldrb    B_lw, [src, tmp1]
 122         strb    A_lw, [dstin]
 123         strb    B_lw, [dstin, tmp1]
 124         strb    C_lw, [dstend, -1]
 125 L(copy0):
 126         ret
 127
 128         .p2align 4
 129         /* Medium copies: 33..128 bytes.  */
 130 L(copy32_128):
 131         ldp     A_l, A_h, [src]
 132         ldp     B_l, B_h, [src, 16]
 133         ldp     C_l, C_h, [srcend, -32]
 134         ldp     D_l, D_h, [srcend, -16]
 135         cmp     count, 64
 136         b.hi    L(copy128)
 137         stp     A_l, A_h, [dstin]
 138         stp     B_l, B_h, [dstin, 16]
 139         stp     C_l, C_h, [dstend, -32]
 140         stp     D_l, D_h, [dstend, -16]
 141         ret
 142
 143         .p2align 4
 144         /* Copy 65..128 bytes.  */
 145 L(copy128):
 146         ldp     E_l, E_h, [src, 32]
 147         ldp     F_l, F_h, [src, 48]
 148         cmp     count, 96
 149         b.ls    L(copy96)
 150         ldp     G_l, G_h, [srcend, -64]
 151         ldp     H_l, H_h, [srcend, -48]
 152         stp     G_l, G_h, [dstend, -64]
 153         stp     H_l, H_h, [dstend, -48]
 154 L(copy96):
 155         stp     A_l, A_h, [dstin]
 156         stp     B_l, B_h, [dstin, 16]
 157         stp     E_l, E_h, [dstin, 32]
 158         stp     F_l, F_h, [dstin, 48]
 159         stp     C_l, C_h, [dstend, -32]
 160         stp     D_l, D_h, [dstend, -16]
 161         ret
 162
 163         .p2align 4
 164         /* Copy more than 128 bytes.  */
 165 L(copy_long):
 166         /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 167         ldp     D_l, D_h, [src]
 168         and     tmp1, dstin, 15
 169         bic     dst, dstin, 15
 170         sub     src, src, tmp1
 171         add     count, count, tmp1      /* Count is now 16 too large.  */
 172         ldp     A_l, A_h, [src, 16]
 173         stp     D_l, D_h, [dstin]
 174         ldp     B_l, B_h, [src, 32]
 175         ldp     C_l, C_h, [src, 48]
 176         ldp     D_l, D_h, [src, 64]!
 177         subs    count, count, 128 + 16  /* Test and readjust count.  */
 178         b.ls    L(copy64_from_end)
 179
 180 L(loop64):
 181         stp     A_l, A_h, [dst, 16]
 182         ldp     A_l, A_h, [src, 16]
 183         stp     B_l, B_h, [dst, 32]
 184         ldp     B_l, B_h, [src, 32]
 185         stp     C_l, C_h, [dst, 48]
 186         ldp     C_l, C_h, [src, 48]
 187         stp     D_l, D_h, [dst, 64]!
 188         ldp     D_l, D_h, [src, 64]!
 189         subs    count, count, 64
 190         b.hi    L(loop64)
 191
 192         /* Write the last iteration and copy 64 bytes from the end.  */
 193 L(copy64_from_end):
 194         ldp     E_l, E_h, [srcend, -64]
 195         stp     A_l, A_h, [dst, 16]
 196         ldp     A_l, A_h, [srcend, -48]
 197         stp     B_l, B_h, [dst, 32]
 198         ldp     B_l, B_h, [srcend, -32]
 199         stp     C_l, C_h, [dst, 48]
 200         ldp     C_l, C_h, [srcend, -16]
 201         stp     D_l, D_h, [dst, 64]
 202         stp     E_l, E_h, [dstend, -64]
 203         stp     A_l, A_h, [dstend, -48]
 204         stp     B_l, B_h, [dstend, -32]
 205         stp     C_l, C_h, [dstend, -16]
 206         ret
 207
 208 END (MEMCPY)
 209 libc_hidden_builtin_def (MEMCPY)
 210
 211 ENTRY_ALIGN (MEMMOVE, 4)
 212         PTR_ARG (0)
 213         PTR_ARG (1)
 214         SIZE_ARG (2)
 215
 216         add     srcend, src, count
 217         add     dstend, dstin, count
 218         cmp     count, 128
 219         b.hi    L(move_long)
 220         cmp     count, 32
 221         b.hi    L(copy32_128)
 222
 223         /* Small copies: 0..32 bytes.  */
 224         cmp     count, 16
 225         b.lo    L(copy16)
 226         ldp     A_l, A_h, [src]
 227         ldp     D_l, D_h, [srcend, -16]
 228         stp     A_l, A_h, [dstin]
 229         stp     D_l, D_h, [dstend, -16]
 230         ret
 231
 232         .p2align 4
 233 L(move_long):
 234         /* Only use backward copy if there is an overlap.  */
 235         sub     tmp1, dstin, src
 236         cbz     tmp1, L(copy0)
 237         cmp     tmp1, count
 238         b.hs    L(copy_long)
 239
 240         /* Large backwards copy for overlapping copies.
 241            Copy 16 bytes and then align dst to 16-byte alignment.  */
 242         ldp     D_l, D_h, [srcend, -16]
 243         and     tmp1, dstend, 15
 244         sub     srcend, srcend, tmp1
 245         sub     count, count, tmp1
 246         ldp     A_l, A_h, [srcend, -16]
 247         stp     D_l, D_h, [dstend, -16]
 248         ldp     B_l, B_h, [srcend, -32]
 249         ldp     C_l, C_h, [srcend, -48]
 250         ldp     D_l, D_h, [srcend, -64]!
 251         sub     dstend, dstend, tmp1
 252         subs    count, count, 128
 253         b.ls    L(copy64_from_start)
 254
 255 L(loop64_backwards):
 256         stp     A_l, A_h, [dstend, -16]
 257         ldp     A_l, A_h, [srcend, -16]
 258         stp     B_l, B_h, [dstend, -32]
 259         ldp     B_l, B_h, [srcend, -32]
 260         stp     C_l, C_h, [dstend, -48]
 261         ldp     C_l, C_h, [srcend, -48]
 262         stp     D_l, D_h, [dstend, -64]!
 263         ldp     D_l, D_h, [srcend, -64]!
 264         subs    count, count, 64
 265         b.hi    L(loop64_backwards)
 266
 267         /* Write the last iteration and copy 64 bytes from the start.  */
 268 L(copy64_from_start):
 269         ldp     G_l, G_h, [src, 48]
 270         stp     A_l, A_h, [dstend, -16]
 271         ldp     A_l, A_h, [src, 32]
 272         stp     B_l, B_h, [dstend, -32]
 273         ldp     B_l, B_h, [src, 16]
 274         stp     C_l, C_h, [dstend, -48]
 275         ldp     C_l, C_h, [src]
 276         stp     D_l, D_h, [dstend, -64]
 277         stp     G_l, G_h, [dstin, 48]
 278         stp     A_l, A_h, [dstin, 32]
 279         stp     B_l, B_h, [dstin, 16]
 280         stp     C_l, C_h, [dstin]
 281         ret
 282
 283 END (MEMMOVE)
 284 libc_hidden_builtin_def (MEMMOVE)