sysdeps/aarch64/memcpy.S

   1 /* Generic optimized memcpy using SIMD.
   2    Copyright (C) 2012-2024 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23  *
  24  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  25  *
  26  */
  27
  28 #define dstin   x0
  29 #define src     x1
  30 #define count   x2
  31 #define dst     x3
  32 #define srcend  x4
  33 #define dstend  x5
  34 #define A_l     x6
  35 #define A_lw    w6
  36 #define A_h     x7
  37 #define B_l     x8
  38 #define B_lw    w8
  39 #define B_h     x9
  40 #define C_lw    w10
  41 #define tmp1    x14
  42
  43 #define A_q     q0
  44 #define B_q     q1
  45 #define C_q     q2
  46 #define D_q     q3
  47 #define E_q     q4
  48 #define F_q     q5
  49 #define G_q     q6
  50 #define H_q     q7
  51
  52 #ifndef MEMMOVE
  53 # define MEMMOVE memmove
  54 #endif
  55 #ifndef MEMCPY
  56 # define MEMCPY memcpy
  57 #endif
  58
  59 /* This implementation supports both memcpy and memmove and shares most code.
  60    It uses unaligned accesses and branchless sequences to keep the code small,
  61    simple and improve performance.
  62
  63    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  64    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  65    check in memmove is negligible since it is only required for large copies.
  66
  67    Large copies use a software pipelined loop processing 64 bytes per
  68    iteration.  The destination pointer is 16-byte aligned to minimize
  69    unaligned accesses.  The loop tail is handled by always copying 64 bytes
  70    from the end.  */
  71
  72 ENTRY (MEMCPY)
  73         PTR_ARG (0)
  74         PTR_ARG (1)
  75         SIZE_ARG (2)
  76
  77         add     srcend, src, count
  78         add     dstend, dstin, count
  79         cmp     count, 128
  80         b.hi    L(copy_long)
  81         cmp     count, 32
  82         b.hi    L(copy32_128)
  83
  84         /* Small copies: 0..32 bytes.  */
  85         cmp     count, 16
  86         b.lo    L(copy16)
  87         ldr     A_q, [src]
  88         ldr     B_q, [srcend, -16]
  89         str     A_q, [dstin]
  90         str     B_q, [dstend, -16]
  91         ret
  92
  93         /* Copy 8-15 bytes.  */
  94 L(copy16):
  95         tbz     count, 3, L(copy8)
  96         ldr     A_l, [src]
  97         ldr     A_h, [srcend, -8]
  98         str     A_l, [dstin]
  99         str     A_h, [dstend, -8]
 100         ret
 101
 102         /* Copy 4-7 bytes.  */
 103 L(copy8):
 104         tbz     count, 2, L(copy4)
 105         ldr     A_lw, [src]
 106         ldr     B_lw, [srcend, -4]
 107         str     A_lw, [dstin]
 108         str     B_lw, [dstend, -4]
 109         ret
 110
 111         /* Copy 0..3 bytes using a branchless sequence.  */
 112 L(copy4):
 113         cbz     count, L(copy0)
 114         lsr     tmp1, count, 1
 115         ldrb    A_lw, [src]
 116         ldrb    C_lw, [srcend, -1]
 117         ldrb    B_lw, [src, tmp1]
 118         strb    A_lw, [dstin]
 119         strb    B_lw, [dstin, tmp1]
 120         strb    C_lw, [dstend, -1]
 121 L(copy0):
 122         ret
 123
 124         .p2align 4
 125         /* Medium copies: 33..128 bytes.  */
 126 L(copy32_128):
 127         ldp     A_q, B_q, [src]
 128         ldp     C_q, D_q, [srcend, -32]
 129         cmp     count, 64
 130         b.hi    L(copy128)
 131         stp     A_q, B_q, [dstin]
 132         stp     C_q, D_q, [dstend, -32]
 133         ret
 134
 135         .p2align 4
 136         /* Copy 65..128 bytes.  */
 137 L(copy128):
 138         ldp     E_q, F_q, [src, 32]
 139         cmp     count, 96
 140         b.ls    L(copy96)
 141         ldp     G_q, H_q, [srcend, -64]
 142         stp     G_q, H_q, [dstend, -64]
 143 L(copy96):
 144         stp     A_q, B_q, [dstin]
 145         stp     E_q, F_q, [dstin, 32]
 146         stp     C_q, D_q, [dstend, -32]
 147         ret
 148
 149         /* Align loop64 below to 16 bytes.  */
 150         nop
 151
 152         /* Copy more than 128 bytes.  */
 153 L(copy_long):
 154         /* Copy 16 bytes and then align src to 16-byte alignment.  */
 155         ldr     D_q, [src]
 156         and     tmp1, src, 15
 157         bic     src, src, 15
 158         sub     dst, dstin, tmp1
 159         add     count, count, tmp1      /* Count is now 16 too large.  */
 160         ldp     A_q, B_q, [src, 16]
 161         str     D_q, [dstin]
 162         ldp     C_q, D_q, [src, 48]
 163         subs    count, count, 128 + 16  /* Test and readjust count.  */
 164         b.ls    L(copy64_from_end)
 165 L(loop64):
 166         stp     A_q, B_q, [dst, 16]
 167         ldp     A_q, B_q, [src, 80]
 168         stp     C_q, D_q, [dst, 48]
 169         ldp     C_q, D_q, [src, 112]
 170         add     src, src, 64
 171         add     dst, dst, 64
 172         subs    count, count, 64
 173         b.hi    L(loop64)
 174
 175         /* Write the last iteration and copy 64 bytes from the end.  */
 176 L(copy64_from_end):
 177         ldp     E_q, F_q, [srcend, -64]
 178         stp     A_q, B_q, [dst, 16]
 179         ldp     A_q, B_q, [srcend, -32]
 180         stp     C_q, D_q, [dst, 48]
 181         stp     E_q, F_q, [dstend, -64]
 182         stp     A_q, B_q, [dstend, -32]
 183         ret
 184
 185 END (MEMCPY)
 186 libc_hidden_builtin_def (MEMCPY)
 187
 188
 189 ENTRY (MEMMOVE)
 190         PTR_ARG (0)
 191         PTR_ARG (1)
 192         SIZE_ARG (2)
 193
 194         add     srcend, src, count
 195         add     dstend, dstin, count
 196         cmp     count, 128
 197         b.hi    L(move_long)
 198         cmp     count, 32
 199         b.hi    L(copy32_128)
 200
 201         /* Small moves: 0..32 bytes.  */
 202         cmp     count, 16
 203         b.lo    L(copy16)
 204         ldr     A_q, [src]
 205         ldr     B_q, [srcend, -16]
 206         str     A_q, [dstin]
 207         str     B_q, [dstend, -16]
 208         ret
 209
 210 L(move_long):
 211         /* Only use backward copy if there is an overlap.  */
 212         sub     tmp1, dstin, src
 213         cbz     tmp1, L(move0)
 214         cmp     tmp1, count
 215         b.hs    L(copy_long)
 216
 217         /* Large backwards copy for overlapping copies.
 218            Copy 16 bytes and then align srcend to 16-byte alignment.  */
 219 L(copy_long_backwards):
 220         ldr     D_q, [srcend, -16]
 221         and     tmp1, srcend, 15
 222         bic     srcend, srcend, 15
 223         sub     count, count, tmp1
 224         ldp     A_q, B_q, [srcend, -32]
 225         str     D_q, [dstend, -16]
 226         ldp     C_q, D_q, [srcend, -64]
 227         sub     dstend, dstend, tmp1
 228         subs    count, count, 128
 229         b.ls    L(copy64_from_start)
 230
 231 L(loop64_backwards):
 232         str     B_q, [dstend, -16]
 233         str     A_q, [dstend, -32]
 234         ldp     A_q, B_q, [srcend, -96]
 235         str     D_q, [dstend, -48]
 236         str     C_q, [dstend, -64]!
 237         ldp     C_q, D_q, [srcend, -128]
 238         sub     srcend, srcend, 64
 239         subs    count, count, 64
 240         b.hi    L(loop64_backwards)
 241
 242         /* Write the last iteration and copy 64 bytes from the start.  */
 243 L(copy64_from_start):
 244         ldp     E_q, F_q, [src, 32]
 245         stp     A_q, B_q, [dstend, -32]
 246         ldp     A_q, B_q, [src]
 247         stp     C_q, D_q, [dstend, -64]
 248         stp     E_q, F_q, [dstin, 32]
 249         stp     A_q, B_q, [dstin]
 250 L(move0):
 251         ret
 252
 253 END (MEMMOVE)
 254 libc_hidden_builtin_def (MEMMOVE)