sysdeps/aarch64/multiarch/memcpy_advsimd.S

   1 /* Generic optimized memcpy using SIMD.
   2    Copyright (C) 2020-2022 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23  *
  24  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  25  *
  26  */
  27
  28 #define dstin   x0
  29 #define src     x1
  30 #define count   x2
  31 #define dst     x3
  32 #define srcend  x4
  33 #define dstend  x5
  34 #define A_l     x6
  35 #define A_lw    w6
  36 #define A_h     x7
  37 #define B_l     x8
  38 #define B_lw    w8
  39 #define B_h     x9
  40 #define C_lw    w10
  41 #define tmp1    x14
  42
  43 #define A_q     q0
  44 #define B_q     q1
  45 #define C_q     q2
  46 #define D_q     q3
  47 #define E_q     q4
  48 #define F_q     q5
  49 #define G_q     q6
  50 #define H_q     q7
  51
  52
  53 /* This implementation supports both memcpy and memmove and shares most code.
  54    It uses unaligned accesses and branchless sequences to keep the code small,
  55    simple and improve performance.
  56
  57    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  58    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  59    check in memmove is negligible since it is only required for large copies.
  60
  61    Large copies use a software pipelined loop processing 64 bytes per
  62    iteration.  The destination pointer is 16-byte aligned to minimize
  63    unaligned accesses.  The loop tail is handled by always copying 64 bytes
  64    from the end.  */
  65
  66 ENTRY (__memcpy_simd)
  67         PTR_ARG (0)
  68         PTR_ARG (1)
  69         SIZE_ARG (2)
  70
  71         add     srcend, src, count
  72         add     dstend, dstin, count
  73         cmp     count, 128
  74         b.hi    L(copy_long)
  75         cmp     count, 32
  76         b.hi    L(copy32_128)
  77
  78         /* Small copies: 0..32 bytes.  */
  79         cmp     count, 16
  80         b.lo    L(copy16)
  81         ldr     A_q, [src]
  82         ldr     B_q, [srcend, -16]
  83         str     A_q, [dstin]
  84         str     B_q, [dstend, -16]
  85         ret
  86
  87         /* Copy 8-15 bytes.  */
  88 L(copy16):
  89         tbz     count, 3, L(copy8)
  90         ldr     A_l, [src]
  91         ldr     A_h, [srcend, -8]
  92         str     A_l, [dstin]
  93         str     A_h, [dstend, -8]
  94         ret
  95
  96         /* Copy 4-7 bytes.  */
  97 L(copy8):
  98         tbz     count, 2, L(copy4)
  99         ldr     A_lw, [src]
 100         ldr     B_lw, [srcend, -4]
 101         str     A_lw, [dstin]
 102         str     B_lw, [dstend, -4]
 103         ret
 104
 105         /* Copy 0..3 bytes using a branchless sequence.  */
 106 L(copy4):
 107         cbz     count, L(copy0)
 108         lsr     tmp1, count, 1
 109         ldrb    A_lw, [src]
 110         ldrb    C_lw, [srcend, -1]
 111         ldrb    B_lw, [src, tmp1]
 112         strb    A_lw, [dstin]
 113         strb    B_lw, [dstin, tmp1]
 114         strb    C_lw, [dstend, -1]
 115 L(copy0):
 116         ret
 117
 118         .p2align 4
 119         /* Medium copies: 33..128 bytes.  */
 120 L(copy32_128):
 121         ldp     A_q, B_q, [src]
 122         ldp     C_q, D_q, [srcend, -32]
 123         cmp     count, 64
 124         b.hi    L(copy128)
 125         stp     A_q, B_q, [dstin]
 126         stp     C_q, D_q, [dstend, -32]
 127         ret
 128
 129         .p2align 4
 130         /* Copy 65..128 bytes.  */
 131 L(copy128):
 132         ldp     E_q, F_q, [src, 32]
 133         cmp     count, 96
 134         b.ls    L(copy96)
 135         ldp     G_q, H_q, [srcend, -64]
 136         stp     G_q, H_q, [dstend, -64]
 137 L(copy96):
 138         stp     A_q, B_q, [dstin]
 139         stp     E_q, F_q, [dstin, 32]
 140         stp     C_q, D_q, [dstend, -32]
 141         ret
 142
 143         /* Align loop64 below to 16 bytes.  */
 144         nop
 145
 146         /* Copy more than 128 bytes.  */
 147 L(copy_long):
 148         /* Copy 16 bytes and then align src to 16-byte alignment.  */
 149         ldr     D_q, [src]
 150         and     tmp1, src, 15
 151         bic     src, src, 15
 152         sub     dst, dstin, tmp1
 153         add     count, count, tmp1      /* Count is now 16 too large.  */
 154         ldp     A_q, B_q, [src, 16]
 155         str     D_q, [dstin]
 156         ldp     C_q, D_q, [src, 48]
 157         subs    count, count, 128 + 16  /* Test and readjust count.  */
 158         b.ls    L(copy64_from_end)
 159 L(loop64):
 160         stp     A_q, B_q, [dst, 16]
 161         ldp     A_q, B_q, [src, 80]
 162         stp     C_q, D_q, [dst, 48]
 163         ldp     C_q, D_q, [src, 112]
 164         add     src, src, 64
 165         add     dst, dst, 64
 166         subs    count, count, 64
 167         b.hi    L(loop64)
 168
 169         /* Write the last iteration and copy 64 bytes from the end.  */
 170 L(copy64_from_end):
 171         ldp     E_q, F_q, [srcend, -64]
 172         stp     A_q, B_q, [dst, 16]
 173         ldp     A_q, B_q, [srcend, -32]
 174         stp     C_q, D_q, [dst, 48]
 175         stp     E_q, F_q, [dstend, -64]
 176         stp     A_q, B_q, [dstend, -32]
 177         ret
 178
 179 END (__memcpy_simd)
 180 libc_hidden_builtin_def (__memcpy_simd)
 181
 182
 183 ENTRY (__memmove_simd)
 184         PTR_ARG (0)
 185         PTR_ARG (1)
 186         SIZE_ARG (2)
 187
 188         add     srcend, src, count
 189         add     dstend, dstin, count
 190         cmp     count, 128
 191         b.hi    L(move_long)
 192         cmp     count, 32
 193         b.hi    L(copy32_128)
 194
 195         /* Small moves: 0..32 bytes.  */
 196         cmp     count, 16
 197         b.lo    L(copy16)
 198         ldr     A_q, [src]
 199         ldr     B_q, [srcend, -16]
 200         str     A_q, [dstin]
 201         str     B_q, [dstend, -16]
 202         ret
 203
 204 L(move_long):
 205         /* Only use backward copy if there is an overlap.  */
 206         sub     tmp1, dstin, src
 207         cbz     tmp1, L(move0)
 208         cmp     tmp1, count
 209         b.hs    L(copy_long)
 210
 211         /* Large backwards copy for overlapping copies.
 212            Copy 16 bytes and then align srcend to 16-byte alignment.  */
 213 L(copy_long_backwards):
 214         ldr     D_q, [srcend, -16]
 215         and     tmp1, srcend, 15
 216         bic     srcend, srcend, 15
 217         sub     count, count, tmp1
 218         ldp     A_q, B_q, [srcend, -32]
 219         str     D_q, [dstend, -16]
 220         ldp     C_q, D_q, [srcend, -64]
 221         sub     dstend, dstend, tmp1
 222         subs    count, count, 128
 223         b.ls    L(copy64_from_start)
 224
 225 L(loop64_backwards):
 226         str     B_q, [dstend, -16]
 227         str     A_q, [dstend, -32]
 228         ldp     A_q, B_q, [srcend, -96]
 229         str     D_q, [dstend, -48]
 230         str     C_q, [dstend, -64]!
 231         ldp     C_q, D_q, [srcend, -128]
 232         sub     srcend, srcend, 64
 233         subs    count, count, 64
 234         b.hi    L(loop64_backwards)
 235
 236         /* Write the last iteration and copy 64 bytes from the start.  */
 237 L(copy64_from_start):
 238         ldp     E_q, F_q, [src, 32]
 239         stp     A_q, B_q, [dstend, -32]
 240         ldp     A_q, B_q, [src]
 241         stp     C_q, D_q, [dstend, -64]
 242         stp     E_q, F_q, [dstin, 32]
 243         stp     A_q, B_q, [dstin]
 244 L(move0):
 245         ret
 246
 247 END (__memmove_simd)
 248 libc_hidden_builtin_def (__memmove_simd)