sysdeps/aarch64/multiarch/memcpy_sve.S

   1 /* Optimized memcpy for SVE.
   2    Copyright (C) 2021-2022 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23  *
  24  * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
  25  *
  26  */
  27
  28 #define dstin   x0
  29 #define src     x1
  30 #define count   x2
  31 #define dst     x3
  32 #define srcend  x4
  33 #define dstend  x5
  34 #define tmp1    x6
  35 #define vlen    x6
  36
  37 #define A_q     q0
  38 #define B_q     q1
  39 #define C_q     q2
  40 #define D_q     q3
  41 #define E_q     q4
  42 #define F_q     q5
  43 #define G_q     q6
  44 #define H_q     q7
  45
  46 /* This implementation supports both memcpy and memmove and shares most code.
  47    It uses unaligned accesses and branchless sequences to keep the code small,
  48    simple and improve performance.
  49
  50    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  51    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  52    check in memmove is negligible since it is only required for large copies.
  53
  54    Large copies use a software pipelined loop processing 64 bytes per iteration.
  55    The source pointer is 16-byte aligned to minimize unaligned accesses.
  56    The loop tail is handled by always copying 64 bytes from the end.
  57 */
  58
  59 #if HAVE_AARCH64_SVE_ASM
  60
  61         .arch armv8.2-a+sve
  62
  63 ENTRY (__memcpy_sve)
  64         PTR_ARG (0)
  65         PTR_ARG (1)
  66         SIZE_ARG (2)
  67
  68         cmp     count, 128
  69         b.hi    L(copy_long)
  70         cmp     count, 32
  71         b.hi    L(copy32_128)
  72
  73         whilelo p0.b, xzr, count
  74         cntb    vlen
  75         tbnz    vlen, 4, L(vlen128)
  76         ld1b    z0.b, p0/z, [src]
  77         st1b    z0.b, p0, [dstin]
  78         ret
  79
  80         /* Medium copies: 33..128 bytes.  */
  81 L(copy32_128):
  82         add     srcend, src, count
  83         add     dstend, dstin, count
  84         ldp     A_q, B_q, [src]
  85         ldp     C_q, D_q, [srcend, -32]
  86         cmp     count, 64
  87         b.hi    L(copy128)
  88         stp     A_q, B_q, [dstin]
  89         stp     C_q, D_q, [dstend, -32]
  90         ret
  91
  92         /* Copy 65..128 bytes.  */
  93 L(copy128):
  94         ldp     E_q, F_q, [src, 32]
  95         cmp     count, 96
  96         b.ls    L(copy96)
  97         ldp     G_q, H_q, [srcend, -64]
  98         stp     G_q, H_q, [dstend, -64]
  99 L(copy96):
 100         stp     A_q, B_q, [dstin]
 101         stp     E_q, F_q, [dstin, 32]
 102         stp     C_q, D_q, [dstend, -32]
 103         ret
 104
 105 L(vlen128):
 106         whilelo p1.b, vlen, count
 107         ld1b    z0.b, p0/z, [src, 0, mul vl]
 108         ld1b    z1.b, p1/z, [src, 1, mul vl]
 109         st1b    z0.b, p0, [dstin, 0, mul vl]
 110         st1b    z1.b, p1, [dstin, 1, mul vl]
 111         ret
 112
 113         .p2align 4
 114         /* Copy more than 128 bytes.  */
 115 L(copy_long):
 116         add     srcend, src, count
 117         add     dstend, dstin, count
 118
 119         /* Copy 16 bytes and then align src to 16-byte alignment.  */
 120         ldr     D_q, [src]
 121         and     tmp1, src, 15
 122         bic     src, src, 15
 123         sub     dst, dstin, tmp1
 124         add     count, count, tmp1      /* Count is now 16 too large.  */
 125         ldp     A_q, B_q, [src, 16]
 126         str     D_q, [dstin]
 127         ldp     C_q, D_q, [src, 48]
 128         subs    count, count, 128 + 16  /* Test and readjust count.  */
 129         b.ls    L(copy64_from_end)
 130 L(loop64):
 131         stp     A_q, B_q, [dst, 16]
 132         ldp     A_q, B_q, [src, 80]
 133         stp     C_q, D_q, [dst, 48]
 134         ldp     C_q, D_q, [src, 112]
 135         add     src, src, 64
 136         add     dst, dst, 64
 137         subs    count, count, 64
 138         b.hi    L(loop64)
 139
 140         /* Write the last iteration and copy 64 bytes from the end.  */
 141 L(copy64_from_end):
 142         ldp     E_q, F_q, [srcend, -64]
 143         stp     A_q, B_q, [dst, 16]
 144         ldp     A_q, B_q, [srcend, -32]
 145         stp     C_q, D_q, [dst, 48]
 146         stp     E_q, F_q, [dstend, -64]
 147         stp     A_q, B_q, [dstend, -32]
 148         ret
 149
 150 END (__memcpy_sve)
 151 libc_hidden_builtin_def (__memcpy_sve)
 152
 153
 154 ENTRY (__memmove_sve)
 155         PTR_ARG (0)
 156         PTR_ARG (1)
 157         SIZE_ARG (2)
 158
 159         cmp     count, 128
 160         b.hi    L(move_long)
 161         cmp     count, 32
 162         b.hi    L(copy32_128)
 163
 164         whilelo p0.b, xzr, count
 165         cntb    vlen
 166         tbnz    vlen, 4, L(vlen128)
 167         ld1b    z0.b, p0/z, [src]
 168         st1b    z0.b, p0, [dstin]
 169         ret
 170
 171         .p2align 4
 172 L(move_long):
 173         add     srcend, src, count
 174         add     dstend, dstin, count
 175         /* Only use backward copy if there is an overlap.  */
 176         sub     tmp1, dstin, src
 177         cbz     tmp1, L(return)
 178         cmp     tmp1, count
 179         b.hs    L(copy_long)
 180
 181         /* Large backwards copy for overlapping copies.
 182            Copy 16 bytes and then align srcend to 16-byte alignment.  */
 183         ldr     D_q, [srcend, -16]
 184         and     tmp1, srcend, 15
 185         bic     srcend, srcend, 15
 186         sub     count, count, tmp1
 187         ldp     A_q, B_q, [srcend, -32]
 188         str     D_q, [dstend, -16]
 189         ldp     C_q, D_q, [srcend, -64]
 190         sub     dstend, dstend, tmp1
 191         subs    count, count, 128
 192         b.ls    L(copy64_from_start)
 193
 194 L(loop64_backwards):
 195         str     B_q, [dstend, -16]
 196         str     A_q, [dstend, -32]
 197         ldp     A_q, B_q, [srcend, -96]
 198         str     D_q, [dstend, -48]
 199         str     C_q, [dstend, -64]!
 200         ldp     C_q, D_q, [srcend, -128]
 201         sub     srcend, srcend, 64
 202         subs    count, count, 64
 203         b.hi    L(loop64_backwards)
 204
 205         /* Write the last iteration and copy 64 bytes from the start.  */
 206 L(copy64_from_start):
 207         ldp     E_q, F_q, [src, 32]
 208         stp     A_q, B_q, [dstend, -32]
 209         ldp     A_q, B_q, [src]
 210         stp     C_q, D_q, [dstend, -64]
 211         stp     E_q, F_q, [dstin, 32]
 212         stp     A_q, B_q, [dstin]
 213 L(return):
 214         ret
 215
 216 END (__memmove_sve)
 217 libc_hidden_builtin_def (__memmove_sve)
 218 #endif