sysdeps/aarch64/multiarch/memcpy_a64fx.S

   1 /* Optimized memcpy for Fujitsu A64FX processor.
   2    Copyright (C) 2021-2022 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 #undef BTI_C
  23 #define BTI_C
  24
  25 /* Assumptions:
  26  *
  27  * ARMv8.2-a, AArch64, unaligned accesses, sve
  28  *
  29  */
  30
  31 #define dstin   x0
  32 #define src     x1
  33 #define n       x2
  34 #define dst     x3
  35 #define dstend  x4
  36 #define srcend  x5
  37 #define tmp     x6
  38 #define vlen    x7
  39 #define vlen8   x8
  40
  41 #if HAVE_AARCH64_SVE_ASM
  42 # if IS_IN (libc)
  43 #  define MEMCPY __memcpy_a64fx
  44 #  define MEMMOVE __memmove_a64fx
  45
  46         .arch armv8.2-a+sve
  47
  48         .macro ld1b_unroll8
  49         ld1b    z0.b, p0/z, [src, 0, mul vl]
  50         ld1b    z1.b, p0/z, [src, 1, mul vl]
  51         ld1b    z2.b, p0/z, [src, 2, mul vl]
  52         ld1b    z3.b, p0/z, [src, 3, mul vl]
  53         ld1b    z4.b, p0/z, [src, 4, mul vl]
  54         ld1b    z5.b, p0/z, [src, 5, mul vl]
  55         ld1b    z6.b, p0/z, [src, 6, mul vl]
  56         ld1b    z7.b, p0/z, [src, 7, mul vl]
  57         .endm
  58
  59         .macro stld1b_unroll4a
  60         st1b    z0.b, p0,   [dst, 0, mul vl]
  61         st1b    z1.b, p0,   [dst, 1, mul vl]
  62         ld1b    z0.b, p0/z, [src, 0, mul vl]
  63         ld1b    z1.b, p0/z, [src, 1, mul vl]
  64         st1b    z2.b, p0,   [dst, 2, mul vl]
  65         st1b    z3.b, p0,   [dst, 3, mul vl]
  66         ld1b    z2.b, p0/z, [src, 2, mul vl]
  67         ld1b    z3.b, p0/z, [src, 3, mul vl]
  68         .endm
  69
  70         .macro stld1b_unroll4b
  71         st1b    z4.b, p0,   [dst, 4, mul vl]
  72         st1b    z5.b, p0,   [dst, 5, mul vl]
  73         ld1b    z4.b, p0/z, [src, 4, mul vl]
  74         ld1b    z5.b, p0/z, [src, 5, mul vl]
  75         st1b    z6.b, p0,   [dst, 6, mul vl]
  76         st1b    z7.b, p0,   [dst, 7, mul vl]
  77         ld1b    z6.b, p0/z, [src, 6, mul vl]
  78         ld1b    z7.b, p0/z, [src, 7, mul vl]
  79         .endm
  80
  81         .macro stld1b_unroll8
  82         stld1b_unroll4a
  83         stld1b_unroll4b
  84         .endm
  85
  86         .macro st1b_unroll8
  87         st1b    z0.b, p0, [dst, 0, mul vl]
  88         st1b    z1.b, p0, [dst, 1, mul vl]
  89         st1b    z2.b, p0, [dst, 2, mul vl]
  90         st1b    z3.b, p0, [dst, 3, mul vl]
  91         st1b    z4.b, p0, [dst, 4, mul vl]
  92         st1b    z5.b, p0, [dst, 5, mul vl]
  93         st1b    z6.b, p0, [dst, 6, mul vl]
  94         st1b    z7.b, p0, [dst, 7, mul vl]
  95         .endm
  96
  97 #undef BTI_C
  98 #define BTI_C
  99
 100 ENTRY (MEMCPY)
 101
 102         PTR_ARG (0)
 103         PTR_ARG (1)
 104         SIZE_ARG (2)
 105
 106         cntb    vlen
 107         cmp     n, vlen, lsl 1
 108         b.hi    L(copy_small)
 109         whilelo p1.b, vlen, n
 110         whilelo p0.b, xzr, n
 111         ld1b    z0.b, p0/z, [src, 0, mul vl]
 112         ld1b    z1.b, p1/z, [src, 1, mul vl]
 113         st1b    z0.b, p0, [dstin, 0, mul vl]
 114         st1b    z1.b, p1, [dstin, 1, mul vl]
 115         ret
 116
 117         .p2align 4
 118
 119 L(copy_small):
 120         cmp     n, vlen, lsl 3
 121         b.hi    L(copy_large)
 122         add     dstend, dstin, n
 123         add     srcend, src, n
 124         cmp     n, vlen, lsl 2
 125         b.hi    1f
 126
 127         /* Copy 2-4 vectors.  */
 128         ptrue   p0.b
 129         ld1b    z0.b, p0/z, [src, 0, mul vl]
 130         ld1b    z1.b, p0/z, [src, 1, mul vl]
 131         ld1b    z2.b, p0/z, [srcend, -2, mul vl]
 132         ld1b    z3.b, p0/z, [srcend, -1, mul vl]
 133         st1b    z0.b, p0, [dstin, 0, mul vl]
 134         st1b    z1.b, p0, [dstin, 1, mul vl]
 135         st1b    z2.b, p0, [dstend, -2, mul vl]
 136         st1b    z3.b, p0, [dstend, -1, mul vl]
 137         ret
 138
 139         .p2align 4
 140         /* Copy 4-8 vectors.  */
 141 1:      ptrue   p0.b
 142         ld1b    z0.b, p0/z, [src, 0, mul vl]
 143         ld1b    z1.b, p0/z, [src, 1, mul vl]
 144         ld1b    z2.b, p0/z, [src, 2, mul vl]
 145         ld1b    z3.b, p0/z, [src, 3, mul vl]
 146         ld1b    z4.b, p0/z, [srcend, -4, mul vl]
 147         ld1b    z5.b, p0/z, [srcend, -3, mul vl]
 148         ld1b    z6.b, p0/z, [srcend, -2, mul vl]
 149         ld1b    z7.b, p0/z, [srcend, -1, mul vl]
 150         st1b    z0.b, p0, [dstin, 0, mul vl]
 151         st1b    z1.b, p0, [dstin, 1, mul vl]
 152         st1b    z2.b, p0, [dstin, 2, mul vl]
 153         st1b    z3.b, p0, [dstin, 3, mul vl]
 154         st1b    z4.b, p0, [dstend, -4, mul vl]
 155         st1b    z5.b, p0, [dstend, -3, mul vl]
 156         st1b    z6.b, p0, [dstend, -2, mul vl]
 157         st1b    z7.b, p0, [dstend, -1, mul vl]
 158         ret
 159
 160         .p2align 4
 161         /* At least 8 vectors - always align to vector length for
 162            higher and consistent write performance.  */
 163 L(copy_large):
 164         sub     tmp, vlen, 1
 165         and     tmp, dstin, tmp
 166         sub     tmp, vlen, tmp
 167         whilelo p1.b, xzr, tmp
 168         ld1b    z1.b, p1/z, [src]
 169         st1b    z1.b, p1, [dstin]
 170         add     dst, dstin, tmp
 171         add     src, src, tmp
 172         sub     n, n, tmp
 173         ptrue   p0.b
 174
 175         lsl     vlen8, vlen, 3
 176         subs    n, n, vlen8
 177         b.ls    3f
 178         ld1b_unroll8
 179         add     src, src, vlen8
 180         subs    n, n, vlen8
 181         b.ls    2f
 182
 183         .p2align 4
 184         /* 8x unrolled and software pipelined loop.  */
 185 1:      stld1b_unroll8
 186         add     dst, dst, vlen8
 187         add     src, src, vlen8
 188         subs    n, n, vlen8
 189         b.hi    1b
 190 2:      st1b_unroll8
 191         add     dst, dst, vlen8
 192 3:      add     n, n, vlen8
 193
 194         /* Move last 0-8 vectors.  */
 195 L(last_bytes):
 196         cmp     n, vlen, lsl 1
 197         b.hi    1f
 198         whilelo p0.b, xzr, n
 199         whilelo p1.b, vlen, n
 200         ld1b    z0.b, p0/z, [src, 0, mul vl]
 201         ld1b    z1.b, p1/z, [src, 1, mul vl]
 202         st1b    z0.b, p0, [dst, 0, mul vl]
 203         st1b    z1.b, p1, [dst, 1, mul vl]
 204         ret
 205
 206         .p2align 4
 207
 208 1:      add     srcend, src, n
 209         add     dstend, dst, n
 210         ld1b    z0.b, p0/z, [src, 0, mul vl]
 211         ld1b    z1.b, p0/z, [src, 1, mul vl]
 212         ld1b    z2.b, p0/z, [srcend, -2, mul vl]
 213         ld1b    z3.b, p0/z, [srcend, -1, mul vl]
 214         cmp     n, vlen, lsl 2
 215         b.hi    1f
 216
 217         st1b    z0.b, p0, [dst, 0, mul vl]
 218         st1b    z1.b, p0, [dst, 1, mul vl]
 219         st1b    z2.b, p0, [dstend, -2, mul vl]
 220         st1b    z3.b, p0, [dstend, -1, mul vl]
 221         ret
 222
 223 1:      ld1b    z4.b, p0/z, [src, 2, mul vl]
 224         ld1b    z5.b, p0/z, [src, 3, mul vl]
 225         ld1b    z6.b, p0/z, [srcend, -4, mul vl]
 226         ld1b    z7.b, p0/z, [srcend, -3, mul vl]
 227         st1b    z0.b, p0, [dst, 0, mul vl]
 228         st1b    z1.b, p0, [dst, 1, mul vl]
 229         st1b    z4.b, p0, [dst, 2, mul vl]
 230         st1b    z5.b, p0, [dst, 3, mul vl]
 231         st1b    z6.b, p0, [dstend, -4, mul vl]
 232         st1b    z7.b, p0, [dstend, -3, mul vl]
 233         st1b    z2.b, p0, [dstend, -2, mul vl]
 234         st1b    z3.b, p0, [dstend, -1, mul vl]
 235         ret
 236
 237 END (MEMCPY)
 238 libc_hidden_builtin_def (MEMCPY)
 239
 240
 241 ENTRY_ALIGN (MEMMOVE, 4)
 242
 243         PTR_ARG (0)
 244         PTR_ARG (1)
 245         SIZE_ARG (2)
 246
 247         /* Fast case for up to 2 vectors.  */
 248         cntb    vlen
 249         cmp     n, vlen, lsl 1
 250         b.hi    1f
 251         whilelo p0.b, xzr, n
 252         whilelo p1.b, vlen, n
 253         ld1b    z0.b, p0/z, [src, 0, mul vl]
 254         ld1b    z1.b, p1/z, [src, 1, mul vl]
 255         st1b    z0.b, p0, [dstin, 0, mul vl]
 256         st1b    z1.b, p1, [dstin, 1, mul vl]
 257 L(full_overlap):
 258         ret
 259
 260         .p2align 4
 261         /* Check for overlapping moves. Return if there is a full overlap.
 262            Small moves up to 8 vectors use the overlap-safe copy_small code.
 263            Non-overlapping or overlapping moves with dst < src use memcpy.
 264            Overlapping moves with dst > src use a backward copy loop.  */
 265 1:      sub     tmp, dstin, src
 266         ands    tmp, tmp, 0xffffffffffffff      /* Clear special tag bits.  */
 267         b.eq    L(full_overlap)
 268         cmp     n, vlen, lsl 3
 269         b.ls    L(copy_small)
 270         cmp     tmp, n
 271         b.hs    L(copy_large)
 272
 273         /* Align to vector length.  */
 274         add     dst, dstin, n
 275         sub     tmp, vlen, 1
 276         ands    tmp, dst, tmp
 277         csel    tmp, tmp, vlen, ne
 278         whilelo p1.b, xzr, tmp
 279         sub     n, n, tmp
 280         ld1b    z1.b, p1/z, [src, n]
 281         st1b    z1.b, p1, [dstin, n]
 282         add     src, src, n
 283         add     dst, dstin, n
 284
 285         ptrue   p0.b
 286         lsl     vlen8, vlen, 3
 287         subs    n, n, vlen8
 288         b.ls    3f
 289         sub     src, src, vlen8
 290         ld1b_unroll8
 291         subs    n, n, vlen8
 292         b.ls    2f
 293
 294         .p2align 4
 295         /* 8x unrolled and software pipelined backward copy loop.  */
 296 1:      sub     src, src, vlen8
 297         sub     dst, dst, vlen8
 298         stld1b_unroll8
 299         subs    n, n, vlen8
 300         b.hi    1b
 301 2:      sub     dst, dst, vlen8
 302         st1b_unroll8
 303 3:      add     n, n, vlen8
 304
 305         /* Adjust src/dst for last 0-8 vectors.  */
 306         sub     src, src, n
 307         mov     dst, dstin
 308         b       L(last_bytes)
 309
 310 END (MEMMOVE)
 311 libc_hidden_builtin_def (MEMMOVE)
 312 # endif /* IS_IN (libc) */
 313 #endif /* HAVE_AARCH64_SVE_ASM */