sysdeps/aarch64/multiarch/memset_a64fx.S

   1 /* Optimized memset for Fujitsu A64FX processor.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21 #include <sysdeps/aarch64/memset-reg.h>
  22
  23 /* Assumptions:
  24  *
  25  * ARMv8.2-a, AArch64, unaligned accesses, sve
  26  *
  27  */
  28
  29 #define L1_SIZE         (64*1024)       // L1 64KB
  30 #define L2_SIZE         (8*1024*1024)   // L2 8MB
  31 #define CACHE_LINE_SIZE 256
  32 #define PF_DIST_L1      (CACHE_LINE_SIZE * 16)  // Prefetch distance L1
  33 #define vector_length   x9
  34
  35 #if HAVE_AARCH64_SVE_ASM
  36 # if IS_IN (libc)
  37 #  define MEMSET __memset_a64fx
  38
  39         .arch armv8.2-a+sve
  40
  41         .macro st1b_unroll first=0, last=7
  42         st1b    z0.b, p0, [dst, \first, mul vl]
  43         .if \last-\first
  44         st1b_unroll "(\first+1)", \last
  45         .endif
  46         .endm
  47
  48
  49 #undef BTI_C
  50 #define BTI_C
  51
  52 ENTRY (MEMSET)
  53         PTR_ARG (0)
  54         SIZE_ARG (2)
  55
  56         cntb    vector_length
  57         dup     z0.b, valw
  58         whilelo p0.b, vector_length, count
  59         b.last  1f
  60         whilelo p1.b, xzr, count
  61         st1b    z0.b, p1, [dstin, 0, mul vl]
  62         st1b    z0.b, p0, [dstin, 1, mul vl]
  63         ret
  64
  65         // count >= vector_length * 2
  66 1:      cmp     count, vector_length, lsl 2
  67         add     dstend, dstin, count
  68         b.hi    1f
  69         st1b    z0.b, p0, [dstin, 0, mul vl]
  70         st1b    z0.b, p0, [dstin, 1, mul vl]
  71         st1b    z0.b, p0, [dstend, -2, mul vl]
  72         st1b    z0.b, p0, [dstend, -1, mul vl]
  73         ret
  74
  75         // count > vector_length * 4
  76 1:      lsl     tmp1, vector_length, 3
  77         cmp     count, tmp1
  78         b.hi    L(vl_agnostic)
  79         st1b    z0.b, p0, [dstin, 0, mul vl]
  80         st1b    z0.b, p0, [dstin, 1, mul vl]
  81         st1b    z0.b, p0, [dstin, 2, mul vl]
  82         st1b    z0.b, p0, [dstin, 3, mul vl]
  83         st1b    z0.b, p0, [dstend, -4, mul vl]
  84         st1b    z0.b, p0, [dstend, -3, mul vl]
  85         st1b    z0.b, p0, [dstend, -2, mul vl]
  86         st1b    z0.b, p0, [dstend, -1, mul vl]
  87         ret
  88
  89         .p2align 4
  90 L(vl_agnostic): // VL Agnostic
  91         mov     dst, dstin
  92         cmp     count, L1_SIZE
  93         b.hi    L(L1_prefetch)
  94
  95         // count >= 8 * vector_length
  96 L(unroll8):
  97         sub     count, count, tmp1
  98         .p2align 4
  99         // The 2 instructions at the beginning of the following loop,
 100         // cmp and branch, are a workaround so as not to degrade at
 101         // the peak performance 16KB.
 102         // It is found heuristically and the branch condition, b.ne,
 103         // is chosen intentionally never to jump.
 104 1:      cmp     xzr, xzr
 105         b.ne    1b
 106         st1b_unroll 0, 7
 107         add     dst, dst, tmp1
 108         subs    count, count, tmp1
 109         b.hi    1b
 110         add     count, count, tmp1
 111
 112 L(last):
 113         cmp     count, vector_length, lsl 1
 114         b.ls    2f
 115         add     tmp2, vector_length, vector_length, lsl 2
 116         cmp     count, tmp2
 117         b.ls    5f
 118         st1b    z0.b, p0, [dstend, -8, mul vl]
 119         st1b    z0.b, p0, [dstend, -7, mul vl]
 120         st1b    z0.b, p0, [dstend, -6, mul vl]
 121 5:      st1b    z0.b, p0, [dstend, -5, mul vl]
 122         st1b    z0.b, p0, [dstend, -4, mul vl]
 123         st1b    z0.b, p0, [dstend, -3, mul vl]
 124 2:      st1b    z0.b, p0, [dstend, -2, mul vl]
 125         st1b    z0.b, p0, [dstend, -1, mul vl]
 126         ret
 127
 128         // count >= L1_SIZE
 129         .p2align 3
 130 L(L1_prefetch):
 131         cmp     count, L2_SIZE
 132         b.hs    L(L2)
 133         cmp     vector_length, 64
 134         b.ne    L(unroll8)
 135 1:      st1b_unroll 0, 3
 136         prfm    pstl1keep, [dst, PF_DIST_L1]
 137         st1b_unroll 4, 7
 138         prfm    pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
 139         add     dst, dst, CACHE_LINE_SIZE * 2
 140         sub     count, count, CACHE_LINE_SIZE * 2
 141         cmp     count, PF_DIST_L1
 142         b.hs    1b
 143         b       L(unroll8)
 144
 145         // count >= L2_SIZE
 146         .p2align 3
 147 L(L2):
 148         tst     valw, 255
 149         b.ne    L(unroll8)
 150         // align dst to CACHE_LINE_SIZE byte boundary
 151         and     tmp2, dst, CACHE_LINE_SIZE - 1
 152         st1b    z0.b, p0, [dst, 0, mul vl]
 153         st1b    z0.b, p0, [dst, 1, mul vl]
 154         st1b    z0.b, p0, [dst, 2, mul vl]
 155         st1b    z0.b, p0, [dst, 3, mul vl]
 156         sub     dst, dst, tmp2
 157         add     count, count, tmp2
 158
 159         // clear cachelines using DC ZVA
 160         sub     count, count, CACHE_LINE_SIZE * 2
 161         .p2align 4
 162 1:      add     dst, dst, CACHE_LINE_SIZE
 163         dc      zva, dst
 164         subs    count, count, CACHE_LINE_SIZE
 165         b.hi    1b
 166         add     count, count, CACHE_LINE_SIZE
 167         b       L(last)
 168
 169 END (MEMSET)
 170 libc_hidden_builtin_def (MEMSET)
 171
 172 #endif /* IS_IN (libc) */
 173 #endif /* HAVE_AARCH64_SVE_ASM */