sysdeps/aarch64/multiarch/memcpy_thunderx.S

   1 /* A Thunderx Optimized memcpy implementation for AARCH64.
   2    Copyright (C) 2017-2022 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 /* The actual code in this memcpy and memmove should be identical to the
  21    generic version except for the code under '#ifdef THUNDERX'.  This is
  22    to make is easier to keep this version and the generic version in sync
  23    for changes that are not specific to thunderx.  */
  24
  25 #include <sysdep.h>
  26
  27 /* Assumptions:
  28  *
  29  * ARMv8-a, AArch64, unaligned accesses.
  30  *
  31  */
  32
  33 #define dstin   x0
  34 #define src     x1
  35 #define count   x2
  36 #define dst     x3
  37 #define srcend  x4
  38 #define dstend  x5
  39 #define A_l     x6
  40 #define A_lw    w6
  41 #define A_h     x7
  42 #define A_hw    w7
  43 #define B_l     x8
  44 #define B_lw    w8
  45 #define B_h     x9
  46 #define C_l     x10
  47 #define C_h     x11
  48 #define D_l     x12
  49 #define D_h     x13
  50 #define E_l     src
  51 #define E_h     count
  52 #define F_l     srcend
  53 #define F_h     dst
  54 #define G_l     count
  55 #define G_h     dst
  56 #define tmp1    x14
  57
  58 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
  59    medium copies of 17..96 bytes which are fully unrolled. Large copies
  60    of more than 96 bytes align the destination and use an unrolled loop
  61    processing 64 bytes per iteration.
  62    In order to share code with memmove, small and medium copies read all
  63    data before writing, allowing any kind of overlap. So small, medium
  64    and large backwards memmoves are handled by falling through into memcpy.
  65    Overlapping large forward memmoves use a loop that copies backwards.
  66 */
  67
  68 #ifndef MEMMOVE
  69 # define MEMMOVE memmove
  70 #endif
  71 #ifndef MEMCPY
  72 # define MEMCPY memcpy
  73 #endif
  74
  75 #if IS_IN (libc)
  76
  77 #  undef MEMCPY
  78 #  define MEMCPY __memcpy_thunderx
  79 #  undef MEMMOVE
  80 #  define MEMMOVE __memmove_thunderx
  81
  82 ENTRY_ALIGN (MEMMOVE, 6)
  83
  84         PTR_ARG (0)
  85         PTR_ARG (1)
  86         SIZE_ARG (2)
  87
  88         sub     tmp1, dstin, src
  89         cmp     count, 96
  90         ccmp    tmp1, count, 2, hi
  91         b.lo    L(move_long)
  92
  93         /* Common case falls through into memcpy.  */
  94 END (MEMMOVE)
  95 libc_hidden_builtin_def (MEMMOVE)
  96 ENTRY (MEMCPY)
  97
  98         PTR_ARG (0)
  99         PTR_ARG (1)
 100         SIZE_ARG (2)
 101
 102         prfm    PLDL1KEEP, [src]
 103         add     srcend, src, count
 104         add     dstend, dstin, count
 105         cmp     count, 16
 106         b.ls    L(copy16)
 107         cmp     count, 96
 108         b.hi    L(copy_long)
 109
 110         /* Medium copies: 17..96 bytes.  */
 111         sub     tmp1, count, 1
 112         ldp     A_l, A_h, [src]
 113         tbnz    tmp1, 6, L(copy96)
 114         ldp     D_l, D_h, [srcend, -16]
 115         tbz     tmp1, 5, 1f
 116         ldp     B_l, B_h, [src, 16]
 117         ldp     C_l, C_h, [srcend, -32]
 118         stp     B_l, B_h, [dstin, 16]
 119         stp     C_l, C_h, [dstend, -32]
 120 1:
 121         stp     A_l, A_h, [dstin]
 122         stp     D_l, D_h, [dstend, -16]
 123         ret
 124
 125         .p2align 4
 126         /* Small copies: 0..16 bytes.  */
 127 L(copy16):
 128         cmp     count, 8
 129         b.lo    1f
 130         ldr     A_l, [src]
 131         ldr     A_h, [srcend, -8]
 132         str     A_l, [dstin]
 133         str     A_h, [dstend, -8]
 134         ret
 135         .p2align 4
 136 1:
 137         tbz     count, 2, 1f
 138         ldr     A_lw, [src]
 139         ldr     A_hw, [srcend, -4]
 140         str     A_lw, [dstin]
 141         str     A_hw, [dstend, -4]
 142         ret
 143
 144         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 145            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 146 1:
 147         cbz     count, 2f
 148         lsr     tmp1, count, 1
 149         ldrb    A_lw, [src]
 150         ldrb    A_hw, [srcend, -1]
 151         ldrb    B_lw, [src, tmp1]
 152         strb    A_lw, [dstin]
 153         strb    B_lw, [dstin, tmp1]
 154         strb    A_hw, [dstend, -1]
 155 2:      ret
 156
 157         .p2align 4
 158         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 159            32 bytes from the end.  */
 160 L(copy96):
 161         ldp     B_l, B_h, [src, 16]
 162         ldp     C_l, C_h, [src, 32]
 163         ldp     D_l, D_h, [src, 48]
 164         ldp     E_l, E_h, [srcend, -32]
 165         ldp     F_l, F_h, [srcend, -16]
 166         stp     A_l, A_h, [dstin]
 167         stp     B_l, B_h, [dstin, 16]
 168         stp     C_l, C_h, [dstin, 32]
 169         stp     D_l, D_h, [dstin, 48]
 170         stp     E_l, E_h, [dstend, -32]
 171         stp     F_l, F_h, [dstend, -16]
 172         ret
 173
 174         /* Align DST to 16 byte alignment so that we don't cross cache line
 175            boundaries on both loads and stores.  There are at least 96 bytes
 176            to copy, so copy 16 bytes unaligned and then align.  The loop
 177            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 178
 179         .p2align 4
 180 L(copy_long):
 181
 182         /* On thunderx, large memcpy's are helped by software prefetching.
 183            This loop is identical to the one below it but with prefetching
 184            instructions included.  For loops that are less than 32768 bytes,
 185            the prefetching does not help and slow the code down so we only
 186            use the prefetching loop for the largest memcpys.  */
 187
 188         cmp     count, #32768
 189         b.lo    L(copy_long_without_prefetch)
 190         and     tmp1, dstin, 15
 191         bic     dst, dstin, 15
 192         ldp     D_l, D_h, [src]
 193         sub     src, src, tmp1
 194         prfm    pldl1strm, [src, 384]
 195         add     count, count, tmp1      /* Count is now 16 too large.  */
 196         ldp     A_l, A_h, [src, 16]
 197         stp     D_l, D_h, [dstin]
 198         ldp     B_l, B_h, [src, 32]
 199         ldp     C_l, C_h, [src, 48]
 200         ldp     D_l, D_h, [src, 64]!
 201         subs    count, count, 128 + 16  /* Test and readjust count.  */
 202
 203 L(prefetch_loop64):
 204         tbz     src, #6, 1f
 205         prfm    pldl1strm, [src, 512]
 206 1:
 207         stp     A_l, A_h, [dst, 16]
 208         ldp     A_l, A_h, [src, 16]
 209         stp     B_l, B_h, [dst, 32]
 210         ldp     B_l, B_h, [src, 32]
 211         stp     C_l, C_h, [dst, 48]
 212         ldp     C_l, C_h, [src, 48]
 213         stp     D_l, D_h, [dst, 64]!
 214         ldp     D_l, D_h, [src, 64]!
 215         subs    count, count, 64
 216         b.hi    L(prefetch_loop64)
 217         b       L(last64)
 218
 219 L(copy_long_without_prefetch):
 220
 221         and     tmp1, dstin, 15
 222         bic     dst, dstin, 15
 223         ldp     D_l, D_h, [src]
 224         sub     src, src, tmp1
 225         add     count, count, tmp1      /* Count is now 16 too large.  */
 226         ldp     A_l, A_h, [src, 16]
 227         stp     D_l, D_h, [dstin]
 228         ldp     B_l, B_h, [src, 32]
 229         ldp     C_l, C_h, [src, 48]
 230         ldp     D_l, D_h, [src, 64]!
 231         subs    count, count, 128 + 16  /* Test and readjust count.  */
 232         b.ls    L(last64)
 233 L(loop64):
 234         stp     A_l, A_h, [dst, 16]
 235         ldp     A_l, A_h, [src, 16]
 236         stp     B_l, B_h, [dst, 32]
 237         ldp     B_l, B_h, [src, 32]
 238         stp     C_l, C_h, [dst, 48]
 239         ldp     C_l, C_h, [src, 48]
 240         stp     D_l, D_h, [dst, 64]!
 241         ldp     D_l, D_h, [src, 64]!
 242         subs    count, count, 64
 243         b.hi    L(loop64)
 244
 245         /* Write the last full set of 64 bytes.  The remainder is at most 64
 246            bytes, so it is safe to always copy 64 bytes from the end even if
 247            there is just 1 byte left.  */
 248 L(last64):
 249         ldp     E_l, E_h, [srcend, -64]
 250         stp     A_l, A_h, [dst, 16]
 251         ldp     A_l, A_h, [srcend, -48]
 252         stp     B_l, B_h, [dst, 32]
 253         ldp     B_l, B_h, [srcend, -32]
 254         stp     C_l, C_h, [dst, 48]
 255         ldp     C_l, C_h, [srcend, -16]
 256         stp     D_l, D_h, [dst, 64]
 257         stp     E_l, E_h, [dstend, -64]
 258         stp     A_l, A_h, [dstend, -48]
 259         stp     B_l, B_h, [dstend, -32]
 260         stp     C_l, C_h, [dstend, -16]
 261         ret
 262
 263         .p2align 4
 264 L(move_long):
 265         cbz     tmp1, 3f
 266
 267         add     srcend, src, count
 268         add     dstend, dstin, count
 269
 270         /* Align dstend to 16 byte alignment so that we don't cross cache line
 271            boundaries on both loads and stores.  There are at least 96 bytes
 272            to copy, so copy 16 bytes unaligned and then align.  The loop
 273            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 274
 275         and     tmp1, dstend, 15
 276         ldp     D_l, D_h, [srcend, -16]
 277         sub     srcend, srcend, tmp1
 278         sub     count, count, tmp1
 279         ldp     A_l, A_h, [srcend, -16]
 280         stp     D_l, D_h, [dstend, -16]
 281         ldp     B_l, B_h, [srcend, -32]
 282         ldp     C_l, C_h, [srcend, -48]
 283         ldp     D_l, D_h, [srcend, -64]!
 284         sub     dstend, dstend, tmp1
 285         subs    count, count, 128
 286         b.ls    2f
 287
 288         nop
 289 1:
 290         stp     A_l, A_h, [dstend, -16]
 291         ldp     A_l, A_h, [srcend, -16]
 292         stp     B_l, B_h, [dstend, -32]
 293         ldp     B_l, B_h, [srcend, -32]
 294         stp     C_l, C_h, [dstend, -48]
 295         ldp     C_l, C_h, [srcend, -48]
 296         stp     D_l, D_h, [dstend, -64]!
 297         ldp     D_l, D_h, [srcend, -64]!
 298         subs    count, count, 64
 299         b.hi    1b
 300
 301         /* Write the last full set of 64 bytes.  The remainder is at most 64
 302            bytes, so it is safe to always copy 64 bytes from the start even if
 303            there is just 1 byte left.  */
 304 2:
 305         ldp     G_l, G_h, [src, 48]
 306         stp     A_l, A_h, [dstend, -16]
 307         ldp     A_l, A_h, [src, 32]
 308         stp     B_l, B_h, [dstend, -32]
 309         ldp     B_l, B_h, [src, 16]
 310         stp     C_l, C_h, [dstend, -48]
 311         ldp     C_l, C_h, [src]
 312         stp     D_l, D_h, [dstend, -64]
 313         stp     G_l, G_h, [dstin, 48]
 314         stp     A_l, A_h, [dstin, 32]
 315         stp     B_l, B_h, [dstin, 16]
 316         stp     C_l, C_h, [dstin]
 317 3:      ret
 318
 319 END (MEMCPY)
 320 libc_hidden_builtin_def (MEMCPY)
 321
 322 #endif