sysdeps/aarch64/multiarch/memcpy_falkor.S

   1 /* Optimized memcpy for Qualcomm Falkor processor.
   2    Copyright (C) 2017-2023 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23
  24    ARMv8-a, AArch64, falkor, unaligned accesses.  */
  25
  26 #define dstin   x0
  27 #define src     x1
  28 #define count   x2
  29 #define dst     x3
  30 #define srcend  x4
  31 #define dstend  x5
  32 #define tmp1    x14
  33 #define A_x     x6
  34 #define B_x     x7
  35 #define A_w     w6
  36 #define B_w     w7
  37
  38 #define A_q     q0
  39 #define B_q     q1
  40 #define C_q     q2
  41 #define D_q     q3
  42 #define E_q     q4
  43 #define F_q     q5
  44 #define G_q     q6
  45 #define H_q     q7
  46 #define Q_q     q6
  47 #define S_q     q22
  48
  49 /* Copies are split into 3 main cases:
  50
  51    1. Small copies of up to 32 bytes
  52    2. Medium copies of 33..128 bytes which are fully unrolled
  53    3. Large copies of more than 128 bytes.
  54
  55    Large copies align the source to a quad word and use an unrolled loop
  56    processing 64 bytes per iteration.
  57
  58    FALKOR-SPECIFIC DESIGN:
  59
  60    The smallest copies (32 bytes or less) focus on optimal pipeline usage,
  61    which is why the redundant copies of 0-3 bytes have been replaced with
  62    conditionals, since the former would unnecessarily break across multiple
  63    issue groups.  The medium copy group has been enlarged to 128 bytes since
  64    bumping up the small copies up to 32 bytes allows us to do that without
  65    cost and also allows us to reduce the size of the prep code before loop64.
  66
  67    The copy loop uses only one register q0.  This is to ensure that all loads
  68    hit a single hardware prefetcher which can get correctly trained to prefetch
  69    a single stream.
  70
  71    The non-temporal stores help optimize cache utilization.  */
  72
  73 #if IS_IN (libc)
  74 ENTRY_ALIGN (__memcpy_falkor, 6)
  75
  76         PTR_ARG (0)
  77         PTR_ARG (1)
  78         SIZE_ARG (2)
  79
  80         cmp     count, 32
  81         add     srcend, src, count
  82         add     dstend, dstin, count
  83         b.ls    L(copy32)
  84         cmp     count, 128
  85         b.hi    L(copy_long)
  86
  87         /* Medium copies: 33..128 bytes.  */
  88 L(copy128):
  89         sub     tmp1, count, 1
  90         ldr     A_q, [src]
  91         ldr     B_q, [src, 16]
  92         ldr     C_q, [srcend, -32]
  93         ldr     D_q, [srcend, -16]
  94         tbz     tmp1, 6, 1f
  95         ldr     E_q, [src, 32]
  96         ldr     F_q, [src, 48]
  97         ldr     G_q, [srcend, -64]
  98         ldr     H_q, [srcend, -48]
  99         str     G_q, [dstend, -64]
 100         str     H_q, [dstend, -48]
 101         str     E_q, [dstin, 32]
 102         str     F_q, [dstin, 48]
 103 1:
 104         str     A_q, [dstin]
 105         str     B_q, [dstin, 16]
 106         str     C_q, [dstend, -32]
 107         str     D_q, [dstend, -16]
 108         ret
 109
 110         .p2align 4
 111         /* Small copies: 0..32 bytes.  */
 112 L(copy32):
 113         /* 16-32 */
 114         cmp     count, 16
 115         b.lo    1f
 116         ldr     A_q, [src]
 117         ldr     B_q, [srcend, -16]
 118         str     A_q, [dstin]
 119         str     B_q, [dstend, -16]
 120         ret
 121         .p2align 4
 122 1:
 123         /* 8-15 */
 124         tbz     count, 3, 1f
 125         ldr     A_x, [src]
 126         ldr     B_x, [srcend, -8]
 127         str     A_x, [dstin]
 128         str     B_x, [dstend, -8]
 129         ret
 130         .p2align 4
 131 1:
 132         /* 4-7 */
 133         tbz     count, 2, 1f
 134         ldr     A_w, [src]
 135         ldr     B_w, [srcend, -4]
 136         str     A_w, [dstin]
 137         str     B_w, [dstend, -4]
 138         ret
 139         .p2align 4
 140 1:
 141         /* 2-3 */
 142         tbz     count, 1, 1f
 143         ldrh    A_w, [src]
 144         ldrh    B_w, [srcend, -2]
 145         strh    A_w, [dstin]
 146         strh    B_w, [dstend, -2]
 147         ret
 148         .p2align 4
 149 1:
 150         /* 0-1 */
 151         tbz     count, 0, 1f
 152         ldrb    A_w, [src]
 153         strb    A_w, [dstin]
 154 1:
 155         ret
 156
 157         /* Align SRC to 16 bytes and copy; that way at least one of the
 158            accesses is aligned throughout the copy sequence.
 159
 160            The count is off by 0 to 15 bytes, but this is OK because we trim
 161            off the last 64 bytes to copy off from the end.  Due to this the
 162            loop never runs out of bounds.  */
 163
 164         .p2align 4
 165         nop             /* Align loop64 below.  */
 166 L(copy_long):
 167         ldr     A_q, [src]
 168         sub     count, count, 64 + 16
 169         and     tmp1, src, 15
 170         str     A_q, [dstin]
 171         bic     src, src, 15
 172         sub     dst, dstin, tmp1
 173         add     count, count, tmp1
 174
 175 L(loop64):
 176         ldr     A_q, [src, 16]!
 177         str     A_q, [dst, 16]
 178         ldr     A_q, [src, 16]!
 179         subs    count, count, 64
 180         str     A_q, [dst, 32]
 181         ldr     A_q, [src, 16]!
 182         str     A_q, [dst, 48]
 183         ldr     A_q, [src, 16]!
 184         str     A_q, [dst, 64]!
 185         b.hi    L(loop64)
 186
 187         /* Write the last full set of 64 bytes.  The remainder is at most 64
 188            bytes, so it is safe to always copy 64 bytes from the end even if
 189            there is just 1 byte left.  */
 190         ldr     E_q, [srcend, -64]
 191         str     E_q, [dstend, -64]
 192         ldr     D_q, [srcend, -48]
 193         str     D_q, [dstend, -48]
 194         ldr     C_q, [srcend, -32]
 195         str     C_q, [dstend, -32]
 196         ldr     B_q, [srcend, -16]
 197         str     B_q, [dstend, -16]
 198         ret
 199
 200 END (__memcpy_falkor)
 201 libc_hidden_builtin_def (__memcpy_falkor)
 202
 203
 204 /* RATIONALE:
 205
 206    The move has 4 distinct parts:
 207    * Small moves of 32 bytes and under.
 208    * Medium sized moves of 33-128 bytes (fully unrolled).
 209    * Large moves where the source address is higher than the destination
 210      (forward copies)
 211    * Large moves where the destination address is higher than the source
 212      (copy backward, or move).
 213
 214    We use only two registers q6 and q22 for the moves and move 32 bytes at a
 215    time to correctly train the hardware prefetcher for better throughput.
 216
 217    For small and medium cases memcpy is used.  */
 218
 219 ENTRY_ALIGN (__memmove_falkor, 6)
 220
 221         PTR_ARG (0)
 222         PTR_ARG (1)
 223         SIZE_ARG (2)
 224
 225         cmp     count, 32
 226         add     srcend, src, count
 227         add     dstend, dstin, count
 228         b.ls    L(copy32)
 229         cmp     count, 128
 230         b.ls    L(copy128)
 231         sub     tmp1, dstin, src
 232         ccmp    tmp1, count, 2, hi
 233         b.lo    L(move_long)
 234
 235         /* CASE: Copy Forwards
 236
 237            Align src to 16 byte alignment so that we don't cross cache line
 238            boundaries on both loads and stores.  There are at least 128 bytes
 239            to copy, so copy 16 bytes unaligned and then align.  The loop
 240            copies 32 bytes per iteration and prefetches one iteration ahead.  */
 241
 242         ldr     S_q, [src]
 243         and     tmp1, src, 15
 244         bic     src, src, 15
 245         sub     dst, dstin, tmp1
 246         add     count, count, tmp1      /* Count is now 16 too large.  */
 247         ldr     Q_q, [src, 16]!
 248         str     S_q, [dstin]
 249         ldr     S_q, [src, 16]!
 250         sub     count, count, 32 + 32 + 16      /* Test and readjust count.  */
 251
 252         .p2align 4
 253 1:
 254         subs    count, count, 32
 255         str     Q_q, [dst, 16]
 256         ldr     Q_q, [src, 16]!
 257         str     S_q, [dst, 32]!
 258         ldr     S_q, [src, 16]!
 259         b.hi    1b
 260
 261         /* Copy 32 bytes from the end before writing the data prefetched in the
 262            last loop iteration.  */
 263 2:
 264         ldr     B_q, [srcend, -32]
 265         ldr     C_q, [srcend, -16]
 266         str     Q_q, [dst, 16]
 267         str     S_q, [dst, 32]
 268         str     B_q, [dstend, -32]
 269         str     C_q, [dstend, -16]
 270         ret
 271
 272         /* CASE: Copy Backwards
 273
 274            Align srcend to 16 byte alignment so that we don't cross cache line
 275            boundaries on both loads and stores.  There are at least 128 bytes
 276            to copy, so copy 16 bytes unaligned and then align.  The loop
 277            copies 32 bytes per iteration and prefetches one iteration ahead.  */
 278
 279         .p2align 4
 280         nop
 281         nop
 282 L(move_long):
 283         cbz     tmp1, 3f  /* Return early if src == dstin */
 284         ldr     S_q, [srcend, -16]
 285         and     tmp1, srcend, 15
 286         sub     srcend, srcend, tmp1
 287         ldr     Q_q, [srcend, -16]!
 288         str     S_q, [dstend, -16]
 289         sub     count, count, tmp1
 290         ldr     S_q, [srcend, -16]!
 291         sub     dstend, dstend, tmp1
 292         sub     count, count, 32 + 32
 293
 294 1:
 295         subs    count, count, 32
 296         str     Q_q, [dstend, -16]
 297         ldr     Q_q, [srcend, -16]!
 298         str     S_q, [dstend, -32]!
 299         ldr     S_q, [srcend, -16]!
 300         b.hi    1b
 301
 302         /* Copy 32 bytes from the start before writing the data prefetched in the
 303            last loop iteration.  */
 304
 305         ldr     B_q, [src, 16]
 306         ldr     C_q, [src]
 307         str     Q_q, [dstend, -16]
 308         str     S_q, [dstend, -32]
 309         str     B_q, [dstin, 16]
 310         str     C_q, [dstin]
 311 3:      ret
 312
 313 END (__memmove_falkor)
 314 libc_hidden_builtin_def (__memmove_falkor)
 315 #endif