sysdeps/aarch64/multiarch/memcpy_thunderx2.S

   1 /* A Thunderx2 Optimized memcpy implementation for AARCH64.
   2    Copyright (C) 2018-2023 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23  *
  24  * ARMv8-a, AArch64, unaligned accesses.
  25  *
  26  */
  27
  28 #define dstin   x0
  29 #define src     x1
  30 #define count   x2
  31 #define dst     x3
  32 #define srcend  x4
  33 #define dstend  x5
  34 #define tmp2    x6
  35 #define tmp3    x7
  36 #define tmp3w   w7
  37 #define A_l     x6
  38 #define A_lw    w6
  39 #define A_h     x7
  40 #define A_hw    w7
  41 #define B_l     x8
  42 #define B_lw    w8
  43 #define B_h     x9
  44 #define C_l     x10
  45 #define C_h     x11
  46 #define D_l     x12
  47 #define D_h     x13
  48 #define E_l     src
  49 #define E_h     count
  50 #define F_l     srcend
  51 #define F_h     dst
  52 #define G_l     count
  53 #define G_h     dst
  54 #define tmp1    x14
  55
  56 #define A_q     q0
  57 #define B_q     q1
  58 #define C_q     q2
  59 #define D_q     q3
  60 #define E_q     q4
  61 #define F_q     q5
  62 #define G_q     q6
  63 #define H_q     q7
  64 #define I_q     q16
  65 #define J_q     q17
  66
  67 #define A_v     v0
  68 #define B_v     v1
  69 #define C_v     v2
  70 #define D_v     v3
  71 #define E_v     v4
  72 #define F_v     v5
  73 #define G_v     v6
  74 #define H_v     v7
  75 #define I_v     v16
  76 #define J_v     v17
  77
  78 #ifndef MEMMOVE
  79 # define MEMMOVE memmove
  80 #endif
  81 #ifndef MEMCPY
  82 # define MEMCPY memcpy
  83 #endif
  84
  85 #if IS_IN (libc)
  86
  87 #undef MEMCPY
  88 #define MEMCPY __memcpy_thunderx2
  89 #undef MEMMOVE
  90 #define MEMMOVE __memmove_thunderx2
  91
  92
  93 /* Overlapping large forward memmoves use a loop that copies backwards.
  94    Otherwise memcpy is used. Small moves branch to memcopy16 directly.
  95    The longer memcpy cases fall through to the memcpy head.
  96 */
  97
  98 ENTRY_ALIGN (MEMMOVE, 6)
  99
 100         PTR_ARG (0)
 101         PTR_ARG (1)
 102         SIZE_ARG (2)
 103
 104         add     srcend, src, count
 105         cmp     count, 16
 106         b.ls    L(memcopy16)
 107         sub     tmp1, dstin, src
 108         cmp     count, 96
 109         ccmp    tmp1, count, 2, hi
 110         b.lo    L(move_long)
 111
 112 END (MEMMOVE)
 113 libc_hidden_builtin_def (MEMMOVE)
 114
 115
 116 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
 117    medium copies of 17..96 bytes which are fully unrolled. Large copies
 118    of more than 96 bytes align the destination and use load-and-merge
 119    approach in the case src and dst addresses are unaligned not evenly,
 120    so that, actual loads and stores are always aligned.
 121    Large copies use the loops processing 64 bytes per iteration for
 122    unaligned case and 128 bytes per iteration for aligned ones.
 123 */
 124
 125 #define MEMCPY_PREFETCH_LDR 640
 126
 127         .p2align 4
 128 ENTRY (MEMCPY)
 129
 130         PTR_ARG (0)
 131         PTR_ARG (1)
 132         SIZE_ARG (2)
 133
 134         add     srcend, src, count
 135         cmp     count, 16
 136         b.ls    L(memcopy16)
 137         ldr     A_q, [src], #16
 138         add     dstend, dstin, count
 139         and     tmp1, src, 15
 140         cmp     count, 96
 141         b.hi    L(memcopy_long)
 142
 143         /* Medium copies: 17..96 bytes.  */
 144         ldr     E_q, [srcend, -16]
 145         cmp     count, 64
 146         b.gt    L(memcpy_copy96)
 147         cmp     count, 48
 148         b.le    L(bytes_17_to_48)
 149         /* 49..64 bytes */
 150         ldp     B_q, C_q, [src]
 151         str     E_q, [dstend, -16]
 152         stp     A_q, B_q, [dstin]
 153         str     C_q, [dstin, 32]
 154         ret
 155
 156 L(bytes_17_to_48):
 157         /* 17..48 bytes*/
 158         cmp     count, 32
 159         b.gt    L(bytes_32_to_48)
 160         /* 17..32 bytes*/
 161         str     A_q, [dstin]
 162         str     E_q, [dstend, -16]
 163         ret
 164
 165 L(bytes_32_to_48):
 166         /* 32..48 */
 167         ldr     B_q, [src]
 168         str     A_q, [dstin]
 169         str     E_q, [dstend, -16]
 170         str     B_q, [dstin, 16]
 171         ret
 172
 173         .p2align 4
 174         /* Small copies: 0..16 bytes.  */
 175 L(memcopy16):
 176         cmp     count, 8
 177         b.lo    L(bytes_0_to_8)
 178         ldr     A_l, [src]
 179         ldr     A_h, [srcend, -8]
 180         add     dstend, dstin, count
 181         str     A_l, [dstin]
 182         str     A_h, [dstend, -8]
 183         ret
 184         .p2align 4
 185
 186 L(bytes_0_to_8):
 187         tbz     count, 2, L(bytes_0_to_3)
 188         ldr     A_lw, [src]
 189         ldr     A_hw, [srcend, -4]
 190         add     dstend, dstin, count
 191         str     A_lw, [dstin]
 192         str     A_hw, [dstend, -4]
 193         ret
 194
 195         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 196            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 197 L(bytes_0_to_3):
 198         cbz     count, 1f
 199         lsr     tmp1, count, 1
 200         ldrb    A_lw, [src]
 201         ldrb    A_hw, [srcend, -1]
 202         add     dstend, dstin, count
 203         ldrb    B_lw, [src, tmp1]
 204         strb    B_lw, [dstin, tmp1]
 205         strb    A_hw, [dstend, -1]
 206         strb    A_lw, [dstin]
 207 1:
 208         ret
 209
 210         .p2align 4
 211
 212 L(memcpy_copy96):
 213         /* Copying 65..96 bytes. A_q (first 16 bytes) and
 214            E_q(last 16 bytes) are already loaded. The size
 215            is large enough to benefit from aligned loads */
 216         bic     src, src, 15
 217         ldp     B_q, C_q, [src]
 218         /* Loaded 64 bytes, second 16-bytes chunk can be
 219            overlapping with the first chunk by tmp1 bytes.
 220            Stored 16 bytes. */
 221         sub     dst, dstin, tmp1
 222         add     count, count, tmp1
 223         /* The range of count being [65..96] becomes [65..111]
 224            after tmp [0..15] gets added to it,
 225            count now is <bytes-left-to-load>+48 */
 226         cmp     count, 80
 227         b.gt    L(copy96_medium)
 228         ldr     D_q, [src, 32]
 229         stp     B_q, C_q, [dst, 16]
 230         str     D_q, [dst, 48]
 231         str     A_q, [dstin]
 232         str     E_q, [dstend, -16]
 233         ret
 234
 235         .p2align 4
 236 L(copy96_medium):
 237         ldp     D_q, G_q, [src, 32]
 238         cmp     count, 96
 239         b.gt    L(copy96_large)
 240         stp     B_q, C_q, [dst, 16]
 241         stp     D_q, G_q, [dst, 48]
 242         str     A_q, [dstin]
 243         str     E_q, [dstend, -16]
 244         ret
 245
 246 L(copy96_large):
 247         ldr     F_q, [src, 64]
 248         str     B_q, [dst, 16]
 249         stp     C_q, D_q, [dst, 32]
 250         stp     G_q, F_q, [dst, 64]
 251         str     A_q, [dstin]
 252         str     E_q, [dstend, -16]
 253         ret
 254
 255         .p2align 4
 256 L(memcopy_long):
 257         bic     src, src, 15
 258         ldp     B_q, C_q, [src], #32
 259         sub     dst, dstin, tmp1
 260         add     count, count, tmp1
 261         add     dst, dst, 16
 262         and     tmp1, dst, 15
 263         ldp     D_q, E_q, [src], #32
 264         str     A_q, [dstin]
 265
 266         /* Already loaded 64+16 bytes. Check if at
 267            least 64 more bytes left */
 268         subs    count, count, 64+64+16
 269         b.lt    L(loop128_exit0)
 270         cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
 271         b.lt    L(loop128)
 272         cbnz    tmp1, L(dst_unaligned)
 273         sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
 274
 275         .p2align 4
 276
 277 L(loop128_prefetch):
 278         prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
 279         ldp     F_q, G_q, [src], #32
 280         stp     B_q, C_q, [dst], #32
 281         ldp     H_q, I_q, [src], #32
 282         prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
 283         ldp     B_q, C_q, [src], #32
 284         stp     D_q, E_q, [dst], #32
 285         ldp     D_q, E_q, [src], #32
 286         stp     F_q, G_q, [dst], #32
 287         stp     H_q, I_q, [dst], #32
 288         subs    count, count, 128
 289         b.ge    L(loop128_prefetch)
 290
 291         add     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
 292         .p2align 4
 293 L(loop128):
 294         ldp     F_q, G_q, [src], #32
 295         ldp     H_q, I_q, [src], #32
 296         stp     B_q, C_q, [dst], #32
 297         stp     D_q, E_q, [dst], #32
 298         subs    count, count, 64
 299         b.lt    L(loop128_exit1)
 300         ldp     B_q, C_q, [src], #32
 301         ldp     D_q, E_q, [src], #32
 302         stp     F_q, G_q, [dst], #32
 303         stp     H_q, I_q, [dst], #32
 304         subs    count, count, 64
 305         b.ge    L(loop128)
 306 L(loop128_exit0):
 307         ldp     F_q, G_q, [srcend, -64]
 308         ldp     H_q, I_q, [srcend, -32]
 309         stp     B_q, C_q, [dst], #32
 310         stp     D_q, E_q, [dst]
 311         stp     F_q, G_q, [dstend, -64]
 312         stp     H_q, I_q, [dstend, -32]
 313         ret
 314 L(loop128_exit1):
 315         ldp     B_q, C_q, [srcend, -64]
 316         ldp     D_q, E_q, [srcend, -32]
 317         stp     F_q, G_q, [dst], #32
 318         stp     H_q, I_q, [dst]
 319         stp     B_q, C_q, [dstend, -64]
 320         stp     D_q, E_q, [dstend, -32]
 321         ret
 322
 323 L(dst_unaligned_tail):
 324         ldp     C_q, D_q, [srcend, -64]
 325         ldp     E_q, F_q, [srcend, -32]
 326         stp     A_q, B_q, [dst], #32
 327         stp     H_q, I_q, [dst], #16
 328         str     G_q, [dst, tmp1]
 329         stp     C_q, D_q, [dstend, -64]
 330         stp     E_q, F_q, [dstend, -32]
 331         ret
 332
 333 L(dst_unaligned):
 334         /* For the unaligned store case the code loads two
 335            aligned chunks and then merges them using ext
 336            instruction. This can be up to 30% faster than
 337            the the simple unaligned store access.
 338
 339            Current state: tmp1 = dst % 16; C_q, D_q, E_q
 340            contains data yet to be stored. src and dst points
 341            to next-to-be-processed data. A_q, B_q contains
 342            data already stored before, count = bytes left to
 343            be load decremented by 64.
 344
 345            The control is passed here if at least 64 bytes left
 346            to be loaded. The code does two aligned loads and then
 347            extracts (16-tmp1) bytes from the first register and
 348            tmp1 bytes from the next register forming the value
 349            for the aligned store.
 350
 351            As ext instruction can only have it's index encoded
 352            as immediate. 15 code chunks process each possible
 353            index value. Computed goto is used to reach the
 354            required code. */
 355
 356         /* Store the 16 bytes to dst and align dst for further
 357            operations, several bytes will be stored at this
 358            address once more */
 359
 360         ldp     F_q, G_q, [src], #32
 361         stp     B_q, C_q, [dst], #32
 362         bic     dst, dst, 15
 363         sub     count, count, 32
 364         adrp    tmp2, L(ext_table)
 365         add     tmp2, tmp2, :lo12:L(ext_table)
 366         add     tmp2, tmp2, tmp1, LSL #2
 367         ldr     tmp3w, [tmp2]
 368         add     tmp2, tmp2, tmp3w, SXTW
 369         br      tmp2
 370
 371 .p2align 4
 372         /* to make the loop in each chunk 16-bytes aligned */
 373         nop
 374 #define EXT_CHUNK(shft) \
 375 L(ext_size_ ## shft):;\
 376         ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 377         ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
 378         ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
 379 1:;\
 380         stp     A_q, B_q, [dst], #32;\
 381         prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
 382         ldp     C_q, D_q, [src], #32;\
 383         ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
 384         stp     H_q, I_q, [dst], #32;\
 385         ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
 386         ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
 387         ldp     F_q, G_q, [src], #32;\
 388         ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
 389         subs    count, count, 64;\
 390         b.ge    1b;\
 391 2:;\
 392         ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
 393         b       L(dst_unaligned_tail);
 394
 395 EXT_CHUNK(1)
 396 EXT_CHUNK(2)
 397 EXT_CHUNK(3)
 398 EXT_CHUNK(4)
 399 EXT_CHUNK(5)
 400 EXT_CHUNK(6)
 401 EXT_CHUNK(7)
 402 EXT_CHUNK(8)
 403 EXT_CHUNK(9)
 404 EXT_CHUNK(10)
 405 EXT_CHUNK(11)
 406 EXT_CHUNK(12)
 407 EXT_CHUNK(13)
 408 EXT_CHUNK(14)
 409 EXT_CHUNK(15)
 410
 411 L(move_long):
 412         .p2align 4
 413 1:
 414         cbz     tmp1, 3f
 415
 416         add     srcend, src, count
 417         add     dstend, dstin, count
 418
 419         and     tmp1, srcend, 15
 420         ldr     D_q, [srcend, -16]
 421         sub     srcend, srcend, tmp1
 422         sub     count, count, tmp1
 423         ldp     A_q, B_q, [srcend, -32]
 424         str     D_q, [dstend, -16]
 425         ldp     C_q, D_q, [srcend, -64]!
 426         sub     dstend, dstend, tmp1
 427         subs    count, count, 128
 428         b.ls    2f
 429
 430         .p2align 4
 431 1:
 432         subs    count, count, 64
 433         stp     A_q, B_q, [dstend, -32]
 434         ldp     A_q, B_q, [srcend, -32]
 435         stp     C_q, D_q, [dstend, -64]!
 436         ldp     C_q, D_q, [srcend, -64]!
 437         b.hi    1b
 438
 439         /* Write the last full set of 64 bytes.  The remainder is at most 64
 440            bytes, so it is safe to always copy 64 bytes from the start even if
 441            there is just 1 byte left.  */
 442 2:
 443         ldp     E_q, F_q, [src, 32]
 444         ldp     G_q, H_q, [src]
 445         stp     A_q, B_q, [dstend, -32]
 446         stp     C_q, D_q, [dstend, -64]
 447         stp     E_q, F_q, [dstin, 32]
 448         stp     G_q, H_q, [dstin]
 449 3:      ret
 450
 451
 452 END (MEMCPY)
 453         .section        .rodata
 454         .p2align        4
 455
 456 L(ext_table):
 457         /* The first entry is for the alignment of 0 and is never
 458            actually used (could be any value).  */
 459         .word   0
 460         .word   L(ext_size_1) -.
 461         .word   L(ext_size_2) -.
 462         .word   L(ext_size_3) -.
 463         .word   L(ext_size_4) -.
 464         .word   L(ext_size_5) -.
 465         .word   L(ext_size_6) -.
 466         .word   L(ext_size_7) -.
 467         .word   L(ext_size_8) -.
 468         .word   L(ext_size_9) -.
 469         .word   L(ext_size_10) -.
 470         .word   L(ext_size_11) -.
 471         .word   L(ext_size_12) -.
 472         .word   L(ext_size_13) -.
 473         .word   L(ext_size_14) -.
 474         .word   L(ext_size_15) -.
 475
 476 libc_hidden_builtin_def (MEMCPY)
 477 #endif