sysdeps/aarch64/multiarch/memcpy_thunderx2.S

   1 /* A Thunderx2 Optimized memcpy implementation for AARCH64.
   2    Copyright (C) 2018 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23  *
  24  * ARMv8-a, AArch64, unaligned accesses.
  25  *
  26  */
  27
  28 #define dstin   x0
  29 #define src     x1
  30 #define count   x2
  31 #define dst     x3
  32 #define srcend  x4
  33 #define dstend  x5
  34 #define tmp2    x6
  35 #define tmp3    x7
  36 #define tmp3w   w7
  37 #define A_l     x6
  38 #define A_lw    w6
  39 #define A_h     x7
  40 #define A_hw    w7
  41 #define B_l     x8
  42 #define B_lw    w8
  43 #define B_h     x9
  44 #define C_l     x10
  45 #define C_h     x11
  46 #define D_l     x12
  47 #define D_h     x13
  48 #define E_l     src
  49 #define E_h     count
  50 #define F_l     srcend
  51 #define F_h     dst
  52 #define G_l     count
  53 #define G_h     dst
  54 #define tmp1    x14
  55
  56 #define A_q     q0
  57 #define B_q     q1
  58 #define C_q     q2
  59 #define D_q     q3
  60 #define E_q     q4
  61 #define F_q     q5
  62 #define G_q     q6
  63 #define H_q     q7
  64 #define I_q     q16
  65 #define J_q     q17
  66
  67 #define A_v     v0
  68 #define B_v     v1
  69 #define C_v     v2
  70 #define D_v     v3
  71 #define E_v     v4
  72 #define F_v     v5
  73 #define G_v     v6
  74 #define H_v     v7
  75 #define I_v     v16
  76 #define J_v     v17
  77
  78 #ifndef MEMMOVE
  79 # define MEMMOVE memmove
  80 #endif
  81 #ifndef MEMCPY
  82 # define MEMCPY memcpy
  83 #endif
  84
  85 #if IS_IN (libc)
  86
  87 #undef MEMCPY
  88 #undef MEMMOVE
  89 #define MEMCPY __memcpy_thunderx2
  90 #define MEMMOVE __memmove_thunderx2
  91
  92
  93 /* Moves are split into 3 main cases: small copies of up to 16 bytes,
  94    medium copies of 17..96 bytes which are fully unrolled. Large copies
  95    of more than 96 bytes align the destination and use an unrolled loop
  96    processing 64 bytes per iteration.
  97    Overlapping large forward memmoves use a loop that copies backwards.
  98 */
  99
 100 ENTRY_ALIGN (MEMMOVE, 6)
 101
 102         DELOUSE (0)
 103         DELOUSE (1)
 104         DELOUSE (2)
 105
 106         sub     tmp1, dstin, src
 107         cmp     count, 96
 108         ccmp    tmp1, count, 2, hi
 109         b.lo    L(move_long)
 110
 111         prfm    PLDL1KEEP, [src]
 112         add     srcend, src, count
 113         add     dstend, dstin, count
 114         cmp     count, 16
 115         b.ls    L(copy16)
 116         cmp     count, 96
 117         b.hi    L(copy_long)
 118
 119         /* Medium copies: 17..96 bytes.  */
 120         sub     tmp1, count, 1
 121         ldp     A_l, A_h, [src]
 122         tbnz    tmp1, 6, L(copy96)
 123         ldp     D_l, D_h, [srcend, -16]
 124         tbz     tmp1, 5, 1f
 125         ldp     B_l, B_h, [src, 16]
 126         ldp     C_l, C_h, [srcend, -32]
 127         stp     B_l, B_h, [dstin, 16]
 128         stp     C_l, C_h, [dstend, -32]
 129 1:
 130         stp     A_l, A_h, [dstin]
 131         stp     D_l, D_h, [dstend, -16]
 132         ret
 133
 134         .p2align 4
 135         /* Small copies: 0..16 bytes.  */
 136 L(copy16):
 137         cmp     count, 8
 138         b.lo    1f
 139         ldr     A_l, [src]
 140         ldr     A_h, [srcend, -8]
 141         str     A_l, [dstin]
 142         str     A_h, [dstend, -8]
 143         ret
 144         .p2align 4
 145 1:
 146         tbz     count, 2, 1f
 147         ldr     A_lw, [src]
 148         ldr     A_hw, [srcend, -4]
 149         str     A_lw, [dstin]
 150         str     A_hw, [dstend, -4]
 151         ret
 152
 153         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 154            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 155 1:
 156         cbz     count, 2f
 157         lsr     tmp1, count, 1
 158         ldrb    A_lw, [src]
 159         ldrb    A_hw, [srcend, -1]
 160         ldrb    B_lw, [src, tmp1]
 161         strb    A_lw, [dstin]
 162         strb    B_lw, [dstin, tmp1]
 163         strb    A_hw, [dstend, -1]
 164 2:      ret
 165
 166         .p2align 4
 167         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 168            32 bytes from the end.  */
 169 L(copy96):
 170         ldp     B_l, B_h, [src, 16]
 171         ldp     C_l, C_h, [src, 32]
 172         ldp     D_l, D_h, [src, 48]
 173         ldp     E_l, E_h, [srcend, -32]
 174         ldp     F_l, F_h, [srcend, -16]
 175         stp     A_l, A_h, [dstin]
 176         stp     B_l, B_h, [dstin, 16]
 177         stp     C_l, C_h, [dstin, 32]
 178         stp     D_l, D_h, [dstin, 48]
 179         stp     E_l, E_h, [dstend, -32]
 180         stp     F_l, F_h, [dstend, -16]
 181         ret
 182
 183         /* Align DST to 16 byte alignment so that we don't cross cache line
 184            boundaries on both loads and stores.  There are at least 96 bytes
 185            to copy, so copy 16 bytes unaligned and then align.  The loop
 186            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 187
 188         .p2align 4
 189 L(copy_long):
 190         and     tmp1, dstin, 15
 191         bic     dst, dstin, 15
 192         ldp     D_l, D_h, [src]
 193         sub     src, src, tmp1
 194         add     count, count, tmp1      /* Count is now 16 too large.  */
 195         ldp     A_l, A_h, [src, 16]
 196         stp     D_l, D_h, [dstin]
 197         ldp     B_l, B_h, [src, 32]
 198         ldp     C_l, C_h, [src, 48]
 199         ldp     D_l, D_h, [src, 64]!
 200         subs    count, count, 128 + 16  /* Test and readjust count.  */
 201         b.ls    L(last64)
 202 L(loop64):
 203         stp     A_l, A_h, [dst, 16]
 204         ldp     A_l, A_h, [src, 16]
 205         stp     B_l, B_h, [dst, 32]
 206         ldp     B_l, B_h, [src, 32]
 207         stp     C_l, C_h, [dst, 48]
 208         ldp     C_l, C_h, [src, 48]
 209         stp     D_l, D_h, [dst, 64]!
 210         ldp     D_l, D_h, [src, 64]!
 211         subs    count, count, 64
 212         b.hi    L(loop64)
 213
 214         /* Write the last full set of 64 bytes.  The remainder is at most 64
 215            bytes, so it is safe to always copy 64 bytes from the end even if
 216            there is just 1 byte left.  */
 217 L(last64):
 218         ldp     E_l, E_h, [srcend, -64]
 219         stp     A_l, A_h, [dst, 16]
 220         ldp     A_l, A_h, [srcend, -48]
 221         stp     B_l, B_h, [dst, 32]
 222         ldp     B_l, B_h, [srcend, -32]
 223         stp     C_l, C_h, [dst, 48]
 224         ldp     C_l, C_h, [srcend, -16]
 225         stp     D_l, D_h, [dst, 64]
 226         stp     E_l, E_h, [dstend, -64]
 227         stp     A_l, A_h, [dstend, -48]
 228         stp     B_l, B_h, [dstend, -32]
 229         stp     C_l, C_h, [dstend, -16]
 230         ret
 231
 232         .p2align 4
 233 L(move_long):
 234         cbz     tmp1, 3f
 235
 236         add     srcend, src, count
 237         add     dstend, dstin, count
 238
 239         /* Align dstend to 16 byte alignment so that we don't cross cache line
 240            boundaries on both loads and stores.  There are at least 96 bytes
 241            to copy, so copy 16 bytes unaligned and then align.  The loop
 242            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 243
 244         and     tmp1, dstend, 15
 245         ldp     D_l, D_h, [srcend, -16]
 246         sub     srcend, srcend, tmp1
 247         sub     count, count, tmp1
 248         ldp     A_l, A_h, [srcend, -16]
 249         stp     D_l, D_h, [dstend, -16]
 250         ldp     B_l, B_h, [srcend, -32]
 251         ldp     C_l, C_h, [srcend, -48]
 252         ldp     D_l, D_h, [srcend, -64]!
 253         sub     dstend, dstend, tmp1
 254         subs    count, count, 128
 255         b.ls    2f
 256
 257         nop
 258 1:
 259         stp     A_l, A_h, [dstend, -16]
 260         ldp     A_l, A_h, [srcend, -16]
 261         stp     B_l, B_h, [dstend, -32]
 262         ldp     B_l, B_h, [srcend, -32]
 263         stp     C_l, C_h, [dstend, -48]
 264         ldp     C_l, C_h, [srcend, -48]
 265         stp     D_l, D_h, [dstend, -64]!
 266         ldp     D_l, D_h, [srcend, -64]!
 267         subs    count, count, 64
 268         b.hi    1b
 269
 270         /* Write the last full set of 64 bytes.  The remainder is at most 64
 271            bytes, so it is safe to always copy 64 bytes from the start even if
 272            there is just 1 byte left.  */
 273 2:
 274         ldp     G_l, G_h, [src, 48]
 275         stp     A_l, A_h, [dstend, -16]
 276         ldp     A_l, A_h, [src, 32]
 277         stp     B_l, B_h, [dstend, -32]
 278         ldp     B_l, B_h, [src, 16]
 279         stp     C_l, C_h, [dstend, -48]
 280         ldp     C_l, C_h, [src]
 281         stp     D_l, D_h, [dstend, -64]
 282         stp     G_l, G_h, [dstin, 48]
 283         stp     A_l, A_h, [dstin, 32]
 284         stp     B_l, B_h, [dstin, 16]
 285         stp     C_l, C_h, [dstin]
 286 3:      ret
 287
 288 END (MEMMOVE)
 289 libc_hidden_builtin_def (MEMMOVE)
 290
 291
 292 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
 293    medium copies of 17..96 bytes which are fully unrolled. Large copies
 294    of more than 96 bytes align the destination and use load-and-merge
 295    approach in the case src and dst addresses are unaligned not evenly,
 296    so that, loads and stores are always aligned.
 297    Large copies use an unrolled loop processing 64 bytes per iteration.
 298    The current optimized memcpy implementation is not compatible with
 299    memmove and is separated from it completely.
 300
 301    memcpy implementation below is not compatible with memmove
 302    because of pipelined loads/stores, which are faster, but they
 303    can't be used in the case of overlapping memmove arrays */
 304
 305 #define MEMCPY_PREFETCH_LDR 640
 306
 307 ENTRY (MEMCPY)
 308         DELOUSE (0)
 309         DELOUSE (1)
 310         DELOUSE (2)
 311
 312         add     srcend, src, count
 313         cmp     count, 16
 314         b.ls    L(memcopy16)
 315         ldr     A_q, [src], #16
 316         add     dstend, dstin, count
 317         and     tmp1, src, 15
 318         cmp     count, 96
 319         b.hi    L(memcopy_long)
 320
 321         /* Medium copies: 17..96 bytes.  */
 322         ldr     E_q, [srcend, -16]
 323         cmp     count, 64
 324         b.gt    L(memcpy_copy96)
 325         cmp     count, 48
 326         b.le    L(bytes_17_to_48)
 327         /* 49..64 bytes */
 328         ldp     B_q, C_q, [src]
 329         str     E_q, [dstend, -16]
 330         stp     A_q, B_q, [dstin]
 331         str     C_q, [dstin, 32]
 332         ret
 333
 334 L(bytes_17_to_48):
 335         /* 17..48 bytes*/
 336         cmp     count, 32
 337         b.gt    L(bytes_32_to_48)
 338         /* 17..32 bytes*/
 339         str     A_q, [dstin]
 340         str     E_q, [dstend, -16]
 341         ret
 342
 343 L(bytes_32_to_48):
 344         /* 32..48 */
 345         ldr     B_q, [src]
 346         str     A_q, [dstin]
 347         str     E_q, [dstend, -16]
 348         str     B_q, [dstin, 16]
 349         ret
 350
 351         .p2align 4
 352         /* Small copies: 0..16 bytes.  */
 353 L(memcopy16):
 354         cmp     count, 8
 355         b.lo    L(bytes_0_to_8)
 356         ldr     A_l, [src]
 357         ldr     A_h, [srcend, -8]
 358         add     dstend, dstin, count
 359         str     A_l, [dstin]
 360         str     A_h, [dstend, -8]
 361         ret
 362         .p2align 4
 363
 364 L(bytes_0_to_8):
 365         tbz     count, 2, L(bytes_0_to_3)
 366         ldr     A_lw, [src]
 367         ldr     A_hw, [srcend, -4]
 368         add     dstend, dstin, count
 369         str     A_lw, [dstin]
 370         str     A_hw, [dstend, -4]
 371         ret
 372
 373         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 374            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 375 L(bytes_0_to_3):
 376         cbz     count, L(end)
 377         lsr     tmp1, count, 1
 378         ldrb    A_lw, [src]
 379         ldrb    A_hw, [srcend, -1]
 380         add     dstend, dstin, count
 381         ldrb    B_lw, [src, tmp1]
 382         strb    A_lw, [dstin]
 383         strb    B_lw, [dstin, tmp1]
 384         strb    A_hw, [dstend, -1]
 385 L(end): ret
 386
 387         .p2align 4
 388
 389 L(memcpy_copy96):
 390         /* Copying 65..96 bytes. A_q (first 16 bytes) and
 391            E_q(last 16 bytes) are already loaded.
 392
 393            The size is large enough to benefit from aligned
 394            loads */
 395         bic     src, src, 15
 396         ldp     B_q, C_q, [src]
 397         str     A_q, [dstin]
 398         /* Loaded 64 bytes, second 16-bytes chunk can be
 399            overlapping with the first chunk by tmp1 bytes.
 400            Stored 16 bytes. */
 401         sub     dst, dstin, tmp1
 402         add     count, count, tmp1
 403         /* The range of count being [65..96] becomes [65..111]
 404            after tmp [0..15] gets added to it,
 405            count now is <bytes-left-to-load>+48 */
 406         cmp     count, 80
 407         b.gt    L(copy96_medium)
 408         ldr     D_q, [src, 32]
 409         stp     B_q, C_q, [dst, 16]
 410         str     E_q, [dstend, -16]
 411         str     D_q, [dst, 48]
 412         ret
 413
 414         .p2align 4
 415 L(copy96_medium):
 416         ldp     D_q, A_q, [src, 32]
 417         str     B_q, [dst, 16]
 418         cmp     count, 96
 419         b.gt    L(copy96_large)
 420         str     E_q, [dstend, -16]
 421         stp     C_q, D_q, [dst, 32]
 422         str     A_q, [dst, 64]
 423         ret
 424
 425 L(copy96_large):
 426         ldr     F_q, [src, 64]
 427         stp     C_q, D_q, [dst, 32]
 428         str     E_q, [dstend, -16]
 429         stp     A_q, F_q, [dst, 64]
 430         ret
 431
 432         .p2align 4
 433 L(memcopy_long):
 434         bic     src, src, 15
 435         ldp     B_q, C_q, [src], #32
 436         str     A_q, [dstin]
 437         sub     dst, dstin, tmp1
 438         add     count, count, tmp1
 439         add     dst, dst, 16
 440         and     tmp1, dst, 15
 441         ldp     D_q, E_q, [src], #32
 442         str     B_q, [dst], #16
 443
 444         /* Already loaded 64+16 bytes. Check if at
 445            least 64 more bytes left */
 446         subs    count, count, 64+64+16
 447         b.lt    L(loop128_exit2)
 448         cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
 449         b.lt    L(loop128)
 450         cbnz    tmp1, L(dst_unaligned)
 451         sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
 452
 453         .p2align 4
 454
 455 L(loop128_prefetch):
 456         str     C_q, [dst], #16
 457         prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
 458         str     D_q, [dst], #16
 459         ldp     F_q, G_q, [src], #32
 460         str     E_q, [dst], #16
 461         ldp     H_q, A_q, [src], #32
 462         str     F_q, [dst], #16
 463         prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
 464         str     G_q, [dst], #16
 465         ldp     B_q, C_q, [src], #32
 466         str     H_q, [dst], #16
 467         ldp     D_q, E_q, [src], #32
 468         stp     A_q, B_q, [dst], #32
 469         subs    count, count, 128
 470         b.ge    L(loop128_prefetch)
 471
 472 L(preloop128):
 473         add     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
 474         .p2align 4
 475 L(loop128):
 476         ldp     F_q, G_q, [src], #32
 477         str     C_q, [dst], #16
 478         ldp     B_q, A_q, [src], #32
 479         str     D_q, [dst], #16
 480         stp     E_q, F_q, [dst], #32
 481         stp     G_q, B_q, [dst], #32
 482         subs    count, count, 64
 483         b.lt    L(loop128_exit1)
 484 L(loop128_proceed):
 485         ldp     B_q, C_q, [src], #32
 486         str     A_q, [dst], #16
 487         ldp     D_q, E_q, [src], #32
 488         str     B_q, [dst], #16
 489         subs    count, count, 64
 490         b.ge    L(loop128)
 491
 492         .p2align 4
 493 L(loop128_exit2):
 494         stp     C_q, D_q, [dst], #32
 495         str     E_q, [dst], #16
 496         b       L(copy_long_check32);
 497
 498 L(loop128_exit1):
 499         /* A_q is still not stored and 0..63 bytes left,
 500            so, count is -64..-1.
 501            Check if less than 32 bytes left (count < -32) */
 502         str     A_q, [dst], #16
 503 L(copy_long_check32):
 504         cmn     count, 64
 505         b.eq    L(copy_long_done)
 506         cmn     count, 32
 507         b.le    L(copy_long_last32)
 508         ldp     B_q, C_q, [src]
 509         stp     B_q, C_q, [dst]
 510
 511 L(copy_long_last32):
 512         ldp     F_q, G_q, [srcend, -32]
 513         stp     F_q, G_q, [dstend, -32]
 514
 515 L(copy_long_done):
 516         ret
 517
 518 L(dst_unaligned):
 519         /* For the unaligned store case the code loads two
 520            aligned chunks and then merges them using ext
 521            instruction. This can be up to 30% faster than
 522            the the simple unaligned store access.
 523
 524            Current state: tmp1 = dst % 16; C_q, D_q, E_q
 525            contains data yet to be stored. src and dst points
 526            to next-to-be-processed data. A_q, B_q contains
 527            data already stored before, count = bytes left to
 528            be load decremented by 64.
 529
 530            The control is passed here if at least 64 bytes left
 531            to be loaded. The code does two aligned loads and then
 532            extracts (16-tmp1) bytes from the first register and
 533            tmp1 bytes from the next register forming the value
 534            for the aligned store.
 535
 536            As ext instruction can only have it's index encoded
 537            as immediate. 15 code chunks process each possible
 538            index value. Computed goto is used to reach the
 539            required code. */
 540
 541         /* Store the 16 bytes to dst and align dst for further
 542            operations, several bytes will be stored at this
 543            address once more */
 544         str     C_q, [dst], #16
 545         ldp     F_q, G_q, [src], #32
 546         bic     dst, dst, 15
 547         adrp    tmp2, L(ext_table)
 548         add     tmp2, tmp2, :lo12:L(ext_table)
 549         add     tmp2, tmp2, tmp1, LSL #2
 550         ldr     tmp3w, [tmp2]
 551         add     tmp2, tmp2, tmp3w, SXTW
 552         br      tmp2
 553
 554 #define EXT_CHUNK(shft) \
 555 .p2align 4 ;\
 556 L(ext_size_ ## shft):;\
 557         ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 558         ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
 559         subs    count, count, 32;\
 560         b.ge    2f;\
 561 1:;\
 562         stp     A_q, B_q, [dst], #32;\
 563         ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
 564         ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
 565         stp     H_q, I_q, [dst], #16;\
 566         add     dst, dst, tmp1;\
 567         str     G_q, [dst], #16;\
 568         b       L(copy_long_check32);\
 569 2:;\
 570         stp     A_q, B_q, [dst], #32;\
 571         prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
 572         ldp     D_q, J_q, [src], #32;\
 573         ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
 574         ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
 575         mov     C_v.16b, G_v.16b;\
 576         stp     H_q, I_q, [dst], #32;\
 577         ldp     F_q, G_q, [src], #32;\
 578         ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 579         ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
 580         mov     E_v.16b, J_v.16b;\
 581         subs    count, count, 64;\
 582         b.ge    2b;\
 583         b       1b;\
 584
 585 EXT_CHUNK(1)
 586 EXT_CHUNK(2)
 587 EXT_CHUNK(3)
 588 EXT_CHUNK(4)
 589 EXT_CHUNK(5)
 590 EXT_CHUNK(6)
 591 EXT_CHUNK(7)
 592 EXT_CHUNK(8)
 593 EXT_CHUNK(9)
 594 EXT_CHUNK(10)
 595 EXT_CHUNK(11)
 596 EXT_CHUNK(12)
 597 EXT_CHUNK(13)
 598 EXT_CHUNK(14)
 599 EXT_CHUNK(15)
 600
 601 END (MEMCPY)
 602         .section        .rodata
 603         .p2align        4
 604
 605 L(ext_table):
 606         /* The first entry is for the alignment of 0 and is never
 607            actually used (could be any value).  */
 608         .word   0
 609         .word   L(ext_size_1) -.
 610         .word   L(ext_size_2) -.
 611         .word   L(ext_size_3) -.
 612         .word   L(ext_size_4) -.
 613         .word   L(ext_size_5) -.
 614         .word   L(ext_size_6) -.
 615         .word   L(ext_size_7) -.
 616         .word   L(ext_size_8) -.
 617         .word   L(ext_size_9) -.
 618         .word   L(ext_size_10) -.
 619         .word   L(ext_size_11) -.
 620         .word   L(ext_size_12) -.
 621         .word   L(ext_size_13) -.
 622         .word   L(ext_size_14) -.
 623         .word   L(ext_size_15) -.
 624
 625 libc_hidden_builtin_def (MEMCPY)
 626 #endif