sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

   1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
   2    Copyright (C) 2016-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* memmove/memcpy/mempcpy is implemented as:
  20    1. Use overlapping load and store to avoid branch.
  21    2. Load all sources into registers and store them together to avoid
  22       possible address overlap between source and destination.
  23    3. If size is 8 * VEC_SIZE or less, load all sources into registers
  24       and store them together.
  25    4. If address of destination > address of source, backward copy
  26       4 * VEC_SIZE at a time with unaligned load and aligned store.
  27       Load the first 4 * VEC and last VEC before the loop and store
  28       them after the loop to support overlapping addresses.
  29    5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
  30       load and aligned store.  Load the last 4 * VEC and first VEC
  31       before the loop and store them after the loop to support
  32       overlapping addresses.
  33    6. On machines with ERMS feature, if size greater than equal or to
  34       __x86_rep_movsb_threshold and less than
  35       __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
  36    7. If size >= __x86_shared_non_temporal_threshold and there is no
  37       overlap between destination and source, use non-temporal store
  38       instead of aligned store copying from either 2 or 4 pages at
  39       once.
  40    8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
  41       and source and destination do not page alias, copy from 2 pages
  42       at once using non-temporal stores. Page aliasing in this case is
  43       considered true if destination's page alignment - sources' page
  44       alignment is less than 8 * VEC_SIZE.
  45    9. If size >= 16 * __x86_shared_non_temporal_threshold or source
  46       and destination do page alias copy from 4 pages at once using
  47       non-temporal stores.  */
  48
  49 #include <sysdep.h>
  50
  51 #ifndef MEMCPY_SYMBOL
  52 # define MEMCPY_SYMBOL(p,s)             MEMMOVE_SYMBOL(p, s)
  53 #endif
  54
  55 #ifndef MEMPCPY_SYMBOL
  56 # define MEMPCPY_SYMBOL(p,s)            MEMMOVE_SYMBOL(p, s)
  57 #endif
  58
  59 #ifndef MEMMOVE_CHK_SYMBOL
  60 # define MEMMOVE_CHK_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
  61 #endif
  62
  63 #ifndef VZEROUPPER
  64 # if VEC_SIZE > 16
  65 #  define VZEROUPPER vzeroupper
  66 # else
  67 #  define VZEROUPPER
  68 # endif
  69 #endif
  70
  71 /* Whether to align before movsb. Ultimately we want 64 byte
  72    align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
  73 #define ALIGN_MOVSB     (VEC_SIZE > 16)
  74 /* Number of bytes to align movsb to.  */
  75 #define MOVSB_ALIGN_TO  64
  76
  77 #define SMALL_MOV_SIZE  (MOV_SIZE <= 4)
  78 #define LARGE_MOV_SIZE  (MOV_SIZE > 4)
  79
  80 #if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
  81 # error MOV_SIZE Unknown
  82 #endif
  83
  84 #if LARGE_MOV_SIZE
  85 # define SMALL_SIZE_OFFSET      (4)
  86 #else
  87 # define SMALL_SIZE_OFFSET      (0)
  88 #endif
  89
  90 #ifndef PAGE_SIZE
  91 # define PAGE_SIZE 4096
  92 #endif
  93
  94 #if PAGE_SIZE != 4096
  95 # error Unsupported PAGE_SIZE
  96 #endif
  97
  98 #ifndef LOG_PAGE_SIZE
  99 # define LOG_PAGE_SIZE 12
 100 #endif
 101
 102 #if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
 103 # error Invalid LOG_PAGE_SIZE
 104 #endif
 105
 106 /* Byte per page for large_memcpy inner loop.  */
 107 #if VEC_SIZE == 64
 108 # define LARGE_LOAD_SIZE (VEC_SIZE * 2)
 109 #else
 110 # define LARGE_LOAD_SIZE (VEC_SIZE * 4)
 111 #endif
 112
 113 /* Amount to shift __x86_shared_non_temporal_threshold by for
 114    bound for memcpy_large_4x. This is essentially use to to
 115    indicate that the copy is far beyond the scope of L3
 116    (assuming no user config x86_non_temporal_threshold) and to
 117    use a more aggressively unrolled loop.  NB: before
 118    increasing the value also update initialization of
 119    x86_non_temporal_threshold.  */
 120 #ifndef LOG_4X_MEMCPY_THRESH
 121 # define LOG_4X_MEMCPY_THRESH 4
 122 #endif
 123
 124 /* Avoid short distance rep movsb only with non-SSE vector.  */
 125 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 126 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
 127 #else
 128 # define AVOID_SHORT_DISTANCE_REP_MOVSB 0
 129 #endif
 130
 131 #ifndef PREFETCH
 132 # define PREFETCH(addr) prefetcht0 addr
 133 #endif
 134
 135 /* Assume 64-byte prefetch size.  */
 136 #ifndef PREFETCH_SIZE
 137 # define PREFETCH_SIZE 64
 138 #endif
 139
 140 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
 141
 142 #if PREFETCH_SIZE == 64
 143 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
 144 #  define PREFETCH_ONE_SET(dir, base, offset) \
 145         PREFETCH ((offset)base)
 146 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
 147 #  define PREFETCH_ONE_SET(dir, base, offset) \
 148         PREFETCH ((offset)base); \
 149         PREFETCH ((offset + dir * PREFETCH_SIZE)base)
 150 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
 151 #  define PREFETCH_ONE_SET(dir, base, offset) \
 152         PREFETCH ((offset)base); \
 153         PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
 154         PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
 155         PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
 156 # else
 157 #   error Unsupported PREFETCHED_LOAD_SIZE!
 158 # endif
 159 #else
 160 # error Unsupported PREFETCH_SIZE!
 161 #endif
 162
 163 #if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
 164 # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
 165         VMOVU   (offset)base, vec0; \
 166         VMOVU   ((offset) + VEC_SIZE)base, vec1;
 167 # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
 168         VMOVNT  vec0, (offset)base; \
 169         VMOVNT  vec1, ((offset) + VEC_SIZE)base;
 170 #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
 171 # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
 172         VMOVU   (offset)base, vec0; \
 173         VMOVU   ((offset) + VEC_SIZE)base, vec1; \
 174         VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
 175         VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
 176 # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
 177         VMOVNT  vec0, (offset)base; \
 178         VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
 179         VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
 180         VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
 181 #else
 182 # error Invalid LARGE_LOAD_SIZE
 183 #endif
 184
 185 #ifndef SECTION
 186 # error SECTION is not defined!
 187 #endif
 188
 189         .section SECTION(.text),"ax",@progbits
 190 #if defined SHARED && IS_IN (libc)
 191 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 192         cmp     %RDX_LP, %RCX_LP
 193         jb      HIDDEN_JUMPTARGET (__chk_fail)
 194 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 195 #endif
 196
 197 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 198         mov     %RDI_LP, %RAX_LP
 199         add     %RDX_LP, %RAX_LP
 200         jmp     L(start)
 201 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 202
 203 #if defined SHARED && IS_IN (libc)
 204 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 205         cmp     %RDX_LP, %RCX_LP
 206         jb      HIDDEN_JUMPTARGET (__chk_fail)
 207 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 208 #endif
 209
 210 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 211         movq    %rdi, %rax
 212 L(start):
 213 # ifdef __ILP32__
 214         /* Clear the upper 32 bits.  */
 215         movl    %edx, %edx
 216 # endif
 217         cmp     $VEC_SIZE, %RDX_LP
 218         jb      L(less_vec)
 219         /* Load regardless.  */
 220         VMOVU   (%rsi), %VMM(0)
 221         cmp     $(VEC_SIZE * 2), %RDX_LP
 222         ja      L(more_2x_vec)
 223         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 224         VMOVU   -VEC_SIZE(%rsi,%rdx), %VMM(1)
 225         VMOVU   %VMM(0), (%rdi)
 226         VMOVU   %VMM(1), -VEC_SIZE(%rdi,%rdx)
 227 #if !(defined USE_MULTIARCH && IS_IN (libc))
 228         ZERO_UPPER_VEC_REGISTERS_RETURN
 229 #else
 230         VZEROUPPER_RETURN
 231 #endif
 232 #if defined USE_MULTIARCH && IS_IN (libc)
 233 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 234
 235 # ifdef SHARED
 236 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 237         cmp     %RDX_LP, %RCX_LP
 238         jb      HIDDEN_JUMPTARGET (__chk_fail)
 239 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 240 # endif
 241
 242 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 243         mov     %RDI_LP, %RAX_LP
 244         add     %RDX_LP, %RAX_LP
 245         jmp     L(start_erms)
 246 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 247
 248 # ifdef SHARED
 249 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 250         cmp     %RDX_LP, %RCX_LP
 251         jb      HIDDEN_JUMPTARGET (__chk_fail)
 252 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 253 # endif
 254
 255 ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 256         movq    %rdi, %rax
 257 L(start_erms):
 258 # ifdef __ILP32__
 259         /* Clear the upper 32 bits.  */
 260         movl    %edx, %edx
 261 # endif
 262         cmp     $VEC_SIZE, %RDX_LP
 263         jb      L(less_vec)
 264         /* Load regardless.  */
 265         VMOVU   (%rsi), %VMM(0)
 266         cmp     $(VEC_SIZE * 2), %RDX_LP
 267         ja      L(movsb_more_2x_vec)
 268         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 269          */
 270         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(1)
 271         VMOVU   %VMM(0), (%rdi)
 272         VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rdx)
 273 L(return_vzeroupper):
 274 # if VEC_SIZE > 16
 275         ZERO_UPPER_VEC_REGISTERS_RETURN
 276 # else
 277         ret
 278 # endif
 279 #endif
 280
 281 #if LARGE_MOV_SIZE
 282         /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
 283            ENTRY block and L(less_vec).  */
 284         .p2align 4,, 8
 285 L(between_4_7):
 286         /* From 4 to 7.  No branch when size == 4.  */
 287         movl    (%rsi), %ecx
 288         movl    (%rsi, %rdx), %esi
 289         movl    %ecx, (%rdi)
 290         movl    %esi, (%rdi, %rdx)
 291         ret
 292 #endif
 293
 294         .p2align 4
 295 L(less_vec):
 296         /* Less than 1 VEC.  */
 297 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 298 # error Unsupported VEC_SIZE!
 299 #endif
 300 #if VEC_SIZE > 32
 301         cmpl    $32, %edx
 302         jae     L(between_32_63)
 303 #endif
 304 #if VEC_SIZE > 16
 305         cmpl    $16, %edx
 306         jae     L(between_16_31)
 307 #endif
 308         cmpl    $8, %edx
 309         jae     L(between_8_15)
 310 #if SMALL_MOV_SIZE
 311         cmpl    $4, %edx
 312 #else
 313         subq    $4, %rdx
 314 #endif
 315         jae     L(between_4_7)
 316         cmpl    $(1 - SMALL_SIZE_OFFSET), %edx
 317         jl      L(copy_0)
 318         movb    (%rsi), %cl
 319         je      L(copy_1)
 320         movzwl  (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
 321         movw    %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
 322 L(copy_1):
 323         movb    %cl, (%rdi)
 324 L(copy_0):
 325         ret
 326
 327 #if SMALL_MOV_SIZE
 328         .p2align 4,, 8
 329 L(between_4_7):
 330         /* From 4 to 7.  No branch when size == 4.  */
 331         movl    -4(%rsi, %rdx), %ecx
 332         movl    (%rsi), %esi
 333         movl    %ecx, -4(%rdi, %rdx)
 334         movl    %esi, (%rdi)
 335         ret
 336 #endif
 337
 338 #if VEC_SIZE > 16
 339         /* From 16 to 31.  No branch when size == 16.  */
 340         .p2align 4,, 8
 341 L(between_16_31):
 342         vmovdqu (%rsi), %xmm0
 343         vmovdqu -16(%rsi, %rdx), %xmm1
 344         vmovdqu %xmm0, (%rdi)
 345         vmovdqu %xmm1, -16(%rdi, %rdx)
 346         /* No ymm registers have been touched.  */
 347         ret
 348 #endif
 349
 350 #if VEC_SIZE > 32
 351         .p2align 4,, 10
 352 L(between_32_63):
 353         /* From 32 to 63.  No branch when size == 32.  */
 354         VMOVU   (%rsi), %VMM_256(0)
 355         VMOVU   -32(%rsi, %rdx), %VMM_256(1)
 356         VMOVU   %VMM_256(0), (%rdi)
 357         VMOVU   %VMM_256(1), -32(%rdi, %rdx)
 358         VZEROUPPER_RETURN
 359 #endif
 360
 361         .p2align 4,, 10
 362 L(between_8_15):
 363         /* From 8 to 15.  No branch when size == 8.  */
 364         movq    -8(%rsi, %rdx), %rcx
 365         movq    (%rsi), %rsi
 366         movq    %rsi, (%rdi)
 367         movq    %rcx, -8(%rdi, %rdx)
 368         ret
 369
 370         .p2align 4,, 10
 371 L(last_4x_vec):
 372         /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 373
 374         /* VEC(0) and VEC(1) have already been loaded.  */
 375         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(2)
 376         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
 377         VMOVU   %VMM(0), (%rdi)
 378         VMOVU   %VMM(1), VEC_SIZE(%rdi)
 379         VMOVU   %VMM(2), -VEC_SIZE(%rdi, %rdx)
 380         VMOVU   %VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 381         VZEROUPPER_RETURN
 382
 383         .p2align 4
 384 #if defined USE_MULTIARCH && IS_IN (libc)
 385 L(movsb_more_2x_vec):
 386         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
 387         ja      L(movsb)
 388 #endif
 389 L(more_2x_vec):
 390         /* More than 2 * VEC and there may be overlap between
 391            destination and source.  */
 392         cmpq    $(VEC_SIZE * 8), %rdx
 393         ja      L(more_8x_vec)
 394         /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
 395         VMOVU   VEC_SIZE(%rsi), %VMM(1)
 396         cmpq    $(VEC_SIZE * 4), %rdx
 397         jbe     L(last_4x_vec)
 398         /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 399         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
 400         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
 401         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(4)
 402         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
 403         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
 404         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
 405         VMOVU   %VMM(0), (%rdi)
 406         VMOVU   %VMM(1), VEC_SIZE(%rdi)
 407         VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
 408         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 409         VMOVU   %VMM(4), -VEC_SIZE(%rdi, %rdx)
 410         VMOVU   %VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
 411         VMOVU   %VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
 412         VMOVU   %VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 413         VZEROUPPER_RETURN
 414
 415         .p2align 4,, 4
 416 L(more_8x_vec):
 417         movq    %rdi, %rcx
 418         subq    %rsi, %rcx
 419         /* Go to backwards temporal copy if overlap no matter what as
 420            backward REP MOVSB is slow and we don't want to use NT stores if
 421            there is overlap.  */
 422         cmpq    %rdx, %rcx
 423         /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
 424         jb      L(more_8x_vec_backward_check_nop)
 425         /* Check if non-temporal move candidate.  */
 426 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 427         /* Check non-temporal store threshold.  */
 428         cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 429         ja      L(large_memcpy_2x)
 430 #endif
 431         /* To reach this point there cannot be overlap and dst > src. So
 432            check for overlap and src > dst in which case correctness
 433            requires forward copy. Otherwise decide between backward/forward
 434            copy depending on address aliasing.  */
 435
 436         /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
 437            but less than __x86_shared_non_temporal_threshold.  */
 438 L(more_8x_vec_check):
 439         /* rcx contains dst - src. Add back length (rdx).  */
 440         leaq    (%rcx, %rdx), %r8
 441         /* If r8 has different sign than rcx then there is overlap so we
 442            must do forward copy.  */
 443         xorq    %rcx, %r8
 444         /* Isolate just sign bit of r8.  */
 445         shrq    $63, %r8
 446         /* Get 4k difference dst - src.  */
 447         andl    $(PAGE_SIZE - 256), %ecx
 448         /* If r8 is non-zero must do foward for correctness. Otherwise
 449            if ecx is non-zero there is 4k False Alaising so do backward
 450            copy.  */
 451         addl    %r8d, %ecx
 452         jz      L(more_8x_vec_backward)
 453
 454         /* if rdx is greater than __x86_shared_non_temporal_threshold
 455            but there is overlap, or from short distance movsb.  */
 456 L(more_8x_vec_forward):
 457         /* Load first and last 4 * VEC to support overlapping addresses.
 458          */
 459
 460         /* First vec was already loaded into VEC(0).  */
 461         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(5)
 462         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
 463         /* Save begining of dst.  */
 464         movq    %rdi, %rcx
 465         /* Align dst to VEC_SIZE - 1.  */
 466         orq     $(VEC_SIZE - 1), %rdi
 467         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
 468         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
 469
 470         /* Subtract dst from src. Add back after dst aligned.  */
 471         subq    %rcx, %rsi
 472         /* Finish aligning dst.  */
 473         incq    %rdi
 474         /* Restore src adjusted with new value for aligned dst.  */
 475         addq    %rdi, %rsi
 476         /* Store end of buffer minus tail in rdx.  */
 477         leaq    (VEC_SIZE * -4)(%rcx, %rdx), %rdx
 478
 479         /* Dont use multi-byte nop to align.  */
 480         .p2align 4,, 11
 481 L(loop_4x_vec_forward):
 482         /* Copy 4 * VEC a time forward.  */
 483         VMOVU   (%rsi), %VMM(1)
 484         VMOVU   VEC_SIZE(%rsi), %VMM(2)
 485         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
 486         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
 487         subq    $-(VEC_SIZE * 4), %rsi
 488         VMOVA   %VMM(1), (%rdi)
 489         VMOVA   %VMM(2), VEC_SIZE(%rdi)
 490         VMOVA   %VMM(3), (VEC_SIZE * 2)(%rdi)
 491         VMOVA   %VMM(4), (VEC_SIZE * 3)(%rdi)
 492         subq    $-(VEC_SIZE * 4), %rdi
 493         cmpq    %rdi, %rdx
 494         ja      L(loop_4x_vec_forward)
 495         /* Store the last 4 * VEC.  */
 496         VMOVU   %VMM(5), (VEC_SIZE * 3)(%rdx)
 497         VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdx)
 498         VMOVU   %VMM(7), VEC_SIZE(%rdx)
 499         VMOVU   %VMM(8), (%rdx)
 500         /* Store the first VEC.  */
 501         VMOVU   %VMM(0), (%rcx)
 502         /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 503          */
 504 L(nop_backward):
 505         VZEROUPPER_RETURN
 506
 507         .p2align 4,, 8
 508 L(more_8x_vec_backward_check_nop):
 509         /* rcx contains dst - src. Test for dst == src to skip all of
 510            memmove.  */
 511         testq   %rcx, %rcx
 512         jz      L(nop_backward)
 513 L(more_8x_vec_backward):
 514         /* Load the first 4 * VEC and last VEC to support overlapping
 515            addresses.  */
 516
 517         /* First vec was also loaded into VEC(0).  */
 518         VMOVU   VEC_SIZE(%rsi), %VMM(5)
 519         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(6)
 520         /* Begining of region for 4x backward copy stored in rcx.  */
 521         leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 522         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
 523         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(8)
 524         /* Subtract dst from src. Add back after dst aligned.  */
 525         subq    %rdi, %rsi
 526         /* Align dst.  */
 527         andq    $-(VEC_SIZE), %rcx
 528         /* Restore src.  */
 529         addq    %rcx, %rsi
 530
 531         /* Don't use multi-byte nop to align.  */
 532         .p2align 4,, 11
 533 L(loop_4x_vec_backward):
 534         /* Copy 4 * VEC a time backward.  */
 535         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(1)
 536         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
 537         VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
 538         VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(4)
 539         addq    $(VEC_SIZE * -4), %rsi
 540         VMOVA   %VMM(1), (VEC_SIZE * 3)(%rcx)
 541         VMOVA   %VMM(2), (VEC_SIZE * 2)(%rcx)
 542         VMOVA   %VMM(3), (VEC_SIZE * 1)(%rcx)
 543         VMOVA   %VMM(4), (VEC_SIZE * 0)(%rcx)
 544         addq    $(VEC_SIZE * -4), %rcx
 545         cmpq    %rcx, %rdi
 546         jb      L(loop_4x_vec_backward)
 547         /* Store the first 4 * VEC.  */
 548         VMOVU   %VMM(0), (%rdi)
 549         VMOVU   %VMM(5), VEC_SIZE(%rdi)
 550         VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdi)
 551         VMOVU   %VMM(7), (VEC_SIZE * 3)(%rdi)
 552         /* Store the last VEC.  */
 553         VMOVU   %VMM(8), -VEC_SIZE(%rdx, %rdi)
 554         VZEROUPPER_RETURN
 555
 556 #if defined USE_MULTIARCH && IS_IN (libc)
 557         /* L(skip_short_movsb_check) is only used with ERMS. Not for
 558            FSRM.  */
 559         .p2align 5,, 16
 560 # if ALIGN_MOVSB
 561 L(skip_short_movsb_check):
 562 #  if MOVSB_ALIGN_TO > VEC_SIZE
 563         VMOVU   VEC_SIZE(%rsi), %VMM(1)
 564 #  endif
 565 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 566 #   error Unsupported MOVSB_ALIGN_TO
 567 #  endif
 568         /* If CPU does not have FSRM two options for aligning. Align src
 569            if dst and src 4k alias. Otherwise align dst.  */
 570         testl   $(PAGE_SIZE - 512), %ecx
 571         jnz     L(movsb_align_dst)
 572         /* Fall through. dst and src 4k alias. It's better to align src
 573            here because the bottleneck will be loads dues to the false
 574            dependency on dst.  */
 575
 576         /* rcx already has dst - src.  */
 577         movq    %rcx, %r9
 578         /* Add src to len. Subtract back after src aligned. -1 because
 579            src is initially aligned to MOVSB_ALIGN_TO - 1.  */
 580         leaq    -1(%rsi, %rdx), %rcx
 581         /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
 582         orq     $(MOVSB_ALIGN_TO - 1), %rsi
 583         /* Restore dst and len adjusted with new values for aligned dst.
 584          */
 585         leaq    1(%rsi, %r9), %rdi
 586         subq    %rsi, %rcx
 587         /* Finish aligning src.  */
 588         incq    %rsi
 589
 590         rep     movsb
 591
 592         VMOVU   %VMM(0), (%r8)
 593 #  if MOVSB_ALIGN_TO > VEC_SIZE
 594         VMOVU   %VMM(1), VEC_SIZE(%r8)
 595 #  endif
 596         VZEROUPPER_RETURN
 597 # endif
 598
 599         .p2align 4,, 12
 600 L(movsb):
 601         movq    %rdi, %rcx
 602         subq    %rsi, %rcx
 603         /* Go to backwards temporal copy if overlap no matter what as
 604            backward REP MOVSB is slow and we don't want to use NT stores if
 605            there is overlap.  */
 606         cmpq    %rdx, %rcx
 607         /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
 608         jb      L(more_8x_vec_backward_check_nop)
 609 # if ALIGN_MOVSB
 610         /* Save dest for storing aligning VECs later.  */
 611         movq    %rdi, %r8
 612 # endif
 613         /* If above __x86_rep_movsb_stop_threshold most likely is
 614            candidate for NT moves aswell.  */
 615         cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
 616         jae     L(large_memcpy_2x_check)
 617 # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
 618         /* Only avoid short movsb if CPU has FSRM.  */
 619 #  if X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB < 256
 620         testb   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
 621 #  else
 622         testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
 623 #  endif
 624         jz      L(skip_short_movsb_check)
 625 #  if AVOID_SHORT_DISTANCE_REP_MOVSB
 626         /* Avoid "rep movsb" if RCX, the distance between source and
 627            destination, is N*4GB + [1..63] with N >= 0.  */
 628
 629         /* ecx contains dst - src. Early check for backward copy
 630            conditions means only case of slow movsb with src = dst + [0,
 631            63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
 632            for that case.  */
 633         cmpl    $-64, %ecx
 634         ja      L(more_8x_vec_forward)
 635 #  endif
 636 # endif
 637 # if ALIGN_MOVSB
 638 #  if MOVSB_ALIGN_TO > VEC_SIZE
 639         VMOVU   VEC_SIZE(%rsi), %VMM(1)
 640 #  endif
 641 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 642 #   error Unsupported MOVSB_ALIGN_TO
 643 #  endif
 644         /* Fall through means cpu has FSRM. In that case exclusively
 645            align destination.  */
 646 L(movsb_align_dst):
 647         /* Subtract dst from src. Add back after dst aligned.  */
 648         subq    %rdi, %rsi
 649         /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
 650         addq    $(MOVSB_ALIGN_TO - 1), %rdi
 651         /* Add dst to len. Subtract back after dst aligned.  */
 652         leaq    (%r8, %rdx), %rcx
 653         /* Finish aligning dst.  */
 654         andq    $-(MOVSB_ALIGN_TO), %rdi
 655         /* Restore src and len adjusted with new values for aligned dst.
 656          */
 657         addq    %rdi, %rsi
 658         subq    %rdi, %rcx
 659
 660         rep     movsb
 661
 662         /* Store VECs loaded for aligning.  */
 663         VMOVU   %VMM(0), (%r8)
 664 #  if MOVSB_ALIGN_TO > VEC_SIZE
 665         VMOVU   %VMM(1), VEC_SIZE(%r8)
 666 #  endif
 667         VZEROUPPER_RETURN
 668 # else  /* !ALIGN_MOVSB.  */
 669 L(skip_short_movsb_check):
 670         mov     %RDX_LP, %RCX_LP
 671         rep     movsb
 672         ret
 673 # endif
 674 #endif
 675
 676         .p2align 4,, 10
 677 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 678 L(large_memcpy_2x_check):
 679         /* Entry from L(large_memcpy_2x) has a redundant load of
 680            __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
 681            is only use for the non-erms memmove which is generally less
 682            common.  */
 683 L(large_memcpy_2x):
 684         mov     __x86_shared_non_temporal_threshold(%rip), %R11_LP
 685         cmp     %R11_LP, %RDX_LP
 686         jb      L(more_8x_vec_check)
 687         /* To reach this point it is impossible for dst > src and
 688            overlap. Remaining to check is src > dst and overlap. rcx
 689            already contains dst - src. Negate rcx to get src - dst. If
 690            length > rcx then there is overlap and forward copy is best.  */
 691         negq    %rcx
 692         cmpq    %rcx, %rdx
 693         ja      L(more_8x_vec_forward)
 694
 695         /* Cache align destination. First store the first 64 bytes then
 696            adjust alignments.  */
 697
 698         /* First vec was also loaded into VEC(0).  */
 699 # if VEC_SIZE < 64
 700         VMOVU   VEC_SIZE(%rsi), %VMM(1)
 701 #  if VEC_SIZE < 32
 702         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
 703         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
 704 #  endif
 705 # endif
 706         VMOVU   %VMM(0), (%rdi)
 707 # if VEC_SIZE < 64
 708         VMOVU   %VMM(1), VEC_SIZE(%rdi)
 709 #  if VEC_SIZE < 32
 710         VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
 711         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 712 #  endif
 713 # endif
 714
 715         /* Adjust source, destination, and size.  */
 716         movq    %rdi, %r8
 717         andq    $63, %r8
 718         /* Get the negative of offset for alignment.  */
 719         subq    $64, %r8
 720         /* Adjust source.  */
 721         subq    %r8, %rsi
 722         /* Adjust destination which should be aligned now.  */
 723         subq    %r8, %rdi
 724         /* Adjust length.  */
 725         addq    %r8, %rdx
 726
 727         /* Test if source and destination addresses will alias. If they
 728            do the larger pipeline in large_memcpy_4x alleviated the
 729            performance drop.  */
 730
 731         /* ecx contains -(dst - src). not ecx will return dst - src - 1
 732            which works for testing aliasing.  */
 733         notl    %ecx
 734         movq    %rdx, %r10
 735         testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
 736         jz      L(large_memcpy_4x)
 737
 738         /* r11 has __x86_shared_non_temporal_threshold.  Shift it left
 739            by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
 740          */
 741         shlq    $LOG_4X_MEMCPY_THRESH, %r11
 742         cmp     %r11, %rdx
 743         jae     L(large_memcpy_4x)
 744
 745         /* edx will store remainder size for copying tail.  */
 746         andl    $(PAGE_SIZE * 2 - 1), %edx
 747         /* r10 stores outer loop counter.  */
 748         shrq    $(LOG_PAGE_SIZE + 1), %r10
 749         /* Copy 4x VEC at a time from 2 pages.  */
 750         .p2align 4
 751 L(loop_large_memcpy_2x_outer):
 752         /* ecx stores inner loop counter.  */
 753         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 754 L(loop_large_memcpy_2x_inner):
 755         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 756         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 757         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 758         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 759         /* Load vectors from rsi.  */
 760         LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
 761         LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 762         subq    $-LARGE_LOAD_SIZE, %rsi
 763         /* Non-temporal store vectors to rdi.  */
 764         STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
 765         STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 766         subq    $-LARGE_LOAD_SIZE, %rdi
 767         decl    %ecx
 768         jnz     L(loop_large_memcpy_2x_inner)
 769         addq    $PAGE_SIZE, %rdi
 770         addq    $PAGE_SIZE, %rsi
 771         decq    %r10
 772         jne     L(loop_large_memcpy_2x_outer)
 773         sfence
 774
 775         /* Check if only last 4 loads are needed.  */
 776         cmpl    $(VEC_SIZE * 4), %edx
 777         jbe     L(large_memcpy_2x_end)
 778
 779         /* Handle the last 2 * PAGE_SIZE bytes.  */
 780 L(loop_large_memcpy_2x_tail):
 781         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 782         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 783         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 784         VMOVU   (%rsi), %VMM(0)
 785         VMOVU   VEC_SIZE(%rsi), %VMM(1)
 786         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
 787         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
 788         subq    $-(VEC_SIZE * 4), %rsi
 789         addl    $-(VEC_SIZE * 4), %edx
 790         VMOVA   %VMM(0), (%rdi)
 791         VMOVA   %VMM(1), VEC_SIZE(%rdi)
 792         VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
 793         VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
 794         subq    $-(VEC_SIZE * 4), %rdi
 795         cmpl    $(VEC_SIZE * 4), %edx
 796         ja      L(loop_large_memcpy_2x_tail)
 797
 798 L(large_memcpy_2x_end):
 799         /* Store the last 4 * VEC.  */
 800         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
 801         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
 802         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
 803         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
 804
 805         VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
 806         VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
 807         VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
 808         VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
 809         VZEROUPPER_RETURN
 810
 811         .p2align 4
 812 L(large_memcpy_4x):
 813         /* edx will store remainder size for copying tail.  */
 814         andl    $(PAGE_SIZE * 4 - 1), %edx
 815         /* r10 stores outer loop counter.  */
 816         shrq    $(LOG_PAGE_SIZE + 2), %r10
 817         /* Copy 4x VEC at a time from 4 pages.  */
 818         .p2align 4
 819 L(loop_large_memcpy_4x_outer):
 820         /* ecx stores inner loop counter.  */
 821         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 822 L(loop_large_memcpy_4x_inner):
 823         /* Only one prefetch set per page as doing 4 pages give more
 824            time for prefetcher to keep up.  */
 825         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 826         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 827         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 828         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 829         /* Load vectors from rsi.  */
 830         LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
 831         LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 832         LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
 833         LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 834         subq    $-LARGE_LOAD_SIZE, %rsi
 835         /* Non-temporal store vectors to rdi.  */
 836         STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
 837         STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 838         STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
 839         STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 840         subq    $-LARGE_LOAD_SIZE, %rdi
 841         decl    %ecx
 842         jnz     L(loop_large_memcpy_4x_inner)
 843         addq    $(PAGE_SIZE * 3), %rdi
 844         addq    $(PAGE_SIZE * 3), %rsi
 845         decq    %r10
 846         jne     L(loop_large_memcpy_4x_outer)
 847         sfence
 848         /* Check if only last 4 loads are needed.  */
 849         cmpl    $(VEC_SIZE * 4), %edx
 850         jbe     L(large_memcpy_4x_end)
 851
 852         /* Handle the last 4  * PAGE_SIZE bytes.  */
 853 L(loop_large_memcpy_4x_tail):
 854         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 855         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 856         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 857         VMOVU   (%rsi), %VMM(0)
 858         VMOVU   VEC_SIZE(%rsi), %VMM(1)
 859         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
 860         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
 861         subq    $-(VEC_SIZE * 4), %rsi
 862         addl    $-(VEC_SIZE * 4), %edx
 863         VMOVA   %VMM(0), (%rdi)
 864         VMOVA   %VMM(1), VEC_SIZE(%rdi)
 865         VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
 866         VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
 867         subq    $-(VEC_SIZE * 4), %rdi
 868         cmpl    $(VEC_SIZE * 4), %edx
 869         ja      L(loop_large_memcpy_4x_tail)
 870
 871 L(large_memcpy_4x_end):
 872         /* Store the last 4 * VEC.  */
 873         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
 874         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
 875         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
 876         VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
 877
 878         VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
 879         VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
 880         VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
 881         VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
 882         VZEROUPPER_RETURN
 883 #endif
 884 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 885
 886 #if IS_IN (libc)
 887 # ifdef USE_MULTIARCH
 888 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 889               MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 890 #  ifdef SHARED
 891 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 892               MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 893 #  endif
 894 # endif
 895 # ifdef SHARED
 896 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
 897               MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 898 # endif
 899 #endif
 900 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
 901               MEMCPY_SYMBOL (__memcpy, unaligned))