sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S

   1 /* memcpy optimized with SSE2 unaligned memory access instructions.
   2    Copyright (C) 2014-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #if IS_IN (libc) \
  20     && (defined SHARED \
  21         || defined USE_AS_MEMMOVE \
  22         || !defined USE_MULTIARCH)
  23
  24 # include <sysdep.h>
  25 # include "asm-syntax.h"
  26
  27 # ifndef MEMCPY
  28 #  define MEMCPY        __memcpy_sse2_unaligned
  29 #  define MEMCPY_CHK    __memcpy_chk_sse2_unaligned
  30 # endif
  31
  32 # define DEST           PARMS
  33 # define SRC            DEST+4
  34 # define LEN            SRC+4
  35
  36 # define CFI_PUSH(REG)          \
  37   cfi_adjust_cfa_offset (4);            \
  38   cfi_rel_offset (REG, 0)
  39
  40 # define CFI_POP(REG)           \
  41   cfi_adjust_cfa_offset (-4);           \
  42   cfi_restore (REG)
  43
  44 # define PUSH(REG)      pushl REG; CFI_PUSH (REG)
  45 # define POP(REG)       popl REG; CFI_POP (REG)
  46
  47 # define PARMS          8               /* Preserve EBX.  */
  48 # define ENTRANCE       PUSH (%ebx);
  49 # define RETURN_END     POP (%ebx); ret
  50 # define RETURN RETURN_END; CFI_PUSH (%ebx)
  51
  52         .section .text.sse2,"ax",@progbits
  53 # if defined SHARED
  54 ENTRY (MEMCPY_CHK)
  55         movl    12(%esp), %eax
  56         cmpl    %eax, 16(%esp)
  57         jb      HIDDEN_JUMPTARGET (__chk_fail)
  58 END (MEMCPY_CHK)
  59 # endif
  60
  61 ENTRY (MEMCPY)
  62         ENTRANCE
  63         movl    LEN(%esp), %ecx
  64         movl    SRC(%esp), %eax
  65         movl    DEST(%esp), %edx
  66         cmp     %edx, %eax
  67
  68 # ifdef USE_AS_MEMMOVE
  69         ja      L(check_forward)
  70
  71 L(mm_len_0_or_more_backward):
  72 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
  73         separately.  */
  74         cmp     $16, %ecx
  75         jbe     L(mm_len_0_16_bytes_backward)
  76
  77         cmpl    $32, %ecx
  78         ja      L(mm_len_32_or_more_backward)
  79
  80 /* Copy [0..32] and return.  */
  81         movdqu  (%eax), %xmm0
  82         movdqu  -16(%eax, %ecx), %xmm1
  83         movdqu  %xmm0, (%edx)
  84         movdqu  %xmm1, -16(%edx, %ecx)
  85         jmp     L(return)
  86
  87 L(mm_len_32_or_more_backward):
  88         cmpl    $64, %ecx
  89         ja      L(mm_len_64_or_more_backward)
  90
  91 /* Copy [0..64] and return.  */
  92         movdqu  (%eax), %xmm0
  93         movdqu  16(%eax), %xmm1
  94         movdqu  -16(%eax, %ecx), %xmm2
  95         movdqu  -32(%eax, %ecx), %xmm3
  96         movdqu  %xmm0, (%edx)
  97         movdqu  %xmm1, 16(%edx)
  98         movdqu  %xmm2, -16(%edx, %ecx)
  99         movdqu  %xmm3, -32(%edx, %ecx)
 100         jmp     L(return)
 101
 102 L(mm_len_64_or_more_backward):
 103         cmpl    $128, %ecx
 104         ja      L(mm_len_128_or_more_backward)
 105
 106 /* Copy [0..128] and return.  */
 107         movdqu  (%eax), %xmm0
 108         movdqu  16(%eax), %xmm1
 109         movdqu  32(%eax), %xmm2
 110         movdqu  48(%eax), %xmm3
 111         movdqu  -64(%eax, %ecx), %xmm4
 112         movdqu  -48(%eax, %ecx), %xmm5
 113         movdqu  -32(%eax, %ecx), %xmm6
 114         movdqu  -16(%eax, %ecx), %xmm7
 115         movdqu  %xmm0, (%edx)
 116         movdqu  %xmm1, 16(%edx)
 117         movdqu  %xmm2, 32(%edx)
 118         movdqu  %xmm3, 48(%edx)
 119         movdqu  %xmm4, -64(%edx, %ecx)
 120         movdqu  %xmm5, -48(%edx, %ecx)
 121         movdqu  %xmm6, -32(%edx, %ecx)
 122         movdqu  %xmm7, -16(%edx, %ecx)
 123         jmp     L(return)
 124
 125 L(mm_len_128_or_more_backward):
 126         add     %ecx, %eax
 127         cmp     %edx, %eax
 128         movl    SRC(%esp), %eax
 129         jbe     L(forward)
 130         PUSH (%esi)
 131         PUSH (%edi)
 132         PUSH (%ebx)
 133
 134 /* Aligning the address of destination. */
 135         movdqu  (%eax), %xmm4
 136         movdqu  16(%eax), %xmm5
 137         movdqu  32(%eax), %xmm6
 138         movdqu  48(%eax), %xmm7
 139         leal    (%edx, %ecx), %esi
 140         movdqu  -16(%eax, %ecx), %xmm0
 141         subl    $16, %esp
 142         movdqu  %xmm0, (%esp)
 143         mov     %ecx, %edi
 144         movl    %esi, %ecx
 145         andl    $-16, %ecx
 146         leal    (%ecx), %ebx
 147         subl    %edx, %ebx
 148         leal    (%eax, %ebx), %eax
 149         shrl    $6, %ebx
 150
 151 # ifdef SHARED_CACHE_SIZE_HALF
 152         cmp     $SHARED_CACHE_SIZE_HALF, %edi
 153 # else
 154 #  ifdef PIC
 155         PUSH (%ebx)
 156         SETUP_PIC_REG (bx)
 157         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 158         cmp     __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
 159         POP (%ebx)
 160 #  else
 161         cmp     __x86_shared_cache_size_half, %edi
 162 #  endif
 163 # endif
 164         jae     L(mm_large_page_loop_backward)
 165
 166         .p2align 4
 167 L(mm_main_loop_backward):
 168
 169         prefetcht0 -128(%eax)
 170
 171         movdqu  -64(%eax), %xmm0
 172         movdqu  -48(%eax), %xmm1
 173         movdqu  -32(%eax), %xmm2
 174         movdqu  -16(%eax), %xmm3
 175         movaps  %xmm0, -64(%ecx)
 176         subl    $64, %eax
 177         movaps  %xmm1, -48(%ecx)
 178         movaps  %xmm2, -32(%ecx)
 179         movaps  %xmm3, -16(%ecx)
 180         subl    $64, %ecx
 181         sub     $1, %ebx
 182         jnz     L(mm_main_loop_backward)
 183         movdqu  (%esp), %xmm0
 184         addl    $16, %esp
 185         movdqu  %xmm0, -16(%esi)
 186         movdqu  %xmm4, (%edx)
 187         movdqu  %xmm5, 16(%edx)
 188         movdqu  %xmm6, 32(%edx)
 189         movdqu  %xmm7, 48(%edx)
 190         POP (%ebx)
 191         jmp     L(mm_return_pop_all)
 192
 193 /* Copy [0..16] and return.  */
 194 L(mm_len_0_16_bytes_backward):
 195         testb   $24, %cl
 196         jnz     L(mm_len_9_16_bytes_backward)
 197         testb   $4, %cl
 198         .p2align 4,,5
 199         jnz     L(mm_len_5_8_bytes_backward)
 200         testl   %ecx, %ecx
 201         .p2align 4,,2
 202         je      L(return)
 203         testb   $2, %cl
 204         .p2align 4,,1
 205         jne     L(mm_len_3_4_bytes_backward)
 206         movzbl  -1(%eax,%ecx), %ebx
 207         movzbl  (%eax), %eax
 208         movb    %bl, -1(%edx,%ecx)
 209         movb    %al, (%edx)
 210         jmp     L(return)
 211
 212 L(mm_len_3_4_bytes_backward):
 213         movzwl  -2(%eax,%ecx), %ebx
 214         movzwl  (%eax), %eax
 215         movw    %bx, -2(%edx,%ecx)
 216         movw    %ax, (%edx)
 217         jmp     L(return)
 218
 219 L(mm_len_9_16_bytes_backward):
 220         PUSH (%esi)
 221         movl    -4(%eax,%ecx), %ebx
 222         movl    -8(%eax,%ecx), %esi
 223         movl    %ebx, -4(%edx,%ecx)
 224         movl    %esi, -8(%edx,%ecx)
 225         subl    $8, %ecx
 226         POP (%esi)
 227         jmp     L(mm_len_0_16_bytes_backward)
 228
 229 L(mm_len_5_8_bytes_backward):
 230         movl    (%eax), %ebx
 231         movl    -4(%eax,%ecx), %eax
 232         movl    %ebx, (%edx)
 233         movl    %eax, -4(%edx,%ecx)
 234         jmp     L(return)
 235
 236 /* Big length copy backward part.  */
 237         .p2align 4
 238 L(mm_large_page_loop_backward):
 239         movdqu  -64(%eax), %xmm0
 240         movdqu  -48(%eax), %xmm1
 241         movdqu  -32(%eax), %xmm2
 242         movdqu  -16(%eax), %xmm3
 243         movntdq %xmm0, -64(%ecx)
 244         subl    $64, %eax
 245         movntdq %xmm1, -48(%ecx)
 246         movntdq %xmm2, -32(%ecx)
 247         movntdq %xmm3, -16(%ecx)
 248         subl    $64, %ecx
 249         sub     $1, %ebx
 250         jnz     L(mm_large_page_loop_backward)
 251         sfence
 252         movdqu  (%esp), %xmm0
 253         addl    $16, %esp
 254         movdqu  %xmm0, -16(%esi)
 255         movdqu  %xmm4, (%edx)
 256         movdqu  %xmm5, 16(%edx)
 257         movdqu  %xmm6, 32(%edx)
 258         movdqu  %xmm7, 48(%edx)
 259         POP (%ebx)
 260         jmp     L(mm_return_pop_all)
 261
 262 L(check_forward):
 263         add     %edx, %ecx
 264         cmp     %eax, %ecx
 265         movl    LEN(%esp), %ecx
 266         jbe     L(forward)
 267
 268 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
 269         separately.  */
 270         cmp     $16, %ecx
 271         jbe     L(mm_len_0_16_bytes_forward)
 272
 273         cmpl    $32, %ecx
 274         ja      L(mm_len_32_or_more_forward)
 275
 276 /* Copy [0..32] and return.  */
 277         movdqu  (%eax), %xmm0
 278         movdqu  -16(%eax, %ecx), %xmm1
 279         movdqu  %xmm0, (%edx)
 280         movdqu  %xmm1, -16(%edx, %ecx)
 281         jmp     L(return)
 282
 283 L(mm_len_32_or_more_forward):
 284         cmpl    $64, %ecx
 285         ja      L(mm_len_64_or_more_forward)
 286
 287 /* Copy [0..64] and return.  */
 288         movdqu  (%eax), %xmm0
 289         movdqu  16(%eax), %xmm1
 290         movdqu  -16(%eax, %ecx), %xmm2
 291         movdqu  -32(%eax, %ecx), %xmm3
 292         movdqu  %xmm0, (%edx)
 293         movdqu  %xmm1, 16(%edx)
 294         movdqu  %xmm2, -16(%edx, %ecx)
 295         movdqu  %xmm3, -32(%edx, %ecx)
 296         jmp     L(return)
 297
 298 L(mm_len_64_or_more_forward):
 299         cmpl    $128, %ecx
 300         ja      L(mm_len_128_or_more_forward)
 301
 302 /* Copy [0..128] and return.  */
 303         movdqu  (%eax), %xmm0
 304         movdqu  16(%eax), %xmm1
 305         movdqu  32(%eax), %xmm2
 306         movdqu  48(%eax), %xmm3
 307         movdqu  -64(%eax, %ecx), %xmm4
 308         movdqu  -48(%eax, %ecx), %xmm5
 309         movdqu  -32(%eax, %ecx), %xmm6
 310         movdqu  -16(%eax, %ecx), %xmm7
 311         movdqu  %xmm0, (%edx)
 312         movdqu  %xmm1, 16(%edx)
 313         movdqu  %xmm2, 32(%edx)
 314         movdqu  %xmm3, 48(%edx)
 315         movdqu  %xmm4, -64(%edx, %ecx)
 316         movdqu  %xmm5, -48(%edx, %ecx)
 317         movdqu  %xmm6, -32(%edx, %ecx)
 318         movdqu  %xmm7, -16(%edx, %ecx)
 319         jmp     L(return)
 320
 321 L(mm_len_128_or_more_forward):
 322         PUSH (%esi)
 323         PUSH (%edi)
 324         PUSH (%ebx)
 325
 326 /* Aligning the address of destination. */
 327         movdqu  -16(%eax, %ecx), %xmm4
 328         movdqu  -32(%eax, %ecx), %xmm5
 329         movdqu  -48(%eax, %ecx), %xmm6
 330         movdqu  -64(%eax, %ecx), %xmm7
 331         leal    (%edx, %ecx), %esi
 332         movdqu  (%eax), %xmm0
 333         subl    $16, %esp
 334         movdqu  %xmm0, (%esp)
 335         mov     %ecx, %edi
 336         leal    16(%edx), %ecx
 337         andl    $-16, %ecx
 338         movl    %ecx, %ebx
 339         subl    %edx, %ebx
 340         addl    %ebx, %eax
 341         movl    %esi, %ebx
 342         subl    %ecx, %ebx
 343         shrl    $6, %ebx
 344
 345 # ifdef SHARED_CACHE_SIZE_HALF
 346         cmp     $SHARED_CACHE_SIZE_HALF, %edi
 347 # else
 348 #  ifdef PIC
 349         PUSH (%ebx)
 350         SETUP_PIC_REG(bx)
 351         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 352         cmp     __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
 353         POP (%ebx)
 354 #  else
 355         cmp     __x86_shared_cache_size_half, %edi
 356 #  endif
 357 # endif
 358         jae     L(mm_large_page_loop_forward)
 359
 360         .p2align 4
 361 L(mm_main_loop_forward):
 362
 363         prefetcht0 128(%eax)
 364
 365         movdqu  (%eax), %xmm0
 366         movdqu  16(%eax), %xmm1
 367         movdqu  32(%eax), %xmm2
 368         movdqu  48(%eax), %xmm3
 369         movdqa  %xmm0, (%ecx)
 370         addl    $64, %eax
 371         movaps  %xmm1, 16(%ecx)
 372         movaps  %xmm2, 32(%ecx)
 373         movaps  %xmm3, 48(%ecx)
 374         addl    $64, %ecx
 375         sub     $1, %ebx
 376         jnz     L(mm_main_loop_forward)
 377         movdqu  (%esp), %xmm0
 378         addl    $16, %esp
 379         movdqu  %xmm0, (%edx)
 380         movdqu  %xmm4, -16(%esi)
 381         movdqu  %xmm5, -32(%esi)
 382         movdqu  %xmm6, -48(%esi)
 383         movdqu  %xmm7, -64(%esi)
 384         POP (%ebx)
 385         jmp     L(mm_return_pop_all)
 386
 387 L(mm_len_0_16_bytes_forward):
 388         testb   $24, %cl
 389         jne     L(mm_len_9_16_bytes_forward)
 390         testb   $4, %cl
 391         .p2align 4,,5
 392         jne     L(mm_len_5_8_bytes_forward)
 393         testl   %ecx, %ecx
 394         .p2align 4,,2
 395         je      L(return)
 396         testb   $2, %cl
 397         .p2align 4,,1
 398         jne     L(mm_len_2_4_bytes_forward)
 399         movzbl  -1(%eax,%ecx), %ebx
 400         movzbl  (%eax), %eax
 401         movb    %bl, -1(%edx,%ecx)
 402         movb    %al, (%edx)
 403         jmp     L(return)
 404
 405 L(mm_len_2_4_bytes_forward):
 406         movzwl  -2(%eax,%ecx), %ebx
 407         movzwl  (%eax), %eax
 408         movw    %bx, -2(%edx,%ecx)
 409         movw    %ax, (%edx)
 410         jmp     L(return)
 411
 412 L(mm_len_5_8_bytes_forward):
 413         movl    (%eax), %ebx
 414         movl    -4(%eax,%ecx), %eax
 415         movl    %ebx, (%edx)
 416         movl    %eax, -4(%edx,%ecx)
 417         jmp     L(return)
 418
 419 L(mm_len_9_16_bytes_forward):
 420         movq    (%eax), %xmm0
 421         movq    -8(%eax, %ecx), %xmm1
 422         movq    %xmm0, (%edx)
 423         movq    %xmm1, -8(%edx, %ecx)
 424         jmp     L(return)
 425
 426 L(mm_return_pop_all):
 427         movl    %edx, %eax
 428         POP (%edi)
 429         POP (%esi)
 430         RETURN
 431
 432 /* Big length copy forward part.  */
 433         .p2align 4
 434 L(mm_large_page_loop_forward):
 435         movdqu  (%eax), %xmm0
 436         movdqu  16(%eax), %xmm1
 437         movdqu  32(%eax), %xmm2
 438         movdqu  48(%eax), %xmm3
 439         movntdq %xmm0, (%ecx)
 440         addl    $64, %eax
 441         movntdq %xmm1, 16(%ecx)
 442         movntdq %xmm2, 32(%ecx)
 443         movntdq %xmm3, 48(%ecx)
 444         addl    $64, %ecx
 445         sub     $1, %ebx
 446         jnz     L(mm_large_page_loop_forward)
 447         sfence
 448         movdqu  (%esp), %xmm0
 449         addl    $16, %esp
 450         movdqu  %xmm0, (%edx)
 451         movdqu  %xmm4, -16(%esi)
 452         movdqu  %xmm5, -32(%esi)
 453         movdqu  %xmm6, -48(%esi)
 454         movdqu  %xmm7, -64(%esi)
 455         POP (%ebx)
 456         jmp     L(mm_return_pop_all)
 457 # endif
 458
 459 L(forward):
 460         cmp     $16, %ecx
 461         jbe     L(len_0_16_bytes)
 462
 463 # ifdef SHARED_CACHE_SIZE_HALF
 464         cmp     $SHARED_CACHE_SIZE_HALF, %ecx
 465 # else
 466 #  ifdef PIC
 467         SETUP_PIC_REG(bx)
 468         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 469         cmp     __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
 470 #  else
 471         cmp     __x86_shared_cache_size_half, %ecx
 472 #  endif
 473 # endif
 474         jae     L(large_page)
 475
 476         movdqu  (%eax), %xmm0
 477         movdqu  -16(%eax, %ecx), %xmm1
 478         cmpl    $32, %ecx
 479         movdqu  %xmm0, (%edx)
 480         movdqu  %xmm1, -16(%edx, %ecx)
 481         jbe     L(return)
 482
 483         movdqu  16(%eax), %xmm0
 484         movdqu  -32(%eax, %ecx), %xmm1
 485         cmpl    $64, %ecx
 486         movdqu  %xmm0, 16(%edx)
 487         movdqu  %xmm1, -32(%edx, %ecx)
 488         jbe     L(return)
 489
 490         movdqu  32(%eax), %xmm0
 491         movdqu  48(%eax), %xmm1
 492         movdqu  -48(%eax, %ecx), %xmm2
 493         movdqu  -64(%eax, %ecx), %xmm3
 494         cmpl    $128, %ecx
 495         movdqu  %xmm0, 32(%edx)
 496         movdqu  %xmm1, 48(%edx)
 497         movdqu  %xmm2, -48(%edx, %ecx)
 498         movdqu  %xmm3, -64(%edx, %ecx)
 499         jbe     L(return)
 500
 501 /* Now the main loop: we align the address of the destination.  */
 502         leal    64(%edx), %ebx
 503         andl    $-64, %ebx
 504
 505         addl    %edx, %ecx
 506         andl    $-64, %ecx
 507
 508         subl    %edx, %eax
 509
 510 /* We should stop two iterations before the termination
 511         (in order not to misprefetch).  */
 512         subl    $64, %ecx
 513         cmpl    %ebx, %ecx
 514         je      L(main_loop_just_one_iteration)
 515
 516         subl    $64, %ecx
 517         cmpl    %ebx, %ecx
 518         je      L(main_loop_last_two_iterations)
 519
 520         .p2align 4
 521 L(main_loop_cache):
 522
 523         prefetcht0 128(%ebx, %eax)
 524
 525         movdqu  (%ebx, %eax), %xmm0
 526         movdqu  16(%ebx, %eax), %xmm1
 527         movdqu  32(%ebx, %eax), %xmm2
 528         movdqu  48(%ebx, %eax), %xmm3
 529         movdqa  %xmm0, (%ebx)
 530         movaps  %xmm1, 16(%ebx)
 531         movaps  %xmm2, 32(%ebx)
 532         movaps  %xmm3, 48(%ebx)
 533         lea     64(%ebx), %ebx
 534         cmpl    %ebx, %ecx
 535         jne     L(main_loop_cache)
 536
 537 L(main_loop_last_two_iterations):
 538         movdqu  (%ebx, %eax), %xmm0
 539         movdqu  16(%ebx, %eax), %xmm1
 540         movdqu  32(%ebx, %eax), %xmm2
 541         movdqu  48(%ebx, %eax), %xmm3
 542         movdqu  64(%ebx, %eax), %xmm4
 543         movdqu  80(%ebx, %eax), %xmm5
 544         movdqu  96(%ebx, %eax), %xmm6
 545         movdqu  112(%ebx, %eax), %xmm7
 546         movdqa  %xmm0, (%ebx)
 547         movaps  %xmm1, 16(%ebx)
 548         movaps  %xmm2, 32(%ebx)
 549         movaps  %xmm3, 48(%ebx)
 550         movaps  %xmm4, 64(%ebx)
 551         movaps  %xmm5, 80(%ebx)
 552         movaps  %xmm6, 96(%ebx)
 553         movaps  %xmm7, 112(%ebx)
 554         jmp     L(return)
 555
 556 L(main_loop_just_one_iteration):
 557         movdqu  (%ebx, %eax), %xmm0
 558         movdqu  16(%ebx, %eax), %xmm1
 559         movdqu  32(%ebx, %eax), %xmm2
 560         movdqu  48(%ebx, %eax), %xmm3
 561         movdqa  %xmm0, (%ebx)
 562         movaps  %xmm1, 16(%ebx)
 563         movaps  %xmm2, 32(%ebx)
 564         movaps  %xmm3, 48(%ebx)
 565         jmp     L(return)
 566
 567 L(large_page):
 568         movdqu  (%eax), %xmm0
 569         movdqu  16(%eax), %xmm1
 570         movdqu  32(%eax), %xmm2
 571         movdqu  48(%eax), %xmm3
 572         movdqu  -64(%eax, %ecx), %xmm4
 573         movdqu  -48(%eax, %ecx), %xmm5
 574         movdqu  -32(%eax, %ecx), %xmm6
 575         movdqu  -16(%eax, %ecx), %xmm7
 576         movdqu  %xmm0, (%edx)
 577         movdqu  %xmm1, 16(%edx)
 578         movdqu  %xmm2, 32(%edx)
 579         movdqu  %xmm3, 48(%edx)
 580         movdqu  %xmm4, -64(%edx, %ecx)
 581         movdqu  %xmm5, -48(%edx, %ecx)
 582         movdqu  %xmm6, -32(%edx, %ecx)
 583         movdqu  %xmm7, -16(%edx, %ecx)
 584
 585         movdqu  64(%eax), %xmm0
 586         movdqu  80(%eax), %xmm1
 587         movdqu  96(%eax), %xmm2
 588         movdqu  112(%eax), %xmm3
 589         movdqu  -128(%eax, %ecx), %xmm4
 590         movdqu  -112(%eax, %ecx), %xmm5
 591         movdqu  -96(%eax, %ecx), %xmm6
 592         movdqu  -80(%eax, %ecx), %xmm7
 593         movdqu  %xmm0, 64(%edx)
 594         movdqu  %xmm1, 80(%edx)
 595         movdqu  %xmm2, 96(%edx)
 596         movdqu  %xmm3, 112(%edx)
 597         movdqu  %xmm4, -128(%edx, %ecx)
 598         movdqu  %xmm5, -112(%edx, %ecx)
 599         movdqu  %xmm6, -96(%edx, %ecx)
 600         movdqu  %xmm7, -80(%edx, %ecx)
 601
 602 /* Now the main loop with non temporal stores. We align
 603         the address of the destination.  */
 604         leal    128(%edx), %ebx
 605         andl    $-128, %ebx
 606
 607         addl    %edx, %ecx
 608         andl    $-128, %ecx
 609
 610         subl    %edx, %eax
 611
 612         .p2align 4
 613 L(main_loop_large_page):
 614         movdqu  (%ebx, %eax), %xmm0
 615         movdqu  16(%ebx, %eax), %xmm1
 616         movdqu  32(%ebx, %eax), %xmm2
 617         movdqu  48(%ebx, %eax), %xmm3
 618         movdqu  64(%ebx, %eax), %xmm4
 619         movdqu  80(%ebx, %eax), %xmm5
 620         movdqu  96(%ebx, %eax), %xmm6
 621         movdqu  112(%ebx, %eax), %xmm7
 622         movntdq %xmm0, (%ebx)
 623         movntdq %xmm1, 16(%ebx)
 624         movntdq %xmm2, 32(%ebx)
 625         movntdq %xmm3, 48(%ebx)
 626         movntdq %xmm4, 64(%ebx)
 627         movntdq %xmm5, 80(%ebx)
 628         movntdq %xmm6, 96(%ebx)
 629         movntdq %xmm7, 112(%ebx)
 630         lea     128(%ebx), %ebx
 631         cmpl    %ebx, %ecx
 632         jne     L(main_loop_large_page)
 633         sfence
 634         jmp     L(return)
 635
 636 L(len_0_16_bytes):
 637         testb   $24, %cl
 638         jne     L(len_9_16_bytes)
 639         testb   $4, %cl
 640         .p2align 4,,5
 641         jne     L(len_5_8_bytes)
 642         testl   %ecx, %ecx
 643         .p2align 4,,2
 644         je      L(return)
 645         movzbl  (%eax), %ebx
 646         testb   $2, %cl
 647         movb    %bl, (%edx)
 648         je      L(return)
 649         movzwl  -2(%eax,%ecx), %ebx
 650         movw    %bx, -2(%edx,%ecx)
 651         jmp     L(return)
 652
 653 L(len_9_16_bytes):
 654         movq    (%eax), %xmm0
 655         movq    -8(%eax, %ecx), %xmm1
 656         movq    %xmm0, (%edx)
 657         movq    %xmm1, -8(%edx, %ecx)
 658         jmp     L(return)
 659
 660 L(len_5_8_bytes):
 661         movl    (%eax), %ebx
 662         movl    %ebx, (%edx)
 663         movl    -4(%eax,%ecx), %ebx
 664         movl    %ebx, -4(%edx,%ecx)
 665
 666 L(return):
 667         movl    %edx, %eax
 668 # ifdef USE_AS_MEMPCPY
 669         movl    LEN(%esp), %ecx
 670         add     %ecx, %eax
 671 # endif
 672         RETURN
 673
 674 END (MEMCPY)
 675 #endif