sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S

   1 /* memcpy optimized with AVX512 for KNL hardware.
   2    Copyright (C) 2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 #if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \
  22     && (defined SHARED \
  23         || defined USE_AS_MEMMOVE \
  24         || !defined USE_MULTIARCH)
  25
  26 #include "asm-syntax.h"
  27 #ifndef MEMCPY
  28 # define MEMCPY         __memcpy_avx512_no_vzeroupper
  29 # define MEMCPY_CHK     __memcpy_chk_avx512_no_vzeroupper
  30 # define MEMPCPY        __mempcpy_avx512_no_vzeroupper
  31 # define MEMPCPY_CHK    __mempcpy_chk_avx512_no_vzeroupper
  32 #endif
  33
  34         .section .text.avx512,"ax",@progbits
  35 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
  36 ENTRY (MEMPCPY_CHK)
  37         cmpq    %rdx, %rcx
  38         jb      HIDDEN_JUMPTARGET (__chk_fail)
  39 END (MEMPCPY_CHK)
  40
  41 ENTRY (MEMPCPY)
  42         movq    %rdi, %rax
  43         addq    %rdx, %rax
  44         jmp     L(start)
  45 END (MEMPCPY)
  46 #endif
  47
  48 #if !defined USE_AS_BCOPY
  49 ENTRY (MEMCPY_CHK)
  50         cmpq    %rdx, %rcx
  51         jb      HIDDEN_JUMPTARGET (__chk_fail)
  52 END (MEMCPY_CHK)
  53 #endif
  54
  55 ENTRY (MEMCPY)
  56         mov     %rdi, %rax
  57 #ifdef USE_AS_MEMPCPY
  58         add     %rdx, %rax
  59 #endif
  60 L(start):
  61         lea     (%rsi, %rdx), %rcx
  62         lea     (%rdi, %rdx), %r9
  63         cmp     $512, %rdx
  64         ja      L(512bytesormore)
  65
  66 L(check):
  67         cmp     $16, %rdx
  68         jbe     L(less_16bytes)
  69         cmp     $256, %rdx
  70         jb      L(less_256bytes)
  71         vmovups (%rsi), %zmm0
  72         vmovups 0x40(%rsi), %zmm1
  73         vmovups 0x80(%rsi), %zmm2
  74         vmovups 0xC0(%rsi), %zmm3
  75         vmovups -0x100(%rcx), %zmm4
  76         vmovups -0xC0(%rcx), %zmm5
  77         vmovups -0x80(%rcx), %zmm6
  78         vmovups -0x40(%rcx), %zmm7
  79         vmovups %zmm0, (%rdi)
  80         vmovups %zmm1, 0x40(%rdi)
  81         vmovups %zmm2, 0x80(%rdi)
  82         vmovups %zmm3, 0xC0(%rdi)
  83         vmovups %zmm4, -0x100(%r9)
  84         vmovups %zmm5, -0xC0(%r9)
  85         vmovups %zmm6, -0x80(%r9)
  86         vmovups %zmm7, -0x40(%r9)
  87         ret
  88
  89 L(less_256bytes):
  90         cmp     $128, %dl
  91         jb      L(less_128bytes)
  92         vmovups (%rsi), %zmm0
  93         vmovups 0x40(%rsi), %zmm1
  94         vmovups -0x80(%rcx), %zmm2
  95         vmovups -0x40(%rcx), %zmm3
  96         vmovups %zmm0, (%rdi)
  97         vmovups %zmm1, 0x40(%rdi)
  98         vmovups %zmm2, -0x80(%r9)
  99         vmovups %zmm3, -0x40(%r9)
 100         ret
 101
 102 L(less_128bytes):
 103         cmp     $64, %dl
 104         jb      L(less_64bytes)
 105         vmovdqu (%rsi), %ymm0
 106         vmovdqu 0x20(%rsi), %ymm1
 107         vmovdqu -0x40(%rcx), %ymm2
 108         vmovdqu -0x20(%rcx), %ymm3
 109         vmovdqu %ymm0, (%rdi)
 110         vmovdqu %ymm1, 0x20(%rdi)
 111         vmovdqu %ymm2, -0x40(%r9)
 112         vmovdqu %ymm3, -0x20(%r9)
 113         ret
 114
 115 L(less_64bytes):
 116         cmp     $32, %dl
 117         jb      L(less_32bytes)
 118         vmovdqu (%rsi), %ymm0
 119         vmovdqu -0x20(%rcx), %ymm1
 120         vmovdqu %ymm0, (%rdi)
 121         vmovdqu %ymm1, -0x20(%r9)
 122         ret
 123
 124 L(less_32bytes):
 125         vmovdqu (%rsi), %xmm0
 126         vmovdqu -0x10(%rcx), %xmm1
 127         vmovdqu %xmm0, (%rdi)
 128         vmovdqu %xmm1, -0x10(%r9)
 129         ret
 130
 131 L(less_16bytes):
 132         cmp     $8, %dl
 133         jb      L(less_8bytes)
 134         movq    (%rsi), %rsi
 135         movq    -0x8(%rcx), %rcx
 136         movq    %rsi, (%rdi)
 137         movq    %rcx, -0x8(%r9)
 138         ret
 139
 140 L(less_8bytes):
 141         cmp     $4, %dl
 142         jb      L(less_4bytes)
 143         mov     (%rsi), %esi
 144         mov     -0x4(%rcx), %ecx
 145         mov     %esi, (%rdi)
 146         mov     %ecx, -0x4(%r9)
 147         ret
 148
 149 L(less_4bytes):
 150         cmp     $2, %dl
 151         jb      L(less_2bytes)
 152         mov     (%rsi), %si
 153         mov     -0x2(%rcx), %cx
 154         mov     %si, (%rdi)
 155         mov     %cx, -0x2(%r9)
 156         ret
 157
 158 L(less_2bytes):
 159         cmp     $1, %dl
 160         jb      L(less_1bytes)
 161         mov     (%rsi), %cl
 162         mov     %cl, (%rdi)
 163 L(less_1bytes):
 164         ret
 165
 166 L(512bytesormore):
 167 #ifdef SHARED_CACHE_SIZE_HALF
 168         mov     $SHARED_CACHE_SIZE_HALF, %r8
 169 #else
 170         mov     __x86_shared_cache_size_half(%rip), %r8
 171 #endif
 172         cmp     %r8, %rdx
 173         jae     L(preloop_large)
 174         cmp     $1024, %rdx
 175         ja      L(1024bytesormore)
 176         prefetcht1 (%rsi)
 177         prefetcht1 0x40(%rsi)
 178         prefetcht1 0x80(%rsi)
 179         prefetcht1 0xC0(%rsi)
 180         prefetcht1 0x100(%rsi)
 181         prefetcht1 0x140(%rsi)
 182         prefetcht1 0x180(%rsi)
 183         prefetcht1 0x1C0(%rsi)
 184         prefetcht1 -0x200(%rcx)
 185         prefetcht1 -0x1C0(%rcx)
 186         prefetcht1 -0x180(%rcx)
 187         prefetcht1 -0x140(%rcx)
 188         prefetcht1 -0x100(%rcx)
 189         prefetcht1 -0xC0(%rcx)
 190         prefetcht1 -0x80(%rcx)
 191         prefetcht1 -0x40(%rcx)
 192         vmovups (%rsi), %zmm0
 193         vmovups 0x40(%rsi), %zmm1
 194         vmovups 0x80(%rsi), %zmm2
 195         vmovups 0xC0(%rsi), %zmm3
 196         vmovups 0x100(%rsi), %zmm4
 197         vmovups 0x140(%rsi), %zmm5
 198         vmovups 0x180(%rsi), %zmm6
 199         vmovups 0x1C0(%rsi), %zmm7
 200         vmovups -0x200(%rcx), %zmm8
 201         vmovups -0x1C0(%rcx), %zmm9
 202         vmovups -0x180(%rcx), %zmm10
 203         vmovups -0x140(%rcx), %zmm11
 204         vmovups -0x100(%rcx), %zmm12
 205         vmovups -0xC0(%rcx), %zmm13
 206         vmovups -0x80(%rcx), %zmm14
 207         vmovups -0x40(%rcx), %zmm15
 208         vmovups %zmm0, (%rdi)
 209         vmovups %zmm1, 0x40(%rdi)
 210         vmovups %zmm2, 0x80(%rdi)
 211         vmovups %zmm3, 0xC0(%rdi)
 212         vmovups %zmm4, 0x100(%rdi)
 213         vmovups %zmm5, 0x140(%rdi)
 214         vmovups %zmm6, 0x180(%rdi)
 215         vmovups %zmm7, 0x1C0(%rdi)
 216         vmovups %zmm8, -0x200(%r9)
 217         vmovups %zmm9, -0x1C0(%r9)
 218         vmovups %zmm10, -0x180(%r9)
 219         vmovups %zmm11, -0x140(%r9)
 220         vmovups %zmm12, -0x100(%r9)
 221         vmovups %zmm13, -0xC0(%r9)
 222         vmovups %zmm14, -0x80(%r9)
 223         vmovups %zmm15, -0x40(%r9)
 224         ret
 225
 226 L(1024bytesormore):
 227         cmp     %rsi, %rdi
 228         ja      L(1024bytesormore_bkw)
 229         sub     $512, %r9
 230         vmovups -0x200(%rcx), %zmm8
 231         vmovups -0x1C0(%rcx), %zmm9
 232         vmovups -0x180(%rcx), %zmm10
 233         vmovups -0x140(%rcx), %zmm11
 234         vmovups -0x100(%rcx), %zmm12
 235         vmovups -0xC0(%rcx), %zmm13
 236         vmovups -0x80(%rcx), %zmm14
 237         vmovups -0x40(%rcx), %zmm15
 238         prefetcht1 (%rsi)
 239         prefetcht1 0x40(%rsi)
 240         prefetcht1 0x80(%rsi)
 241         prefetcht1 0xC0(%rsi)
 242         prefetcht1 0x100(%rsi)
 243         prefetcht1 0x140(%rsi)
 244         prefetcht1 0x180(%rsi)
 245         prefetcht1 0x1C0(%rsi)
 246
 247 /* Loop with unaligned memory access.  */
 248 L(gobble_512bytes_loop):
 249         vmovups (%rsi), %zmm0
 250         vmovups 0x40(%rsi), %zmm1
 251         vmovups 0x80(%rsi), %zmm2
 252         vmovups 0xC0(%rsi), %zmm3
 253         vmovups 0x100(%rsi), %zmm4
 254         vmovups 0x140(%rsi), %zmm5
 255         vmovups 0x180(%rsi), %zmm6
 256         vmovups 0x1C0(%rsi), %zmm7
 257         add     $512, %rsi
 258         prefetcht1 (%rsi)
 259         prefetcht1 0x40(%rsi)
 260         prefetcht1 0x80(%rsi)
 261         prefetcht1 0xC0(%rsi)
 262         prefetcht1 0x100(%rsi)
 263         prefetcht1 0x140(%rsi)
 264         prefetcht1 0x180(%rsi)
 265         prefetcht1 0x1C0(%rsi)
 266         vmovups %zmm0, (%rdi)
 267         vmovups %zmm1, 0x40(%rdi)
 268         vmovups %zmm2, 0x80(%rdi)
 269         vmovups %zmm3, 0xC0(%rdi)
 270         vmovups %zmm4, 0x100(%rdi)
 271         vmovups %zmm5, 0x140(%rdi)
 272         vmovups %zmm6, 0x180(%rdi)
 273         vmovups %zmm7, 0x1C0(%rdi)
 274         add     $512, %rdi
 275         cmp     %r9, %rdi
 276         jb      L(gobble_512bytes_loop)
 277         vmovups %zmm8, (%r9)
 278         vmovups %zmm9, 0x40(%r9)
 279         vmovups %zmm10, 0x80(%r9)
 280         vmovups %zmm11, 0xC0(%r9)
 281         vmovups %zmm12, 0x100(%r9)
 282         vmovups %zmm13, 0x140(%r9)
 283         vmovups %zmm14, 0x180(%r9)
 284         vmovups %zmm15, 0x1C0(%r9)
 285         ret
 286
 287 L(1024bytesormore_bkw):
 288         add     $512, %rdi
 289         vmovups 0x1C0(%rsi), %zmm8
 290         vmovups 0x180(%rsi), %zmm9
 291         vmovups 0x140(%rsi), %zmm10
 292         vmovups 0x100(%rsi), %zmm11
 293         vmovups 0xC0(%rsi), %zmm12
 294         vmovups 0x80(%rsi), %zmm13
 295         vmovups 0x40(%rsi), %zmm14
 296         vmovups (%rsi), %zmm15
 297         prefetcht1 -0x40(%rcx)
 298         prefetcht1 -0x80(%rcx)
 299         prefetcht1 -0xC0(%rcx)
 300         prefetcht1 -0x100(%rcx)
 301         prefetcht1 -0x140(%rcx)
 302         prefetcht1 -0x180(%rcx)
 303         prefetcht1 -0x1C0(%rcx)
 304         prefetcht1 -0x200(%rcx)
 305
 306 /* Backward loop with unaligned memory access.  */
 307 L(gobble_512bytes_loop_bkw):
 308         vmovups -0x40(%rcx), %zmm0
 309         vmovups -0x80(%rcx), %zmm1
 310         vmovups -0xC0(%rcx), %zmm2
 311         vmovups -0x100(%rcx), %zmm3
 312         vmovups -0x140(%rcx), %zmm4
 313         vmovups -0x180(%rcx), %zmm5
 314         vmovups -0x1C0(%rcx), %zmm6
 315         vmovups -0x200(%rcx), %zmm7
 316         sub     $512, %rcx
 317         prefetcht1 -0x40(%rcx)
 318         prefetcht1 -0x80(%rcx)
 319         prefetcht1 -0xC0(%rcx)
 320         prefetcht1 -0x100(%rcx)
 321         prefetcht1 -0x140(%rcx)
 322         prefetcht1 -0x180(%rcx)
 323         prefetcht1 -0x1C0(%rcx)
 324         prefetcht1 -0x200(%rcx)
 325         vmovups %zmm0, -0x40(%r9)
 326         vmovups %zmm1, -0x80(%r9)
 327         vmovups %zmm2, -0xC0(%r9)
 328         vmovups %zmm3, -0x100(%r9)
 329         vmovups %zmm4, -0x140(%r9)
 330         vmovups %zmm5, -0x180(%r9)
 331         vmovups %zmm6, -0x1C0(%r9)
 332         vmovups %zmm7, -0x200(%r9)
 333         sub     $512, %r9
 334         cmp     %rdi, %r9
 335         ja      L(gobble_512bytes_loop_bkw)
 336         vmovups %zmm8, -0x40(%rdi)
 337         vmovups %zmm9, -0x80(%rdi)
 338         vmovups %zmm10, -0xC0(%rdi)
 339         vmovups %zmm11, -0x100(%rdi)
 340         vmovups %zmm12, -0x140(%rdi)
 341         vmovups %zmm13, -0x180(%rdi)
 342         vmovups %zmm14, -0x1C0(%rdi)
 343         vmovups %zmm15, -0x200(%rdi)
 344         ret
 345
 346 L(preloop_large):
 347         cmp     %rsi, %rdi
 348         ja      L(preloop_large_bkw)
 349         vmovups (%rsi), %zmm4
 350         vmovups 0x40(%rsi), %zmm5
 351
 352 /* Align destination for access with non-temporal stores in the loop.  */
 353         mov     %rdi, %r8
 354         and     $-0x80, %rdi
 355         add     $0x80, %rdi
 356         sub     %rdi, %r8
 357         sub     %r8, %rsi
 358         add     %r8, %rdx
 359 L(gobble_256bytes_nt_loop):
 360         prefetcht1 0x200(%rsi)
 361         prefetcht1 0x240(%rsi)
 362         prefetcht1 0x280(%rsi)
 363         prefetcht1 0x2C0(%rsi)
 364         prefetcht1 0x300(%rsi)
 365         prefetcht1 0x340(%rsi)
 366         prefetcht1 0x380(%rsi)
 367         prefetcht1 0x3C0(%rsi)
 368         vmovdqu64 (%rsi), %zmm0
 369         vmovdqu64 0x40(%rsi), %zmm1
 370         vmovdqu64 0x80(%rsi), %zmm2
 371         vmovdqu64 0xC0(%rsi), %zmm3
 372         vmovntdq %zmm0, (%rdi)
 373         vmovntdq %zmm1, 0x40(%rdi)
 374         vmovntdq %zmm2, 0x80(%rdi)
 375         vmovntdq %zmm3, 0xC0(%rdi)
 376         sub     $256, %rdx
 377         add     $256, %rsi
 378         add     $256, %rdi
 379         cmp     $256, %rdx
 380         ja      L(gobble_256bytes_nt_loop)
 381         sfence
 382         vmovups %zmm4, (%rax)
 383         vmovups %zmm5, 0x40(%rax)
 384         jmp     L(check)
 385
 386 L(preloop_large_bkw):
 387         vmovups -0x80(%rcx), %zmm4
 388         vmovups -0x40(%rcx), %zmm5
 389
 390 /* Align end of destination for access with non-temporal stores.  */
 391         mov     %r9, %r8
 392         and     $-0x80, %r9
 393         sub     %r9, %r8
 394         sub     %r8, %rcx
 395         sub     %r8, %rdx
 396         add     %r9, %r8
 397 L(gobble_256bytes_nt_loop_bkw):
 398         prefetcht1 -0x400(%rcx)
 399         prefetcht1 -0x3C0(%rcx)
 400         prefetcht1 -0x380(%rcx)
 401         prefetcht1 -0x340(%rcx)
 402         prefetcht1 -0x300(%rcx)
 403         prefetcht1 -0x2C0(%rcx)
 404         prefetcht1 -0x280(%rcx)
 405         prefetcht1 -0x240(%rcx)
 406         vmovdqu64 -0x100(%rcx), %zmm0
 407         vmovdqu64 -0xC0(%rcx), %zmm1
 408         vmovdqu64 -0x80(%rcx), %zmm2
 409         vmovdqu64 -0x40(%rcx), %zmm3
 410         vmovntdq %zmm0, -0x100(%r9)
 411         vmovntdq %zmm1, -0xC0(%r9)
 412         vmovntdq %zmm2, -0x80(%r9)
 413         vmovntdq %zmm3, -0x40(%r9)
 414         sub     $256, %rdx
 415         sub     $256, %rcx
 416         sub     $256, %r9
 417         cmp     $256, %rdx
 418         ja      L(gobble_256bytes_nt_loop_bkw)
 419         sfence
 420         vmovups %zmm4, -0x80(%r8)
 421         vmovups %zmm5, -0x40(%r8)
 422         jmp     L(check)
 423 END (MEMCPY)
 424 #endif