sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S

   1 /* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
   2    Copyright (C) 2016-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include <isa-level.h>
  21
  22 #if ISA_SHOULD_BUILD (4)
  23
  24 # include "asm-syntax.h"
  25
  26         .section .text.avx512,"ax",@progbits
  27 ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
  28         cmp     %RDX_LP, %RCX_LP
  29         jb      HIDDEN_JUMPTARGET (__chk_fail)
  30 END (__mempcpy_chk_avx512_no_vzeroupper)
  31
  32 ENTRY (__mempcpy_avx512_no_vzeroupper)
  33         mov     %RDI_LP, %RAX_LP
  34         add     %RDX_LP, %RAX_LP
  35         jmp     L(start)
  36 END (__mempcpy_avx512_no_vzeroupper)
  37
  38 ENTRY (__memmove_chk_avx512_no_vzeroupper)
  39         cmp     %RDX_LP, %RCX_LP
  40         jb      HIDDEN_JUMPTARGET (__chk_fail)
  41 END (__memmove_chk_avx512_no_vzeroupper)
  42
  43 ENTRY (__memmove_avx512_no_vzeroupper)
  44         mov     %RDI_LP, %RAX_LP
  45 # ifdef USE_AS_MEMPCPY
  46         add     %RDX_LP, %RAX_LP
  47 # endif
  48 L(start):
  49 # ifdef __ILP32__
  50         /* Clear the upper 32 bits.  */
  51         mov     %edx, %edx
  52 # endif
  53         lea     (%rsi, %rdx), %rcx
  54         lea     (%rdi, %rdx), %r9
  55         cmp     $512, %rdx
  56         ja      L(512bytesormore)
  57
  58 L(check):
  59         cmp     $16, %rdx
  60         jbe     L(less_16bytes)
  61         cmp     $256, %rdx
  62         jb      L(less_256bytes)
  63         vmovups (%rsi), %zmm0
  64         vmovups 0x40(%rsi), %zmm1
  65         vmovups 0x80(%rsi), %zmm2
  66         vmovups 0xC0(%rsi), %zmm3
  67         vmovups -0x100(%rcx), %zmm4
  68         vmovups -0xC0(%rcx), %zmm5
  69         vmovups -0x80(%rcx), %zmm6
  70         vmovups -0x40(%rcx), %zmm7
  71         vmovups %zmm0, (%rdi)
  72         vmovups %zmm1, 0x40(%rdi)
  73         vmovups %zmm2, 0x80(%rdi)
  74         vmovups %zmm3, 0xC0(%rdi)
  75         vmovups %zmm4, -0x100(%r9)
  76         vmovups %zmm5, -0xC0(%r9)
  77         vmovups %zmm6, -0x80(%r9)
  78         vmovups %zmm7, -0x40(%r9)
  79         ret
  80
  81 L(less_256bytes):
  82         cmp     $128, %dl
  83         jb      L(less_128bytes)
  84         vmovups (%rsi), %zmm0
  85         vmovups 0x40(%rsi), %zmm1
  86         vmovups -0x80(%rcx), %zmm2
  87         vmovups -0x40(%rcx), %zmm3
  88         vmovups %zmm0, (%rdi)
  89         vmovups %zmm1, 0x40(%rdi)
  90         vmovups %zmm2, -0x80(%r9)
  91         vmovups %zmm3, -0x40(%r9)
  92         ret
  93
  94 L(less_128bytes):
  95         cmp     $64, %dl
  96         jb      L(less_64bytes)
  97         vmovdqu (%rsi), %ymm0
  98         vmovdqu 0x20(%rsi), %ymm1
  99         vmovdqu -0x40(%rcx), %ymm2
 100         vmovdqu -0x20(%rcx), %ymm3
 101         vmovdqu %ymm0, (%rdi)
 102         vmovdqu %ymm1, 0x20(%rdi)
 103         vmovdqu %ymm2, -0x40(%r9)
 104         vmovdqu %ymm3, -0x20(%r9)
 105         ret
 106
 107 L(less_64bytes):
 108         cmp     $32, %dl
 109         jb      L(less_32bytes)
 110         vmovdqu (%rsi), %ymm0
 111         vmovdqu -0x20(%rcx), %ymm1
 112         vmovdqu %ymm0, (%rdi)
 113         vmovdqu %ymm1, -0x20(%r9)
 114         ret
 115
 116 L(less_32bytes):
 117         vmovdqu (%rsi), %xmm0
 118         vmovdqu -0x10(%rcx), %xmm1
 119         vmovdqu %xmm0, (%rdi)
 120         vmovdqu %xmm1, -0x10(%r9)
 121         ret
 122
 123 L(less_16bytes):
 124         cmp     $8, %dl
 125         jb      L(less_8bytes)
 126         movq    (%rsi), %rsi
 127         movq    -0x8(%rcx), %rcx
 128         movq    %rsi, (%rdi)
 129         movq    %rcx, -0x8(%r9)
 130         ret
 131
 132 L(less_8bytes):
 133         cmp     $4, %dl
 134         jb      L(less_4bytes)
 135         mov     (%rsi), %esi
 136         mov     -0x4(%rcx), %ecx
 137         mov     %esi, (%rdi)
 138         mov     %ecx, -0x4(%r9)
 139         ret
 140
 141 L(less_4bytes):
 142         cmp     $2, %dl
 143         jb      L(less_2bytes)
 144         mov     (%rsi), %si
 145         mov     -0x2(%rcx), %cx
 146         mov     %si, (%rdi)
 147         mov     %cx, -0x2(%r9)
 148         ret
 149
 150 L(less_2bytes):
 151         cmp     $1, %dl
 152         jb      L(less_1bytes)
 153         mov     (%rsi), %cl
 154         mov     %cl, (%rdi)
 155 L(less_1bytes):
 156         ret
 157
 158 L(512bytesormore):
 159 # ifdef SHARED_CACHE_SIZE_HALF
 160         mov     $SHARED_CACHE_SIZE_HALF, %r8
 161 # else
 162         mov     __x86_shared_cache_size_half(%rip), %r8
 163 # endif
 164         cmp     %r8, %rdx
 165         jae     L(preloop_large)
 166         cmp     $1024, %rdx
 167         ja      L(1024bytesormore)
 168         prefetcht1 (%rsi)
 169         prefetcht1 0x40(%rsi)
 170         prefetcht1 0x80(%rsi)
 171         prefetcht1 0xC0(%rsi)
 172         prefetcht1 0x100(%rsi)
 173         prefetcht1 0x140(%rsi)
 174         prefetcht1 0x180(%rsi)
 175         prefetcht1 0x1C0(%rsi)
 176         prefetcht1 -0x200(%rcx)
 177         prefetcht1 -0x1C0(%rcx)
 178         prefetcht1 -0x180(%rcx)
 179         prefetcht1 -0x140(%rcx)
 180         prefetcht1 -0x100(%rcx)
 181         prefetcht1 -0xC0(%rcx)
 182         prefetcht1 -0x80(%rcx)
 183         prefetcht1 -0x40(%rcx)
 184         vmovups (%rsi), %zmm0
 185         vmovups 0x40(%rsi), %zmm1
 186         vmovups 0x80(%rsi), %zmm2
 187         vmovups 0xC0(%rsi), %zmm3
 188         vmovups 0x100(%rsi), %zmm4
 189         vmovups 0x140(%rsi), %zmm5
 190         vmovups 0x180(%rsi), %zmm6
 191         vmovups 0x1C0(%rsi), %zmm7
 192         vmovups -0x200(%rcx), %zmm8
 193         vmovups -0x1C0(%rcx), %zmm9
 194         vmovups -0x180(%rcx), %zmm10
 195         vmovups -0x140(%rcx), %zmm11
 196         vmovups -0x100(%rcx), %zmm12
 197         vmovups -0xC0(%rcx), %zmm13
 198         vmovups -0x80(%rcx), %zmm14
 199         vmovups -0x40(%rcx), %zmm15
 200         vmovups %zmm0, (%rdi)
 201         vmovups %zmm1, 0x40(%rdi)
 202         vmovups %zmm2, 0x80(%rdi)
 203         vmovups %zmm3, 0xC0(%rdi)
 204         vmovups %zmm4, 0x100(%rdi)
 205         vmovups %zmm5, 0x140(%rdi)
 206         vmovups %zmm6, 0x180(%rdi)
 207         vmovups %zmm7, 0x1C0(%rdi)
 208         vmovups %zmm8, -0x200(%r9)
 209         vmovups %zmm9, -0x1C0(%r9)
 210         vmovups %zmm10, -0x180(%r9)
 211         vmovups %zmm11, -0x140(%r9)
 212         vmovups %zmm12, -0x100(%r9)
 213         vmovups %zmm13, -0xC0(%r9)
 214         vmovups %zmm14, -0x80(%r9)
 215         vmovups %zmm15, -0x40(%r9)
 216         ret
 217
 218 L(1024bytesormore):
 219         cmp     %rsi, %rdi
 220         ja      L(1024bytesormore_bkw)
 221         sub     $512, %r9
 222         vmovups -0x200(%rcx), %zmm8
 223         vmovups -0x1C0(%rcx), %zmm9
 224         vmovups -0x180(%rcx), %zmm10
 225         vmovups -0x140(%rcx), %zmm11
 226         vmovups -0x100(%rcx), %zmm12
 227         vmovups -0xC0(%rcx), %zmm13
 228         vmovups -0x80(%rcx), %zmm14
 229         vmovups -0x40(%rcx), %zmm15
 230         prefetcht1 (%rsi)
 231         prefetcht1 0x40(%rsi)
 232         prefetcht1 0x80(%rsi)
 233         prefetcht1 0xC0(%rsi)
 234         prefetcht1 0x100(%rsi)
 235         prefetcht1 0x140(%rsi)
 236         prefetcht1 0x180(%rsi)
 237         prefetcht1 0x1C0(%rsi)
 238
 239 /* Loop with unaligned memory access.  */
 240 L(gobble_512bytes_loop):
 241         vmovups (%rsi), %zmm0
 242         vmovups 0x40(%rsi), %zmm1
 243         vmovups 0x80(%rsi), %zmm2
 244         vmovups 0xC0(%rsi), %zmm3
 245         vmovups 0x100(%rsi), %zmm4
 246         vmovups 0x140(%rsi), %zmm5
 247         vmovups 0x180(%rsi), %zmm6
 248         vmovups 0x1C0(%rsi), %zmm7
 249         add     $512, %rsi
 250         prefetcht1 (%rsi)
 251         prefetcht1 0x40(%rsi)
 252         prefetcht1 0x80(%rsi)
 253         prefetcht1 0xC0(%rsi)
 254         prefetcht1 0x100(%rsi)
 255         prefetcht1 0x140(%rsi)
 256         prefetcht1 0x180(%rsi)
 257         prefetcht1 0x1C0(%rsi)
 258         vmovups %zmm0, (%rdi)
 259         vmovups %zmm1, 0x40(%rdi)
 260         vmovups %zmm2, 0x80(%rdi)
 261         vmovups %zmm3, 0xC0(%rdi)
 262         vmovups %zmm4, 0x100(%rdi)
 263         vmovups %zmm5, 0x140(%rdi)
 264         vmovups %zmm6, 0x180(%rdi)
 265         vmovups %zmm7, 0x1C0(%rdi)
 266         add     $512, %rdi
 267         cmp     %r9, %rdi
 268         jb      L(gobble_512bytes_loop)
 269         vmovups %zmm8, (%r9)
 270         vmovups %zmm9, 0x40(%r9)
 271         vmovups %zmm10, 0x80(%r9)
 272         vmovups %zmm11, 0xC0(%r9)
 273         vmovups %zmm12, 0x100(%r9)
 274         vmovups %zmm13, 0x140(%r9)
 275         vmovups %zmm14, 0x180(%r9)
 276         vmovups %zmm15, 0x1C0(%r9)
 277         ret
 278
 279 L(1024bytesormore_bkw):
 280         add     $512, %rdi
 281         vmovups 0x1C0(%rsi), %zmm8
 282         vmovups 0x180(%rsi), %zmm9
 283         vmovups 0x140(%rsi), %zmm10
 284         vmovups 0x100(%rsi), %zmm11
 285         vmovups 0xC0(%rsi), %zmm12
 286         vmovups 0x80(%rsi), %zmm13
 287         vmovups 0x40(%rsi), %zmm14
 288         vmovups (%rsi), %zmm15
 289         prefetcht1 -0x40(%rcx)
 290         prefetcht1 -0x80(%rcx)
 291         prefetcht1 -0xC0(%rcx)
 292         prefetcht1 -0x100(%rcx)
 293         prefetcht1 -0x140(%rcx)
 294         prefetcht1 -0x180(%rcx)
 295         prefetcht1 -0x1C0(%rcx)
 296         prefetcht1 -0x200(%rcx)
 297
 298 /* Backward loop with unaligned memory access.  */
 299 L(gobble_512bytes_loop_bkw):
 300         vmovups -0x40(%rcx), %zmm0
 301         vmovups -0x80(%rcx), %zmm1
 302         vmovups -0xC0(%rcx), %zmm2
 303         vmovups -0x100(%rcx), %zmm3
 304         vmovups -0x140(%rcx), %zmm4
 305         vmovups -0x180(%rcx), %zmm5
 306         vmovups -0x1C0(%rcx), %zmm6
 307         vmovups -0x200(%rcx), %zmm7
 308         sub     $512, %rcx
 309         prefetcht1 -0x40(%rcx)
 310         prefetcht1 -0x80(%rcx)
 311         prefetcht1 -0xC0(%rcx)
 312         prefetcht1 -0x100(%rcx)
 313         prefetcht1 -0x140(%rcx)
 314         prefetcht1 -0x180(%rcx)
 315         prefetcht1 -0x1C0(%rcx)
 316         prefetcht1 -0x200(%rcx)
 317         vmovups %zmm0, -0x40(%r9)
 318         vmovups %zmm1, -0x80(%r9)
 319         vmovups %zmm2, -0xC0(%r9)
 320         vmovups %zmm3, -0x100(%r9)
 321         vmovups %zmm4, -0x140(%r9)
 322         vmovups %zmm5, -0x180(%r9)
 323         vmovups %zmm6, -0x1C0(%r9)
 324         vmovups %zmm7, -0x200(%r9)
 325         sub     $512, %r9
 326         cmp     %rdi, %r9
 327         ja      L(gobble_512bytes_loop_bkw)
 328         vmovups %zmm8, -0x40(%rdi)
 329         vmovups %zmm9, -0x80(%rdi)
 330         vmovups %zmm10, -0xC0(%rdi)
 331         vmovups %zmm11, -0x100(%rdi)
 332         vmovups %zmm12, -0x140(%rdi)
 333         vmovups %zmm13, -0x180(%rdi)
 334         vmovups %zmm14, -0x1C0(%rdi)
 335         vmovups %zmm15, -0x200(%rdi)
 336         ret
 337
 338 L(preloop_large):
 339         cmp     %rsi, %rdi
 340         ja      L(preloop_large_bkw)
 341         vmovups (%rsi), %zmm4
 342         vmovups 0x40(%rsi), %zmm5
 343
 344         mov     %rdi, %r11
 345 /* Align destination for access with non-temporal stores in the loop.  */
 346         mov     %rdi, %r8
 347         and     $-0x80, %rdi
 348         add     $0x80, %rdi
 349         sub     %rdi, %r8
 350         sub     %r8, %rsi
 351         add     %r8, %rdx
 352 L(gobble_256bytes_nt_loop):
 353         prefetcht1 0x200(%rsi)
 354         prefetcht1 0x240(%rsi)
 355         prefetcht1 0x280(%rsi)
 356         prefetcht1 0x2C0(%rsi)
 357         prefetcht1 0x300(%rsi)
 358         prefetcht1 0x340(%rsi)
 359         prefetcht1 0x380(%rsi)
 360         prefetcht1 0x3C0(%rsi)
 361         vmovdqu64 (%rsi), %zmm0
 362         vmovdqu64 0x40(%rsi), %zmm1
 363         vmovdqu64 0x80(%rsi), %zmm2
 364         vmovdqu64 0xC0(%rsi), %zmm3
 365         vmovntdq %zmm0, (%rdi)
 366         vmovntdq %zmm1, 0x40(%rdi)
 367         vmovntdq %zmm2, 0x80(%rdi)
 368         vmovntdq %zmm3, 0xC0(%rdi)
 369         sub     $256, %rdx
 370         add     $256, %rsi
 371         add     $256, %rdi
 372         cmp     $256, %rdx
 373         ja      L(gobble_256bytes_nt_loop)
 374         sfence
 375         vmovups %zmm4, (%r11)
 376         vmovups %zmm5, 0x40(%r11)
 377         jmp     L(check)
 378
 379 L(preloop_large_bkw):
 380         vmovups -0x80(%rcx), %zmm4
 381         vmovups -0x40(%rcx), %zmm5
 382
 383 /* Align end of destination for access with non-temporal stores.  */
 384         mov     %r9, %r8
 385         and     $-0x80, %r9
 386         sub     %r9, %r8
 387         sub     %r8, %rcx
 388         sub     %r8, %rdx
 389         add     %r9, %r8
 390 L(gobble_256bytes_nt_loop_bkw):
 391         prefetcht1 -0x400(%rcx)
 392         prefetcht1 -0x3C0(%rcx)
 393         prefetcht1 -0x380(%rcx)
 394         prefetcht1 -0x340(%rcx)
 395         prefetcht1 -0x300(%rcx)
 396         prefetcht1 -0x2C0(%rcx)
 397         prefetcht1 -0x280(%rcx)
 398         prefetcht1 -0x240(%rcx)
 399         vmovdqu64 -0x100(%rcx), %zmm0
 400         vmovdqu64 -0xC0(%rcx), %zmm1
 401         vmovdqu64 -0x80(%rcx), %zmm2
 402         vmovdqu64 -0x40(%rcx), %zmm3
 403         vmovntdq %zmm0, -0x100(%r9)
 404         vmovntdq %zmm1, -0xC0(%r9)
 405         vmovntdq %zmm2, -0x80(%r9)
 406         vmovntdq %zmm3, -0x40(%r9)
 407         sub     $256, %rdx
 408         sub     $256, %rcx
 409         sub     $256, %r9
 410         cmp     $256, %rdx
 411         ja      L(gobble_256bytes_nt_loop_bkw)
 412         sfence
 413         vmovups %zmm4, -0x80(%r8)
 414         vmovups %zmm5, -0x40(%r8)
 415         jmp     L(check)
 416 END (__memmove_avx512_no_vzeroupper)
 417
 418 strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
 419 strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
 420 #endif