sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S

   1 /* memset with unaligned store and rep stosb
   2    Copyright (C) 2016-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* memset is implemented as:
  20    1. Use overlapping store to avoid branch.
  21    2. If size is less than VEC, use integer register stores.
  22    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
  23    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
  24    5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
  25       4 VEC stores and store 4 * VEC at a time until done.  */
  26
  27 #include <sysdep.h>
  28
  29 #ifndef MEMSET_CHK_SYMBOL
  30 # define MEMSET_CHK_SYMBOL(p,s)         MEMSET_SYMBOL(p, s)
  31 #endif
  32
  33 #ifndef WMEMSET_CHK_SYMBOL
  34 # define WMEMSET_CHK_SYMBOL(p,s)        WMEMSET_SYMBOL(p, s)
  35 #endif
  36
  37 #ifndef VZEROUPPER
  38 # if VEC_SIZE > 16
  39 #  define VZEROUPPER                    vzeroupper
  40 #  define VZEROUPPER_SHORT_RETURN       vzeroupper; ret
  41 # else
  42 #  define VZEROUPPER
  43 # endif
  44 #endif
  45
  46 #ifndef VZEROUPPER_SHORT_RETURN
  47 # define VZEROUPPER_SHORT_RETURN        rep; ret
  48 #endif
  49
  50 #ifndef MOVQ
  51 # if VEC_SIZE > 16
  52 #  define MOVQ                          vmovq
  53 #  define MOVD                          vmovd
  54 # else
  55 #  define MOVQ                          movq
  56 #  define MOVD                          movd
  57 # endif
  58 #endif
  59
  60 #if VEC_SIZE == 64
  61 # define LOOP_4X_OFFSET (VEC_SIZE * 4)
  62 #else
  63 # define LOOP_4X_OFFSET (0)
  64 #endif
  65
  66 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
  67 # define END_REG        rcx
  68 # define LOOP_REG       rdi
  69 # define LESS_VEC_REG   rax
  70 #else
  71 # define END_REG        rdi
  72 # define LOOP_REG       rdx
  73 # define LESS_VEC_REG   rdi
  74 #endif
  75
  76 #ifdef USE_XMM_LESS_VEC
  77 # define XMM_SMALL      1
  78 #else
  79 # define XMM_SMALL      0
  80 #endif
  81
  82 #ifdef USE_LESS_VEC_MASK_STORE
  83 # define SET_REG64      rcx
  84 # define SET_REG32      ecx
  85 # define SET_REG16      cx
  86 # define SET_REG8       cl
  87 #else
  88 # define SET_REG64      rsi
  89 # define SET_REG32      esi
  90 # define SET_REG16      si
  91 # define SET_REG8       sil
  92 #endif
  93
  94 #define PAGE_SIZE 4096
  95
  96 /* Macro to calculate size of small memset block for aligning
  97    purposes.  */
  98 #define SMALL_MEMSET_ALIGN(mov_sz,      ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
  99
 100
 101 #ifndef SECTION
 102 # error SECTION is not defined!
 103 #endif
 104
 105         .section SECTION(.text), "ax", @progbits
 106 #if IS_IN (libc)
 107 # if defined SHARED
 108 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 109         cmp     %RDX_LP, %RCX_LP
 110         jb      HIDDEN_JUMPTARGET (__chk_fail)
 111 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 112 # endif
 113
 114 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 115         shl     $2, %RDX_LP
 116         WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 117         WMEMSET_VDUP_TO_VEC0_LOW()
 118         cmpq    $VEC_SIZE, %rdx
 119         jb      L(less_vec_from_wmemset)
 120         WMEMSET_VDUP_TO_VEC0_HIGH()
 121         jmp     L(entry_from_wmemset)
 122 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 123 #endif
 124
 125 #if defined SHARED && IS_IN (libc)
 126 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 127         cmp     %RDX_LP, %RCX_LP
 128         jb      HIDDEN_JUMPTARGET (__chk_fail)
 129 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 130 #endif
 131
 132 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 133         MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 134 # ifdef __ILP32__
 135         /* Clear the upper 32 bits.  */
 136         mov     %edx, %edx
 137 # endif
 138         cmpq    $VEC_SIZE, %rdx
 139         jb      L(less_vec)
 140         MEMSET_VDUP_TO_VEC0_HIGH()
 141 L(entry_from_wmemset):
 142         cmpq    $(VEC_SIZE * 2), %rdx
 143         ja      L(more_2x_vec)
 144         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 145         VMOVU   %VMM(0), -VEC_SIZE(%rdi,%rdx)
 146         VMOVU   %VMM(0), (%rdi)
 147         VZEROUPPER_RETURN
 148 #if defined USE_MULTIARCH && IS_IN (libc)
 149 END (MEMSET_SYMBOL (__memset, unaligned))
 150
 151 # if defined SHARED && IS_IN (libc)
 152 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 153         cmp     %RDX_LP, %RCX_LP
 154         jb      HIDDEN_JUMPTARGET (__chk_fail)
 155 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 156 # endif
 157
 158 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 159         MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 160 # ifdef __ILP32__
 161         /* Clear the upper 32 bits.  */
 162         mov     %edx, %edx
 163 # endif
 164         cmp     $VEC_SIZE, %RDX_LP
 165         jb      L(less_vec)
 166         MEMSET_VDUP_TO_VEC0_HIGH ()
 167         cmp     $(VEC_SIZE * 2), %RDX_LP
 168         ja      L(stosb_more_2x_vec)
 169         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 170         VMOVU   %VMM(0), (%rdi)
 171         VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 172         VZEROUPPER_RETURN
 173 #endif
 174
 175         .p2align 4,, 4
 176 L(last_2x_vec):
 177 #ifdef USE_LESS_VEC_MASK_STORE
 178         VMOVU   %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
 179         VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 180 #else
 181         VMOVU   %VMM(0), (VEC_SIZE * -2)(%rdi)
 182         VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi)
 183 #endif
 184         VZEROUPPER_RETURN
 185
 186         /* If have AVX512 mask instructions put L(less_vec) close to
 187            entry as it doesn't take much space and is likely a hot target.
 188          */
 189 #ifdef USE_LESS_VEC_MASK_STORE
 190         .p2align 4,, 10
 191 L(less_vec):
 192 L(less_vec_from_wmemset):
 193         /* Less than 1 VEC.  */
 194 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 195 #  error Unsupported VEC_SIZE!
 196 # endif
 197         /* Clear high bits from edi. Only keeping bits relevant to page
 198            cross check. Note that we are using rax which is set in
 199            MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 200         andl    $(PAGE_SIZE - 1), %edi
 201         /* Check if VEC_SIZE store cross page. Mask stores suffer
 202            serious performance degradation when it has to fault suppress.
 203          */
 204         cmpl    $(PAGE_SIZE - VEC_SIZE), %edi
 205         /* This is generally considered a cold target.  */
 206         ja      L(cross_page)
 207 # if VEC_SIZE > 32
 208         movq    $-1, %rcx
 209         bzhiq   %rdx, %rcx, %rcx
 210         kmovq   %rcx, %k1
 211 # else
 212         movl    $-1, %ecx
 213         bzhil   %edx, %ecx, %ecx
 214         kmovd   %ecx, %k1
 215 # endif
 216         vmovdqu8 %VMM(0), (%rax){%k1}
 217         VZEROUPPER_RETURN
 218
 219 # if defined USE_MULTIARCH && IS_IN (libc)
 220         /* Include L(stosb_local) here if including L(less_vec) between
 221            L(stosb_more_2x_vec) and ENTRY. This is to cache align the
 222            L(stosb_more_2x_vec) target.  */
 223         .p2align 4,, 10
 224 L(stosb_local):
 225         movzbl  %sil, %eax
 226         mov     %RDX_LP, %RCX_LP
 227         mov     %RDI_LP, %RDX_LP
 228         rep     stosb
 229         mov     %RDX_LP, %RAX_LP
 230         VZEROUPPER_RETURN
 231 # endif
 232 #endif
 233
 234 #if defined USE_MULTIARCH && IS_IN (libc)
 235         .p2align 4
 236 L(stosb_more_2x_vec):
 237         cmp     __x86_rep_stosb_threshold(%rip), %RDX_LP
 238         ja      L(stosb_local)
 239 #endif
 240         /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 241            and (4x, 8x] jump to target.  */
 242 L(more_2x_vec):
 243         /* Store next 2x vec regardless.  */
 244         VMOVU   %VMM(0), (%rdi)
 245         VMOVU   %VMM(0), (VEC_SIZE * 1)(%rdi)
 246
 247
 248         /* Two different methods of setting up pointers / compare. The two
 249            methods are based on the fact that EVEX/AVX512 mov instructions take
 250            more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
 251            machines also have fast LEA_BID. Both setup and END_REG to avoid complex
 252            address mode. For EVEX/AVX512 this saves code size and keeps a few
 253            targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
 254            bottlenecks.  */
 255 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 256         /* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 257         addq    %rdx, %END_REG
 258 #endif
 259
 260         cmpq    $(VEC_SIZE * 4), %rdx
 261         jbe     L(last_2x_vec)
 262
 263
 264 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 265         /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
 266            LEA_BID.  */
 267
 268         /* END_REG is rcx for EVEX/AVX512.  */
 269         leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
 270 #endif
 271
 272         /* Store next 2x vec regardless.  */
 273         VMOVU   %VMM(0), (VEC_SIZE * 2)(%rax)
 274         VMOVU   %VMM(0), (VEC_SIZE * 3)(%rax)
 275
 276
 277 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 278         /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
 279            extra offset to addresses in loop. Used for AVX512 to save space
 280            as no way to get (VEC_SIZE * 4) in imm8.  */
 281 # if LOOP_4X_OFFSET == 0
 282         subq    $-(VEC_SIZE * 4), %LOOP_REG
 283 # endif
 284         /* Avoid imm32 compare here to save code size.  */
 285         cmpq    %rdi, %rcx
 286 #else
 287         addq    $-(VEC_SIZE * 4), %END_REG
 288         cmpq    $(VEC_SIZE * 8), %rdx
 289 #endif
 290         jbe     L(last_4x_vec)
 291 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 292         /* Set LOOP_REG (rdx).  */
 293         leaq    (VEC_SIZE * 4)(%rax), %LOOP_REG
 294 #endif
 295         /* Align dst for loop.  */
 296         andq    $(VEC_SIZE * -1), %LOOP_REG
 297         .p2align 4
 298 L(loop):
 299         VMOVA   %VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
 300         VMOVA   %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
 301         VMOVA   %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
 302         VMOVA   %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 303         subq    $-(VEC_SIZE * 4), %LOOP_REG
 304         cmpq    %END_REG, %LOOP_REG
 305         jb      L(loop)
 306         .p2align 4,, MOV_SIZE
 307 L(last_4x_vec):
 308         VMOVU   %VMM(0), LOOP_4X_OFFSET(%END_REG)
 309         VMOVU   %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
 310         VMOVU   %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
 311         VMOVU   %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
 312 L(return_vzeroupper):
 313 #if VEC_SIZE > 16
 314         ZERO_UPPER_VEC_REGISTERS_RETURN
 315 #else
 316         ret
 317 #endif
 318
 319         .p2align 4,, 10
 320 #ifndef USE_LESS_VEC_MASK_STORE
 321 # if defined USE_MULTIARCH && IS_IN (libc)
 322         /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 323            range for 2-byte jump encoding.  */
 324 L(stosb_local):
 325         movzbl  %sil, %eax
 326         mov     %RDX_LP, %RCX_LP
 327         mov     %RDI_LP, %RDX_LP
 328         rep     stosb
 329         mov     %RDX_LP, %RAX_LP
 330         VZEROUPPER_RETURN
 331 # endif
 332         /* Define L(less_vec) only if not otherwise defined.  */
 333         .p2align 4
 334 L(less_vec):
 335         /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
 336            xmm). This is only does anything for AVX2.  */
 337         MEMSET_VDUP_TO_VEC0_LOW ()
 338 L(less_vec_from_wmemset):
 339 #endif
 340 L(cross_page):
 341 #if VEC_SIZE > 32
 342         cmpl    $32, %edx
 343         jge     L(between_32_63)
 344 #endif
 345 #if VEC_SIZE > 16
 346         cmpl    $16, %edx
 347         jge     L(between_16_31)
 348 #endif
 349 #ifndef USE_XMM_LESS_VEC
 350         MOVQ    %VMM_128(0), %SET_REG64
 351 #endif
 352         cmpl    $8, %edx
 353         jge     L(between_8_15)
 354         cmpl    $4, %edx
 355         jge     L(between_4_7)
 356         cmpl    $1, %edx
 357         jg      L(between_2_3)
 358         jl      L(between_0_0)
 359         movb    %SET_REG8, (%LESS_VEC_REG)
 360 L(between_0_0):
 361         ret
 362
 363         /* Align small targets only if not doing so would cross a fetch line.
 364          */
 365 #if VEC_SIZE > 32
 366         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 367         /* From 32 to 63.  No branch when size == 32.  */
 368 L(between_32_63):
 369         VMOVU   %VMM_256(0), (%LESS_VEC_REG)
 370         VMOVU   %VMM_256(0), -32(%LESS_VEC_REG, %rdx)
 371         VZEROUPPER_RETURN
 372 #endif
 373
 374 #if VEC_SIZE >= 32
 375         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 376 L(between_16_31):
 377         /* From 16 to 31.  No branch when size == 16.  */
 378         VMOVU   %VMM_128(0), (%LESS_VEC_REG)
 379         VMOVU   %VMM_128(0), -16(%LESS_VEC_REG, %rdx)
 380         ret
 381 #endif
 382
 383         /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
 384          */
 385         .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 386 L(between_8_15):
 387         /* From 8 to 15.  No branch when size == 8.  */
 388 #ifdef USE_XMM_LESS_VEC
 389         MOVQ    %VMM_128(0), (%rdi)
 390         MOVQ    %VMM_128(0), -8(%rdi, %rdx)
 391 #else
 392         movq    %SET_REG64, (%LESS_VEC_REG)
 393         movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
 394 #endif
 395         ret
 396
 397         /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
 398          */
 399         .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 400 L(between_4_7):
 401         /* From 4 to 7.  No branch when size == 4.  */
 402 #ifdef USE_XMM_LESS_VEC
 403         MOVD    %VMM_128(0), (%rdi)
 404         MOVD    %VMM_128(0), -4(%rdi, %rdx)
 405 #else
 406         movl    %SET_REG32, (%LESS_VEC_REG)
 407         movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
 408 #endif
 409         ret
 410
 411         /* 4 * XMM_SMALL for the third mov for AVX2.  */
 412         .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
 413 L(between_2_3):
 414         /* From 2 to 3.  No branch when size == 2.  */
 415 #ifdef USE_XMM_LESS_VEC
 416         movb    %SET_REG8, (%rdi)
 417         movb    %SET_REG8, 1(%rdi)
 418         movb    %SET_REG8, -1(%rdi, %rdx)
 419 #else
 420         movw    %SET_REG16, (%LESS_VEC_REG)
 421         movb    %SET_REG8, -1(%LESS_VEC_REG, %rdx)
 422 #endif
 423         ret
 424 END (MEMSET_SYMBOL (__memset, unaligned_erms))