sysdeps/x86_64/multiarch/strcpy-evex.S

   1 /* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
   2    Copyright (C) 2021-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20 #if ISA_SHOULD_BUILD (4)
  21
  22
  23         /* Use evex-masked stores for small sizes. Turned off at the
  24            moment.  */
  25 # define USE_EVEX_MASKED_STORE  0
  26         /* Use movsb in page cross case to save code size.  */
  27 # define USE_MOVSB_IN_PAGE_CROSS        1
  28
  29 # include <sysdep.h>
  30
  31 # ifndef VEC_SIZE
  32 #  include "x86-evex256-vecs.h"
  33 # endif
  34
  35 # ifndef STRCPY
  36 #  define STRCPY        __strcpy_evex
  37 # endif
  38
  39
  40 # ifdef USE_AS_WCSCPY
  41 #  define VMOVU_MASK    vmovdqu32
  42 #  define VPMIN vpminud
  43 #  define VPTESTN       vptestnmd
  44 #  define VPTEST        vptestmd
  45 #  define VPCMPEQ       vpcmpeqd
  46 #  define CHAR_SIZE     4
  47
  48 #  define REP_MOVS      rep movsd
  49
  50 #  define USE_WIDE_CHAR
  51 # else
  52 #  define VMOVU_MASK    vmovdqu8
  53 #  define VPMIN vpminub
  54 #  define VPTESTN       vptestnmb
  55 #  define VPTEST        vptestmb
  56 #  define VPCMPEQ       vpcmpeqb
  57 #  define CHAR_SIZE     1
  58
  59 #  define REP_MOVS      rep movsb
  60 # endif
  61
  62 # include "reg-macros.h"
  63
  64
  65 # ifdef USE_AS_STPCPY
  66 #  define END_REG       rax
  67 # else
  68 #  define END_REG       rdi, %rdx, CHAR_SIZE
  69 # endif
  70
  71 # ifdef USE_AS_STRCAT
  72 #  define PAGE_ALIGN_REG        edx
  73 #  define PAGE_ALIGN_REG_64     rdx
  74 # else
  75 #  define PAGE_ALIGN_REG        eax
  76 #  define PAGE_ALIGN_REG_64     rax
  77 # endif
  78
  79 # define VZERO  VMM(7)
  80 # define VZERO_128      VMM_128(7)
  81
  82
  83 # define PAGE_SIZE      4096
  84 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  85
  86
  87         .section SECTION(.text), "ax", @progbits
  88 ENTRY(STRCPY)
  89 # ifdef USE_AS_STRCAT
  90         movq    %rdi, %rax
  91 #  include "strcat-strlen-evex.h.S"
  92 # endif
  93
  94         movl    %esi, %PAGE_ALIGN_REG
  95         andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
  96         cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
  97         ja      L(page_cross)
  98 L(page_cross_continue):
  99         VMOVU   (%rsi), %VMM(0)
 100 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
 101         movq    %rdi, %rax
 102 # endif
 103
 104
 105         /* Two short string implementations. One with traditional
 106            branching approach and one with masked instructions (which
 107            have potential for dramatically bad perf if dst splits a
 108            page and is not in the TLB).  */
 109 # if USE_EVEX_MASKED_STORE
 110         VPTEST  %VMM(0), %VMM(0), %k0
 111         KMOV    %k0, %VRCX
 112 #  ifdef USE_AS_WCSCPY
 113         subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
 114 #  else
 115         inc     %VRCX
 116 #  endif
 117         jz      L(more_1x_vec)
 118         KMOV    %VRCX, %k1
 119         KXOR    %k0, %k1, %k1
 120
 121         VMOVU_MASK %VMM(0), (%rdi){%k1}
 122
 123 #  ifdef USE_AS_STPCPY
 124         bsf     %VRCX, %VRCX
 125         leaq    (%rdi, %rcx, CHAR_SIZE), %rax
 126 #  endif
 127         ret
 128
 129 # else
 130         VPTESTN %VMM(0), %VMM(0), %k0
 131         KMOV    %k0, %VRCX
 132         test    %VRCX, %VRCX
 133         jz      L(more_1x_vec)
 134
 135         xorl    %edx, %edx
 136         bsf     %VRCX, %VRDX
 137 #  ifdef USE_AS_STPCPY
 138         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 139 #  endif
 140
 141         /* Use mask bits in rcx to detect which copy we need. If the low
 142            mask is zero then there must be a bit set in the upper half.
 143            I.e if rcx != 0 and ecx == 0, then match must be upper 32
 144            bits so we use L(copy_32_63).  */
 145 #  if VEC_SIZE == 64
 146 #   ifdef USE_AS_WCSCPY
 147         testb   %cl, %cl
 148 #   else
 149         testl   %ecx, %ecx
 150 #   endif
 151         jz      L(copy_32_63)
 152 #  endif
 153
 154 #  ifdef USE_AS_WCSCPY
 155         testb   $0xf, %cl
 156 #  else
 157         testw   %cx, %cx
 158 #  endif
 159         jz      L(copy_16_31)
 160
 161
 162 #  ifdef USE_AS_WCSCPY
 163         testb   $0x3, %cl
 164 #  else
 165         testb   %cl, %cl
 166 #  endif
 167         jz      L(copy_8_15)
 168
 169
 170 #  ifdef USE_AS_WCSCPY
 171         vmovd   %VMM_128(0), (%rdi)
 172         /* No need to copy, we know its zero.  */
 173         movl    $0, (%END_REG)
 174
 175         ret
 176 #  else
 177
 178         testb   $0x7, %cl
 179         jz      L(copy_4_7)
 180
 181
 182         test    %edx, %edx
 183         jz      L(set_null_term)
 184
 185         /* NB: make this `vmovw` if support for AVX512-FP16 is added.
 186          */
 187         vmovd   %VMM_128(0), %esi
 188         movw    %si, (%rdi)
 189
 190         .p2align 4,, 1
 191 L(set_null_term):
 192         /* No need to copy, we know its zero.  */
 193         movb    $0, (%END_REG)
 194         ret
 195 #  endif
 196
 197 #  if VEC_SIZE == 64
 198         .p2align 4,, 6
 199 L(copy_32_63):
 200         VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
 201         VMOVU   %VMM_256(0), (%rdi)
 202         VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 203         ret
 204 #  endif
 205
 206
 207         .p2align 4,, 6
 208 L(copy_16_31):
 209         /* Use xmm1 explicitly here as it won't require a `vzeroupper`
 210            and will save code size.  */
 211         vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
 212         VMOVU   %VMM_128(0), (%rdi)
 213         vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
 214         ret
 215
 216         .p2align 4,, 8
 217 L(copy_8_15):
 218 #  ifdef USE_AS_WCSCPY
 219         movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
 220 #  else
 221         movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
 222 #  endif
 223         vmovq   %VMM_128(0), (%rdi)
 224         movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
 225         ret
 226 # endif
 227
 228
 229 # ifndef USE_AS_WCSCPY
 230         .p2align 4,, 12
 231 L(copy_4_7):
 232         movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
 233         vmovd   %VMM_128(0), (%rdi)
 234         movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
 235         ret
 236 # endif
 237
 238
 239         .p2align 4,, 8
 240 L(more_1x_vec):
 241 # if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 242         VMOVU   %VMM(0), (%rdi)
 243 # endif
 244         subq    %rsi, %rdi
 245         andq    $-(VEC_SIZE), %rsi
 246         addq    %rsi, %rdi
 247         VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
 248
 249         /* Ideally we store after moves to minimize impact of potential
 250            false-dependencies.  */
 251 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
 252         VMOVU   %VMM(0), (%rax)
 253 # endif
 254
 255         VPTESTN %VMM(1), %VMM(1), %k0
 256         KMOV    %k0, %VRCX
 257         test    %VRCX, %VRCX
 258         jnz     L(ret_vec_x1)
 259
 260         VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
 261         VMOVU   %VMM(1), VEC_SIZE(%rdi)
 262
 263         VPTESTN %VMM(2), %VMM(2), %k0
 264         KMOV    %k0, %VRCX
 265         test    %VRCX, %VRCX
 266         jnz     L(ret_vec_x2)
 267
 268         VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
 269         VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
 270
 271         VPTESTN %VMM(3), %VMM(3), %k0
 272         KMOV    %k0, %VRDX
 273         test    %VRDX, %VRDX
 274         jnz     L(ret_vec_x3)
 275
 276         VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
 277         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 278         VPTESTN %VMM(4), %VMM(4), %k0
 279         KMOV    %k0, %VRCX
 280         test    %VRCX, %VRCX
 281         jnz     L(ret_vec_x4)
 282
 283         VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
 284
 285
 286         /* Align for 4x loop.  */
 287         subq    %rsi, %rdi
 288
 289         /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
 290            we covered before aligning.  */
 291         subq    $-(VEC_SIZE * 5), %rsi
 292         andq    $-(VEC_SIZE * 4), %rsi
 293
 294
 295         /* Load first half of the loop before entry.  */
 296         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 297         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 298         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 299         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 300
 301         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 302         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 303         VPTESTN %VMM(4), %VMM(4), %k2
 304         VPTESTN %VMM(6), %VMM(6), %k4
 305         KORTEST %k2, %k4
 306         jnz     L(loop_4x_done)
 307
 308         .p2align 4,, 11
 309 L(loop_4x_vec):
 310
 311         VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
 312         VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
 313         VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
 314         VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
 315
 316         subq    $(VEC_SIZE * -4), %rsi
 317
 318         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 319         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 320         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 321         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 322
 323
 324         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 325         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 326         VPTESTN %VMM(4), %VMM(4), %k2
 327         VPTESTN %VMM(6), %VMM(6), %k4
 328         KORTEST %k2, %k4
 329         jz      L(loop_4x_vec)
 330
 331 L(loop_4x_done):
 332         VPTESTN %VMM(0), %VMM(0), %k0
 333         KMOV    %k0, %VRCX
 334         /* Restore rdi (%rdi).  */
 335         addq    %rsi, %rdi
 336         test    %VRCX, %VRCX
 337         jnz     L(ret_vec_x0_end)
 338         VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
 339
 340         KMOV    %k2, %VRCX
 341         test    %VRCX, %VRCX
 342         jnz     L(ret_vec_x1)
 343         VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
 344
 345         VPTESTN %VMM(2), %VMM(2), %k0
 346         KMOV    %k0, %VRCX
 347         test    %VRCX, %VRCX
 348         jnz     L(ret_vec_x2)
 349         VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
 350         /* Place L(ret_vec_x4) here to save code size.  We get a
 351            meaningfuly benefit doing this for stpcpy.  */
 352         KMOV    %k4, %VRDX
 353 L(ret_vec_x3):
 354         bsf     %VRDX, %VRDX
 355         VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 356         VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 357 # ifdef USE_AS_STPCPY
 358         leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
 359 # endif
 360 L(return_end):
 361         ret
 362
 363         .p2align 4,, 6
 364 L(ret_vec_x0_end):
 365         bsf     %VRCX, %VRCX
 366 # ifdef USE_AS_STPCPY
 367         leaq    (%rdi, %rcx, CHAR_SIZE), %rax
 368 # endif
 369         inc     %VRCX
 370         VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 371         VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 372         ret
 373
 374         .p2align 4,, 8
 375 L(ret_vec_x1):
 376         bsf     %VRCX, %VRCX
 377         VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 378         VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 379 # ifdef USE_AS_STPCPY
 380         leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 381 # endif
 382         ret
 383
 384         .p2align 4,, 4
 385 L(ret_vec_x2):
 386         bsf     %VRCX, %VRCX
 387         VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 388         VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 389 # ifdef USE_AS_STPCPY
 390         leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 391 # endif
 392         ret
 393
 394         /* ret_vec_x3 reuses return code after the loop.  */
 395         .p2align 4,, 6
 396 L(ret_vec_x4):
 397         bsf     %VRCX, %VRCX
 398         VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 399         VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 400 # ifdef USE_AS_STPCPY
 401         leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 402 # endif
 403         ret
 404
 405
 406         .p2align 4,, 4
 407 L(page_cross):
 408 # ifndef USE_AS_STRCAT
 409         vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
 410 # endif
 411         movq    %rsi, %rcx
 412         andq    $(VEC_SIZE * -1), %rcx
 413
 414         VPCMPEQ (%rcx), %VZERO, %k0
 415         KMOV    %k0, %VRCX
 416 # ifdef USE_AS_WCSCPY
 417         andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
 418         shrl    $2, %PAGE_ALIGN_REG
 419 # endif
 420         shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
 421
 422 # if USE_MOVSB_IN_PAGE_CROSS
 423         /* Optimizing more aggressively for space as this is very cold
 424            code. This saves 2x cache lines.  */
 425
 426         /* This adds once to the later result which will get correct
 427            copy bounds. NB: this can never zero-out a non-zero RCX as
 428            to be in the page cross case rsi cannot be aligned and we
 429            already right-shift rcx by the misalignment.  */
 430         shl     %VRCX
 431         jz      L(page_cross_continue)
 432 #  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
 433         movq    %rdi, %rax
 434 #  endif
 435         bsf     %VRCX, %VRCX
 436         REP_MOVS
 437
 438 #  ifdef USE_AS_STPCPY
 439         leaq    -CHAR_SIZE(%rdi), %rax
 440 #  endif
 441         ret
 442
 443
 444 # else
 445         /* Check if we found zero-char before end of page.  */
 446         test    %VRCX, %VRCX
 447         jz      L(page_cross_continue)
 448
 449         /* Traditional copy case, essentially same as used in non-page-
 450            cross case but since we can't reuse VMM(0) we need twice as
 451            many loads from rsi.  */
 452
 453 #  ifndef USE_AS_STRCAT
 454         xorl    %edx, %edx
 455 #  endif
 456         /* Dependency on rdi must already have been satisfied.  */
 457         bsf     %VRCX, %VRDX
 458 #  ifdef USE_AS_STPCPY
 459         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 460 #  elif !defined USE_AS_STRCAT
 461         movq    %rdi, %rax
 462 #  endif
 463
 464 #  if VEC_SIZE == 64
 465 #   ifdef USE_AS_WCSCPY
 466         testb   %cl, %cl
 467 #   else
 468         test    %ecx, %ecx
 469 #   endif
 470         jz      L(page_cross_copy_32_63)
 471 #  endif
 472
 473 #  ifdef USE_AS_WCSCPY
 474         testb   $0xf, %cl
 475 #  else
 476         testw   %cx, %cx
 477 #  endif
 478         jz      L(page_cross_copy_16_31)
 479
 480 #  ifdef USE_AS_WCSCPY
 481         testb   $0x3, %cl
 482 #  else
 483         testb   %cl, %cl
 484 #  endif
 485         jz      L(page_cross_copy_8_15)
 486
 487 #  ifdef USE_AS_WCSCPY
 488         movl    (%rsi), %esi
 489         movl    %esi, (%rdi)
 490         movl    $0, (%END_REG)
 491         ret
 492 #  else
 493
 494         testb   $0x7, %cl
 495         jz      L(page_cross_copy_4_7)
 496
 497         test    %edx, %edx
 498         jz      L(page_cross_set_null_term)
 499         movzwl  (%rsi), %ecx
 500         movw    %cx, (%rdi)
 501 L(page_cross_set_null_term):
 502         movb    $0, (%END_REG)
 503         ret
 504
 505
 506         .p2align 4,, 4
 507 L(page_cross_copy_4_7):
 508         movl    (%rsi), %ecx
 509         movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
 510         movl    %ecx, (%rdi)
 511         movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
 512         ret
 513 #  endif
 514
 515 #  if VEC_SIZE == 64
 516         .p2align 4,, 4
 517 L(page_cross_copy_32_63):
 518         VMOVU   (%rsi), %VMM_256(0)
 519         VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
 520         VMOVU   %VMM_256(0), (%rdi)
 521         VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 522         ret
 523 #  endif
 524
 525         .p2align 4,, 4
 526 L(page_cross_copy_16_31):
 527         vmovdqu (%rsi), %xmm0
 528         vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
 529         vmovdqu %xmm0, (%rdi)
 530         vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
 531         ret
 532
 533         .p2align 4,, 4
 534 L(page_cross_copy_8_15):
 535         movq    (%rsi), %rcx
 536         movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 537         movq    %rcx, (%rdi)
 538         movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
 539         ret
 540 # endif
 541 END(STRCPY)
 542 #endif