sysdeps/x86_64/multiarch/strncat-evex.S

   1 /* {wcs|str}ncat  with 256/512-bit EVEX.
   2    Copyright (C) 2022 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23         /* Use evex-masked stores for small sizes. Turned off at the
  24            moment.  */
  25 # define USE_EVEX_MASKED_STORE  0
  26
  27 # include <sysdep.h>
  28
  29 # ifndef VEC_SIZE
  30 #  include "x86-evex256-vecs.h"
  31 # endif
  32
  33 # ifndef STRNCAT
  34 #  define STRNCAT       __strncat_evex
  35 # endif
  36
  37
  38 # ifdef USE_AS_WCSCPY
  39 #  define MOVCHAR       movl
  40 #  define VMOVU_MASK    vmovdqu32
  41 #  define VPMIN vpminud
  42 #  define VPTESTN       vptestnmd
  43 #  define VPTEST        vptestmd
  44 #  define VPCMPEQ       vpcmpeqd
  45 #  define CHAR_SIZE     4
  46
  47 #  define REP_MOVS      rep movsd
  48
  49 #  define VMASK_REG     VR10
  50 #  define FIND_FIRST_ONE(src, dst)      movl $CHAR_PER_VEC, %dst; bsf %src, %dst
  51
  52 #  define USE_WIDE_CHAR
  53 # else
  54 #  define MOVCHAR       movb
  55 #  define VMOVU_MASK    vmovdqu8
  56 #  define VPMIN vpminub
  57 #  define VPTESTN       vptestnmb
  58 #  define VPTEST        vptestmb
  59 #  define VPCMPEQ       vpcmpeqb
  60 #  define CHAR_SIZE     1
  61
  62 #  define REP_MOVS      rep movsb
  63
  64 #  define VMASK_REG     VRCX
  65 #  define FIND_FIRST_ONE(src, dst)      tzcnt %src, %dst
  66
  67 # endif
  68
  69 # include "strncpy-or-cat-overflow-def.h"
  70
  71 # include "reg-macros.h"
  72
  73
  74 # define VZERO  VMM(7)
  75 # define VZERO_128      VMM_128(7)
  76
  77 # define PAGE_SIZE      4096
  78 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  79
  80         .section SECTION(.text), "ax", @progbits
  81 ENTRY(STRNCAT)
  82         movq    %rdi, %rax
  83
  84         /* NB: It's safe to filter out zero-length strings WITHOUT
  85            setting null-term. Destination MUST be a null-terminated
  86            string so essentially the work is already done.  */
  87 # ifdef USE_AS_WCSCPY
  88         leaq    -1(%rdx), %rcx
  89         shrq    $56, %rcx
  90         jnz     L(zero_len)
  91 # else
  92         test    %rdx, %rdx
  93         jle     L(zero_len)
  94 # endif
  95
  96 # include "strcat-strlen-evex.h.S"
  97
  98         movl    %esi, %ecx
  99         andl    $(PAGE_SIZE - 1), %ecx
 100         cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
 101         ja      L(page_cross)
 102 L(page_cross_continue):
 103         VMOVU   (%rsi), %VMM(0)
 104         VPTESTN %VMM(0), %VMM(0), %k0
 105
 106         /* If USE_EVEX_MASK_STORE is enabled then we just handle length
 107            <= CHAR_PER_VEC with masked instructions (which have
 108            potential for dramatically bad perf if dst splits a page and
 109            is not in the TLB).  */
 110 # if USE_EVEX_MASKED_STORE
 111         KMOV    %k0, %VRCX
 112         FIND_FIRST_ONE (VRCX, VR8)
 113         cmpq    %r8, %rdx
 114         jbe     L(less_1x_vec)
 115
 116         test    %VRCX, %VRCX
 117         jz      L(more_1x_vec)
 118
 119         blsmsk  %VRCX, %VRCX
 120         KMOV    %VRCX, %k1
 121         VMOVU_MASK %VMM(0), (%rdi){%k1}
 122         ret
 123
 124 L(less_1x_vec):
 125         mov     $-1, %VRCX
 126         bzhi    %VRDX, %VRCX, %VRCX
 127         KMOV    %VRCX, %k1
 128         MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
 129         VMOVU_MASK %VMM(0), (%rdi){%k1}
 130
 131         ret
 132 # else
 133         KMOV    %k0, %VMASK_REG
 134         /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
 135            %VMASK_REG, %VRCX` for wcsncat.  */
 136         FIND_FIRST_ONE (VMASK_REG, VRCX)
 137         cmpq    %rcx, %rdx
 138         jbe     L(less_1x_vec)
 139
 140         /* If there were no zero-CHARs (rcx was zero before
 141            FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
 142         cmpl    $CHAR_PER_VEC, %ecx
 143         je      L(more_1x_vec)
 144
 145         movl    %ecx, %edx
 146
 147 L(less_1x_vec):
 148 #  if VEC_SIZE == 64
 149         cmpl    $(32 / CHAR_SIZE), %edx
 150         jae     L(copy_32_63)
 151 #  endif
 152
 153         cmpl    $(16 / CHAR_SIZE), %edx
 154         jae     L(copy_16_31)
 155
 156
 157         cmpl    $(8 / CHAR_SIZE), %edx
 158         jae     L(copy_8_15)
 159
 160 #  ifdef USE_AS_WCSCPY
 161         vmovd   %VMM_128(0), (%rdi)
 162         MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
 163         ret
 164 #  else
 165
 166         cmpl    $4, %edx
 167         jae     L(copy_4_7)
 168
 169         movzbl  (%rsi), %ecx
 170         cmpl    $1, %edx
 171         jbe     L(set_null_term)
 172
 173         movzwl  1(%rsi), %esi
 174         movw    %si, 1(%rdi)
 175
 176         .p2align 4,, 1
 177 L(set_null_term):
 178         movb    %cl, (%rdi)
 179         MOVCHAR $0, (%rdi, %rdx)
 180         ret
 181 #  endif
 182
 183 #  if VEC_SIZE == 64
 184         .p2align 4,, 6
 185 L(copy_32_63):
 186         VMOVU   -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
 187         VMOVU   %VMM_256(0), (%rdi)
 188         VMOVU   %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
 189         MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
 190         ret
 191 #  endif
 192         .p2align 4,, 6
 193 L(copy_16_31):
 194         /* Use xmm1 explicitly here as it won't require a `vzeroupper`
 195            and will save code size.  */
 196         vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
 197         VMOVU   %VMM_128(0), (%rdi)
 198         vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
 199         MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
 200         ret
 201
 202         .p2align 4,, 2
 203 L(copy_8_15):
 204         movq    -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
 205         vmovq   %VMM_128(0), (%rdi)
 206         movq    %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
 207         MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
 208         ret
 209
 210 #  ifndef USE_AS_WCSCPY
 211         .p2align 4,, 12
 212 L(copy_4_7):
 213         movl    -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
 214         vmovd   %VMM_128(0), (%rdi)
 215         movl    %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
 216         MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
 217         ret
 218 #  endif
 219
 220 # endif
 221         .p2align 4,, 4
 222 L(zero_len):
 223 # ifdef USE_AS_WCSCPY
 224         test    %rdx, %rdx
 225 # endif
 226         jne     OVERFLOW_STRCAT
 227         ret
 228
 229         .p2align 4,, 8
 230 L(more_1x_vec):
 231         VMOVU   %VMM(0), (%rdi)
 232
 233         /* We are going to align rsi here so will need to be able to re-
 234            adjust rdi/rdx afterwords. NB: We filtered out huge lengths
 235            so rsi + rdx * CHAR_SIZE cannot overflow.  */
 236
 237         leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
 238         subq    %rsi, %rdi
 239         andq    $-(VEC_SIZE), %rsi
 240 L(loop_last_4x_vec):
 241         addq    %rsi, %rdi
 242         subq    %rsi, %rdx
 243 # ifdef USE_AS_WCSCPY
 244         shrq    $2, %rdx
 245 # endif
 246
 247         /* Will need this regardless.  */
 248         VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
 249         VPTESTN %VMM(1), %VMM(1), %k0
 250         KMOV    %k0, %VMASK_REG
 251
 252         cmpq    $(CHAR_PER_VEC * 2), %rdx
 253         ja      L(more_2x_vec)
 254
 255 L(last_2x_vec):
 256         FIND_FIRST_ONE (VMASK_REG, VRCX)
 257         cmpl    %ecx, %edx
 258         jbe     L(ret_vec_x1_len)
 259
 260         /* If there were no zero-CHARs (rcx was zero before
 261            FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
 262         cmpl    $CHAR_PER_VEC, %ecx
 263         jne     L(ret_vec_x1)
 264
 265         VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
 266         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 267         VPTESTN %VMM(2), %VMM(2), %k0
 268         KMOV    %k0, %VRCX
 269         addl    $-CHAR_PER_VEC, %edx
 270         bzhi    %VRDX, %VRCX, %VR8
 271         jz      L(ret_vec_x2_len)
 272 L(ret_vec_x2):
 273         bsf     %VRCX, %VRDX
 274 L(ret_vec_x2_len):
 275         VMOVU   (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 276         MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
 277         VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
 278         ret
 279
 280         .p2align 4,, 4
 281 L(ret_vec_x1_len):
 282         movl    %edx, %ecx
 283 L(ret_vec_x1):
 284         VMOVU   (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 285         MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
 286         VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 287         VZEROUPPER_RETURN
 288
 289
 290         .p2align 4,, 8
 291 L(last_4x_vec):
 292         addl    $-(CHAR_PER_VEC * 4), %edx
 293         VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
 294         VPTESTN %VMM(1), %VMM(1), %k0
 295         KMOV    %k0, %VMASK_REG
 296         subq    $-(VEC_SIZE * 4), %rsi
 297         subq    $-(VEC_SIZE * 4), %rdi
 298         cmpl    $(CHAR_PER_VEC * 2), %edx
 299         jbe     L(last_2x_vec)
 300         .p2align 4,, 8
 301 L(more_2x_vec):
 302 # ifdef USE_AS_WCSCPY
 303         xorl    %ecx, %ecx
 304 # endif
 305         bsf     %VMASK_REG, %VRCX
 306         jnz     L(ret_vec_x1)
 307
 308         VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
 309         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 310         VPTESTN %VMM(2), %VMM(2), %k0
 311         KMOV    %k0, %VRCX
 312         test    %VRCX, %VRCX
 313         jnz     L(ret_vec_x2)
 314
 315         VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
 316         VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
 317         VPTESTN %VMM(3), %VMM(3), %k0
 318         KMOV    %k0, %VMASK_REG
 319
 320         cmpq    $(CHAR_PER_VEC * 4), %rdx
 321         ja      L(more_4x_vec)
 322
 323         /* Adjust length before going to L(ret_vec_x3_len) or
 324            L(ret_vec_x3).  */
 325         addl    $(CHAR_PER_VEC * -2), %edx
 326
 327         FIND_FIRST_ONE (VMASK_REG, VRCX)
 328         cmpl    %ecx, %edx
 329         jbe     L(ret_vec_x3_len)
 330
 331         /* If there were no zero-CHARs (rcx was zero before
 332            FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
 333         cmpl    $CHAR_PER_VEC, %ecx
 334         jne     L(ret_vec_x3)
 335
 336         VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
 337         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 338         VPTESTN %VMM(4), %VMM(4), %k0
 339         KMOV    %k0, %VRCX
 340         addl    $-CHAR_PER_VEC, %edx
 341         bzhi    %VRDX, %VRCX, %VR8
 342         jz      L(ret_vec_x4_len)
 343 L(ret_vec_x4):
 344         bsf     %VRCX, %VRDX
 345 L(ret_vec_x4_len):
 346         VMOVU   (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 347         MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
 348         VMOVU   %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
 349         ret
 350
 351         .p2align 4,, 4
 352 L(ret_vec_x3_len):
 353         movl    %edx, %ecx
 354 L(ret_vec_x3):
 355         VMOVU   (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 356         MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
 357         VMOVU   %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 358         ret
 359
 360         .p2align 4,, 8
 361 L(more_4x_vec):
 362 # ifdef USE_AS_WCSCPY
 363         xorl    %ecx, %ecx
 364 # endif
 365         bsf     %VMASK_REG, %VRCX
 366         jnz     L(ret_vec_x3)
 367
 368         VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
 369         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 370         VPTESTN %VMM(4), %VMM(4), %k0
 371         KMOV    %k0, %VRCX
 372         test    %VRCX, %VRCX
 373         jnz     L(ret_vec_x4)
 374
 375         VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
 376
 377         /* Check if we are near the end before aligning.  */
 378         cmpq    $(CHAR_PER_VEC * 8), %rdx
 379         jbe     L(last_4x_vec)
 380
 381
 382         /* Add rsi to rdx (length) before aligning rsi. NB: Since we
 383            filtered out huge lengths this cannot overflow.  */
 384 # ifdef USE_AS_WCSCPY
 385         leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
 386 # else
 387         addq    %rsi, %rdx
 388 # endif
 389
 390         /* Subtract rsi from rdi before aligning (add back will have
 391            correct rdi for aligned rsi).  */
 392         subq    %rsi, %rdi
 393         subq    $-(VEC_SIZE * 5), %rsi
 394         andq    $(VEC_SIZE * -4), %rsi
 395
 396         /* Load first half of the loop before entry.  */
 397         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 398         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 399         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 400         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 401
 402         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 403         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 404         VPTESTN %VMM(4), %VMM(4), %k2
 405         VPTESTN %VMM(6), %VMM(6), %k4
 406
 407         /* Offset rsi by VEC_SIZE so that we can jump to
 408            L(loop_last_4x_vec).  */
 409         addq    $-(VEC_SIZE), %rsi
 410         KORTEST %k2, %k4
 411         jnz     L(loop_4x_done)
 412
 413         /* Store loop end in r9.  */
 414         leaq    -(VEC_SIZE * 5)(%rdx), %r9
 415
 416         .p2align 4,, 11
 417 L(loop_4x_vec):
 418         VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
 419         VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
 420         VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
 421         VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
 422
 423         subq    $(VEC_SIZE * -4), %rsi
 424         cmpq    %rsi, %r9
 425         jbe     L(loop_last_4x_vec)
 426
 427         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
 428         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
 429         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
 430         VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
 431
 432         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 433         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 434         VPTESTN %VMM(4), %VMM(4), %k2
 435         VPTESTN %VMM(6), %VMM(6), %k4
 436         KORTEST %k2, %k4
 437         jz      L(loop_4x_vec)
 438
 439 L(loop_4x_done):
 440         VPTESTN %VMM(0), %VMM(0), %k0
 441         KMOV    %k0, %VRCX
 442         /* Restore rdi (dst).  */
 443         addq    %rsi, %rdi
 444
 445         /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
 446            test with bsf.  */
 447         bsf     %VRCX, %VRCX
 448         jnz     L(ret_vec_x1)
 449         VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
 450
 451         KMOV    %k2, %VRCX
 452         test    %VRCX, %VRCX
 453         jnz     L(ret_vec_x2)
 454         VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
 455
 456         VPTESTN %VMM(2), %VMM(2), %k0
 457         KMOV    %k0, %VRCX
 458         bsf     %VRCX, %VRCX
 459         jnz     L(ret_vec_x3)
 460         VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
 461
 462         KMOV    %k4, %VRCX
 463         bsf     %VRCX, %VRCX
 464         VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 465         VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 466         ret
 467
 468
 469         .p2align 4,, 4
 470 L(page_cross):
 471         movq    %rsi, %r8
 472         andq    $(VEC_SIZE * -1), %r8
 473         VPCMPEQ (%r8), %VZERO, %k0
 474
 475 # ifdef USE_AS_WCSCPY
 476         KMOV    %k0, %VR9
 477         shrl    $2, %ecx
 478         andl    $(CHAR_PER_VEC - 1), %ecx
 479         shrx    %VRCX, %VR9, %VRCX
 480 # else
 481         KMOV    %k0, %VRCX
 482         shrx    %VRSI, %VRCX, %VRCX
 483 # endif
 484
 485         subl    %esi, %r8d
 486         andl    $(VEC_SIZE - 1), %r8d
 487 # ifdef USE_AS_WCSCPY
 488         shrl    $2, %r8d
 489 # endif
 490         cmpq    %r8, %rdx
 491         jbe     L(page_cross_small)
 492         /* Optimizing more for space as this is very cold code. This
 493            saves 2x cache lines.  */
 494
 495         /* This adds once to the later result which will get correct
 496            copy bounds. NB: this can never zero-out a non-zero RCX as
 497            to be in the page cross case rsi cannot be aligned and we
 498            already right-shift rcx by the misalignment.  */
 499         shl     %VRCX
 500         jz      L(page_cross_continue)
 501         bsf     %VRCX, %VRCX
 502         REP_MOVS
 503         ret
 504
 505 L(page_cross_small):
 506         tzcnt   %VRCX, %VRCX
 507         jz      L(page_cross_setz)
 508         cmpl    %edx, %ecx
 509         cmova   %edx, %ecx
 510
 511 # ifdef USE_AS_WCSCPY
 512         rep     movsd
 513 # else
 514         rep     movsb
 515 # endif
 516 L(page_cross_setz):
 517         MOVCHAR $0, (%rdi)
 518         ret
 519 END(STRNCAT)
 520 #endif