sysdeps/x86_64/multiarch/strncpy-evex.S

   1 /* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
   2    Copyright (C) 2022 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23         /* Use evex-masked stores for small sizes. Turned off at the
  24            moment.  */
  25 # define USE_EVEX_MASKED_STORE  0
  26
  27
  28 # include <sysdep.h>
  29 # ifndef VEC_SIZE
  30 #  include "x86-evex256-vecs.h"
  31 # endif
  32
  33
  34 # ifndef STRNCPY
  35 #  define STRNCPY       __strncpy_evex
  36 # endif
  37
  38 # ifdef USE_AS_WCSCPY
  39 #  define VMOVU_MASK    vmovdqu32
  40 #  define VPCMPEQ       vpcmpeqd
  41 #  define VPMIN vpminud
  42 #  define VPTESTN       vptestnmd
  43 #  define VPTEST        vptestmd
  44 #  define CHAR_SIZE     4
  45
  46 #  define REP_MOVS      rep movsd
  47 #  define REP_STOS      rep stosl
  48
  49 #  define USE_WIDE_CHAR
  50
  51 # else
  52 #  define VMOVU_MASK    vmovdqu8
  53 #  define VPCMPEQ       vpcmpeqb
  54 #  define VPMIN vpminub
  55 #  define VPTESTN       vptestnmb
  56 #  define VPTEST        vptestmb
  57 #  define CHAR_SIZE     1
  58
  59 #  define REP_MOVS      rep movsb
  60 #  define REP_STOS      rep stosb
  61 # endif
  62
  63 # include "strncpy-or-cat-overflow-def.h"
  64
  65 # define PAGE_SIZE      4096
  66 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  67
  68 # include "reg-macros.h"
  69
  70
  71 # define VZERO  VMM(7)
  72 # define VZERO_256      VMM_256(7)
  73 # define VZERO_128      VMM_128(7)
  74
  75 # if VEC_SIZE == 64
  76 #  define VZERO_HALF    VZERO_256
  77 # else
  78 #  define VZERO_HALF    VZERO_128
  79 # endif
  80
  81         .section SECTION(.text), "ax", @progbits
  82 ENTRY(STRNCPY)
  83         /* Filter zero length strings and very long strings.  Zero
  84            length strings just return, very long strings are handled by
  85            just running rep stos{b|l} to zero set (which will almost
  86            certainly segfault), if that succeeds then just calling
  87            OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
  88 # ifdef USE_AS_WCSCPY
  89         decq    %rdx
  90         movq    %rdx, %rax
  91         /* 56 is end of max supported address space.  */
  92         shr     $56, %rax
  93         jnz     L(zero_len)
  94 # else
  95         decq    %rdx
  96         /* If the flag needs to become `jb` replace `dec` with `sub`.
  97          */
  98         jl      L(zero_len)
  99 # endif
 100
 101         vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
 102         movl    %esi, %eax
 103         andl    $(PAGE_SIZE - 1), %eax
 104         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
 105         ja      L(page_cross)
 106
 107 L(page_cross_continue):
 108         VMOVU   (%rsi), %VMM(0)
 109         VPTESTN %VMM(0), %VMM(0), %k0
 110         KMOV    %k0, %VRCX
 111
 112         /* If no STPCPY just save end ahead of time.  */
 113 # ifndef USE_AS_STPCPY
 114         movq    %rdi, %rax
 115 # endif
 116
 117
 118         cmpq    $(CHAR_PER_VEC), %rdx
 119
 120         /* If USE_EVEX_MASK_STORE is enabled then we just handle length
 121            <= CHAR_PER_VEC with masked instructions (which have
 122            potential for dramatically bad perf if dst splits a page and
 123            is not in the TLB).  */
 124 # if USE_EVEX_MASKED_STORE
 125         /* `jae` because length rdx is now length - 1.  */
 126         jae     L(more_1x_vec)
 127
 128         /* If there where multiple zero-CHAR matches in the first VEC,
 129            VRCX will be overset but thats fine since any oversets where
 130            at zero-positions anyways.  */
 131
 132 #  ifdef USE_AS_STPCPY
 133         tzcnt   %VRCX, %VRAX
 134         cmpl    %eax, %edx
 135         cmovb   %edx, %eax
 136 #   ifdef USE_AS_WCSCPY
 137         adcl    $0, %eax
 138         leaq    (%rdi, %rax, CHAR_SIZE), %rax
 139 #   else
 140         adcq    %rdi, %rax
 141 #   endif
 142 #  endif
 143         dec     %VRCX
 144
 145         /* Zero out all non-zero CHAR's after the first zero match.  */
 146         KMOV    %VRCX, %k1
 147
 148         /* Use VZERO as destination so this can be reused for
 149            L(zfill_less_vec) (which if jumped to by subsequent logic
 150            will have zerod out VZERO.  */
 151         VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
 152 L(zfill_less_vec):
 153         /* Get mask for what we need to set.  */
 154         incl    %edx
 155         mov     $-1, %VRCX
 156         bzhi    %VRDX, %VRCX, %VRCX
 157         KMOV    %VRCX, %k1
 158         VMOVU_MASK %VZERO, (%rdi){%k1}
 159         ret
 160
 161         .p2align 4,, 4
 162 L(zero_len):
 163         cmpq    $-1, %rdx
 164         jne     L(best_effort_strncpy)
 165         movq    %rdi, %rax
 166         ret
 167
 168         .p2align 4,, 8
 169 L(more_1x_vec):
 170 # else
 171         /* `jb` because length rdx is now length - 1.  */
 172         jb      L(less_1x_vec)
 173 # endif
 174
 175
 176         /* This may overset but thats fine because we still need to zero
 177            fill.  */
 178         VMOVU   %VMM(0), (%rdi)
 179
 180
 181         /* Length must be >= CHAR_PER_VEC so match here means we must
 182            zero-fill.  */
 183         test    %VRCX, %VRCX
 184         jnz     L(zfill)
 185
 186
 187         /* We are going to align rsi here so will need to be able to re-
 188            adjust rdi/rdx afterwords. NB: We filtered out huge lengths
 189            so rsi + rdx * CHAR_SIZE cannot overflow.  */
 190         leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
 191         subq    %rsi, %rdi
 192         andq    $-(VEC_SIZE), %rsi
 193
 194 L(loop_last_4x_vec):
 195         addq    %rsi, %rdi
 196         subq    %rsi, %rdx
 197 # ifdef USE_AS_WCSCPY
 198         shrq    $2, %rdx
 199 # endif
 200
 201         VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
 202         VPTESTN %VMM(1), %VMM(1), %k0
 203         KMOV    %k0, %VRCX
 204
 205         /* -1 because of the `dec %rdx` earlier.  */
 206         cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
 207         ja      L(more_2x_vec)
 208
 209 L(last_2x_vec):
 210         /* This will be need to be computed no matter what. We do it
 211            ahead of time for CHAR_PER_VEC == 64 because we can't adjust
 212            the value of `tzcnt` with a shift.  */
 213 # if CHAR_PER_VEC == 64
 214         tzcntq  %rcx, %rcx
 215 # endif
 216
 217         cmpl    $(CHAR_PER_VEC), %edx
 218         jb      L(ret_vec_x1_len)
 219
 220         /* Seperate logic for CHAR_PER_VEC == 64 because we already did
 221            `tzcnt` on VRCX.  */
 222 # if CHAR_PER_VEC == 64
 223         /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
 224         cmpb    $CHAR_PER_VEC, %cl
 225         jnz     L(ret_vec_x1_no_bsf)
 226 # else
 227         test    %VRCX, %VRCX
 228         jnz     L(ret_vec_x1)
 229 # endif
 230
 231
 232
 233         VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
 234         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 235         KMOV    %k0, %VRCX
 236
 237 # if CHAR_PER_VEC < 64
 238         /* This essentiallys adds CHAR_PER_VEC to computed result.  */
 239         shlq    $CHAR_PER_VEC, %rcx
 240 # else
 241         tzcntq  %rcx, %rcx
 242         addl    $CHAR_PER_VEC, %ecx
 243 # endif
 244
 245         .p2align 4,, 4
 246 L(ret_vec_x1_len):
 247         /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
 248            already been done.  */
 249 # if CHAR_PER_VEC < 64
 250         tzcntq  %rcx, %rcx
 251 # endif
 252         cmpl    %ecx, %edx
 253         jbe     L(ret_vec_x1_len_no_zfill)
 254         /* Fall through (expectation) is copy len < buffer len.  */
 255         VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 256 L(ret_vec_x1_len_no_zfill_mov):
 257         movl    %ecx, %edx
 258 # ifdef USE_AS_STPCPY
 259         /* clear flags.  */
 260         xorl    %ecx, %ecx
 261 # endif
 262 L(ret_vec_x1_len_no_zfill):
 263         VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 264         VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 265 # ifdef USE_AS_STPCPY
 266 #  ifdef USE_AS_WCSCPY
 267         adcq    $0, %rdx
 268         leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 269 #  else
 270         leal    (VEC_SIZE)(%rdx), %eax
 271         adcq    %rdi, %rax
 272 #  endif
 273 # endif
 274         ret
 275
 276
 277         .p2align 4,, 10
 278 L(ret_vec_x1):
 279         bsf     %VRCX, %VRCX
 280 L(ret_vec_x1_no_bsf):
 281         VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 282         subl    %ecx, %edx
 283         cmpl    $CHAR_PER_VEC, %edx
 284         jb      L(ret_vec_x1_len_no_zfill_mov)
 285         /* Fall through (expectation) is copy len < buffer len.  */
 286         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 287         VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
 288 # ifdef USE_AS_STPCPY
 289         leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
 290 # endif
 291         ret
 292
 293         .p2align 4,, 8
 294 L(last_4x_vec):
 295         /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
 296            $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
 297            using `movzbl`.  */
 298 # if CHAR_PER_VEC == 64
 299         movzbl  %dl, %edx
 300 # else
 301         andl    $(CHAR_PER_VEC * 4 - 1), %edx
 302 # endif
 303         VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
 304         VPTESTN %VMM(1), %VMM(1), %k0
 305         KMOV    %k0, %VRCX
 306         subq    $-(VEC_SIZE * 4), %rsi
 307         subq    $-(VEC_SIZE * 4), %rdi
 308         cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
 309         jbe     L(last_2x_vec)
 310         .p2align 4,, 8
 311 L(more_2x_vec):
 312         VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
 313         test    %VRCX, %VRCX
 314         /* Must fill at least 2x VEC.  */
 315         jnz     L(zfill_vec1)
 316
 317         VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
 318         VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
 319         VPTESTN %VMM(2), %VMM(2), %k0
 320         KMOV    %k0, %VRCX
 321         test    %VRCX, %VRCX
 322         /* Must fill at least 1x VEC.  */
 323         jnz     L(zfill_vec2)
 324
 325         VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
 326         VPTESTN %VMM(3), %VMM(3), %k0
 327         KMOV    %k0, %VRCX
 328
 329         /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
 330         cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
 331         ja      L(more_4x_vec)
 332
 333         subl    $(CHAR_PER_VEC * 3), %edx
 334         jb      L(ret_vec_x3_len)
 335
 336         test    %VRCX, %VRCX
 337         jnz     L(ret_vec_x3)
 338
 339         VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
 340         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 341         KMOV    %k0, %VRCX
 342         tzcnt   %VRCX, %VRCX
 343         cmpl    %ecx, %edx
 344         jbe     L(ret_vec_x4_len_no_zfill)
 345         /* Fall through (expectation) is copy len < buffer len.  */
 346         VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 347         movl    %ecx, %edx
 348 L(ret_vec_x4_len_no_zfill):
 349         VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 350         VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 351 # ifdef USE_AS_STPCPY
 352 #  ifdef USE_AS_WCSCPY
 353         adcq    $0, %rdx
 354         leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
 355 #  else
 356         leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
 357         adcq    %rdi, %rax
 358 #  endif
 359 # endif
 360         ret
 361
 362
 363 L(ret_vec_x3_len):
 364         addl    $(CHAR_PER_VEC * 1), %edx
 365         tzcnt   %VRCX, %VRCX
 366         cmpl    %ecx, %edx
 367         jbe     L(ret_vec_x3_len_no_zfill)
 368         /* Fall through (expectation) is copy len < buffer len.  */
 369         VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 370 L(ret_vec_x3_len_no_zfill_mov):
 371         movl    %ecx, %edx
 372 # ifdef USE_AS_STPCPY
 373         /* clear flags.  */
 374         xorl    %ecx, %ecx
 375 # endif
 376         .p2align 4,, 4
 377 L(ret_vec_x3_len_no_zfill):
 378         VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 379         VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 380 # ifdef USE_AS_STPCPY
 381 #  ifdef USE_AS_WCSCPY
 382         adcq    $0, %rdx
 383         leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
 384 #  else
 385         leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
 386         adcq    %rdi, %rax
 387 #  endif
 388 # endif
 389         ret
 390
 391
 392         .p2align 4,, 8
 393 L(ret_vec_x3):
 394         bsf     %VRCX, %VRCX
 395         VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
 396         subl    %ecx, %edx
 397         jl      L(ret_vec_x3_len_no_zfill_mov)
 398         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 399         VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
 400 # ifdef USE_AS_STPCPY
 401         leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
 402 # endif
 403         ret
 404
 405         .p2align 4,, 8
 406 L(more_4x_vec):
 407         VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
 408         test    %VRCX, %VRCX
 409         jnz     L(zfill_vec3)
 410
 411         VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
 412         VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
 413         VPTESTN %VMM(4), %VMM(4), %k0
 414         KMOV    %k0, %VRCX
 415         test    %VRCX, %VRCX
 416         jnz     L(zfill_vec4)
 417
 418         /* Recheck length before aligning.  */
 419         cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
 420         jbe     L(last_4x_vec)
 421
 422         /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
 423 # ifdef USE_AS_WCSCPY
 424         leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
 425 # else
 426         addq    %rsi, %rdx
 427 # endif
 428         subq    %rsi, %rdi
 429         subq    $-(VEC_SIZE * 5), %rsi
 430         andq    $(VEC_SIZE * -4), %rsi
 431
 432
 433         /* Load first half of the loop before entry.  */
 434         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 435         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 436         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 437         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 438
 439         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 440         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 441         VPTESTN %VMM(4), %VMM(4), %k2
 442         VPTESTN %VMM(6), %VMM(6), %k4
 443
 444
 445         /* Offset rsi by VEC_SIZE so that we can jump to
 446            L(loop_last_4x_vec).  */
 447         addq    $-(VEC_SIZE), %rsi
 448         KORTEST %k2, %k4
 449         jnz     L(loop_4x_done)
 450
 451         /* Store loop end in r9.  */
 452         leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
 453
 454         .p2align 4,, 11
 455 L(loop_4x_vec):
 456         VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
 457         VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
 458         VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
 459         VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
 460
 461         subq    $(VEC_SIZE * -4), %rsi
 462         cmpq    %rsi, %r9
 463         jbe     L(loop_last_4x_vec)
 464
 465         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
 466         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
 467         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
 468         VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
 469
 470         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 471         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 472         VPTESTN %VMM(4), %VMM(4), %k2
 473         VPTESTN %VMM(6), %VMM(6), %k4
 474         KORTEST %k2, %k4
 475         jz      L(loop_4x_vec)
 476
 477 L(loop_4x_done):
 478         /* Restore rdx (length).  */
 479         subq    %rsi, %rdx
 480 # ifdef USE_AS_WCSCPY
 481         shrq    $2, %rdx
 482 # endif
 483         VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
 484         /* Restore rdi (dst).  */
 485         addq    %rsi, %rdi
 486         VPTESTN %VMM(0), %VMM(0), %k0
 487         KMOV    %k0, %VRCX
 488         test    %VRCX, %VRCX
 489         jnz     L(zfill_vec1)
 490
 491         VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
 492         KMOV    %k2, %VRCX
 493         test    %VRCX, %VRCX
 494         jnz     L(zfill_vec2)
 495
 496         VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
 497         VPTESTN %VMM(2), %VMM(2), %k0
 498         KMOV    %k0, %VRCX
 499         test    %VRCX, %VRCX
 500         jnz     L(zfill_vec3)
 501
 502         VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
 503         KMOV    %k4, %VRCX
 504         // Zfill more....
 505
 506         .p2align 4,, 4
 507 L(zfill_vec4):
 508         subq    $(VEC_SIZE * -2), %rdi
 509         addq    $(CHAR_PER_VEC * -2), %rdx
 510 L(zfill_vec2):
 511         subq    $(VEC_SIZE * -2), %rdi
 512         addq    $(CHAR_PER_VEC * -1), %rdx
 513 L(zfill):
 514         /* VRCX must be non-zero.  */
 515         bsf     %VRCX, %VRCX
 516
 517         /* Adjust length / dst for zfill.  */
 518         subq    %rcx, %rdx
 519 # ifdef USE_AS_WCSCPY
 520         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 521 # else
 522         addq    %rcx, %rdi
 523 # endif
 524 # ifdef USE_AS_STPCPY
 525         movq    %rdi, %rax
 526 # endif
 527 L(zfill_from_page_cross):
 528
 529         /* From here on out its just memset(rdi, 0, rdx).  */
 530         cmpq    $CHAR_PER_VEC, %rdx
 531         jb      L(zfill_less_vec)
 532
 533 L(zfill_more_1x_vec):
 534         VMOVU   %VZERO, (%rdi)
 535         VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
 536         cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
 537         ja      L(zfill_more_2x_vec)
 538 L(zfill_done0):
 539         ret
 540
 541         /* Coming from vec1/vec2 we must be able to zfill at least 2x
 542            VEC.  */
 543         .p2align 4,, 8
 544 L(zfill_vec3):
 545         subq    $(VEC_SIZE * -2), %rdi
 546         addq    $(CHAR_PER_VEC * -2), %rdx
 547         .p2align 4,, 2
 548 L(zfill_vec1):
 549         bsfq    %rcx, %rcx
 550         /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
 551          */
 552         leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
 553         subq    %rcx, %rdx
 554 # ifdef USE_AS_STPCPY
 555         movq    %rdi, %rax
 556 # endif
 557
 558
 559         VMOVU   %VZERO, (%rdi)
 560         VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
 561         cmpq    $(CHAR_PER_VEC * 2), %rdx
 562         jb      L(zfill_done0)
 563 L(zfill_more_2x_vec):
 564         VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
 565         VMOVU   %VZERO, (VEC_SIZE)(%rdi)
 566         subq    $(CHAR_PER_VEC * 4 - 1), %rdx
 567         jbe     L(zfill_done)
 568
 569 # ifdef USE_AS_WCSCPY
 570         leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
 571 # else
 572         addq    %rdi, %rdx
 573 # endif
 574
 575         VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
 576         VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
 577
 578
 579         VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
 580         VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
 581
 582         subq    $-(VEC_SIZE * 4), %rdi
 583         cmpq    %rdi, %rdx
 584         jbe     L(zfill_done)
 585
 586         /* Align rdi and zfill loop.  */
 587         andq    $-(VEC_SIZE), %rdi
 588         .p2align 4,, 12
 589 L(zfill_loop_4x_vec):
 590         VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
 591         VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
 592         VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
 593         VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
 594         subq    $-(VEC_SIZE * 4), %rdi
 595         cmpq    %rdi, %rdx
 596         ja      L(zfill_loop_4x_vec)
 597 L(zfill_done):
 598         ret
 599
 600
 601         /* Less 1x VEC case if we are not using evex masked store.  */
 602 # if !USE_EVEX_MASKED_STORE
 603         .p2align 4,, 8
 604 L(copy_1x):
 605         /* Special case for copy 1x. It can be handled quickly and many
 606            buffer sizes have convenient alignment.  */
 607         VMOVU   %VMM(0), (%rdi)
 608         /* If no zeros then we are done.  */
 609         testl   %ecx, %ecx
 610         jz      L(ret_1x_1x)
 611
 612         /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
 613            only handle the small case here.  */
 614         bsf     %VRCX, %VRCX
 615 L(zfill_less_vec_no_bsf):
 616         /* Adjust length / dst then just zfill less_vec.  */
 617         subq    %rcx, %rdx
 618 #  ifdef USE_AS_WCSCPY
 619         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 620 #  else
 621         addq    %rcx, %rdi
 622 #  endif
 623 #  ifdef USE_AS_STPCPY
 624         movq    %rdi, %rax
 625 #  endif
 626
 627 L(zfill_less_vec):
 628         cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
 629         jb      L(zfill_less_half)
 630
 631         VMOVU   %VZERO_HALF, (%rdi)
 632         VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 633         ret
 634 #  ifdef USE_AS_STPCPY
 635 L(ret_1x_1x):
 636         leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
 637         ret
 638 #  endif
 639
 640
 641 #  if VEC_SIZE == 64
 642         .p2align 4,, 4
 643 L(copy_32_63):
 644         /* Overfill to avoid branches.  */
 645         VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
 646         VMOVU   %VMM_256(0), (%rdi)
 647         VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 648
 649         /* We are taking advantage of the fact that to be here we must
 650            be writing null-term as (%rdi, %rcx) we have a byte of lee-
 651            way for overwriting.  */
 652         cmpl    %ecx, %edx
 653         ja      L(zfill_less_vec_no_bsf)
 654 #   ifndef USE_AS_STPCPY
 655 L(ret_1x_1x):
 656 #   else
 657 #    ifdef USE_AS_WCSCPY
 658         adcq    $0, %rdx
 659         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 660 #    else
 661         movl    %edx, %eax
 662         adcq    %rdi, %rax
 663 #    endif
 664 #   endif
 665         ret
 666 #  endif
 667
 668         .p2align 4,, 4
 669 L(copy_16_31):
 670         /* Overfill to avoid branches.  */
 671         vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
 672         VMOVU   %VMM_128(0), (%rdi)
 673         vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 674         cmpl    %ecx, %edx
 675
 676         /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
 677            we have a larger copy block for 32-63 so this is just falls
 678            through to zfill 16-31. If VEC_SIZE == 32 then we check for
 679            full zfill of less 1x VEC.  */
 680 #  if VEC_SIZE == 64
 681         jbe     L(ret_16_31)
 682         subl    %ecx, %edx
 683 #   ifdef USE_AS_WCSCPY
 684         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 685 #   else
 686         addq    %rcx, %rdi
 687 #   endif
 688 #   ifdef USE_AS_STPCPY
 689         movq    %rdi, %rax
 690 #   endif
 691 L(zfill_less_half):
 692 L(zfill_less_32):
 693         cmpl    $(16 / CHAR_SIZE), %edx
 694         jb      L(zfill_less_16)
 695         VMOVU   %VZERO_128, (%rdi)
 696         VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 697 #   ifdef USE_AS_STPCPY
 698         ret
 699 #   endif
 700 L(ret_16_31):
 701 #   ifdef USE_AS_STPCPY
 702 #    ifdef USE_AS_WCSCPY
 703         adcq    $0, %rdx
 704         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 705 #    else
 706         movl    %edx, %eax
 707         adcq    %rdi, %rax
 708 #    endif
 709 #   endif
 710         ret
 711 #  else
 712         /* VEC_SIZE == 32 begins.  */
 713         ja      L(zfill_less_vec_no_bsf)
 714 #   ifndef USE_AS_STPCPY
 715 L(ret_1x_1x):
 716 #   else
 717 #    ifdef USE_AS_WCSCPY
 718         adcq    $0, %rdx
 719         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 720 #    else
 721         movl    %edx, %eax
 722         adcq    %rdi, %rax
 723 #    endif
 724 #   endif
 725         ret
 726 #  endif
 727
 728
 729         .p2align 4,, 4
 730 L(copy_8_15):
 731         /* Overfill to avoid branches.  */
 732         movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 733         vmovq   %VMM_128(0), (%rdi)
 734         movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 735         cmpl    %ecx, %edx
 736         jbe     L(ret_8_15)
 737         subl    %ecx, %edx
 738 #  ifdef USE_AS_WCSCPY
 739         leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
 740 #  else
 741         addq    %rcx, %rdi
 742 #  endif
 743 #  ifdef USE_AS_STPCPY
 744         movq    %rdi, %rax
 745 #  endif
 746         .p2align 4,, 8
 747 #  if VEC_SIZE == 32
 748 L(zfill_less_half):
 749 #  endif
 750 L(zfill_less_16):
 751         xorl    %ecx, %ecx
 752         cmpl    $(8 / CHAR_SIZE), %edx
 753         jb      L(zfill_less_8)
 754         movq    %rcx, (%rdi)
 755         movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
 756 #  ifndef USE_AS_STPCPY
 757 L(ret_8_15):
 758 #  endif
 759         ret
 760
 761         .p2align 4,, 8
 762 L(less_1x_vec):
 763         je      L(copy_1x)
 764
 765         /* We will need `tzcnt` result for all other copy sizes.  */
 766         tzcnt   %VRCX, %VRCX
 767 #  if VEC_SIZE == 64
 768         cmpl    $(32 / CHAR_SIZE), %edx
 769         jae     L(copy_32_63)
 770 #  endif
 771
 772         cmpl    $(16 / CHAR_SIZE), %edx
 773         jae     L(copy_16_31)
 774
 775         cmpl    $(8 / CHAR_SIZE), %edx
 776         jae     L(copy_8_15)
 777 #  ifdef USE_AS_WCSCPY
 778         testl   %ecx, %ecx
 779         jz      L(zfill_less_8_set_ret)
 780
 781         movl    (%rsi, %rdx, CHAR_SIZE), %esi
 782         vmovd   %VMM_128(0), (%rdi)
 783         movl    %esi, (%rdi, %rdx, CHAR_SIZE)
 784 #   ifdef USE_AS_STPCPY
 785         cmpl    %ecx, %edx
 786 L(ret_8_15):
 787         adcq    $0, %rdx
 788         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 789 #   endif
 790         ret
 791 L(zfill_less_8_set_ret):
 792         xorl    %ecx, %ecx
 793 #   ifdef USE_AS_STPCPY
 794         movq    %rdi, %rax
 795 #   endif
 796 L(zfill_less_8):
 797         movl    %ecx, (%rdi)
 798         movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
 799         ret
 800 #  else
 801         cmpl    $3, %edx
 802         jb      L(copy_0_3)
 803         /* Overfill to avoid branches.  */
 804         movl    -3(%rsi, %rdx), %esi
 805         vmovd   %VMM_128(0), (%rdi)
 806         movl    %esi, -3(%rdi, %rdx)
 807         cmpl    %ecx, %edx
 808         jbe     L(ret_4_7)
 809         subq    %rcx, %rdx
 810         addq    %rcx, %rdi
 811 #   ifdef USE_AS_STPCPY
 812         movq    %rdi, %rax
 813 #   endif
 814         xorl    %ecx, %ecx
 815         .p2align 4,, 8
 816 L(zfill_less_8):
 817         cmpl    $3, %edx
 818         jb      L(zfill_less_3)
 819         movl    %ecx, (%rdi)
 820         movl    %ecx, -3(%rdi, %rdx)
 821 #   ifdef USE_AS_STPCPY
 822         ret
 823 #   endif
 824
 825 L(ret_4_7):
 826 #   ifdef USE_AS_STPCPY
 827 L(ret_8_15):
 828         movl    %edx, %eax
 829         adcq    %rdi, %rax
 830 #   endif
 831         ret
 832
 833         .p2align 4,, 4
 834 L(zfill_less_3):
 835         testl   %edx, %edx
 836         jz      L(zfill_1)
 837         movw    %cx, (%rdi)
 838 L(zfill_1):
 839         movb    %cl, (%rdi, %rdx)
 840         ret
 841
 842         .p2align 4,, 8
 843 L(copy_0_3):
 844         vmovd   %VMM_128(0), %r8d
 845         testl   %edx, %edx
 846         jz      L(copy_1)
 847         movw    %r8w, (%rdi)
 848         cmpl    %ecx, %edx
 849         ja      L(zfill_from_1)
 850         movzbl  (%rsi, %rdx), %r8d
 851 #   ifdef USE_AS_STPCPY
 852         movl    %edx, %eax
 853         adcq    %rdi, %rax
 854         movb    %r8b, (%rdi, %rdx)
 855         ret
 856 #   endif
 857
 858 L(copy_1):
 859 #   ifdef USE_AS_STPCPY
 860         movl    %edx, %eax
 861         cmpl    %ecx, %edx
 862         adcq    %rdi, %rax
 863 #   endif
 864 #   ifdef USE_AS_WCSCPY
 865         vmovd   %VMM_128(0), (%rdi)
 866 #   else
 867         movb    %r8b, (%rdi, %rdx)
 868 #   endif
 869         ret
 870 #  endif
 871
 872
 873 #  ifndef USE_AS_WCSCPY
 874         .p2align 4,, 8
 875 L(zfill_from_1):
 876 #   ifdef USE_AS_STPCPY
 877         leaq    (%rdi, %rcx), %rax
 878 #   endif
 879         movw    $0, -1(%rdi, %rdx)
 880         ret
 881 #  endif
 882
 883         .p2align 4,, 4
 884 L(zero_len):
 885         incq    %rdx
 886         jne     L(best_effort_strncpy)
 887         movq    %rdi, %rax
 888         ret
 889 # endif
 890
 891
 892         .p2align 4,, 4
 893         .p2align 6,, 8
 894 L(page_cross):
 895         movq    %rsi, %rax
 896         andq    $(VEC_SIZE * -1), %rax
 897         VPCMPEQ (%rax), %VZERO, %k0
 898         KMOV    %k0, %VRCX
 899 # ifdef USE_AS_WCSCPY
 900         movl    %esi, %r8d
 901         shrl    $2, %r8d
 902         andl    $(CHAR_PER_VEC - 1), %r8d
 903         shrx    %VR8, %VRCX, %VRCX
 904 # else
 905         shrx    %VRSI, %VRCX, %VRCX
 906 # endif
 907
 908         /* Compute amount of bytes we checked.  */
 909         subl    %esi, %eax
 910         andl    $(VEC_SIZE - 1), %eax
 911 # ifdef USE_AS_WCSCPY
 912         shrl    $2, %eax
 913 # endif
 914
 915         /* If rax > rdx then we are finishing the copy at the end of the
 916            page.  */
 917         cmpq    %rax, %rdx
 918         jb      L(page_cross_small)
 919
 920
 921         /* If rcx is non-zero then continue.  */
 922         test    %VRCX, %VRCX
 923         jz      L(page_cross_continue)
 924
 925         /* We found zero-CHAR so need to copy then zfill (we know we
 926            didn't cover all of length here).  */
 927         bsf     %VRCX, %VRCX
 928 L(movsb_and_zfill):
 929         incl    %ecx
 930         subq    %rcx, %rdx
 931 # ifdef USE_AS_STPCPY
 932         leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 933 # else
 934         movq    %rdi, %rax
 935 # endif
 936
 937         REP_MOVS
 938 # ifdef USE_AS_WCSCPY
 939         movl    $0, (%rdi)
 940 # else
 941         movb    $0, (%rdi)
 942 # endif
 943         jmp     L(zfill_from_page_cross)
 944
 945 L(page_cross_small):
 946         tzcnt   %VRCX, %VRCX
 947         cmpl    %ecx, %edx
 948         jbe     L(page_cross_copy_only)
 949
 950         /* Do a zfill of the tail before copying.  */
 951         movq    %rdi, %r9
 952         xorl    %eax, %eax
 953
 954         movl    %ecx, %r8d
 955
 956         subl    %ecx, %edx
 957         leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
 958         movl    %edx, %ecx
 959         REP_STOS
 960         movq    %r9, %rdi
 961         movl    %r8d, %edx
 962 L(page_cross_copy_only):
 963         leal    1(%rdx), %ecx
 964 # ifdef USE_AS_STPCPY
 965 #  ifdef USE_AS_WCSCPY
 966         adcl    $0, %edx
 967         leaq    (%rdi, %rdx, CHAR_SIZE), %rax
 968 #  else
 969         movl    %edx, %eax
 970         adcq    %rdi, %rax
 971 #  endif
 972 # else
 973         movq    %rdi, %rax
 974 # endif
 975         REP_MOVS
 976         ret
 977
 978
 979 L(best_effort_strncpy):
 980         movq    %rdx, %rcx
 981         xorl    %eax, %eax
 982         movq    %rdi, %r8
 983         /* The length is >= 2^63. We very much so expect to segfault at
 984            rep stos. If that doesn't happen then just strcpy to finish.
 985          */
 986         REP_STOS
 987         movq    %r8, %rdi
 988         jmp     OVERFLOW_STRCPY
 989 END(STRNCPY)
 990 #endif