1 /* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
23 /* Use evex-masked stores for small sizes. Turned off at the
25 # define USE_EVEX_MASKED_STORE 0
30 # include "x86-evex256-vecs.h"
35 # define STRNCPY __strncpy_evex
39 # define VMOVU_MASK vmovdqu32
40 # define VPCMPEQ vpcmpeqd
41 # define VPMIN vpminud
42 # define VPTESTN vptestnmd
43 # define VPTEST vptestmd
46 # define REP_MOVS rep movsd
47 # define REP_STOS rep stosl
49 # define USE_WIDE_CHAR
52 # define VMOVU_MASK vmovdqu8
53 # define VPCMPEQ vpcmpeqb
54 # define VPMIN vpminub
55 # define VPTESTN vptestnmb
56 # define VPTEST vptestmb
59 # define REP_MOVS rep movsb
60 # define REP_STOS rep stosb
63 # include "strncpy-or-cat-overflow-def.h"
65 # define PAGE_SIZE 4096
66 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
68 # include "reg-macros.h"
72 # define VZERO_256 VMM_256(7)
73 # define VZERO_128 VMM_128(7)
76 # define VZERO_HALF VZERO_256
78 # define VZERO_HALF VZERO_128
81 .section SECTION(.text), "ax", @progbits
83 /* Filter zero length strings and very long strings. Zero
84 length strings just return, very long strings are handled by
85 just running rep stos{b|l} to zero set (which will almost
86 certainly segfault), if that succeeds then just calling
87 OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
91 /* 56 is end of max supported address space. */
96 /* If the flag needs to become `jb` replace `dec` with `sub`.
101 vpxorq %VZERO_128, %VZERO_128, %VZERO_128
103 andl $(PAGE_SIZE - 1), %eax
104 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
107 L(page_cross_continue):
108 VMOVU (%rsi), %VMM(0)
109 VPTESTN %VMM(0), %VMM(0), %k0
112 /* If no STPCPY just save end ahead of time. */
113 # ifndef USE_AS_STPCPY
118 cmpq $(CHAR_PER_VEC), %rdx
120 /* If USE_EVEX_MASK_STORE is enabled then we just handle length
121 <= CHAR_PER_VEC with masked instructions (which have
122 potential for dramatically bad perf if dst splits a page and
123 is not in the TLB). */
124 # if USE_EVEX_MASKED_STORE
125 /* `jae` because length rdx is now length - 1. */
128 /* If there where multiple zero-CHAR matches in the first VEC,
129 VRCX will be overset but thats fine since any oversets where
130 at zero-positions anyways. */
132 # ifdef USE_AS_STPCPY
136 # ifdef USE_AS_WCSCPY
138 leaq (%rdi, %rax, CHAR_SIZE), %rax
145 /* Zero out all non-zero CHAR's after the first zero match. */
148 /* Use VZERO as destination so this can be reused for
149 L(zfill_less_vec) (which if jumped to by subsequent logic
150 will have zerod out VZERO. */
151 VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
153 /* Get mask for what we need to set. */
156 bzhi %VRDX, %VRCX, %VRCX
158 VMOVU_MASK %VZERO, (%rdi){%k1}
164 jne L(best_effort_strncpy)
171 /* `jb` because length rdx is now length - 1. */
176 /* This may overset but thats fine because we still need to zero
178 VMOVU %VMM(0), (%rdi)
181 /* Length must be >= CHAR_PER_VEC so match here means we must
187 /* We are going to align rsi here so will need to be able to re-
188 adjust rdi/rdx afterwords. NB: We filtered out huge lengths
189 so rsi + rdx * CHAR_SIZE cannot overflow. */
190 leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
192 andq $-(VEC_SIZE), %rsi
197 # ifdef USE_AS_WCSCPY
201 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
202 VPTESTN %VMM(1), %VMM(1), %k0
205 /* -1 because of the `dec %rdx` earlier. */
206 cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
210 /* This will be need to be computed no matter what. We do it
211 ahead of time for CHAR_PER_VEC == 64 because we can't adjust
212 the value of `tzcnt` with a shift. */
213 # if CHAR_PER_VEC == 64
217 cmpl $(CHAR_PER_VEC), %edx
220 /* Seperate logic for CHAR_PER_VEC == 64 because we already did
222 # if CHAR_PER_VEC == 64
223 /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
224 cmpb $CHAR_PER_VEC, %cl
225 jnz L(ret_vec_x1_no_bsf)
233 VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
234 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
237 # if CHAR_PER_VEC < 64
238 /* This essentiallys adds CHAR_PER_VEC to computed result. */
239 shlq $CHAR_PER_VEC, %rcx
242 addl $CHAR_PER_VEC, %ecx
247 /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
248 already been done. */
249 # if CHAR_PER_VEC < 64
253 jbe L(ret_vec_x1_len_no_zfill)
254 /* Fall through (expectation) is copy len < buffer len. */
255 VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
256 L(ret_vec_x1_len_no_zfill_mov):
258 # ifdef USE_AS_STPCPY
262 L(ret_vec_x1_len_no_zfill):
263 VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
264 VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
265 # ifdef USE_AS_STPCPY
266 # ifdef USE_AS_WCSCPY
268 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
270 leal (VEC_SIZE)(%rdx), %eax
280 L(ret_vec_x1_no_bsf):
281 VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
283 cmpl $CHAR_PER_VEC, %edx
284 jb L(ret_vec_x1_len_no_zfill_mov)
285 /* Fall through (expectation) is copy len < buffer len. */
286 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
287 VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
288 # ifdef USE_AS_STPCPY
289 leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
295 /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
296 $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
298 # if CHAR_PER_VEC == 64
301 andl $(CHAR_PER_VEC * 4 - 1), %edx
303 VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
304 VPTESTN %VMM(1), %VMM(1), %k0
306 subq $-(VEC_SIZE * 4), %rsi
307 subq $-(VEC_SIZE * 4), %rdi
308 cmpl $(CHAR_PER_VEC * 2 - 1), %edx
312 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
314 /* Must fill at least 2x VEC. */
317 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
318 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
319 VPTESTN %VMM(2), %VMM(2), %k0
322 /* Must fill at least 1x VEC. */
325 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
326 VPTESTN %VMM(3), %VMM(3), %k0
329 /* Check if len is more 4x VEC. -1 because rdx is len - 1. */
330 cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
333 subl $(CHAR_PER_VEC * 3), %edx
339 VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
340 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
344 jbe L(ret_vec_x4_len_no_zfill)
345 /* Fall through (expectation) is copy len < buffer len. */
346 VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
348 L(ret_vec_x4_len_no_zfill):
349 VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
350 VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
351 # ifdef USE_AS_STPCPY
352 # ifdef USE_AS_WCSCPY
354 leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
356 leal (VEC_SIZE * 4 + 0)(%rdx), %eax
364 addl $(CHAR_PER_VEC * 1), %edx
367 jbe L(ret_vec_x3_len_no_zfill)
368 /* Fall through (expectation) is copy len < buffer len. */
369 VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
370 L(ret_vec_x3_len_no_zfill_mov):
372 # ifdef USE_AS_STPCPY
377 L(ret_vec_x3_len_no_zfill):
378 VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
379 VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
380 # ifdef USE_AS_STPCPY
381 # ifdef USE_AS_WCSCPY
383 leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
385 leal (VEC_SIZE * 3 + 0)(%rdx), %eax
395 VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
397 jl L(ret_vec_x3_len_no_zfill_mov)
398 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
399 VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
400 # ifdef USE_AS_STPCPY
401 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
407 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
411 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
412 VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
413 VPTESTN %VMM(4), %VMM(4), %k0
418 /* Recheck length before aligning. */
419 cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
422 /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
423 # ifdef USE_AS_WCSCPY
424 leaq (%rsi, %rdx, CHAR_SIZE), %rdx
429 subq $-(VEC_SIZE * 5), %rsi
430 andq $(VEC_SIZE * -4), %rsi
433 /* Load first half of the loop before entry. */
434 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
435 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
436 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
437 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
439 VPMIN %VMM(0), %VMM(1), %VMM(4)
440 VPMIN %VMM(2), %VMM(3), %VMM(6)
441 VPTESTN %VMM(4), %VMM(4), %k2
442 VPTESTN %VMM(6), %VMM(6), %k4
445 /* Offset rsi by VEC_SIZE so that we can jump to
446 L(loop_last_4x_vec). */
447 addq $-(VEC_SIZE), %rsi
451 /* Store loop end in r9. */
452 leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
456 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
457 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
458 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
459 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
461 subq $(VEC_SIZE * -4), %rsi
463 jbe L(loop_last_4x_vec)
465 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
466 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
467 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
468 VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
470 VPMIN %VMM(0), %VMM(1), %VMM(4)
471 VPMIN %VMM(2), %VMM(3), %VMM(6)
472 VPTESTN %VMM(4), %VMM(4), %k2
473 VPTESTN %VMM(6), %VMM(6), %k4
478 /* Restore rdx (length). */
480 # ifdef USE_AS_WCSCPY
483 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
484 /* Restore rdi (dst). */
486 VPTESTN %VMM(0), %VMM(0), %k0
491 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
496 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
497 VPTESTN %VMM(2), %VMM(2), %k0
502 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
508 subq $(VEC_SIZE * -2), %rdi
509 addq $(CHAR_PER_VEC * -2), %rdx
511 subq $(VEC_SIZE * -2), %rdi
512 addq $(CHAR_PER_VEC * -1), %rdx
514 /* VRCX must be non-zero. */
517 /* Adjust length / dst for zfill. */
519 # ifdef USE_AS_WCSCPY
520 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
524 # ifdef USE_AS_STPCPY
527 L(zfill_from_page_cross):
529 /* From here on out its just memset(rdi, 0, rdx). */
530 cmpq $CHAR_PER_VEC, %rdx
533 L(zfill_more_1x_vec):
535 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
536 cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
537 ja L(zfill_more_2x_vec)
541 /* Coming from vec1/vec2 we must be able to zfill at least 2x
545 subq $(VEC_SIZE * -2), %rdi
546 addq $(CHAR_PER_VEC * -2), %rdx
550 /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
552 leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
554 # ifdef USE_AS_STPCPY
560 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
561 cmpq $(CHAR_PER_VEC * 2), %rdx
563 L(zfill_more_2x_vec):
564 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
565 VMOVU %VZERO, (VEC_SIZE)(%rdi)
566 subq $(CHAR_PER_VEC * 4 - 1), %rdx
569 # ifdef USE_AS_WCSCPY
570 leaq (%rdi, %rdx, CHAR_SIZE), %rdx
575 VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
576 VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
579 VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
580 VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
582 subq $-(VEC_SIZE * 4), %rdi
586 /* Align rdi and zfill loop. */
587 andq $-(VEC_SIZE), %rdi
589 L(zfill_loop_4x_vec):
590 VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
591 VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
592 VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
593 VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
594 subq $-(VEC_SIZE * 4), %rdi
596 ja L(zfill_loop_4x_vec)
601 /* Less 1x VEC case if we are not using evex masked store. */
602 # if !USE_EVEX_MASKED_STORE
605 /* Special case for copy 1x. It can be handled quickly and many
606 buffer sizes have convenient alignment. */
607 VMOVU %VMM(0), (%rdi)
608 /* If no zeros then we are done. */
612 /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
613 only handle the small case here. */
615 L(zfill_less_vec_no_bsf):
616 /* Adjust length / dst then just zfill less_vec. */
618 # ifdef USE_AS_WCSCPY
619 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
623 # ifdef USE_AS_STPCPY
628 cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
629 jb L(zfill_less_half)
631 VMOVU %VZERO_HALF, (%rdi)
632 VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
634 # ifdef USE_AS_STPCPY
636 leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
644 /* Overfill to avoid branches. */
645 VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
646 VMOVU %VMM_256(0), (%rdi)
647 VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
649 /* We are taking advantage of the fact that to be here we must
650 be writing null-term as (%rdi, %rcx) we have a byte of lee-
651 way for overwriting. */
653 ja L(zfill_less_vec_no_bsf)
654 # ifndef USE_AS_STPCPY
657 # ifdef USE_AS_WCSCPY
659 leaq (%rdi, %rdx, CHAR_SIZE), %rax
670 /* Overfill to avoid branches. */
671 vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
672 VMOVU %VMM_128(0), (%rdi)
673 vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
676 /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
677 we have a larger copy block for 32-63 so this is just falls
678 through to zfill 16-31. If VEC_SIZE == 32 then we check for
679 full zfill of less 1x VEC. */
683 # ifdef USE_AS_WCSCPY
684 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
688 # ifdef USE_AS_STPCPY
693 cmpl $(16 / CHAR_SIZE), %edx
695 VMOVU %VZERO_128, (%rdi)
696 VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
697 # ifdef USE_AS_STPCPY
701 # ifdef USE_AS_STPCPY
702 # ifdef USE_AS_WCSCPY
704 leaq (%rdi, %rdx, CHAR_SIZE), %rax
712 /* VEC_SIZE == 32 begins. */
713 ja L(zfill_less_vec_no_bsf)
714 # ifndef USE_AS_STPCPY
717 # ifdef USE_AS_WCSCPY
719 leaq (%rdi, %rdx, CHAR_SIZE), %rax
731 /* Overfill to avoid branches. */
732 movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
733 vmovq %VMM_128(0), (%rdi)
734 movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
738 # ifdef USE_AS_WCSCPY
739 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
743 # ifdef USE_AS_STPCPY
752 cmpl $(8 / CHAR_SIZE), %edx
755 movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
756 # ifndef USE_AS_STPCPY
765 /* We will need `tzcnt` result for all other copy sizes. */
768 cmpl $(32 / CHAR_SIZE), %edx
772 cmpl $(16 / CHAR_SIZE), %edx
775 cmpl $(8 / CHAR_SIZE), %edx
777 # ifdef USE_AS_WCSCPY
779 jz L(zfill_less_8_set_ret)
781 movl (%rsi, %rdx, CHAR_SIZE), %esi
782 vmovd %VMM_128(0), (%rdi)
783 movl %esi, (%rdi, %rdx, CHAR_SIZE)
784 # ifdef USE_AS_STPCPY
788 leaq (%rdi, %rdx, CHAR_SIZE), %rax
791 L(zfill_less_8_set_ret):
793 # ifdef USE_AS_STPCPY
798 movl %ecx, (%rdi, %rdx, CHAR_SIZE)
803 /* Overfill to avoid branches. */
804 movl -3(%rsi, %rdx), %esi
805 vmovd %VMM_128(0), (%rdi)
806 movl %esi, -3(%rdi, %rdx)
811 # ifdef USE_AS_STPCPY
820 movl %ecx, -3(%rdi, %rdx)
821 # ifdef USE_AS_STPCPY
826 # ifdef USE_AS_STPCPY
839 movb %cl, (%rdi, %rdx)
844 vmovd %VMM_128(0), %r8d
850 movzbl (%rsi, %rdx), %r8d
851 # ifdef USE_AS_STPCPY
854 movb %r8b, (%rdi, %rdx)
859 # ifdef USE_AS_STPCPY
864 # ifdef USE_AS_WCSCPY
865 vmovd %VMM_128(0), (%rdi)
867 movb %r8b, (%rdi, %rdx)
873 # ifndef USE_AS_WCSCPY
876 # ifdef USE_AS_STPCPY
877 leaq (%rdi, %rcx), %rax
879 movw $0, -1(%rdi, %rdx)
886 jne L(best_effort_strncpy)
896 andq $(VEC_SIZE * -1), %rax
897 VPCMPEQ (%rax), %VZERO, %k0
899 # ifdef USE_AS_WCSCPY
902 andl $(CHAR_PER_VEC - 1), %r8d
903 shrx %VR8, %VRCX, %VRCX
905 shrx %VRSI, %VRCX, %VRCX
908 /* Compute amount of bytes we checked. */
910 andl $(VEC_SIZE - 1), %eax
911 # ifdef USE_AS_WCSCPY
915 /* If rax > rdx then we are finishing the copy at the end of the
918 jb L(page_cross_small)
921 /* If rcx is non-zero then continue. */
923 jz L(page_cross_continue)
925 /* We found zero-CHAR so need to copy then zfill (we know we
926 didn't cover all of length here). */
931 # ifdef USE_AS_STPCPY
932 leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
938 # ifdef USE_AS_WCSCPY
943 jmp L(zfill_from_page_cross)
948 jbe L(page_cross_copy_only)
950 /* Do a zfill of the tail before copying. */
957 leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
962 L(page_cross_copy_only):
964 # ifdef USE_AS_STPCPY
965 # ifdef USE_AS_WCSCPY
967 leaq (%rdi, %rdx, CHAR_SIZE), %rax
979 L(best_effort_strncpy):
983 /* The length is >= 2^63. We very much so expect to segfault at
984 rep stos. If that doesn't happen then just strcpy to finish.