sysdeps/x86_64/multiarch/strncat-avx2.S

   1 /* strncat with AVX2
   2    Copyright (C) 2022-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (3)
  22
  23 # include <sysdep.h>
  24
  25 # ifndef VEC_SIZE
  26 #  include "x86-avx-vecs.h"
  27 # endif
  28
  29 # ifndef STRNCAT
  30 #  define STRNCAT       __strncat_avx2
  31 # endif
  32
  33 # ifdef USE_AS_WCSCPY
  34 #  define MOVCHAR       movl
  35 #  define VPCMPEQ       vpcmpeqd
  36 #  define VPMIN vpminud
  37 #  define CHAR_SIZE     4
  38 # else
  39 #  define MOVCHAR       movb
  40 #  define VPCMPEQ       vpcmpeqb
  41 #  define VPMIN vpminub
  42 #  define CHAR_SIZE     1
  43 # endif
  44
  45 # include "strncpy-or-cat-overflow-def.h"
  46
  47 # define PAGE_SIZE      4096
  48
  49 # define VZERO  VMM(7)
  50 # define VZERO_128      VMM_128(7)
  51
  52         .section SECTION(.text), "ax", @progbits
  53 ENTRY(STRNCAT)
  54 # ifdef __ILP32__
  55         /* Clear the upper 32 bits.  */
  56         movl    %edx, %edx
  57 # endif
  58         /* Filter zero length strings and very long strings.  Zero
  59            length strings just return, very long strings are handled by
  60            using the non-length variant {wcs|str}cat.  */
  61         movq    %rdi, %rax
  62 # ifdef USE_AS_WCSCPY
  63         leaq    -1(%rdx), %rcx
  64         shr     $56, %rcx
  65         jnz     L(zero_len)
  66         salq    $2, %rdx
  67 # else
  68         test    %rdx, %rdx
  69         jl      L(zero_len)
  70 # endif
  71         vpxor   %VZERO_128, %VZERO_128, %VZERO_128
  72
  73 # include "strcat-strlen-avx2.h.S"
  74
  75         movl    %esi, %ecx
  76         andl    $(PAGE_SIZE - 1), %ecx
  77         cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
  78         ja      L(page_cross)
  79 L(page_cross_continue):
  80         VMOVU   (%rsi), %VMM(0)
  81         VPCMPEQ %VMM(0), %VZERO, %VMM(6)
  82         vpmovmskb %VMM(6), %ecx
  83
  84         tzcnt   %ecx, %r8d
  85         cmpq    %r8, %rdx
  86         jbe     L(less_1x_vec)
  87
  88         testl   %ecx, %ecx
  89         jz      L(more_1x_vec)
  90
  91         /* Hoist this to save code size.  */
  92
  93         movl    %r8d, %edx
  94
  95 L(less_1x_vec):
  96         COND_VZEROUPPER
  97
  98         cmpl    $16, %edx
  99         jae     L(copy_16_31)
 100         cmpl    $8, %edx
 101         jae     L(copy_8_15)
 102
 103
 104 # ifdef USE_AS_WCSCPY
 105         vmovd   %VMM_128(0), (%rdi)
 106         MOVCHAR $0, (%rdi, %rdx)
 107         ret
 108 # else
 109         cmpl    $4, %edx
 110         jae     L(copy_4_7)
 111
 112         movzbl  (%rsi), %ecx
 113         cmpl    $1, %edx
 114         jbe     L(set_null_term)
 115
 116         /* NB: make this `vmovw` if support for AVX512-FP16 is added.
 117          */
 118         movzwl  1(%rsi), %esi
 119         movw    %si, 1(%rdi)
 120
 121         .p2align 4,, 1
 122 L(set_null_term):
 123         movb    %cl, (%rdi)
 124         MOVCHAR $0, (%rdi, %rdx)
 125         ret
 126
 127         .p2align 4,, 11
 128 L(copy_4_7):
 129         movl    -(4)(%rsi, %rdx), %ecx
 130         vmovd   %xmm0, (%rdi)
 131         movl    %ecx, -(4)(%rdi, %rdx)
 132         MOVCHAR $0, (%rdi, %rdx)
 133         ret
 134 # endif
 135
 136
 137         .p2align 4,, 10
 138 L(copy_16_31):
 139         VMOVU   -(16)(%rsi, %rdx), %xmm1
 140         VMOVU   %xmm0, (%rdi)
 141         VMOVU   %xmm1, -(16)(%rdi, %rdx)
 142         MOVCHAR $0, (%rdi, %rdx)
 143         ret
 144
 145         .p2align 4,, 10
 146 L(copy_8_15):
 147         movq    -(8)(%rsi, %rdx), %rcx
 148         vmovq   %xmm0, (%rdi)
 149         movq    %rcx, -(8)(%rdi, %rdx)
 150         MOVCHAR $0, (%rdi, %rdx)
 151         ret
 152
 153         .p2align 4,, 8
 154         .p2align 6,, 14
 155 L(more_1x_vec):
 156         VMOVU   %VMM(0), (%rdi)
 157
 158         /* Align rsi (src) and just rdx/rdi (length/dst).  */
 159         addq    %rsi, %rdx
 160         subq    %rsi, %rdi
 161         orq     $(VEC_SIZE - 1), %rsi
 162         incq    %rsi
 163         addq    %rsi, %rdi
 164 L(loop_last_4x_vec):
 165         subq    %rsi, %rdx
 166         VMOVA   0(%rsi), %VMM(1)
 167         VPCMPEQ %VMM(1), %VZERO, %VMM(6)
 168         vpmovmskb %VMM(6), %ecx
 169         cmpq    $(VEC_SIZE * 2), %rdx
 170         ja      L(more_2x_vec)
 171 L(last_2x_vec):
 172         tzcnt   %ecx, %ecx
 173         cmpl    %ecx, %edx
 174         jbe     L(ret_vec_x1_len)
 175
 176         cmpl    $VEC_SIZE, %ecx
 177         jnz     L(ret_vec_x1)
 178
 179         VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(2)
 180         VMOVU   %VMM(1), (%rdi)
 181         VPCMPEQ %VMM(2), %VZERO, %VMM(6)
 182         vpmovmskb %VMM(6), %ecx
 183         addl    $-VEC_SIZE, %edx
 184         bzhil   %edx, %ecx, %r8d
 185         jz      L(ret_vec_x2_len)
 186 L(ret_vec_x2):
 187         bsfl    %ecx, %edx
 188 L(ret_vec_x2_len):
 189         VMOVU   (%rsi, %rdx), %VMM(0)
 190         MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx)
 191         VMOVU   %VMM(0), (%rdi, %rdx)
 192 L(return_vzeroupper):
 193         ZERO_UPPER_VEC_REGISTERS_RETURN
 194
 195
 196         .p2align 4,, 12
 197 L(ret_vec_x1_len):
 198         movl    %edx, %ecx
 199 L(ret_vec_x1):
 200         VMOVU   -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
 201         MOVCHAR $0, (%rdi, %rcx)
 202         VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rcx)
 203         VZEROUPPER_RETURN
 204
 205         .p2align 4,, 8
 206 L(last_4x_vec):
 207         subq    $-(VEC_SIZE * 4), %rsi
 208         VMOVA   0(%rsi), %VMM(1)
 209         VPCMPEQ %VMM(1), %VZERO, %VMM(6)
 210         vpmovmskb %VMM(6), %ecx
 211         subq    $-(VEC_SIZE * 4), %rdi
 212         addl    $-(VEC_SIZE * 4), %edx
 213         cmpl    $(VEC_SIZE * 2), %edx
 214         jbe     L(last_2x_vec)
 215         .p2align 4,, 8
 216 L(more_2x_vec):
 217         /* L(ret_vec_x1) expects ecx to have position of first match so
 218            test with bsf.  */
 219         bsfl    %ecx, %ecx
 220         jnz     L(ret_vec_x1)
 221
 222         VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(2)
 223         VMOVU   %VMM(1), (%rdi)
 224
 225         VPCMPEQ %VMM(2), %VZERO, %VMM(6)
 226         vpmovmskb %VMM(6), %ecx
 227         testl   %ecx, %ecx
 228         jnz     L(ret_vec_x2)
 229
 230
 231         VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(3)
 232         VMOVU   %VMM(2), (VEC_SIZE * 1)(%rdi)
 233
 234         VPCMPEQ %VMM(3), %VZERO, %VMM(6)
 235         vpmovmskb %VMM(6), %ecx
 236
 237         /* Check if length is greater than 4x VEC.  */
 238         cmpq    $(VEC_SIZE * 4), %rdx
 239         ja      L(more_4x_vec)
 240
 241         addl    $(VEC_SIZE * -2), %edx
 242
 243         tzcnt   %ecx, %ecx
 244         cmpl    %ecx, %edx
 245         jbe     L(ret_vec_x3_len)
 246
 247         cmpl    $VEC_SIZE, %ecx
 248         jnz     L(ret_vec_x3)
 249
 250         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
 251         VMOVU   %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
 252         VPCMPEQ %VMM(4), %VZERO, %VMM(6)
 253         vpmovmskb %VMM(6), %ecx
 254         addl    $-VEC_SIZE, %edx
 255         bzhil   %edx, %ecx, %r8d
 256         jz      L(ret_vec_x4_len)
 257 L(ret_vec_x4):
 258         bsfl    %ecx, %edx
 259 L(ret_vec_x4_len):
 260         VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
 261         MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx)
 262         VMOVU   %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
 263         VZEROUPPER_RETURN
 264
 265         .p2align 4,, 4
 266 L(ret_vec_x3_len):
 267         movl    %edx, %ecx
 268 L(ret_vec_x3):
 269         VMOVU   (VEC_SIZE)(%rsi, %rcx), %VMM(0)
 270         MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx)
 271         VMOVU   %VMM(0), (VEC_SIZE)(%rdi, %rcx)
 272         VZEROUPPER_RETURN
 273
 274
 275         .p2align 4,, 8
 276 L(more_4x_vec):
 277         bsfl    %ecx, %ecx
 278         jnz     L(ret_vec_x3)
 279
 280         VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(4)
 281         VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
 282         VPCMPEQ %VMM(4), %VZERO, %VMM(6)
 283         vpmovmskb %VMM(6), %ecx
 284         testl   %ecx, %ecx
 285         jnz     L(ret_vec_x4)
 286
 287         VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi)
 288
 289
 290         /* Recheck length before aligning.  */
 291         cmpq    $(VEC_SIZE * 8), %rdx
 292         jbe     L(last_4x_vec)
 293
 294         /* Align rsi (src) and just rdx/rdi (length/dst).  */
 295         addq    %rsi, %rdx
 296         subq    %rsi, %rdi
 297         subq    $-(VEC_SIZE * 4), %rsi
 298         andq    $(VEC_SIZE * -4), %rsi
 299
 300         /* Do first half of loop ahead of time so loop can just start by
 301            storing.  */
 302         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 303         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 304         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 305         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 306
 307         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 308         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 309         VPMIN   %VMM(4), %VMM(6), %VMM(6)
 310         VPCMPEQ %VMM(6), %VZERO, %VMM(6)
 311         vpmovmskb %VMM(6), %r8d
 312         addq    %rsi, %rdi
 313         testl   %r8d, %r8d
 314         jnz     L(loop_4x_done)
 315
 316         /* Use r9 for end of region before handling last 4x VEC
 317            specially.  */
 318         leaq    -(VEC_SIZE * 4)(%rdx), %r9
 319
 320         .p2align 4,, 11
 321 L(loop_4x_vec):
 322
 323         VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
 324         VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
 325         subq    $(VEC_SIZE * -4), %rsi
 326         VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
 327         VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
 328
 329         subq    $(VEC_SIZE * -4), %rdi
 330         cmpq    %rsi, %r9
 331         jbe     L(loop_last_4x_vec)
 332
 333         VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
 334         VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
 335         VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
 336         VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
 337
 338         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 339         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 340         VPMIN   %VMM(4), %VMM(6), %VMM(6)
 341         VPCMPEQ %VMM(6), %VZERO, %VMM(6)
 342
 343         vpmovmskb %VMM(6), %r8d
 344
 345         testl   %r8d, %r8d
 346         jz      L(loop_4x_vec)
 347
 348 L(loop_4x_done):
 349         VPCMPEQ %VMM(0), %VZERO, %VMM(6)
 350         vpmovmskb %VMM(6), %ecx
 351         /* L(ret_vec_x1) expects ecx to have position of first match so
 352            test with bsf.  */
 353         bsfl    %ecx, %ecx
 354         jnz     L(ret_vec_x1)
 355         VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
 356
 357         VPCMPEQ %VMM(1), %VZERO, %VMM(6)
 358         vpmovmskb %VMM(6), %ecx
 359
 360         testl   %ecx, %ecx
 361         jnz     L(ret_vec_x2)
 362         VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
 363
 364         VPCMPEQ %VMM(2), %VZERO, %VMM(6)
 365         vpmovmskb %VMM(6), %ecx
 366         bsfl    %ecx, %ecx
 367         jnz     L(ret_vec_x3)
 368
 369         VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
 370         bsfl    %r8d, %r8d
 371         VMOVU   (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
 372         VMOVU   %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
 373         VZEROUPPER_RETURN
 374
 375
 376
 377         .p2align 4,, 4
 378 L(page_cross):
 379         movq    %rsi, %r8
 380         andq    $(VEC_SIZE * -1), %r8
 381
 382         VPCMPEQ (%r8), %VZERO, %VMM(6)
 383
 384         vpmovmskb %VMM(6), %ecx
 385         shrxl   %esi, %ecx, %ecx
 386
 387         subl    %esi, %r8d
 388         andl    $(VEC_SIZE - 1), %r8d
 389         cmpq    %r8, %rdx
 390         jb      L(page_cross_small)
 391
 392         /* Optimizing more aggressively for space as this is very cold
 393            code. This saves 2x cache lines.  */
 394
 395         /* This adds once to the later result which will get correct
 396            copy bounds. NB: this can never zero-out a non-zero RCX as
 397            to be in the page cross case rsi cannot be aligned and we
 398            already right-shift rcx by the misalignment.  */
 399         shll    $CHAR_SIZE, %ecx
 400         jz      L(page_cross_continue)
 401         bsfl    %ecx, %ecx
 402         rep     movsb
 403         VZEROUPPER_RETURN
 404
 405 L(page_cross_small):
 406         tzcntl  %ecx, %ecx
 407         jz      L(page_cross_setz)
 408         cmpl    %edx, %ecx
 409         cmova   %edx, %ecx
 410         rep     movsb
 411 L(page_cross_setz):
 412         MOVCHAR $0, (%rdi)
 413         VZEROUPPER_RETURN
 414 L(zero_len):
 415 # ifdef USE_AS_WCSCPY
 416         test    %rdx, %rdx
 417 # endif
 418         jnz     OVERFLOW_STRCAT
 419         ret
 420
 421
 422 END(STRNCAT)
 423 #endif