sysdeps/x86_64/multiarch/strcpy-avx2.S

   1 /* strcpy with AVX2
   2    Copyright (C) 2011-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (3)
  22
  23 # include <sysdep.h>
  24
  25 # ifndef VEC_SIZE
  26 #  include "x86-avx-vecs.h"
  27 # endif
  28
  29 # ifndef STRCPY
  30 #  define STRCPY        __strcpy_avx2
  31 # endif
  32
  33         /* Use movsb in page cross case to save code size.  */
  34 # define USE_MOVSB_IN_PAGE_CROSS        1
  35
  36 # ifdef USE_AS_WCSCPY
  37 #  define VPCMPEQ       vpcmpeqd
  38 #  define VPMIN vpminud
  39 #  define CHAR_SIZE     4
  40 # else
  41 #  define VPCMPEQ       vpcmpeqb
  42 #  define VPMIN vpminub
  43 #  define CHAR_SIZE     1
  44 # endif
  45
  46 # define PAGE_SIZE      4096
  47
  48 # ifdef USE_AS_STPCPY
  49 #  define END_REG       rax
  50 # else
  51 #  define END_REG       rdi, %rdx
  52 # endif
  53
  54 # ifdef USE_AS_STRCAT
  55 #  define PAGE_ALIGN_REG        ecx
  56 # else
  57 #  define PAGE_ALIGN_REG        eax
  58 # endif
  59
  60 # define VZERO  VMM(7)
  61 # define VZERO_128      VMM_128(7)
  62
  63         .section SECTION(.text), "ax", @progbits
  64 ENTRY(STRCPY)
  65         vpxor   %VZERO_128, %VZERO_128, %VZERO_128
  66
  67 # ifdef USE_AS_STRCAT
  68         movq    %rdi, %rax
  69 #  include "strcat-strlen-avx2.h.S"
  70 # endif
  71
  72         movl    %esi, %PAGE_ALIGN_REG
  73         andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
  74         cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
  75         ja      L(page_cross)
  76 L(page_cross_continue):
  77 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
  78         movq    %rdi, %rax
  79 # endif
  80         VMOVU   (%rsi), %VMM(0)
  81         VPCMPEQ %VMM(0), %VZERO, %VMM(6)
  82         vpmovmskb %VMM(6), %ecx
  83
  84         testl   %ecx, %ecx
  85         jz      L(more_1x_vec)
  86
  87         /* No longer need ymm registers so just vzeroupper so it doesn't
  88            need to be duplicated at each return statement.  */
  89         COND_VZEROUPPER
  90
  91         xorl    %edx, %edx
  92         bsfl    %ecx, %edx
  93 # ifdef USE_AS_STPCPY
  94         leaq    (%rdi, %rdx), %rax
  95 # endif
  96
  97         /* Use mask bits in rcx to detect which copy we need. If the low
  98            mask is zero then there must be a bit set in the upper half.
  99            I.e if ecx != 0 and cx == 0, then match must be upper 16
 100            bits so we use L(copy_16_31).  */
 101         testw   %cx, %cx
 102         jz      L(copy_16_31)
 103
 104         testb   %cl, %cl
 105         jz      L(copy_8_15)
 106 # ifdef USE_AS_WCSCPY
 107         vmovd   %xmm0, (%rdi)
 108         movl    $0, (%END_REG)
 109         ret
 110 # else
 111         testb   $0x7, %cl
 112         jz      L(copy_4_7)
 113
 114         testl   %edx, %edx
 115         jz      L(set_null_term)
 116         vmovd   %xmm0, %ecx
 117         movw    %cx, (%rdi)
 118
 119         .p2align 4,, 2
 120 L(set_null_term):
 121         movb    $0, (%END_REG)
 122         ret
 123
 124         .p2align 4,, 12
 125 L(copy_4_7):
 126         movl    -3(%rsi, %rdx), %ecx
 127         vmovd   %xmm0, (%rdi)
 128         movl    %ecx, -3(%END_REG)
 129         ret
 130 # endif
 131
 132         .p2align 4,, 10
 133 L(copy_16_31):
 134         VMOVU   -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
 135         VMOVU   %xmm0, (%rdi)
 136         VMOVU   %xmm1, -(16 - CHAR_SIZE)(%END_REG)
 137         ret
 138
 139         .p2align 4,, 10
 140 L(copy_8_15):
 141 # ifdef USE_AS_WCSCPY
 142         movl    -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
 143 # else
 144         movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
 145 # endif
 146         vmovq   %xmm0, (%rdi)
 147         movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
 148         ret
 149
 150
 151         .p2align 4,, 8
 152 L(more_1x_vec):
 153 # if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 154         VMOVU   %VMM(0), (%rdi)
 155 # endif
 156         subq    %rsi, %rdi
 157         orq     $(VEC_SIZE - 1), %rsi
 158         addq    %rsi, %rdi
 159         VMOVA   1(%rsi), %VMM(1)
 160
 161         /* Try and order stores after as many loads as is reasonable to
 162            avoid potential false dependencies.  */
 163 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
 164         VMOVU   %VMM(0), (%rax)
 165 # endif
 166         VPCMPEQ %VMM(1), %VZERO, %VMM(6)
 167         vpmovmskb %VMM(6), %ecx
 168         testl   %ecx, %ecx
 169         jnz     L(ret_vec_x1)
 170
 171         VMOVA   (VEC_SIZE + 1)(%rsi), %VMM(2)
 172         VMOVU   %VMM(1), 1(%rdi)
 173
 174         VPCMPEQ %VMM(2), %VZERO, %VMM(6)
 175         vpmovmskb %VMM(6), %ecx
 176         testl   %ecx, %ecx
 177         jnz     L(ret_vec_x2)
 178
 179         VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
 180         VMOVU   %VMM(2), (VEC_SIZE + 1)(%rdi)
 181
 182         VPCMPEQ %VMM(3), %VZERO, %VMM(6)
 183         vpmovmskb %VMM(6), %ecx
 184         testl   %ecx, %ecx
 185         jnz     L(ret_vec_x3)
 186
 187         VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
 188         VMOVU   %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
 189         VPCMPEQ %VMM(4), %VZERO, %VMM(6)
 190         vpmovmskb %VMM(6), %edx
 191         testl   %edx, %edx
 192         jnz     L(ret_vec_x4)
 193
 194         VMOVU   %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
 195
 196         /* Subtract rsi from rdi before aligning. Adding back rsi will
 197            get proper rdi (dst) for new src.  */
 198         subq    %rsi, %rdi
 199         incq    %rsi
 200         orq     $(VEC_SIZE * 4 - 1), %rsi
 201
 202         /* Do first half of loop ahead of time so loop can just start by
 203            storing.  */
 204         VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
 205         VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
 206         VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
 207         VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
 208
 209         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 210         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 211         VPMIN   %VMM(4), %VMM(6), %VMM(6)
 212         VPCMPEQ %VMM(6), %VZERO, %VMM(6)
 213         vpmovmskb %VMM(6), %edx
 214         addq    %rsi, %rdi
 215
 216         testl   %edx, %edx
 217         jnz     L(loop_4x_done)
 218
 219         .p2align 4,, 11
 220 L(loop_4x_vec):
 221
 222         VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
 223         VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
 224         subq    $(VEC_SIZE * -4), %rsi
 225         VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
 226         VMOVU   %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
 227
 228
 229         VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
 230         VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
 231         VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
 232         VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
 233
 234         VPMIN   %VMM(0), %VMM(1), %VMM(4)
 235         VPMIN   %VMM(2), %VMM(3), %VMM(6)
 236         VPMIN   %VMM(4), %VMM(6), %VMM(6)
 237         VPCMPEQ %VMM(6), %VZERO, %VMM(6)
 238
 239         vpmovmskb %VMM(6), %edx
 240         subq    $(VEC_SIZE * -4), %rdi
 241         testl   %edx, %edx
 242         jz      L(loop_4x_vec)
 243
 244 L(loop_4x_done):
 245         VPCMPEQ %VMM(0), %VZERO, %VMM(6)
 246         vpmovmskb %VMM(6), %ecx
 247         testl   %ecx, %ecx
 248         jnz     L(ret_vec_x1)
 249         VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
 250
 251         VPCMPEQ %VMM(1), %VZERO, %VMM(6)
 252         vpmovmskb %VMM(6), %ecx
 253         testl   %ecx, %ecx
 254         jnz     L(ret_vec_x2)
 255         VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
 256
 257         VPCMPEQ %VMM(2), %VZERO, %VMM(6)
 258         vpmovmskb %VMM(6), %ecx
 259         testl   %ecx, %ecx
 260         jnz     L(ret_vec_x3)
 261         VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
 262 L(ret_vec_x4):
 263         bsfl    %edx, %edx
 264         VMOVU   ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
 265         VMOVU   %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
 266 # ifdef USE_AS_STPCPY
 267         leaq    (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
 268 # endif
 269 L(return_end):
 270         VZEROUPPER_RETURN
 271
 272         .p2align 4,, 8
 273 L(ret_vec_x1):
 274         bsfl    %ecx, %ecx
 275         VMOVU   (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
 276         VMOVU   %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 277 # ifdef USE_AS_STPCPY
 278         leaq    1(%rcx, %rdi), %rax
 279 # endif
 280 L(return_vzeroupper):
 281         ZERO_UPPER_VEC_REGISTERS_RETURN
 282
 283         .p2align 4,, 8
 284 L(ret_vec_x2):
 285         bsfl    %ecx, %ecx
 286         VMOVU   ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
 287         VMOVU   %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 288 # ifdef USE_AS_STPCPY
 289         leaq    (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
 290 # endif
 291         VZEROUPPER_RETURN
 292
 293         .p2align 4,, 8
 294 L(ret_vec_x3):
 295         bsfl    %ecx, %ecx
 296         VMOVU   ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
 297         VMOVU   %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 298 # ifdef USE_AS_STPCPY
 299         leaq    (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
 300 # endif
 301         VZEROUPPER_RETURN
 302
 303
 304         .p2align 4,, 4
 305 L(page_cross):
 306         movq    %rsi, %rcx
 307         andq    $(VEC_SIZE * -1), %rcx
 308
 309         VPCMPEQ (%rcx), %VZERO, %VMM(6)
 310         vpmovmskb %VMM(6), %ecx
 311         shrxl   %esi, %ecx, %ecx
 312 # if USE_MOVSB_IN_PAGE_CROSS
 313         /* Optimizing more aggressively for space as this is very cold
 314            code. This saves 2x cache lines.  */
 315
 316         /* This adds once to the later result which will get correct
 317            copy bounds. NB: this can never zero-out a non-zero RCX as
 318            to be in the page cross case rsi cannot be aligned and we
 319            already right-shift rcx by the misalignment.  */
 320         shll    $CHAR_SIZE, %ecx
 321         jz      L(page_cross_continue)
 322         bsfl    %ecx, %ecx
 323 #  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
 324         movq    %rdi, %rax
 325 #  endif
 326         rep     movsb
 327 #  ifdef USE_AS_STPCPY
 328         leaq    -CHAR_SIZE(%rdi), %rax
 329 #  endif
 330
 331         VZEROUPPER_RETURN
 332
 333 # else
 334         testl   %ecx, %ecx
 335         jz      L(page_cross_continue)
 336
 337         /* Traditional copy case, essentially same as used in non-page-
 338            cross case but since we can't reuse VMM(0) we need twice as
 339            many loads from rsi.  */
 340 #  ifndef USE_AS_STRCAT
 341         xorl    %edx, %edx
 342 #  endif
 343         bsfl    %ecx, %edx
 344 #  ifdef USE_AS_STPCPY
 345         leaq    (%rdi, %rdx), %rax
 346 #  elif !defined USE_AS_STRCAT
 347         movq    %rdi, %rax
 348 #  endif
 349
 350         /* vzeroupper early to avoid duplicating at each return.  */
 351         COND_VZEROUPPER
 352
 353         testw   %cx, %cx
 354         jz      L(page_cross_copy_16_31)
 355
 356         testb   %cl, %cl
 357         jz      L(page_cross_copy_8_15)
 358
 359         testl   $0x7, %cl
 360         jz      L(page_cross_copy_4_7)
 361
 362         testl   %edx, %edx
 363         jz      L(page_cross_set_null_term)
 364         movzwl  (%rsi), %ecx
 365         movw    %cx, (%rdi)
 366 L(page_cross_set_null_term):
 367         movb    $0, (%END_REG)
 368         ret
 369
 370         .p2align 4,, 4
 371 L(page_cross_copy_4_7):
 372         movl    (%rsi), %ecx
 373         movl    -3(%rsi, %rdx), %esi
 374         movl    %ecx, (%rdi)
 375         movl    %esi, -3(%END_REG)
 376         ret
 377
 378         .p2align 4,, 4
 379 L(page_cross_copy_8_15):
 380         movq    (%rsi), %rcx
 381         movq    -7(%rsi, %rdx), %rsi
 382         movq    %rcx, (%rdi)
 383         movq    %rsi, -7(%END_REG)
 384         ret
 385
 386
 387         .p2align 4,, 3
 388 L(page_cross_copy_16_31):
 389         VMOVU   (%rsi), %xmm0
 390         VMOVU   -15(%rsi, %rdx), %xmm1
 391         VMOVU   %xmm0, (%rdi)
 392         VMOVU   %xmm1, -15(%END_REG)
 393         ret
 394 # endif
 395
 396 END(STRCPY)
 397 #endif