sysdeps/x86_64/multiarch/memrchr-sse2.S

   1 /* memrchr optimized with SSE2.
   2    Copyright (C) 2017-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
  22    so we need this to build for ISA V2 builds. */
  23 #if ISA_SHOULD_BUILD (2)
  24
  25 # ifndef MEMRCHR
  26 #  define MEMRCHR __memrchr_sse2
  27 # endif
  28
  29 # include <sysdep.h>
  30 # define VEC_SIZE                       16
  31 # define PAGE_SIZE                      4096
  32
  33         .text
  34 ENTRY_P2ALIGN(MEMRCHR, 6)
  35 # ifdef __ILP32__
  36         /* Clear upper bits.  */
  37         mov     %RDX_LP, %RDX_LP
  38 # endif
  39         movd    %esi, %xmm0
  40
  41         /* Get end pointer.  */
  42         leaq    (%rdx, %rdi), %rcx
  43
  44         punpcklbw %xmm0, %xmm0
  45         punpcklwd %xmm0, %xmm0
  46         pshufd  $0, %xmm0, %xmm0
  47
  48         /* Check if we can load 1x VEC without cross a page.  */
  49         testl   $(PAGE_SIZE - VEC_SIZE), %ecx
  50         jz      L(page_cross)
  51
  52         /* NB: This load happens regardless of whether rdx (len) is zero. Since
  53            it doesn't cross a page and the standard gurantees any pointer have
  54            at least one-valid byte this load must be safe. For the entire
  55            history of the x86 memrchr implementation this has been possible so
  56            no code "should" be relying on a zero-length check before this load.
  57            The zero-length check is moved to the page cross case because it is
  58            1) pretty cold and including it pushes the hot case len <= VEC_SIZE
  59            into 2-cache lines.  */
  60         movups  -(VEC_SIZE)(%rcx), %xmm1
  61         pcmpeqb %xmm0, %xmm1
  62         pmovmskb %xmm1, %eax
  63
  64         subq    $VEC_SIZE, %rdx
  65         ja      L(more_1x_vec)
  66 L(ret_vec_x0_test):
  67         /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
  68            zero.  */
  69         bsrl    %eax, %eax
  70         jz      L(ret_0)
  71         /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
  72            if out of bounds.  */
  73         addl    %edx, %eax
  74         jl      L(zero_0)
  75         /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
  76            ptr.  */
  77         addq    %rdi, %rax
  78 L(ret_0):
  79         ret
  80
  81         .p2align 4,, 5
  82 L(ret_vec_x0):
  83         bsrl    %eax, %eax
  84         leaq    -(VEC_SIZE)(%rcx, %rax), %rax
  85         ret
  86
  87         .p2align 4,, 2
  88 L(zero_0):
  89         xorl    %eax, %eax
  90         ret
  91
  92
  93         .p2align 4,, 8
  94 L(more_1x_vec):
  95         testl   %eax, %eax
  96         jnz     L(ret_vec_x0)
  97
  98         /* Align rcx (pointer to string).  */
  99         decq    %rcx
 100         andq    $-VEC_SIZE, %rcx
 101
 102         movq    %rcx, %rdx
 103         /* NB: We could consistenyl save 1-byte in this pattern with `movaps
 104            %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
 105            it adds more frontend uops (even if the moves can be eliminated) and
 106            some percentage of the time actual backend uops.  */
 107         movaps  -(VEC_SIZE)(%rcx), %xmm1
 108         pcmpeqb %xmm0, %xmm1
 109         subq    %rdi, %rdx
 110         pmovmskb %xmm1, %eax
 111
 112         cmpq    $(VEC_SIZE * 2), %rdx
 113         ja      L(more_2x_vec)
 114 L(last_2x_vec):
 115         subl    $VEC_SIZE, %edx
 116         jbe     L(ret_vec_x0_test)
 117
 118         testl   %eax, %eax
 119         jnz     L(ret_vec_x0)
 120
 121         movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
 122         pcmpeqb %xmm0, %xmm1
 123         pmovmskb %xmm1, %eax
 124
 125         subl    $VEC_SIZE, %edx
 126         bsrl    %eax, %eax
 127         jz      L(ret_1)
 128         addl    %edx, %eax
 129         jl      L(zero_0)
 130         addq    %rdi, %rax
 131 L(ret_1):
 132         ret
 133
 134         /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
 135            causes the hot pause (length <= VEC_SIZE) to span multiple cache
 136            lines.  Naturally aligned % 16 to 8-bytes.  */
 137 L(page_cross):
 138         /* Zero length check.  */
 139         testq   %rdx, %rdx
 140         jz      L(zero_0)
 141
 142         leaq    -1(%rcx), %r8
 143         andq    $-(VEC_SIZE), %r8
 144
 145         movaps  (%r8), %xmm1
 146         pcmpeqb %xmm0, %xmm1
 147         pmovmskb %xmm1, %esi
 148         /* Shift out negative alignment (because we are starting from endptr and
 149            working backwards).  */
 150         negl    %ecx
 151         /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
 152            explicitly.  */
 153         andl    $(VEC_SIZE - 1), %ecx
 154         shl     %cl, %esi
 155         movzwl  %si, %eax
 156         leaq    (%rdi, %rdx), %rcx
 157         cmpq    %rdi, %r8
 158         ja      L(more_1x_vec)
 159         subl    $VEC_SIZE, %edx
 160         bsrl    %eax, %eax
 161         jz      L(ret_2)
 162         addl    %edx, %eax
 163         jl      L(zero_1)
 164         addq    %rdi, %rax
 165 L(ret_2):
 166         ret
 167
 168         /* Fits in aliging bytes.  */
 169 L(zero_1):
 170         xorl    %eax, %eax
 171         ret
 172
 173         .p2align 4,, 5
 174 L(ret_vec_x1):
 175         bsrl    %eax, %eax
 176         leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
 177         ret
 178
 179         .p2align 4,, 8
 180 L(more_2x_vec):
 181         testl   %eax, %eax
 182         jnz     L(ret_vec_x0)
 183
 184         movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
 185         pcmpeqb %xmm0, %xmm1
 186         pmovmskb %xmm1, %eax
 187         testl   %eax, %eax
 188         jnz     L(ret_vec_x1)
 189
 190
 191         movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
 192         pcmpeqb %xmm0, %xmm1
 193         pmovmskb %xmm1, %eax
 194
 195         subq    $(VEC_SIZE * 4), %rdx
 196         ja      L(more_4x_vec)
 197
 198         addl    $(VEC_SIZE), %edx
 199         jle     L(ret_vec_x2_test)
 200
 201 L(last_vec):
 202         testl   %eax, %eax
 203         jnz     L(ret_vec_x2)
 204
 205         movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
 206         pcmpeqb %xmm0, %xmm1
 207         pmovmskb %xmm1, %eax
 208
 209         subl    $(VEC_SIZE), %edx
 210         bsrl    %eax, %eax
 211         jz      L(ret_3)
 212         addl    %edx, %eax
 213         jl      L(zero_2)
 214         addq    %rdi, %rax
 215 L(ret_3):
 216         ret
 217
 218         .p2align 4,, 6
 219 L(ret_vec_x2_test):
 220         bsrl    %eax, %eax
 221         jz      L(zero_2)
 222         addl    %edx, %eax
 223         jl      L(zero_2)
 224         addq    %rdi, %rax
 225         ret
 226
 227 L(zero_2):
 228         xorl    %eax, %eax
 229         ret
 230
 231
 232         .p2align 4,, 5
 233 L(ret_vec_x2):
 234         bsrl    %eax, %eax
 235         leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
 236         ret
 237
 238         .p2align 4,, 5
 239 L(ret_vec_x3):
 240         bsrl    %eax, %eax
 241         leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
 242         ret
 243
 244         .p2align 4,, 8
 245 L(more_4x_vec):
 246         testl   %eax, %eax
 247         jnz     L(ret_vec_x2)
 248
 249         movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
 250         pcmpeqb %xmm0, %xmm1
 251         pmovmskb %xmm1, %eax
 252
 253         testl   %eax, %eax
 254         jnz     L(ret_vec_x3)
 255
 256         addq    $-(VEC_SIZE * 4), %rcx
 257         cmpq    $(VEC_SIZE * 4), %rdx
 258         jbe     L(last_4x_vec)
 259
 260         /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
 261            keeping the code from spilling to the next cache line.  */
 262         addq    $(VEC_SIZE * 4 - 1), %rcx
 263         andq    $-(VEC_SIZE * 4), %rcx
 264         leaq    (VEC_SIZE * 4)(%rdi), %rdx
 265         andq    $-(VEC_SIZE * 4), %rdx
 266
 267         .p2align 4,, 11
 268 L(loop_4x_vec):
 269         movaps  (VEC_SIZE * -1)(%rcx), %xmm1
 270         movaps  (VEC_SIZE * -2)(%rcx), %xmm2
 271         movaps  (VEC_SIZE * -3)(%rcx), %xmm3
 272         movaps  (VEC_SIZE * -4)(%rcx), %xmm4
 273         pcmpeqb %xmm0, %xmm1
 274         pcmpeqb %xmm0, %xmm2
 275         pcmpeqb %xmm0, %xmm3
 276         pcmpeqb %xmm0, %xmm4
 277
 278         por     %xmm1, %xmm2
 279         por     %xmm3, %xmm4
 280         por     %xmm2, %xmm4
 281
 282         pmovmskb %xmm4, %esi
 283         testl   %esi, %esi
 284         jnz     L(loop_end)
 285
 286         addq    $-(VEC_SIZE * 4), %rcx
 287         cmpq    %rdx, %rcx
 288         jne     L(loop_4x_vec)
 289
 290         subl    %edi, %edx
 291
 292         /* Ends up being 1-byte nop.  */
 293         .p2align 4,, 2
 294 L(last_4x_vec):
 295         movaps  -(VEC_SIZE)(%rcx), %xmm1
 296         pcmpeqb %xmm0, %xmm1
 297         pmovmskb %xmm1, %eax
 298
 299         cmpl    $(VEC_SIZE * 2), %edx
 300         jbe     L(last_2x_vec)
 301
 302         testl   %eax, %eax
 303         jnz     L(ret_vec_x0)
 304
 305
 306         movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
 307         pcmpeqb %xmm0, %xmm1
 308         pmovmskb %xmm1, %eax
 309
 310         testl   %eax, %eax
 311         jnz     L(ret_vec_end)
 312
 313         movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
 314         pcmpeqb %xmm0, %xmm1
 315         pmovmskb %xmm1, %eax
 316
 317         subl    $(VEC_SIZE * 3), %edx
 318         ja      L(last_vec)
 319         bsrl    %eax, %eax
 320         jz      L(ret_4)
 321         addl    %edx, %eax
 322         jl      L(zero_3)
 323         addq    %rdi, %rax
 324 L(ret_4):
 325         ret
 326
 327         /* Ends up being 1-byte nop.  */
 328         .p2align 4,, 3
 329 L(loop_end):
 330         pmovmskb %xmm1, %eax
 331         sall    $16, %eax
 332         jnz     L(ret_vec_end)
 333
 334         pmovmskb %xmm2, %eax
 335         testl   %eax, %eax
 336         jnz     L(ret_vec_end)
 337
 338         pmovmskb %xmm3, %eax
 339         /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
 340            then it won't affect the result in esi (VEC4). If ecx is non-zero
 341            then CHAR in VEC3 and bsrq will use that position.  */
 342         sall    $16, %eax
 343         orl     %esi, %eax
 344         bsrl    %eax, %eax
 345         leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
 346         ret
 347
 348 L(ret_vec_end):
 349         bsrl    %eax, %eax
 350         leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
 351         ret
 352         /* Use in L(last_4x_vec). In the same cache line. This is just a spare
 353            aligning bytes.  */
 354 L(zero_3):
 355         xorl    %eax, %eax
 356         ret
 357         /* 2-bytes from next cache line.  */
 358 END(MEMRCHR)
 359 #endif