sysdeps/x86_64/multiarch/strchr-sse2.S

   1 /* strchr optimized with SSE2.
   2    Copyright (C) 2009-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
  22    so we need this to build for ISA V2 builds. */
  23 #if ISA_SHOULD_BUILD (2)
  24
  25 # ifndef STRCHR
  26 #  define STRCHR __strchr_sse2
  27 # endif
  28
  29 # include <sysdep.h>
  30
  31         .text
  32 ENTRY (STRCHR)
  33         movd    %esi, %xmm1
  34         movl    %edi, %eax
  35         andl    $4095, %eax
  36         punpcklbw %xmm1, %xmm1
  37         cmpl    $4032, %eax
  38         punpcklwd %xmm1, %xmm1
  39         pshufd  $0, %xmm1, %xmm1
  40         jg      L(cross_page)
  41         movdqu  (%rdi), %xmm0
  42         pxor    %xmm3, %xmm3
  43         movdqa  %xmm0, %xmm4
  44         pcmpeqb %xmm1, %xmm0
  45         pcmpeqb %xmm3, %xmm4
  46         por     %xmm4, %xmm0
  47         pmovmskb %xmm0, %eax
  48         test    %eax, %eax
  49         je      L(next_48_bytes)
  50         bsf     %eax, %eax
  51 # ifdef AS_STRCHRNUL
  52         leaq    (%rdi,%rax), %rax
  53 # else
  54         movl    $0, %edx
  55         leaq    (%rdi,%rax), %rax
  56         cmpb    %sil, (%rax)
  57         cmovne  %rdx, %rax
  58 # endif
  59         ret
  60
  61         .p2align 3
  62 L(next_48_bytes):
  63         movdqu  16(%rdi), %xmm0
  64         movdqa  %xmm0, %xmm4
  65         pcmpeqb %xmm1, %xmm0
  66         pcmpeqb %xmm3, %xmm4
  67         por     %xmm4, %xmm0
  68         pmovmskb %xmm0, %ecx
  69         movdqu  32(%rdi), %xmm0
  70         movdqa  %xmm0, %xmm4
  71         pcmpeqb %xmm1, %xmm0
  72         salq    $16, %rcx
  73         pcmpeqb %xmm3, %xmm4
  74         por     %xmm4, %xmm0
  75         pmovmskb %xmm0, %eax
  76         movdqu  48(%rdi), %xmm0
  77         pcmpeqb %xmm0, %xmm3
  78         salq    $32, %rax
  79         pcmpeqb %xmm1, %xmm0
  80         orq     %rcx, %rax
  81         por     %xmm3, %xmm0
  82         pmovmskb %xmm0, %ecx
  83         salq    $48, %rcx
  84         orq     %rcx, %rax
  85         testq   %rax, %rax
  86         jne     L(return)
  87 L(loop_start):
  88         /* We use this alignment to force loop be aligned to 8 but not
  89            16 bytes.  This gives better sheduling on AMD processors.  */
  90         .p2align 4
  91         pxor    %xmm6, %xmm6
  92         andq    $-64, %rdi
  93         .p2align 3
  94 L(loop64):
  95         addq    $64, %rdi
  96         movdqa  (%rdi), %xmm5
  97         movdqa  16(%rdi), %xmm2
  98         movdqa  32(%rdi), %xmm3
  99         pxor    %xmm1, %xmm5
 100         movdqa  48(%rdi), %xmm4
 101         pxor    %xmm1, %xmm2
 102         pxor    %xmm1, %xmm3
 103         pminub  (%rdi), %xmm5
 104         pxor    %xmm1, %xmm4
 105         pminub  16(%rdi), %xmm2
 106         pminub  32(%rdi), %xmm3
 107         pminub  %xmm2, %xmm5
 108         pminub  48(%rdi), %xmm4
 109         pminub  %xmm3, %xmm5
 110         pminub  %xmm4, %xmm5
 111         pcmpeqb %xmm6, %xmm5
 112         pmovmskb %xmm5, %eax
 113
 114         testl   %eax, %eax
 115         je      L(loop64)
 116
 117         movdqa  (%rdi), %xmm5
 118         movdqa  %xmm5, %xmm0
 119         pcmpeqb %xmm1, %xmm5
 120         pcmpeqb %xmm6, %xmm0
 121         por     %xmm0, %xmm5
 122         pcmpeqb %xmm6, %xmm2
 123         pcmpeqb %xmm6, %xmm3
 124         pcmpeqb %xmm6, %xmm4
 125
 126         pmovmskb %xmm5, %ecx
 127         pmovmskb %xmm2, %eax
 128         salq    $16, %rax
 129         pmovmskb %xmm3, %r8d
 130         pmovmskb %xmm4, %edx
 131         salq    $32, %r8
 132         orq     %r8, %rax
 133         orq     %rcx, %rax
 134         salq    $48, %rdx
 135         orq     %rdx, %rax
 136         .p2align 3
 137 L(return):
 138         bsfq    %rax, %rax
 139 # ifdef AS_STRCHRNUL
 140         leaq    (%rdi,%rax), %rax
 141 # else
 142         movl    $0, %edx
 143         leaq    (%rdi,%rax), %rax
 144         cmpb    %sil, (%rax)
 145         cmovne  %rdx, %rax
 146 # endif
 147         ret
 148         .p2align 4
 149
 150 L(cross_page):
 151         movq    %rdi, %rdx
 152         pxor    %xmm2, %xmm2
 153         andq    $-64, %rdx
 154         movdqa  %xmm1, %xmm0
 155         movdqa  (%rdx), %xmm3
 156         movdqa  %xmm3, %xmm4
 157         pcmpeqb %xmm1, %xmm3
 158         pcmpeqb %xmm2, %xmm4
 159         por     %xmm4, %xmm3
 160         pmovmskb %xmm3, %r8d
 161         movdqa  16(%rdx), %xmm3
 162         movdqa  %xmm3, %xmm4
 163         pcmpeqb %xmm1, %xmm3
 164         pcmpeqb %xmm2, %xmm4
 165         por     %xmm4, %xmm3
 166         pmovmskb %xmm3, %eax
 167         movdqa  32(%rdx), %xmm3
 168         movdqa  %xmm3, %xmm4
 169         pcmpeqb %xmm1, %xmm3
 170         salq    $16, %rax
 171         pcmpeqb %xmm2, %xmm4
 172         por     %xmm4, %xmm3
 173         pmovmskb %xmm3, %r9d
 174         movdqa  48(%rdx), %xmm3
 175         pcmpeqb %xmm3, %xmm2
 176         salq    $32, %r9
 177         pcmpeqb %xmm3, %xmm0
 178         orq     %r9, %rax
 179         orq     %r8, %rax
 180         por     %xmm2, %xmm0
 181         pmovmskb %xmm0, %ecx
 182         salq    $48, %rcx
 183         orq     %rcx, %rax
 184         movl    %edi, %ecx
 185         subb    %dl, %cl
 186         shrq    %cl, %rax
 187         testq   %rax, %rax
 188         jne     L(return)
 189         jmp     L(loop_start)
 190
 191 END (STRCHR)
 192 #endif