1 /* SSE2 version of strlen.
2 Copyright (C) 2012-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* Long lived register in strlen(s), strnlen(s, n) are:
25 %r10 (s+n) & (~(64-1))
33 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
35 pcmpeqb (%rax), %xmm8; \
36 pcmpeqb 16(%rax), %xmm9; \
37 pcmpeqb 32(%rax), %xmm10; \
38 pcmpeqb 48(%rax), %xmm11; \
39 pmovmskb %xmm8, %esi; \
40 pmovmskb %xmm9, %edx; \
41 pmovmskb %xmm10, %r8d; \
42 pmovmskb %xmm11, %ecx; \
51 /* Do not read anything when n==0. */
58 /* Initialize long lived registers. */
73 /* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
75 /* We cannot unify this branching as it would be ~6 cycles slower. */
79 /* Test if end is among first 64 bytes. */
80 # define STRNLEN_PROLOG \
87 # define STRNLEN_PROLOG andq $-64, %rax;
90 /* Ignore bits in mask that come before start of string. */
105 /* Test first 16 bytes unaligned. */
106 movdqu (%rax), %xmm12
107 pcmpeqb %xmm8, %xmm12
108 pmovmskb %xmm12, %edx
111 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
115 /* Same as FIND_ZERO except we do not check first 16 bytes. */
117 pcmpeqb 16(%rax), %xmm9
118 pcmpeqb 32(%rax), %xmm10
119 pcmpeqb 48(%rax), %xmm11
121 pmovmskb %xmm10, %r8d
122 pmovmskb %xmm11, %ecx
130 /* When no zero byte is found xmm9-11 are zero so we do not have to
141 /* We must do this check to correctly handle strnlen (s, -1). */
164 pminub 16(%rax), %xmm8
165 pminub 32(%rax), %xmm8
166 pminub 48(%rax), %xmm8
167 pcmpeqb %xmm11, %xmm8
176 je L(first) /* Do not read when end is at page boundary. */
199 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
203 movdqa 64(%rax), %xmm8
204 pminub 80(%rax), %xmm8
205 pminub 96(%rax), %xmm8
206 pminub 112(%rax), %xmm8
207 pcmpeqb %xmm11, %xmm8
215 pminub 16(%rax), %xmm8
216 pminub 32(%rax), %xmm8
217 pminub 48(%rax), %xmm8
218 pcmpeqb %xmm11, %xmm8
239 libc_hidden_builtin_def (strlen)