1 /* Find character CH in a NUL terminated string.
2 Highly optimized version for ix85, x>=5.
3 Copyright (C) 1995-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 #include "asm-syntax.h"
23 /* This version is especially optimized for the i586 (and following?)
24 processors. This is mainly done by using the two pipelines. The
25 version optimized for i486 is weak in this aspect because to get
26 as much parallelism we have to execute some *more* instructions.
28 The code below is structured to reflect the pairing of the instructions
29 as *I think* it is. I have no processor data book to verify this.
30 If you find something you think is incorrect let me know. */
33 /* The magic value which is used throughout in the whole code. */
34 #define magic 0xfefefeff
36 #define PARMS 4+16 /* space for 4 saved regs */
44 pushl %edi /* Save callee-safe registers. */
45 cfi_adjust_cfa_offset (-4)
47 cfi_adjust_cfa_offset (-4)
50 cfi_adjust_cfa_offset (-4)
52 cfi_adjust_cfa_offset (-4)
57 movl %eax, %edi /* duplicate string pointer for later */
58 cfi_rel_offset (edi, 12)
59 xorl %ecx, %ecx /* clear %ecx */
61 /* At the moment %edx contains C. What we need for the
62 algorithm is C in all bytes of the dword. Avoid
63 operations on 16 bit words because these require an
64 prefix byte (and one more cycle). */
65 movb %dl, %dh /* now it is 0|0|c|c */
66 movb %dl, %cl /* we construct the lower half in %ecx */
68 shll $16, %edx /* now %edx is c|c|0|0 */
69 movb %cl, %ch /* now %ecx is 0|0|c|c */
71 orl %ecx, %edx /* and finally c|c|c|c */
72 andl $3, %edi /* mask alignment bits */
74 jz L(11) /* alignment is 0 => start loop */
76 movb %dl, %cl /* 0 is needed below */
77 jp L(0) /* exactly two bits set */
79 xorb (%eax), %cl /* is byte the one we are looking for? */
80 jz L(out) /* yes => return pointer */
82 xorb %dl, %cl /* load single byte and test for NUL */
83 je L(3) /* yes => return NULL */
85 movb 1(%eax), %cl /* load single byte */
88 cmpb %cl, %dl /* is byte == C? */
89 je L(out) /* aligned => return pointer */
91 cmpb $0, %cl /* is byte NUL? */
92 je L(3) /* yes => return NULL */
99 L(0): movb (%eax), %cl /* load single byte */
101 cmpb %cl, %dl /* is byte == C? */
102 je L(out) /* aligned => return pointer */
104 cmpb $0, %cl /* is byte NUL? */
105 je L(3) /* yes => return NULL */
107 incl %eax /* increment pointer */
109 cfi_rel_offset (esi, 8)
110 cfi_rel_offset (ebx, 4)
111 cfi_rel_offset (ebp, 0)
113 /* The following code is the preparation for the loop. The
114 four instruction up to `L1' will not be executed in the loop
115 because the same code is found at the end of the loop, but
116 there it is executed in parallel with other instructions. */
117 L(11): movl (%eax), %ecx
123 /* The main loop: it looks complex and indeed it is. I would
124 love to say `it was hard to write, so it should he hard to
125 read' but I will give some more hints. To fully understand
126 this code you should first take a look at the i486 version.
127 The basic algorithm is the same, but here the code organized
128 in a way which permits to use both pipelines all the time.
130 I tried to make it a bit more understandable by indenting
131 the code according to stage in the algorithm. It goes as
133 check for 0 in 1st word
134 check for C in 1st word
135 check for 0 in 2nd word
136 check for C in 2nd word
137 check for 0 in 3rd word
138 check for C in 3rd word
139 check for 0 in 4th word
140 check for C in 4th word
142 Please note that doing the test for NUL before the test for
143 C allows us to overlap the test for 0 in the next word with
146 L(1): xorl %ecx, %ebp /* (word^magic) */
147 addl %ecx, %edi /* add magic word */
149 leal 4(%eax), %eax /* increment pointer */
150 jnc L(4) /* previous addl caused overflow? */
152 movl %ecx, %ebx /* duplicate original word */
153 orl $magic, %ebp /* (word^magic)|magic */
155 addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */
156 jne L(4) /* yes => we found word with NUL */
158 movl $magic, %esi /* load magic value */
159 xorl %edx, %ebx /* clear words which are C */
162 addl %ebx, %esi /* (word+magic) */
165 jnc L(5) /* previous addl caused overflow? */
168 xorl %ebx, %esi /* (word+magic)^word */
171 orl $magic, %esi /* ((word+magic)^word)|magic */
173 addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/
174 jne L(5) /* yes => we found word with C */
267 /* We know there is no NUL byte but a C byte in the word.
268 %ebx contains NUL in this particular byte. */
269 L(5): subl $4, %eax /* adjust pointer */
270 testb %bl, %bl /* first byte == C? */
272 jz L(out) /* yes => return pointer */
274 incl %eax /* increment pointer */
275 testb %bh, %bh /* second byte == C? */
277 jz L(out) /* yes => return pointer */
279 shrl $16, %ebx /* make upper bytes accessible */
280 incl %eax /* increment pointer */
282 cmp $0, %bl /* third byte == C */
283 je L(out) /* yes => return pointer */
285 incl %eax /* increment pointer */
287 L(out): popl %ebp /* restore saved registers */
288 cfi_adjust_cfa_offset (-4)
291 cfi_adjust_cfa_offset (-4)
295 cfi_adjust_cfa_offset (-4)
298 cfi_adjust_cfa_offset (-4)
303 cfi_adjust_cfa_offset (16)
304 cfi_rel_offset (edi, 12)
305 cfi_rel_offset (esi, 8)
306 cfi_rel_offset (ebx, 4)
307 cfi_rel_offset (ebp, 0)
308 /* We know there is a NUL byte in the word. But we have to test
309 whether there is an C byte before it in the word. */
310 L(4): subl $4, %eax /* adjust pointer */
311 cmpb %dl, %cl /* first byte == C? */
313 je L(out) /* yes => return pointer */
315 cmpb $0, %cl /* first byte == NUL? */
316 je L(3) /* yes => return NULL */
318 incl %eax /* increment pointer */
320 cmpb %dl, %ch /* second byte == C? */
321 je L(out) /* yes => return pointer */
323 cmpb $0, %ch /* second byte == NUL? */
324 je L(3) /* yes => return NULL */
326 shrl $16, %ecx /* make upper bytes accessible */
327 incl %eax /* increment pointer */
329 cmpb %dl, %cl /* third byte == C? */
330 je L(out) /* yes => return pointer */
332 cmpb $0, %cl /* third byte == NUL? */
333 je L(3) /* yes => return NULL */
335 incl %eax /* increment pointer */
337 /* The test four the fourth byte is necessary! */
338 cmpb %dl, %ch /* fourth byte == C? */
339 je L(out) /* yes => return pointer */
341 L(3): xorl %eax, %eax
346 weak_alias (strchr, index)
347 libc_hidden_builtin_def (strchr)