sysdeps/i386/i586/strchr.S

   1 /* Find character CH in a NUL terminated string.
   2    Highly optimized version for ix85, x>=5.
   3    Copyright (C) 1995,1996,1997,2000,2003,2005 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 #include <sysdep.h>
  23 #include "asm-syntax.h"
  24 #include "bp-sym.h"
  25 #include "bp-asm.h"
  26
  27 /* This version is especially optimized for the i586 (and following?)
  28    processors.  This is mainly done by using the two pipelines.  The
  29    version optimized for i486 is weak in this aspect because to get
  30    as much parallelism we have to execute some *more* instructions.
  31
  32    The code below is structured to reflect the pairing of the instructions
  33    as *I think* it is.  I have no processor data book to verify this.
  34    If you find something you think is incorrect let me know.  */
  35
  36
  37 /* The magic value which is used throughout in the whole code.  */
  38 #define magic 0xfefefeff
  39
  40 #define PARMS   LINKAGE+16      /* space for 4 saved regs */
  41 #define RTN     PARMS
  42 #define STR     RTN+RTN_SIZE
  43 #define CHR     STR+PTR_SIZE
  44
  45         .text
  46 ENTRY (BP_SYM (strchr))
  47         ENTER
  48
  49         pushl %edi              /* Save callee-safe registers.  */
  50         cfi_adjust_cfa_offset (-4)
  51         pushl %esi
  52         cfi_adjust_cfa_offset (-4)
  53
  54         pushl %ebx
  55         cfi_adjust_cfa_offset (-4)
  56         pushl %ebp
  57         cfi_adjust_cfa_offset (-4)
  58
  59         movl STR(%esp), %eax
  60         movl CHR(%esp), %edx
  61         CHECK_BOUNDS_LOW (%eax, STR(%esp))
  62
  63         movl %eax, %edi         /* duplicate string pointer for later */
  64         cfi_rel_offset (edi, 12)
  65         xorl %ecx, %ecx         /* clear %ecx */
  66
  67         /* At the moment %edx contains C.  What we need for the
  68            algorithm is C in all bytes of the dword.  Avoid
  69            operations on 16 bit words because these require an
  70            prefix byte (and one more cycle).  */
  71         movb %dl, %dh           /* now it is 0|0|c|c */
  72         movb %dl, %cl           /* we construct the lower half in %ecx */
  73
  74         shll $16, %edx          /* now %edx is c|c|0|0 */
  75         movb %cl, %ch           /* now %ecx is 0|0|c|c */
  76
  77         orl %ecx, %edx          /* and finally c|c|c|c */
  78         andl $3, %edi           /* mask alignment bits */
  79
  80         jz L(11)                /* alignment is 0 => start loop */
  81
  82         movb %dl, %cl           /* 0 is needed below */
  83         jp L(0)                 /* exactly two bits set */
  84
  85         xorb (%eax), %cl        /* is byte the one we are looking for? */
  86         jz L(2)                 /* yes => return pointer */
  87
  88         xorb %dl, %cl           /* load single byte and test for NUL */
  89         je L(3)                 /* yes => return NULL */
  90
  91         movb 1(%eax), %cl       /* load single byte */
  92         incl %eax
  93
  94         cmpb %cl, %dl           /* is byte == C? */
  95         je L(2)                 /* aligned => return pointer */
  96
  97         cmpb $0, %cl            /* is byte NUL? */
  98         je L(3)                 /* yes => return NULL */
  99
 100         incl %eax
 101         decl %edi
 102
 103         jne L(11)
 104
 105 L(0):   movb (%eax), %cl        /* load single byte */
 106
 107         cmpb %cl, %dl           /* is byte == C? */
 108         je L(2)                 /* aligned => return pointer */
 109
 110         cmpb $0, %cl            /* is byte NUL? */
 111         je L(3)                 /* yes => return NULL */
 112
 113         incl %eax               /* increment pointer */
 114
 115         cfi_rel_offset (esi, 8)
 116         cfi_rel_offset (ebx, 4)
 117         cfi_rel_offset (ebp, 0)
 118
 119         /* The following code is the preparation for the loop.  The
 120            four instruction up to `L1' will not be executed in the loop
 121            because the same code is found at the end of the loop, but
 122            there it is executed in parallel with other instructions.  */
 123 L(11):  movl (%eax), %ecx
 124         movl $magic, %ebp
 125
 126         movl $magic, %edi
 127         addl %ecx, %ebp
 128
 129         /* The main loop: it looks complex and indeed it is.  I would
 130            love to say `it was hard to write, so it should he hard to
 131            read' but I will give some more hints.  To fully understand
 132            this code you should first take a look at the i486 version.
 133            The basic algorithm is the same, but here the code organized
 134            in a way which permits to use both pipelines all the time.
 135
 136            I tried to make it a bit more understandable by indenting
 137            the code according to stage in the algorithm.  It goes as
 138            follows:
 139                 check for 0 in 1st word
 140                         check for C in 1st word
 141                                         check for 0 in 2nd word
 142                                                 check for C in 2nd word
 143                 check for 0 in 3rd word
 144                         check for C in 3rd word
 145                                         check for 0 in 4th word
 146                                                 check for C in 4th word
 147
 148            Please note that doing the test for NUL before the test for
 149            C allows us to overlap the test for 0 in the next word with
 150            the test for C.  */
 151
 152 L(1):   xorl %ecx, %ebp                 /* (word^magic) */
 153         addl %ecx, %edi                 /* add magic word */
 154
 155         leal 4(%eax), %eax              /* increment pointer */
 156         jnc L(4)                        /* previous addl caused overflow? */
 157
 158                 movl %ecx, %ebx         /* duplicate original word */
 159         orl $magic, %ebp                /* (word^magic)|magic */
 160
 161         addl $1, %ebp                   /* (word^magic)|magic == 0xffffffff? */
 162         jne L(4)                                /* yes => we found word with NUL */
 163
 164                 movl $magic, %esi       /* load magic value */
 165                 xorl %edx, %ebx         /* clear words which are C */
 166
 167                                         movl (%eax), %ecx
 168                 addl %ebx, %esi         /* (word+magic) */
 169
 170                                         movl $magic, %edi
 171                 jnc L(5)                /* previous addl caused overflow? */
 172
 173                                         movl %edi, %ebp
 174                 xorl %ebx, %esi         /* (word+magic)^word */
 175
 176                                         addl %ecx, %ebp
 177                 orl $magic, %esi        /* ((word+magic)^word)|magic */
 178
 179                 addl $1, %esi           /* ((word+magic)^word)|magic==0xf..f?*/
 180                 jne L(5)                /* yes => we found word with C */
 181
 182                                         xorl %ecx, %ebp
 183                                         addl %ecx, %edi
 184
 185                                         leal 4(%eax), %eax
 186                                         jnc L(4)
 187
 188                                                 movl %ecx, %ebx
 189                                         orl $magic, %ebp
 190
 191                                         addl $1, %ebp
 192                                         jne L(4)
 193
 194                                                 movl $magic, %esi
 195                                                 xorl %edx, %ebx
 196
 197         movl (%eax), %ecx
 198                                                 addl %ebx, %esi
 199
 200         movl $magic, %edi
 201                                                 jnc L(5)
 202
 203         movl %edi, %ebp
 204                                                 xorl %ebx, %esi
 205
 206         addl %ecx, %ebp
 207                                                 orl $magic, %esi
 208
 209                                                 addl $1, %esi
 210                                                 jne L(5)
 211
 212         xorl %ecx, %ebp
 213         addl %ecx, %edi
 214
 215         leal 4(%eax), %eax
 216         jnc L(4)
 217
 218                 movl %ecx, %ebx
 219         orl $magic, %ebp
 220
 221         addl $1, %ebp
 222         jne L(4)
 223
 224                 movl $magic, %esi
 225                 xorl %edx, %ebx
 226
 227                                         movl (%eax), %ecx
 228                 addl %ebx, %esi
 229
 230                                         movl $magic, %edi
 231                 jnc L(5)
 232
 233                                         movl %edi, %ebp
 234                 xorl %ebx, %esi
 235
 236                                         addl %ecx, %ebp
 237                 orl $magic, %esi
 238
 239                 addl $1, %esi
 240                 jne L(5)
 241
 242                                         xorl %ecx, %ebp
 243                                         addl %ecx, %edi
 244
 245                                         leal 4(%eax), %eax
 246                                         jnc L(4)
 247
 248                                                 movl %ecx, %ebx
 249                                         orl $magic, %ebp
 250
 251                                         addl $1, %ebp
 252                                         jne L(4)
 253
 254                                                 movl $magic, %esi
 255                                                 xorl %edx, %ebx
 256
 257         movl (%eax), %ecx
 258                                                 addl %ebx, %esi
 259
 260         movl $magic, %edi
 261                                                 jnc L(5)
 262
 263         movl %edi, %ebp
 264                                                 xorl %ebx, %esi
 265
 266         addl %ecx, %ebp
 267                                                 orl $magic, %esi
 268
 269                                                 addl $1, %esi
 270
 271                                                 je L(1)
 272
 273         /* We know there is no NUL byte but a C byte in the word.
 274            %ebx contains NUL in this particular byte.  */
 275 L(5):   subl $4, %eax           /* adjust pointer */
 276         testb %bl, %bl          /* first byte == C? */
 277
 278         jz L(2)                 /* yes => return pointer */
 279
 280         incl %eax               /* increment pointer */
 281         testb %bh, %bh          /* second byte == C? */
 282
 283         jz L(2)                 /* yes => return pointer */
 284
 285         shrl $16, %ebx          /* make upper bytes accessible */
 286         incl %eax               /* increment pointer */
 287
 288         cmp $0, %bl             /* third byte == C */
 289         je L(2)                 /* yes => return pointer */
 290
 291         incl %eax               /* increment pointer */
 292
 293 L(2):   CHECK_BOUNDS_HIGH (%eax, STR(%esp), jb)
 294         RETURN_BOUNDED_POINTER (STR(%esp))
 295 L(out): popl %ebp               /* restore saved registers */
 296         cfi_adjust_cfa_offset (-4)
 297         cfi_restore (ebp)
 298         popl %ebx
 299         cfi_adjust_cfa_offset (-4)
 300         cfi_restore (ebx)
 301
 302         popl %esi
 303         cfi_adjust_cfa_offset (-4)
 304         cfi_restore (esi)
 305         popl %edi
 306         cfi_adjust_cfa_offset (-4)
 307         cfi_restore (edi)
 308
 309         LEAVE
 310         RET_PTR
 311
 312         cfi_adjust_cfa_offset (16)
 313         cfi_rel_offset (edi, 12)
 314         cfi_rel_offset (esi, 8)
 315         cfi_rel_offset (ebx, 4)
 316         cfi_rel_offset (ebp, 0)
 317         /* We know there is a NUL byte in the word.  But we have to test
 318            whether there is an C byte before it in the word.  */
 319 L(4):   subl $4, %eax           /* adjust pointer */
 320         cmpb %dl, %cl           /* first byte == C? */
 321
 322         je L(2)                 /* yes => return pointer */
 323
 324         cmpb $0, %cl            /* first byte == NUL? */
 325         je L(3)                 /* yes => return NULL */
 326
 327         incl %eax               /* increment pointer */
 328
 329         cmpb %dl, %ch           /* second byte == C? */
 330         je L(2)                 /* yes => return pointer */
 331
 332         cmpb $0, %ch            /* second byte == NUL? */
 333         je L(3)                 /* yes => return NULL */
 334
 335         shrl $16, %ecx          /* make upper bytes accessible */
 336         incl %eax               /* increment pointer */
 337
 338         cmpb %dl, %cl           /* third byte == C? */
 339         je L(2)                 /* yes => return pointer */
 340
 341         cmpb $0, %cl            /* third byte == NUL? */
 342         je L(3)                 /* yes => return NULL */
 343
 344         incl %eax               /* increment pointer */
 345
 346         /* The test four the fourth byte is necessary!  */
 347         cmpb %dl, %ch           /* fourth byte == C? */
 348         je L(2)                 /* yes => return pointer */
 349
 350 L(3):   xorl %eax, %eax
 351         RETURN_NULL_BOUNDED_POINTER
 352         jmp L(out)
 353 END (BP_SYM (strchr))
 354
 355 #undef index
 356 weak_alias (BP_SYM (strchr), BP_SYM (index))
 357 libc_hidden_builtin_def (strchr)