1 /* Optimized strcasestr implementation for PowerPC64/POWER8.
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include <locale-defines.h>
22 /* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
24 /* The performance gain is obtained by comparing 16 bytes. */
26 /* When the first char of r4 is hit ITERATIONS times in r3
27 fallback to default. */
31 # define STRCASESTR __strcasestr
35 /* For builds without IFUNC support, local calls should be made to internal
36 GLIBC symbol (created by libc_hidden_builtin_def). */
38 # define STRLEN __GI_strlen
40 # define STRLEN strlen
45 /* For builds without IFUNC support, local calls should be made to internal
46 GLIBC symbol (created by libc_hidden_builtin_def). */
48 # define STRNLEN __GI_strnlen
50 # define STRNLEN __strnlen
56 # define STRCHR __GI_strchr
58 # define STRCHR strchr
62 /* Convert 16 bytes of v4 and reg to lowercase and compare. */
63 #define TOLOWER(reg) \
64 vcmpgtub v6, v4, v1; \
65 vcmpgtub v7, v2, v4; \
69 vcmpgtub v6, reg, v1; \
70 vcmpgtub v7, v2, reg; \
74 vcmpequb. v6, reg, v4;
76 /* TODO: change these to the actual instructions when the minimum required
77 binutils allows it. */
79 #define VCLZD_V8_v7 vclzd v8, v7;
81 #define VCLZD_V8_v7 .long 0x11003fc2
84 #define FRAMESIZE (FRAME_MIN_SIZE+48)
85 /* TODO: change this to .machine power8 when the minimum required binutils
88 EALIGN (STRCASESTR, 4, 0)
90 mflr r0 /* Load link register LR to r0. */
91 std r31, -8(r1) /* Save callers register r31. */
92 std r30, -16(r1) /* Save callers register r30. */
93 std r29, -24(r1) /* Save callers register r29. */
94 std r28, -32(r1) /* Save callers register r28. */
95 std r27, -40(r1) /* Save callers register r27. */
96 std r0, 16(r1) /* Store the link register. */
103 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
104 cfi_adjust_cfa_offset(FRAMESIZE)
108 cmpdi cr7, r3, 0 /* Input validation. */
115 /* Load first byte from r4 and check if its null. */
120 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
121 add r9, r10, __libc_tsd_LOCALE@tls
123 ld r9, LOCALE_CTYPE_TOUPPER(r9)
124 sldi r10, r6, 2 /* Convert to upper case. */
127 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
128 add r11, r10, __libc_tsd_LOCALE@tls
130 ld r11, LOCALE_CTYPE_TOLOWER(r11)
131 sldi r10, r6, 2 /* Convert to lower case. */
134 /* Check if the first char is present. */
147 beq cr7, L(skipcheck)
150 /* Move r3 to the first occurence. */
158 /* Reg r27 is used to count the number of iterations. */
160 /* If first char of search str is not present. */
164 /* Find the length of pattern. */
169 cmpdi cr7, r3, 0 /* If search str is null. */
178 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
183 /* Locales not matching ASCII for single bytes. */
184 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
185 add r9, r10, __libc_tsd_LOCALE@tls
188 addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
191 beq cr7, L(bytebybyte)
193 /* If len(r4) < 16 handle byte by byte. */
194 /* For shorter strings we will not use vector registers. */
196 blt cr7, L(bytebybyte)
198 /* Comparison values used for TOLOWER. */
199 /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
212 1. Load 16 bytes from r3 and r4
213 2. Check if there is null, If yes, proceed byte by byte path.
214 3. Else,Convert both to lowercase and compare.
215 4. If they are same proceed to 1.
216 5. If they dont match, find if first char of r4 is present in the
217 loaded 16 byte of r3.
218 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
221 mr r8, r3 /* Save r3 for future use. */
222 mr r4, r30 /* Restore r4. */
224 lvx v5, 0, r4 /* Load 16 bytes from r4. */
227 /* If r4 is unaligned, load another 16 bytes. */
228 #ifdef __LITTLE_ENDIAN__
235 #ifdef __LITTLE_ENDIAN__
242 vcmpequb. v7, v0, v4 /* Check for null. */
251 #ifdef __LITTLE_ENDIAN__
257 /* If r3 is unaligned, load another 16 bytes. */
259 #ifdef __LITTLE_ENDIAN__
260 vperm v4, v10, v4, v7
262 vperm v4, v4, v10, v7
265 vcmpequb. v6, v0, v5 /* Check for null. */
277 /* Convert both v3 and v4 to lower. */
279 /* If both are same, branch to match. */
281 /* Find if the first char is present in next 15 bytes. */
282 #ifdef __LITTLE_ENDIAN__
284 vsldoi v7, v0, v4, 15
292 /* Shift r3 by 16 bytes and proceed. */
295 #ifdef __LITTLE_ENDIAN__
301 /* Shift r3 by 8 bytes and proceed. */
307 /* There is a match of 16 bytes, check next bytes. */
315 /* Load next 16 bytes of r3 and r4 and compare. */
319 /* Handle unaligned case. */
327 #ifdef __LITTLE_ENDIAN__
333 /* If r4 is unaligned, load another 16 bytes. */
335 #ifdef __LITTLE_ENDIAN__
336 vperm v11, v9, v6, v7
338 vperm v11, v6, v9, v7
346 vcmpequb. v7, v0, v11
354 beq cr7, L(nextload1)
355 /* Handle unaligned case. */
363 #ifdef __LITTLE_ENDIAN__
369 /* If r3 is unaligned, load another 16 bytes. */
371 #ifdef __LITTLE_ENDIAN__
372 vperm v4, v10, v4, v7
374 vperm v4, v4, v10, v7
388 /* Convert both v3 and v4 to lower. */
390 /* If both are same, branch to secondmatch. */
391 blt cr6, L(secondmatch)
392 /* Continue the search. */
397 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
398 add r11, r10, __libc_tsd_LOCALE@tls
400 ld r11, LOCALE_CTYPE_TOLOWER(r11)
402 lbz r5, 0(r3) /* Load byte from r3. */
403 lbz r6, 0(r4) /* Load next byte from r4. */
404 cmpdi cr7, r6, 0 /* Is it null? */
406 cmpdi cr7, r5, 0 /* Is it null? */
407 beq cr7, L(retnull) /* If yes, return. */
409 addi r4, r4, 1 /* Increment r4. */
410 sldi r10, r5, 2 /* Convert to lower case. */
412 sldi r7, r6, 2 /* Convert to lower case. */
414 cmpw cr7, r7, r10 /* Compare with byte from r4. */
429 /* When our iterations exceed ITERATIONS,fall back to default. */
431 cmpdi cr7, r27, ITERATIONS
433 mr r4, r30 /* Restore r4. */
436 /* Handling byte by byte. */
441 cmpdi cr7, r27, ITERATIONS
445 /* Check if the first char is present. */
458 beq cr7, L(skipcheck1)
461 /* Move r3 to first occurence. */
469 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
470 add r11, r10, __libc_tsd_LOCALE@tls
472 ld r11, LOCALE_CTYPE_TOLOWER(r11)
473 mr r4, r30 /* Restore r4. */
474 mr r8, r3 /* Save r3. */
479 lbz r5, 0(r3) /* Load byte from r3. */
480 addi r4, r4, 1 /* Increment r4. */
481 lbz r6, 0(r4) /* Load next byte from r4. */
482 cmpdi cr7, r6, 0 /* Is it null? */
484 cmpdi cr7, r5, 0 /* Is it null? */
485 beq cr7, L(retnull) /* If yes, return. */
486 sldi r10, r5, 2 /* Convert to lower case. */
488 sldi r7, r6, 2 /* Convert to lower case. */
490 cmpw cr7, r7, r10 /* Compare with byte from r4. */
494 /* Handling return values. */
497 subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
502 mr r3, r29 /* Return point of match. */
507 li r3, 0 /* Substring was not found. */
518 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
519 cfi_adjust_cfa_offset(-FRAMESIZE)
520 ld r0, 16(r1) /* Restore the saved link register. */
523 ld r29, -24(r1) /* Restore callers save register r29. */
524 ld r30, -16(r1) /* Restore callers save register r30. */
525 ld r31, -8(r1) /* Restore callers save register r31. */
532 mtlr r0 /* Branch to link register. */
536 weak_alias (__strcasestr, strcasestr)
537 libc_hidden_def (__strcasestr)
538 libc_hidden_builtin_def (strcasestr)