From 73f27d5e722ece05a66c124406cc8ca4305f4cbd Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 24 Aug 2010 11:35:01 -0700 Subject: [PATCH] Clean up SSE variable shifts --- ChangeLog | 17 ++++ sysdeps/i386/i686/multiarch/Makefile | 2 +- sysdeps/i386/i686/multiarch/varshift.S | 1 + sysdeps/i386/i686/multiarch/varshift.h | 1 + sysdeps/x86_64/multiarch/Makefile | 2 +- sysdeps/x86_64/multiarch/strcspn-c.c | 154 ++------------------------------- sysdeps/x86_64/multiarch/strspn-c.c | 152 ++------------------------------ sysdeps/x86_64/multiarch/strstr.c | 62 +------------ sysdeps/x86_64/multiarch/varshift.S | 30 +++++++ sysdeps/x86_64/multiarch/varshift.h | 27 ++++++ 10 files changed, 93 insertions(+), 355 deletions(-) create mode 100644 sysdeps/i386/i686/multiarch/varshift.S create mode 100644 sysdeps/i386/i686/multiarch/varshift.h create mode 100644 sysdeps/x86_64/multiarch/varshift.S create mode 100644 sysdeps/x86_64/multiarch/varshift.h diff --git a/ChangeLog b/ChangeLog index 1da347c7d9..f8050d7b88 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2010-08-24 Richard Henderson + Ulrich Drepper + H.J. Lu + + * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add varshift. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Likewise. + * sysdeps/x86_64/multiarch/strcspn-c.c: Include "varshift.h". + Replace _mm_srli_si128 with __m128i_shift_right. Replace + _mm_alignr_epi8 with _mm_loadu_si128. + * sysdeps/x86_64/multiarch/strspn-c.c: Likewise. + * sysdeps/x86_64/multiarch/strstr.c: Include "varshift.h". + (__m128i_shift_right): Removed. + * sysdeps/i386/i686/multiarch/varshift.h: New file. + * sysdeps/i386/i686/multiarch/varshift.S: New file. + * sysdeps/x86_64/multiarch/varshift.h: New file. + * sysdeps/x86_64/multiarch/varshift.S: New file. + 2010-08-21 Mike Frysinger * configure.in: Move assembler checks to before sysdep dir checking. diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 12bcfc273f..26f3e58064 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -9,7 +9,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ - memcmp-ssse3 memcmp-sse4 strcasestr-nonascii + memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/varshift.S b/sysdeps/i386/i686/multiarch/varshift.S new file mode 100644 index 0000000000..41afaf721c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/varshift.S @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i686/multiarch/varshift.h b/sysdeps/i386/i686/multiarch/varshift.h new file mode 100644 index 0000000000..7c72c70d67 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/varshift.h @@ -0,0 +1 @@ +#include diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index b124524b2e..27dc56321d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -10,7 +10,7 @@ sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ strncase_l-ssse3 ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c +sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index daeebe1bf5..04aba46237 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -20,6 +20,7 @@ #include #include +#include "varshift.h" /* We use 0x2: _SIDD_SBYTE_OPS @@ -86,8 +87,6 @@ STRCSPN_SSE42 (const char *s, const char *a) const char *aligned; __m128i mask; - /* Fake initialization. gcc otherwise will warn. */ - asm ("" : "=xm" (mask)); int offset = (int) ((size_t) a & 15); if (offset != 0) { @@ -95,54 +94,7 @@ STRCSPN_SSE42 (const char *s, const char *a) aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - mask = _mm_srli_si128 (mask0, 1); - break; - case 2: - mask = _mm_srli_si128 (mask0, 2); - break; - case 3: - mask = _mm_srli_si128 (mask0, 3); - break; - case 4: - mask = _mm_srli_si128 (mask0, 4); - break; - case 5: - mask = _mm_srli_si128 (mask0, 5); - break; - case 6: - mask = _mm_srli_si128 (mask0, 6); - break; - case 7: - mask = _mm_srli_si128 (mask0, 7); - break; - case 8: - mask = _mm_srli_si128 (mask0, 8); - break; - case 9: - mask = _mm_srli_si128 (mask0, 9); - break; - case 10: - mask = _mm_srli_si128 (mask0, 10); - break; - case 11: - mask = _mm_srli_si128 (mask0, 11); - break; - case 12: - mask = _mm_srli_si128 (mask0, 12); - break; - case 13: - mask = _mm_srli_si128 (mask0, 13); - break; - case 14: - mask = _mm_srli_si128 (mask0, 14); - break; - case 15: - mask = _mm_srli_si128 (mask0, 15); - break; - } + mask = __m128i_shift_right (mask0, offset); /* Find where the NULL terminator is. */ int length = _mm_cmpistri (mask, mask, 0x3a); @@ -159,55 +111,10 @@ STRCSPN_SSE42 (const char *s, const char *a) if (index != 0) { - /* Combine mask0 and mask1. */ - switch (offset) - { - case 1: - mask = _mm_alignr_epi8 (mask1, mask0, 1); - break; - case 2: - mask = _mm_alignr_epi8 (mask1, mask0, 2); - break; - case 3: - mask = _mm_alignr_epi8 (mask1, mask0, 3); - break; - case 4: - mask = _mm_alignr_epi8 (mask1, mask0, 4); - break; - case 5: - mask = _mm_alignr_epi8 (mask1, mask0, 5); - break; - case 6: - mask = _mm_alignr_epi8 (mask1, mask0, 6); - break; - case 7: - mask = _mm_alignr_epi8 (mask1, mask0, 7); - break; - case 8: - mask = _mm_alignr_epi8 (mask1, mask0, 8); - break; - case 9: - mask = _mm_alignr_epi8 (mask1, mask0, 9); - break; - case 10: - mask = _mm_alignr_epi8 (mask1, mask0, 10); - break; - case 11: - mask = _mm_alignr_epi8 (mask1, mask0, 11); - break; - case 12: - mask = _mm_alignr_epi8 (mask1, mask0, 12); - break; - case 13: - mask = _mm_alignr_epi8 (mask1, mask0, 13); - break; - case 14: - mask = _mm_alignr_epi8 (mask1, mask0, 14); - break; - case 15: - mask = _mm_alignr_epi8 (mask1, mask0, 15); - break; - } + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); } } } @@ -234,54 +141,7 @@ STRCSPN_SSE42 (const char *s, const char *a) aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - value = _mm_srli_si128 (value, 1); - break; - case 2: - value = _mm_srli_si128 (value, 2); - break; - case 3: - value = _mm_srli_si128 (value, 3); - break; - case 4: - value = _mm_srli_si128 (value, 4); - break; - case 5: - value = _mm_srli_si128 (value, 5); - break; - case 6: - value = _mm_srli_si128 (value, 6); - break; - case 7: - value = _mm_srli_si128 (value, 7); - break; - case 8: - value = _mm_srli_si128 (value, 8); - break; - case 9: - value = _mm_srli_si128 (value, 9); - break; - case 10: - value = _mm_srli_si128 (value, 10); - break; - case 11: - value = _mm_srli_si128 (value, 11); - break; - case 12: - value = _mm_srli_si128 (value, 12); - break; - case 13: - value = _mm_srli_si128 (value, 13); - break; - case 14: - value = _mm_srli_si128 (value, 14); - break; - case 15: - value = _mm_srli_si128 (value, 15); - break; - } + value = __m128i_shift_right (value, offset); int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index be9e8ac0a8..ab58549f9b 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -20,6 +20,7 @@ #include #include +#include "varshift.h" /* We use 0x12: _SIDD_SBYTE_OPS @@ -71,54 +72,7 @@ __strspn_sse42 (const char *s, const char *a) aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - mask = _mm_srli_si128 (mask0, 1); - break; - case 2: - mask = _mm_srli_si128 (mask0, 2); - break; - case 3: - mask = _mm_srli_si128 (mask0, 3); - break; - case 4: - mask = _mm_srli_si128 (mask0, 4); - break; - case 5: - mask = _mm_srli_si128 (mask0, 5); - break; - case 6: - mask = _mm_srli_si128 (mask0, 6); - break; - case 7: - mask = _mm_srli_si128 (mask0, 7); - break; - case 8: - mask = _mm_srli_si128 (mask0, 8); - break; - case 9: - mask = _mm_srli_si128 (mask0, 9); - break; - case 10: - mask = _mm_srli_si128 (mask0, 10); - break; - case 11: - mask = _mm_srli_si128 (mask0, 11); - break; - case 12: - mask = _mm_srli_si128 (mask0, 12); - break; - case 13: - mask = _mm_srli_si128 (mask0, 13); - break; - case 14: - mask = _mm_srli_si128 (mask0, 14); - break; - case 15: - mask = _mm_srli_si128 (mask0, 15); - break; - } + mask = __m128i_shift_right (mask0, offset); /* Find where the NULL terminator is. */ int length = _mm_cmpistri (mask, mask, 0x3a); @@ -135,55 +89,10 @@ __strspn_sse42 (const char *s, const char *a) if (index != 0) { - /* Combine mask0 and mask1. */ - switch (offset) - { - case 1: - mask = _mm_alignr_epi8 (mask1, mask0, 1); - break; - case 2: - mask = _mm_alignr_epi8 (mask1, mask0, 2); - break; - case 3: - mask = _mm_alignr_epi8 (mask1, mask0, 3); - break; - case 4: - mask = _mm_alignr_epi8 (mask1, mask0, 4); - break; - case 5: - mask = _mm_alignr_epi8 (mask1, mask0, 5); - break; - case 6: - mask = _mm_alignr_epi8 (mask1, mask0, 6); - break; - case 7: - mask = _mm_alignr_epi8 (mask1, mask0, 7); - break; - case 8: - mask = _mm_alignr_epi8 (mask1, mask0, 8); - break; - case 9: - mask = _mm_alignr_epi8 (mask1, mask0, 9); - break; - case 10: - mask = _mm_alignr_epi8 (mask1, mask0, 10); - break; - case 11: - mask = _mm_alignr_epi8 (mask1, mask0, 11); - break; - case 12: - mask = _mm_alignr_epi8 (mask1, mask0, 12); - break; - case 13: - mask = _mm_alignr_epi8 (mask1, mask0, 13); - break; - case 14: - mask = _mm_alignr_epi8 (mask1, mask0, 14); - break; - case 15: - mask = _mm_alignr_epi8 (mask1, mask0, 15); - break; - } + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); } } } @@ -210,54 +119,7 @@ __strspn_sse42 (const char *s, const char *a) aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); - switch (offset) - { - case 1: - value = _mm_srli_si128 (value, 1); - break; - case 2: - value = _mm_srli_si128 (value, 2); - break; - case 3: - value = _mm_srli_si128 (value, 3); - break; - case 4: - value = _mm_srli_si128 (value, 4); - break; - case 5: - value = _mm_srli_si128 (value, 5); - break; - case 6: - value = _mm_srli_si128 (value, 6); - break; - case 7: - value = _mm_srli_si128 (value, 7); - break; - case 8: - value = _mm_srli_si128 (value, 8); - break; - case 9: - value = _mm_srli_si128 (value, 9); - break; - case 10: - value = _mm_srli_si128 (value, 10); - break; - case 11: - value = _mm_srli_si128 (value, 11); - break; - case 12: - value = _mm_srli_si128 (value, 12); - break; - case 13: - value = _mm_srli_si128 (value, 13); - break; - case 14: - value = _mm_srli_si128 (value, 14); - break; - case 15: - value = _mm_srli_si128 (value, 15); - break; - } + value = __m128i_shift_right (value, offset); int length = _mm_cmpistri (mask, value, 0x12); /* No need to check CFlag since it is always 1. */ diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 45d7a550ac..b408b752fa 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -19,6 +19,7 @@ 02111-1307 USA. */ #include +#include "varshift.h" #ifndef STRSTR_SSE42 # define STRSTR_SSE42 __strstr_sse42 @@ -82,67 +83,6 @@ 5. failed string compare, go back to scanning */ -/* Fix-up of removal of unneeded data due to 16B aligned load - parameters: - value: 16B data loaded from 16B aligned address. - offset: Offset of target data address relative to 16B aligned load - address. - */ - -static __inline__ __m128i -__m128i_shift_right (__m128i value, int offset) -{ - switch (offset) - { - case 1: - value = _mm_srli_si128 (value, 1); - break; - case 2: - value = _mm_srli_si128 (value, 2); - break; - case 3: - value = _mm_srli_si128 (value, 3); - break; - case 4: - value = _mm_srli_si128 (value, 4); - break; - case 5: - value = _mm_srli_si128 (value, 5); - break; - case 6: - value = _mm_srli_si128 (value, 6); - break; - case 7: - value = _mm_srli_si128 (value, 7); - break; - case 8: - value = _mm_srli_si128 (value, 8); - break; - case 9: - value = _mm_srli_si128 (value, 9); - break; - case 10: - value = _mm_srli_si128 (value, 10); - break; - case 11: - value = _mm_srli_si128 (value, 11); - break; - case 12: - value = _mm_srli_si128 (value, 12); - break; - case 13: - value = _mm_srli_si128 (value, 13); - break; - case 14: - value = _mm_srli_si128 (value, 14); - break; - case 15: - value = _mm_srli_si128 (value, 15); - break; - } - return value; -} - /* Simple replacement of movdqu to address 4KB boundary cross issue. If EOS occurs within less than 16B before 4KB boundary, we don't cross to next page. */ diff --git a/sysdeps/x86_64/multiarch/varshift.S b/sysdeps/x86_64/multiarch/varshift.S new file mode 100644 index 0000000000..b50f98bb55 --- /dev/null +++ b/sysdeps/x86_64/multiarch/varshift.S @@ -0,0 +1,30 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + + .section .rodata + .hidden ___m128i_shift_right + .globl ___m128i_shift_right + .size ___m128i_shift_right, 31 + +___m128i_shift_right: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h new file mode 100644 index 0000000000..d679739f69 --- /dev/null +++ b/sysdeps/x86_64/multiarch/varshift.h @@ -0,0 +1,27 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +extern char ___m128i_shift_right[31] __attribute__((visibility("hidden"))); + +static __inline__ __m128i +__m128i_shift_right (__m128i value, unsigned long offset) +{ + return _mm_shuffle_epi8 (value, _mm_loadu_si128 ((__m128 *) (___m128i_shift_right + offset))); +} -- 2.11.4.GIT