Add single-lane SLP support to .GOMP_SIMD_LANE vectorization
[official-gcc.git] / gcc / config / rs6000 / bmi2intrin.h
blob1326810b3600402fd8b0f07e7c0ff46cadb6d013
1 /* Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* This header is distributed to simplify porting x86_64 code that
25 makes explicit use of Intel intrinsics to powerpc64le.
26 It is the user's responsibility to determine if the results are
27 acceptable and make additional changes as necessary.
28 Note that much code that uses Intel intrinsics can be rewritten in
29 standard C or GNU C extensions, which are more portable and better
30 optimized across multiple targets. */
32 #if !defined _X86GPRINTRIN_H_INCLUDED
33 # error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
34 #endif
36 #ifndef _BMI2INTRIN_H_INCLUDED
37 #define _BMI2INTRIN_H_INCLUDED
39 extern __inline unsigned int
40 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41 _bzhi_u32 (unsigned int __X, unsigned int __Y)
43 return ((__X << (32 - __Y)) >> (32 - __Y));
46 extern __inline unsigned int
47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
50 unsigned long long __res = (unsigned long long) __X * __Y;
51 *__P = (unsigned int) (__res >> 32);
52 return (unsigned int) __res;
55 #ifdef __PPC64__
56 extern __inline unsigned long long
57 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
58 _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
60 return ((__X << (64 - __Y)) >> (64 - __Y));
63 /* __int128 requires base 64-bit. */
64 extern __inline unsigned long long
65 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66 _mulx_u64 (unsigned long long __X, unsigned long long __Y,
67 unsigned long long *__P)
69 unsigned __int128 __res = (unsigned __int128) __X * __Y;
70 *__P = (unsigned long long) (__res >> 64);
71 return (unsigned long long) __res;
74 #ifdef _ARCH_PWR7
75 /* popcount and bpermd require power7 minimum. */
76 extern __inline unsigned long long
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _pdep_u64 (unsigned long long __X, unsigned long long __M)
80 unsigned long __result = 0x0UL;
81 const unsigned long __mask = 0x8000000000000000UL;
82 unsigned long __m = __M;
83 unsigned long __c, __t;
84 unsigned long __p;
86 /* The pop-count of the mask gives the number of the bits from
87 source to process. This is also needed to shift bits from the
88 source into the correct position for the result. */
89 __p = 64 - __builtin_popcountl (__M);
91 /* The loop is for the number of '1' bits in the mask and clearing
92 each mask bit as it is processed. */
93 while (__m != 0)
95 __c = __builtin_clzl (__m);
96 __t = __X << (__p - __c);
97 __m ^= (__mask >> __c);
98 __result |= (__t & (__mask >> __c));
99 __p++;
101 return __result;
104 extern __inline unsigned long long
105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 _pext_u64 (unsigned long long __X, unsigned long long __M)
108 unsigned long __p = 0x4040404040404040UL; // initial bit permute control
109 const unsigned long __mask = 0x8000000000000000UL;
110 unsigned long __m = __M;
111 unsigned long __c;
112 unsigned long __result;
114 /* if the mask is constant and selects 8 bits or less we can use
115 the Power8 Bit permute instruction. */
116 if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
118 /* Also if the pext mask is constant, then the popcount is
119 constant, we can evaluate the following loop at compile
120 time and use a constant bit permute vector. */
121 long __i;
122 for (__i = 0; __i < __builtin_popcountl (__M); __i++)
124 __c = __builtin_clzl (__m);
125 __p = (__p << 8) | __c;
126 __m ^= (__mask >> __c);
128 __result = __builtin_bpermd (__p, __X);
130 else
132 __p = 64 - __builtin_popcountl (__M);
133 __result = 0;
134 /* We could a use a for loop here, but that combined with
135 -funroll-loops can expand to a lot of code. The while
136 loop avoids unrolling and the compiler commons the xor
137 from clearing the mask bit with the (m != 0) test. The
138 result is a more compact loop setup and body. */
139 while (__m != 0)
141 unsigned long __t;
142 __c = __builtin_clzl (__m);
143 __t = (__X & (__mask >> __c)) >> (__p - __c);
144 __m ^= (__mask >> __c);
145 __result |= (__t);
146 __p++;
149 return __result;
152 /* these 32-bit implementations depend on 64-bit pdep/pext
153 which depend on _ARCH_PWR7. */
154 extern __inline unsigned int
155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _pdep_u32 (unsigned int __X, unsigned int __Y)
158 return _pdep_u64 (__X, __Y);
161 extern __inline unsigned int
162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 _pext_u32 (unsigned int __X, unsigned int __Y)
165 return _pext_u64 (__X, __Y);
167 #endif /* _ARCH_PWR7 */
168 #endif /* __PPC64__ */
170 #endif /* _BMI2INTRIN_H_INCLUDED */