1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
2 * to be of any use, otherwise I'd use them.
4 * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
5 * close enough that they could have implemented the _mm_*-style intrinsic
6 * interface and had a ton of optimized code available to them. Instead they
7 * implemented something much, much worse.
9 * - pshuf takes a dead first argument, causing extra instructions to be
12 * - There are no 64-bit shift or logical intrinsics, which means you have
13 * to implement them with inline assembly, but this is a nightmare because
14 * gcc doesn't understand that the integer vector datatypes are actually in
15 * floating-point registers, so you end up with braindead code like
17 * punpcklwd $f9,$f9,$f5
19 * punpcklwd $f19,$f19,$f5
24 * punpcklbh $f20,$f20,$f2
26 * where crap just gets copied back and forth between integer and floating-
27 * point registers ad nauseum.
29 * Instead of trying to workaround the problems from these crap intrinsics, I
30 * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
36 /* vectors are stored in 64-bit floating-point registers */
38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
41 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
42 _mm_setzero_si64 (void)
47 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
48 _mm_add_pi16 (__m64 __m1
, __m64 __m2
)
51 asm("paddh %0, %1, %2\n\t"
53 : "f" (__m1
), "f" (__m2
)
58 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
59 _mm_add_pi32 (__m64 __m1
, __m64 __m2
)
62 asm("paddw %0, %1, %2\n\t"
64 : "f" (__m1
), "f" (__m2
)
69 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
70 _mm_adds_pu16 (__m64 __m1
, __m64 __m2
)
73 asm("paddush %0, %1, %2\n\t"
75 : "f" (__m1
), "f" (__m2
)
80 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
81 _mm_adds_pu8 (__m64 __m1
, __m64 __m2
)
84 asm("paddusb %0, %1, %2\n\t"
86 : "f" (__m1
), "f" (__m2
)
91 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
92 _mm_and_si64 (__m64 __m1
, __m64 __m2
)
95 asm("and %0, %1, %2\n\t"
97 : "f" (__m1
), "f" (__m2
)
102 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
103 _mm_cmpeq_pi32 (__m64 __m1
, __m64 __m2
)
106 asm("pcmpeqw %0, %1, %2\n\t"
108 : "f" (__m1
), "f" (__m2
)
113 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
119 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
120 _mm_madd_pi16 (__m64 __m1
, __m64 __m2
)
123 asm("pmaddhw %0, %1, %2\n\t"
125 : "f" (__m1
), "f" (__m2
)
130 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
131 _mm_mulhi_pu16 (__m64 __m1
, __m64 __m2
)
134 asm("pmulhuh %0, %1, %2\n\t"
136 : "f" (__m1
), "f" (__m2
)
141 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
142 _mm_mullo_pi16 (__m64 __m1
, __m64 __m2
)
145 asm("pmullh %0, %1, %2\n\t"
147 : "f" (__m1
), "f" (__m2
)
152 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
153 _mm_or_si64 (__m64 __m1
, __m64 __m2
)
156 asm("or %0, %1, %2\n\t"
158 : "f" (__m1
), "f" (__m2
)
163 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
164 _mm_packs_pu16 (__m64 __m1
, __m64 __m2
)
167 asm("packushb %0, %1, %2\n\t"
169 : "f" (__m1
), "f" (__m2
)
174 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
175 _mm_packs_pi32 (__m64 __m1
, __m64 __m2
)
178 asm("packsswh %0, %1, %2\n\t"
180 : "f" (__m1
), "f" (__m2
)
185 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
186 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
187 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
188 _mm_set_pi16 (uint16_t __w3
, uint16_t __w2
, uint16_t __w1
, uint16_t __w0
)
190 if (__builtin_constant_p (__w3
) &&
191 __builtin_constant_p (__w2
) &&
192 __builtin_constant_p (__w1
) &&
193 __builtin_constant_p (__w0
))
195 uint64_t val
= ((uint64_t)__w3
<< 48)
196 | ((uint64_t)__w2
<< 32)
197 | ((uint64_t)__w1
<< 16)
198 | ((uint64_t)__w0
<< 0);
199 return *(__m64
*)&val
;
201 else if (__w3
== __w2
&& __w2
== __w1
&& __w1
== __w0
)
203 /* TODO: handle other cases */
205 uint64_t imm
= _MM_SHUFFLE (0, 0, 0, 0);
207 asm("pshufh %0, %1, %2\n\t"
209 : "f" (*(__m64
*)&val
), "f" (*(__m64
*)&imm
)
213 uint64_t val
= ((uint64_t)__w3
<< 48)
214 | ((uint64_t)__w2
<< 32)
215 | ((uint64_t)__w1
<< 16)
216 | ((uint64_t)__w0
<< 0);
217 return *(__m64
*)&val
;
220 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
221 _mm_set_pi32 (unsigned __i1
, unsigned __i0
)
223 if (__builtin_constant_p (__i1
) &&
224 __builtin_constant_p (__i0
))
226 uint64_t val
= ((uint64_t)__i1
<< 32)
227 | ((uint64_t)__i0
<< 0);
228 return *(__m64
*)&val
;
230 else if (__i1
== __i0
)
232 uint64_t imm
= _MM_SHUFFLE (1, 0, 1, 0);
234 asm("pshufh %0, %1, %2\n\t"
236 : "f" (*(__m32
*)&__i1
), "f" (*(__m64
*)&imm
)
240 uint64_t val
= ((uint64_t)__i1
<< 32)
241 | ((uint64_t)__i0
<< 0);
242 return *(__m64
*)&val
;
246 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
247 _mm_shuffle_pi16 (__m64 __m
, int64_t __n
)
250 asm("pshufh %0, %1, %2\n\t"
252 : "f" (__m
), "f" (*(__m64
*)&__n
)
257 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
258 _mm_slli_pi16 (__m64 __m
, int64_t __count
)
261 asm("psllh %0, %1, %2\n\t"
263 : "f" (__m
), "f" (*(__m64
*)&__count
)
267 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
268 _mm_slli_si64 (__m64 __m
, int64_t __count
)
271 asm("dsll %0, %1, %2\n\t"
273 : "f" (__m
), "f" (*(__m64
*)&__count
)
278 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
279 _mm_srli_pi16 (__m64 __m
, int64_t __count
)
282 asm("psrlh %0, %1, %2\n\t"
284 : "f" (__m
), "f" (*(__m64
*)&__count
)
289 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
290 _mm_srli_pi32 (__m64 __m
, int64_t __count
)
293 asm("psrlw %0, %1, %2\n\t"
295 : "f" (__m
), "f" (*(__m64
*)&__count
)
300 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
301 _mm_srli_si64 (__m64 __m
, int64_t __count
)
304 asm("dsrl %0, %1, %2\n\t"
306 : "f" (__m
), "f" (*(__m64
*)&__count
)
311 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
312 _mm_sub_pi16 (__m64 __m1
, __m64 __m2
)
315 asm("psubh %0, %1, %2\n\t"
317 : "f" (__m1
), "f" (__m2
)
322 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
323 _mm_unpackhi_pi8 (__m64 __m1
, __m64 __m2
)
326 asm("punpckhbh %0, %1, %2\n\t"
328 : "f" (__m1
), "f" (__m2
)
333 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
334 _mm_unpackhi_pi16 (__m64 __m1
, __m64 __m2
)
337 asm("punpckhhw %0, %1, %2\n\t"
339 : "f" (__m1
), "f" (__m2
)
344 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
345 _mm_unpacklo_pi8 (__m64 __m1
, __m64 __m2
)
348 asm("punpcklbh %0, %1, %2\n\t"
350 : "f" (__m1
), "f" (__m2
)
355 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
356 * allows load8888 to use 32-bit loads */
357 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
358 _mm_unpacklo_pi8_f (__m32 __m1
, __m64 __m2
)
361 asm("punpcklbh %0, %1, %2\n\t"
363 : "f" (__m1
), "f" (__m2
)
368 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
369 _mm_unpacklo_pi16 (__m64 __m1
, __m64 __m2
)
372 asm("punpcklhw %0, %1, %2\n\t"
374 : "f" (__m1
), "f" (__m2
)
379 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
380 _mm_xor_si64 (__m64 __m1
, __m64 __m2
)
383 asm("xor %0, %1, %2\n\t"
385 : "f" (__m1
), "f" (__m2
)
390 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
391 loongson_extract_pi16 (__m64 __m
, int64_t __pos
)
394 asm("pextrh %0, %1, %2\n\t"
396 : "f" (__m
), "f" (*(__m64
*)&__pos
)
401 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
402 loongson_insert_pi16 (__m64 __m1
, __m64 __m2
, int64_t __pos
)
405 asm("pinsrh_%3 %0, %1, %2\n\t"
407 : "f" (__m1
), "f" (__m2
), "i" (__pos
)