beta-0.89.2
[luatex.git] / source / libs / pixman / pixman-src / pixman / loongson-mmintrin.h
blob086c6e0f1b24567e60ede6bfc4dc0892d0030527
1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
2 * to be of any use, otherwise I'd use them.
4 * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
5 * close enough that they could have implemented the _mm_*-style intrinsic
6 * interface and had a ton of optimized code available to them. Instead they
7 * implemented something much, much worse.
9 * - pshuf takes a dead first argument, causing extra instructions to be
10 * generated.
12 * - There are no 64-bit shift or logical intrinsics, which means you have
13 * to implement them with inline assembly, but this is a nightmare because
14 * gcc doesn't understand that the integer vector datatypes are actually in
15 * floating-point registers, so you end up with braindead code like
17 * punpcklwd $f9,$f9,$f5
18 * dmtc1 v0,$f8
19 * punpcklwd $f19,$f19,$f5
20 * dmfc1 t9,$f9
21 * dmtc1 v0,$f9
22 * dmtc1 t9,$f20
23 * dmfc1 s0,$f19
24 * punpcklbh $f20,$f20,$f2
26 * where crap just gets copied back and forth between integer and floating-
27 * point registers ad nauseum.
29 * Instead of trying to workaround the problems from these crap intrinsics, I
30 * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
31 * assembly.
34 #include <stdint.h>
36 /* vectors are stored in 64-bit floating-point registers */
37 typedef double __m64;
38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
39 typedef float __m32;
41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
42 _mm_setzero_si64 (void)
44 return 0.0;
47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 _mm_add_pi16 (__m64 __m1, __m64 __m2)
50 __m64 ret;
51 asm("paddh %0, %1, %2\n\t"
52 : "=f" (ret)
53 : "f" (__m1), "f" (__m2)
55 return ret;
58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59 _mm_add_pi32 (__m64 __m1, __m64 __m2)
61 __m64 ret;
62 asm("paddw %0, %1, %2\n\t"
63 : "=f" (ret)
64 : "f" (__m1), "f" (__m2)
66 return ret;
69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
72 __m64 ret;
73 asm("paddush %0, %1, %2\n\t"
74 : "=f" (ret)
75 : "f" (__m1), "f" (__m2)
77 return ret;
80 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
83 __m64 ret;
84 asm("paddusb %0, %1, %2\n\t"
85 : "=f" (ret)
86 : "f" (__m1), "f" (__m2)
88 return ret;
91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92 _mm_and_si64 (__m64 __m1, __m64 __m2)
94 __m64 ret;
95 asm("and %0, %1, %2\n\t"
96 : "=f" (ret)
97 : "f" (__m1), "f" (__m2)
99 return ret;
102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
105 __m64 ret;
106 asm("pcmpeqw %0, %1, %2\n\t"
107 : "=f" (ret)
108 : "f" (__m1), "f" (__m2)
110 return ret;
113 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114 _mm_empty (void)
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
122 __m64 ret;
123 asm("pmaddhw %0, %1, %2\n\t"
124 : "=f" (ret)
125 : "f" (__m1), "f" (__m2)
127 return ret;
130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
133 __m64 ret;
134 asm("pmulhuh %0, %1, %2\n\t"
135 : "=f" (ret)
136 : "f" (__m1), "f" (__m2)
138 return ret;
141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
144 __m64 ret;
145 asm("pmullh %0, %1, %2\n\t"
146 : "=f" (ret)
147 : "f" (__m1), "f" (__m2)
149 return ret;
152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_or_si64 (__m64 __m1, __m64 __m2)
155 __m64 ret;
156 asm("or %0, %1, %2\n\t"
157 : "=f" (ret)
158 : "f" (__m1), "f" (__m2)
160 return ret;
163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
166 __m64 ret;
167 asm("packushb %0, %1, %2\n\t"
168 : "=f" (ret)
169 : "f" (__m1), "f" (__m2)
171 return ret;
174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
177 __m64 ret;
178 asm("packsswh %0, %1, %2\n\t"
179 : "=f" (ret)
180 : "f" (__m1), "f" (__m2)
182 return ret;
185 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
186 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
190 if (__builtin_constant_p (__w3) &&
191 __builtin_constant_p (__w2) &&
192 __builtin_constant_p (__w1) &&
193 __builtin_constant_p (__w0))
195 uint64_t val = ((uint64_t)__w3 << 48)
196 | ((uint64_t)__w2 << 32)
197 | ((uint64_t)__w1 << 16)
198 | ((uint64_t)__w0 << 0);
199 return *(__m64 *)&val;
201 else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
203 /* TODO: handle other cases */
204 uint64_t val = __w3;
205 uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
206 __m64 ret;
207 asm("pshufh %0, %1, %2\n\t"
208 : "=f" (ret)
209 : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
211 return ret;
213 uint64_t val = ((uint64_t)__w3 << 48)
214 | ((uint64_t)__w2 << 32)
215 | ((uint64_t)__w1 << 16)
216 | ((uint64_t)__w0 << 0);
217 return *(__m64 *)&val;
220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_set_pi32 (unsigned __i1, unsigned __i0)
223 if (__builtin_constant_p (__i1) &&
224 __builtin_constant_p (__i0))
226 uint64_t val = ((uint64_t)__i1 << 32)
227 | ((uint64_t)__i0 << 0);
228 return *(__m64 *)&val;
230 else if (__i1 == __i0)
232 uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
233 __m64 ret;
234 asm("pshufh %0, %1, %2\n\t"
235 : "=f" (ret)
236 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
238 return ret;
240 uint64_t val = ((uint64_t)__i1 << 32)
241 | ((uint64_t)__i0 << 0);
242 return *(__m64 *)&val;
244 #undef _MM_SHUFFLE
246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
247 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
249 __m64 ret;
250 asm("pshufh %0, %1, %2\n\t"
251 : "=f" (ret)
252 : "f" (__m), "f" (*(__m64 *)&__n)
254 return ret;
257 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258 _mm_slli_pi16 (__m64 __m, int64_t __count)
260 __m64 ret;
261 asm("psllh %0, %1, %2\n\t"
262 : "=f" (ret)
263 : "f" (__m), "f" (*(__m64 *)&__count)
265 return ret;
267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 _mm_slli_si64 (__m64 __m, int64_t __count)
270 __m64 ret;
271 asm("dsll %0, %1, %2\n\t"
272 : "=f" (ret)
273 : "f" (__m), "f" (*(__m64 *)&__count)
275 return ret;
278 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279 _mm_srli_pi16 (__m64 __m, int64_t __count)
281 __m64 ret;
282 asm("psrlh %0, %1, %2\n\t"
283 : "=f" (ret)
284 : "f" (__m), "f" (*(__m64 *)&__count)
286 return ret;
289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_srli_pi32 (__m64 __m, int64_t __count)
292 __m64 ret;
293 asm("psrlw %0, %1, %2\n\t"
294 : "=f" (ret)
295 : "f" (__m), "f" (*(__m64 *)&__count)
297 return ret;
300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301 _mm_srli_si64 (__m64 __m, int64_t __count)
303 __m64 ret;
304 asm("dsrl %0, %1, %2\n\t"
305 : "=f" (ret)
306 : "f" (__m), "f" (*(__m64 *)&__count)
308 return ret;
311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
314 __m64 ret;
315 asm("psubh %0, %1, %2\n\t"
316 : "=f" (ret)
317 : "f" (__m1), "f" (__m2)
319 return ret;
322 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
325 __m64 ret;
326 asm("punpckhbh %0, %1, %2\n\t"
327 : "=f" (ret)
328 : "f" (__m1), "f" (__m2)
330 return ret;
333 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
336 __m64 ret;
337 asm("punpckhhw %0, %1, %2\n\t"
338 : "=f" (ret)
339 : "f" (__m1), "f" (__m2)
341 return ret;
344 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
347 __m64 ret;
348 asm("punpcklbh %0, %1, %2\n\t"
349 : "=f" (ret)
350 : "f" (__m1), "f" (__m2)
352 return ret;
355 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
356 * allows load8888 to use 32-bit loads */
357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
358 _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
360 __m64 ret;
361 asm("punpcklbh %0, %1, %2\n\t"
362 : "=f" (ret)
363 : "f" (__m1), "f" (__m2)
365 return ret;
368 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
371 __m64 ret;
372 asm("punpcklhw %0, %1, %2\n\t"
373 : "=f" (ret)
374 : "f" (__m1), "f" (__m2)
376 return ret;
379 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_xor_si64 (__m64 __m1, __m64 __m2)
382 __m64 ret;
383 asm("xor %0, %1, %2\n\t"
384 : "=f" (ret)
385 : "f" (__m1), "f" (__m2)
387 return ret;
390 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 loongson_extract_pi16 (__m64 __m, int64_t __pos)
393 __m64 ret;
394 asm("pextrh %0, %1, %2\n\t"
395 : "=f" (ret)
396 : "f" (__m), "f" (*(__m64 *)&__pos)
398 return ret;
401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
404 __m64 ret;
405 asm("pinsrh_%3 %0, %1, %2\n\t"
406 : "=f" (ret)
407 : "f" (__m1), "f" (__m2), "i" (__pos)
409 return ret;