Eliminate some uses of T2
[qemu/malc.git] / target-arm / op_neon.h
blob232375e18fc528d76893e9eeaf4d6fd715da1ca9
1 /*
2 * ARM NEON vector operations.
4 * Copyright (c) 2007 CodeSourcery.
5 * Written by Paul Brook
7 * This code is licenced under the GPL.
8 */
9 /* Note that for NEON an "l" prefix means it is a wide operation, unlike
10 scalar arm ops where it means a word size operation. */
12 /* ??? NEON ops should probably have their own float status. */
13 #define NFS &env->vfp.fp_status
14 #define NEON_OP(name) void OPPROTO op_neon_##name (void)
16 NEON_OP(getreg_T0)
18 T0 = *(uint32_t *)((char *) env + PARAM1);
21 NEON_OP(getreg_T1)
23 T1 = *(uint32_t *)((char *) env + PARAM1);
26 NEON_OP(getreg_T2)
28 T2 = *(uint32_t *)((char *) env + PARAM1);
31 NEON_OP(setreg_T0)
33 *(uint32_t *)((char *) env + PARAM1) = T0;
36 NEON_OP(setreg_T1)
38 *(uint32_t *)((char *) env + PARAM1) = T1;
41 NEON_OP(setreg_T2)
43 *(uint32_t *)((char *) env + PARAM1) = T2;
46 #define NEON_TYPE1(name, type) \
47 typedef struct \
48 { \
49 type v1; \
50 } neon_##name;
51 #ifdef WORDS_BIGENDIAN
52 #define NEON_TYPE2(name, type) \
53 typedef struct \
54 { \
55 type v2; \
56 type v1; \
57 } neon_##name;
58 #define NEON_TYPE4(name, type) \
59 typedef struct \
60 { \
61 type v4; \
62 type v3; \
63 type v2; \
64 type v1; \
65 } neon_##name;
66 #else
67 #define NEON_TYPE2(name, type) \
68 typedef struct \
69 { \
70 type v1; \
71 type v2; \
72 } neon_##name;
73 #define NEON_TYPE4(name, type) \
74 typedef struct \
75 { \
76 type v1; \
77 type v2; \
78 type v3; \
79 type v4; \
80 } neon_##name;
81 #endif
83 NEON_TYPE4(s8, int8_t)
84 NEON_TYPE4(u8, uint8_t)
85 NEON_TYPE2(s16, int16_t)
86 NEON_TYPE2(u16, uint16_t)
87 NEON_TYPE1(s32, int32_t)
88 NEON_TYPE1(u32, uint32_t)
89 #undef NEON_TYPE4
90 #undef NEON_TYPE2
91 #undef NEON_TYPE1
93 /* Copy from a uint32_t to a vector structure type. */
94 #define NEON_UNPACK(vtype, dest, val) do { \
95 union { \
96 vtype v; \
97 uint32_t i; \
98 } conv_u; \
99 conv_u.i = (val); \
100 dest = conv_u.v; \
101 } while(0)
103 /* Copy from a vector structure type to a uint32_t. */
104 #define NEON_PACK(vtype, dest, val) do { \
105 union { \
106 vtype v; \
107 uint32_t i; \
108 } conv_u; \
109 conv_u.v = (val); \
110 dest = conv_u.i; \
111 } while(0)
113 #define NEON_DO1 \
114 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
115 #define NEON_DO2 \
116 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
117 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
118 #define NEON_DO4 \
119 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
120 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
121 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
122 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
124 #define NEON_VOP(name, vtype, n) \
125 NEON_OP(name) \
127 vtype vsrc1; \
128 vtype vsrc2; \
129 vtype vdest; \
130 NEON_UNPACK(vtype, vsrc1, T0); \
131 NEON_UNPACK(vtype, vsrc2, T1); \
132 NEON_DO##n; \
133 NEON_PACK(vtype, T0, vdest); \
134 FORCE_RET(); \
137 #define NEON_VOP1(name, vtype, n) \
138 NEON_OP(name) \
140 vtype vsrc1; \
141 vtype vdest; \
142 NEON_UNPACK(vtype, vsrc1, T0); \
143 NEON_DO##n; \
144 NEON_PACK(vtype, T0, vdest); \
145 FORCE_RET(); \
148 /* Pairwise operations. */
149 /* For 32-bit elements each segment only contains a single element, so
150 the elementwise and pairwise operations are the same. */
151 #define NEON_PDO2 \
152 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
153 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
154 #define NEON_PDO4 \
155 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
156 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
157 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
158 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
160 #define NEON_POP(name, vtype, n) \
161 NEON_OP(name) \
163 vtype vsrc1; \
164 vtype vsrc2; \
165 vtype vdest; \
166 NEON_UNPACK(vtype, vsrc1, T0); \
167 NEON_UNPACK(vtype, vsrc2, T1); \
168 NEON_PDO##n; \
169 NEON_PACK(vtype, T0, vdest); \
170 FORCE_RET(); \
173 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
174 NEON_VOP(hadd_s8, neon_s8, 4)
175 NEON_VOP(hadd_u8, neon_u8, 4)
176 NEON_VOP(hadd_s16, neon_s16, 2)
177 NEON_VOP(hadd_u16, neon_u16, 2)
178 #undef NEON_FN
180 NEON_OP(hadd_s32)
182 int32_t src1 = T0;
183 int32_t src2 = T1;
184 int32_t dest;
186 dest = (src1 >> 1) + (src2 >> 1);
187 if (src1 & src2 & 1)
188 dest++;
189 T0 = dest;
190 FORCE_RET();
193 NEON_OP(hadd_u32)
195 uint32_t src1 = T0;
196 uint32_t src2 = T1;
197 uint32_t dest;
199 dest = (src1 >> 1) + (src2 >> 1);
200 if (src1 & src2 & 1)
201 dest++;
202 T0 = dest;
203 FORCE_RET();
206 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
207 NEON_VOP(rhadd_s8, neon_s8, 4)
208 NEON_VOP(rhadd_u8, neon_u8, 4)
209 NEON_VOP(rhadd_s16, neon_s16, 2)
210 NEON_VOP(rhadd_u16, neon_u16, 2)
211 #undef NEON_FN
213 NEON_OP(rhadd_s32)
215 int32_t src1 = T0;
216 int32_t src2 = T1;
217 int32_t dest;
219 dest = (src1 >> 1) + (src2 >> 1);
220 if ((src1 | src2) & 1)
221 dest++;
222 T0 = dest;
223 FORCE_RET();
226 NEON_OP(rhadd_u32)
228 uint32_t src1 = T0;
229 uint32_t src2 = T1;
230 uint32_t dest;
232 dest = (src1 >> 1) + (src2 >> 1);
233 if ((src1 | src2) & 1)
234 dest++;
235 T0 = dest;
236 FORCE_RET();
239 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
240 NEON_VOP(hsub_s8, neon_s8, 4)
241 NEON_VOP(hsub_u8, neon_u8, 4)
242 NEON_VOP(hsub_s16, neon_s16, 2)
243 NEON_VOP(hsub_u16, neon_u16, 2)
244 #undef NEON_FN
246 NEON_OP(hsub_s32)
248 int32_t src1 = T0;
249 int32_t src2 = T1;
250 int32_t dest;
252 dest = (src1 >> 1) - (src2 >> 1);
253 if ((~src1) & src2 & 1)
254 dest--;
255 T0 = dest;
256 FORCE_RET();
259 NEON_OP(hsub_u32)
261 uint32_t src1 = T0;
262 uint32_t src2 = T1;
263 uint32_t dest;
265 dest = (src1 >> 1) - (src2 >> 1);
266 if ((~src1) & src2 & 1)
267 dest--;
268 T0 = dest;
269 FORCE_RET();
272 /* ??? bsl, bif and bit are all the same op, just with the oparands in a
273 differnet order. It's currently easier to have 3 differnt ops than
274 rearange the operands. */
276 /* Bitwise Select. */
277 NEON_OP(bsl)
279 T0 = (T0 & T2) | (T1 & ~T2);
282 /* Bitwise Insert If True. */
283 NEON_OP(bit)
285 T0 = (T0 & T1) | (T2 & ~T1);
288 /* Bitwise Insert If False. */
289 NEON_OP(bif)
291 T0 = (T2 & T1) | (T0 & ~T1);
294 #define NEON_USAT(dest, src1, src2, type) do { \
295 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
296 if (tmp != (type)tmp) { \
297 env->QF = 1; \
298 dest = ~0; \
299 } else { \
300 dest = tmp; \
301 }} while(0)
302 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
303 NEON_VOP(qadd_u8, neon_u8, 4)
304 #undef NEON_FN
305 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
306 NEON_VOP(qadd_u16, neon_u16, 2)
307 #undef NEON_FN
308 #undef NEON_USAT
310 #define NEON_SSAT(dest, src1, src2, type) do { \
311 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
312 if (tmp != (type)tmp) { \
313 env->QF = 1; \
314 if (src2 > 0) { \
315 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
316 } else { \
317 tmp = 1 << (sizeof(type) * 8 - 1); \
320 dest = tmp; \
321 } while(0)
322 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
323 NEON_VOP(qadd_s8, neon_s8, 4)
324 #undef NEON_FN
325 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
326 NEON_VOP(qadd_s16, neon_s16, 2)
327 #undef NEON_FN
328 #undef NEON_SSAT
330 #define NEON_USAT(dest, src1, src2, type) do { \
331 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
332 if (tmp != (type)tmp) { \
333 env->QF = 1; \
334 dest = 0; \
335 } else { \
336 dest = tmp; \
337 }} while(0)
338 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
339 NEON_VOP(qsub_u8, neon_u8, 4)
340 #undef NEON_FN
341 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
342 NEON_VOP(qsub_u16, neon_u16, 2)
343 #undef NEON_FN
344 #undef NEON_USAT
346 #define NEON_SSAT(dest, src1, src2, type) do { \
347 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
348 if (tmp != (type)tmp) { \
349 env->QF = 1; \
350 if (src2 < 0) { \
351 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
352 } else { \
353 tmp = 1 << (sizeof(type) * 8 - 1); \
356 dest = tmp; \
357 } while(0)
358 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
359 NEON_VOP(qsub_s8, neon_s8, 4)
360 #undef NEON_FN
361 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
362 NEON_VOP(qsub_s16, neon_s16, 2)
363 #undef NEON_FN
364 #undef NEON_SSAT
366 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
367 NEON_VOP(cgt_s8, neon_s8, 4)
368 NEON_VOP(cgt_u8, neon_u8, 4)
369 NEON_VOP(cgt_s16, neon_s16, 2)
370 NEON_VOP(cgt_u16, neon_u16, 2)
371 NEON_VOP(cgt_s32, neon_s32, 1)
372 NEON_VOP(cgt_u32, neon_u32, 1)
373 #undef NEON_FN
375 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
376 NEON_VOP(cge_s8, neon_s8, 4)
377 NEON_VOP(cge_u8, neon_u8, 4)
378 NEON_VOP(cge_s16, neon_s16, 2)
379 NEON_VOP(cge_u16, neon_u16, 2)
380 NEON_VOP(cge_s32, neon_s32, 1)
381 NEON_VOP(cge_u32, neon_u32, 1)
382 #undef NEON_FN
384 #define NEON_FN(dest, src1, src2) do { \
385 int8_t tmp; \
386 tmp = (int8_t)src2; \
387 if (tmp < 0) { \
388 dest = src1 >> -tmp; \
389 } else { \
390 dest = src1 << tmp; \
391 }} while (0)
392 NEON_VOP(shl_s8, neon_s8, 4)
393 NEON_VOP(shl_u8, neon_u8, 4)
394 NEON_VOP(shl_s16, neon_s16, 2)
395 NEON_VOP(shl_u16, neon_u16, 2)
396 NEON_VOP(shl_s32, neon_s32, 1)
397 NEON_VOP(shl_u32, neon_u32, 1)
398 #undef NEON_FN
400 NEON_OP(shl_u64)
402 int8_t shift = T2;
403 uint64_t val = T0 | ((uint64_t)T1 << 32);
404 if (shift < 0) {
405 val >>= -shift;
406 } else {
407 val <<= shift;
409 T0 = val;
410 T1 = val >> 32;
411 FORCE_RET();
414 NEON_OP(shl_s64)
416 int8_t shift = T2;
417 int64_t val = T0 | ((uint64_t)T1 << 32);
418 if (shift < 0) {
419 val >>= -shift;
420 } else {
421 val <<= shift;
423 T0 = val;
424 T1 = val >> 32;
425 FORCE_RET();
428 #define NEON_FN(dest, src1, src2) do { \
429 int8_t tmp; \
430 tmp = (int8_t)src1; \
431 if (tmp < 0) { \
432 dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
433 } else { \
434 dest = src2 << tmp; \
435 }} while (0)
437 NEON_VOP(rshl_s8, neon_s8, 4)
438 NEON_VOP(rshl_u8, neon_u8, 4)
439 NEON_VOP(rshl_s16, neon_s16, 2)
440 NEON_VOP(rshl_u16, neon_u16, 2)
441 NEON_VOP(rshl_s32, neon_s32, 1)
442 NEON_VOP(rshl_u32, neon_u32, 1)
443 #undef NEON_FN
445 NEON_OP(rshl_u64)
447 int8_t shift = T2;
448 uint64_t val = T0 | ((uint64_t)T1 << 32);
449 if (shift < 0) {
450 val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
451 val >>= -shift;
452 } else {
453 val <<= shift;
455 T0 = val;
456 T1 = val >> 32;
457 FORCE_RET();
460 NEON_OP(rshl_s64)
462 int8_t shift = T2;
463 int64_t val = T0 | ((uint64_t)T1 << 32);
464 if (shift < 0) {
465 val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
466 } else {
467 val <<= shift;
469 T0 = val;
470 T1 = val >> 32;
471 FORCE_RET();
474 #define NEON_FN(dest, src1, src2) do { \
475 int8_t tmp; \
476 tmp = (int8_t)src1; \
477 if (tmp < 0) { \
478 dest = src2 >> -tmp; \
479 } else { \
480 dest = src2 << tmp; \
481 if ((dest >> tmp) != src2) { \
482 env->QF = 1; \
483 dest = ~0; \
485 }} while (0)
486 NEON_VOP(qshl_s8, neon_s8, 4)
487 NEON_VOP(qshl_s16, neon_s16, 2)
488 NEON_VOP(qshl_s32, neon_s32, 1)
489 #undef NEON_FN
491 NEON_OP(qshl_s64)
493 int8_t shift = T2;
494 int64_t val = T0 | ((uint64_t)T1 << 32);
495 if (shift < 0) {
496 val >>= -shift;
497 } else {
498 int64_t tmp = val;
499 val <<= shift;
500 if ((val >> shift) != tmp) {
501 env->QF = 1;
502 val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
505 T0 = val;
506 T1 = val >> 32;
507 FORCE_RET();
510 #define NEON_FN(dest, src1, src2) do { \
511 int8_t tmp; \
512 tmp = (int8_t)src1; \
513 if (tmp < 0) { \
514 dest = src2 >> -tmp; \
515 } else { \
516 dest = src2 << tmp; \
517 if ((dest >> tmp) != src2) { \
518 env->QF = 1; \
519 dest = src2 >> 31; \
521 }} while (0)
522 NEON_VOP(qshl_u8, neon_u8, 4)
523 NEON_VOP(qshl_u16, neon_u16, 2)
524 NEON_VOP(qshl_u32, neon_u32, 1)
525 #undef NEON_FN
527 NEON_OP(qshl_u64)
529 int8_t shift = T2;
530 uint64_t val = T0 | ((uint64_t)T1 << 32);
531 if (shift < 0) {
532 val >>= -shift;
533 } else {
534 uint64_t tmp = val;
535 val <<= shift;
536 if ((val >> shift) != tmp) {
537 env->QF = 1;
538 val = ~(uint64_t)0;
541 T0 = val;
542 T1 = val >> 32;
543 FORCE_RET();
546 #define NEON_FN(dest, src1, src2) do { \
547 int8_t tmp; \
548 tmp = (int8_t)src1; \
549 if (tmp < 0) { \
550 dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
551 } else { \
552 dest = src2 << tmp; \
553 if ((dest >> tmp) != src2) { \
554 dest = ~0; \
556 }} while (0)
557 NEON_VOP(qrshl_s8, neon_s8, 4)
558 NEON_VOP(qrshl_s16, neon_s16, 2)
559 NEON_VOP(qrshl_s32, neon_s32, 1)
560 #undef NEON_FN
562 #define NEON_FN(dest, src1, src2) do { \
563 int8_t tmp; \
564 tmp = (int8_t)src1; \
565 if (tmp < 0) { \
566 dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
567 } else { \
568 dest = src2 << tmp; \
569 if ((dest >> tmp) != src2) { \
570 env->QF = 1; \
571 dest = src2 >> 31; \
573 }} while (0)
574 NEON_VOP(qrshl_u8, neon_u8, 4)
575 NEON_VOP(qrshl_u16, neon_u16, 2)
576 NEON_VOP(qrshl_u32, neon_u32, 1)
577 #undef NEON_FN
579 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
580 NEON_VOP(max_s8, neon_s8, 4)
581 NEON_VOP(max_u8, neon_u8, 4)
582 NEON_VOP(max_s16, neon_s16, 2)
583 NEON_VOP(max_u16, neon_u16, 2)
584 NEON_VOP(max_s32, neon_s32, 1)
585 NEON_VOP(max_u32, neon_u32, 1)
586 NEON_POP(pmax_s8, neon_s8, 4)
587 NEON_POP(pmax_u8, neon_u8, 4)
588 NEON_POP(pmax_s16, neon_s16, 2)
589 NEON_POP(pmax_u16, neon_u16, 2)
590 #undef NEON_FN
592 NEON_OP(max_f32)
594 float32 f0 = vfp_itos(T0);
595 float32 f1 = vfp_itos(T1);
596 T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
597 FORCE_RET();
600 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
601 NEON_VOP(min_s8, neon_s8, 4)
602 NEON_VOP(min_u8, neon_u8, 4)
603 NEON_VOP(min_s16, neon_s16, 2)
604 NEON_VOP(min_u16, neon_u16, 2)
605 NEON_VOP(min_s32, neon_s32, 1)
606 NEON_VOP(min_u32, neon_u32, 1)
607 NEON_POP(pmin_s8, neon_s8, 4)
608 NEON_POP(pmin_u8, neon_u8, 4)
609 NEON_POP(pmin_s16, neon_s16, 2)
610 NEON_POP(pmin_u16, neon_u16, 2)
611 #undef NEON_FN
613 NEON_OP(min_f32)
615 float32 f0 = vfp_itos(T0);
616 float32 f1 = vfp_itos(T1);
617 T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
618 FORCE_RET();
621 #define NEON_FN(dest, src1, src2) \
622 dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
623 NEON_VOP(abd_s8, neon_s8, 4)
624 NEON_VOP(abd_u8, neon_u8, 4)
625 NEON_VOP(abd_s16, neon_s16, 2)
626 NEON_VOP(abd_u16, neon_u16, 2)
627 NEON_VOP(abd_s32, neon_s32, 1)
628 NEON_VOP(abd_u32, neon_u32, 1)
629 #undef NEON_FN
631 NEON_OP(abd_f32)
633 float32 f0 = vfp_itos(T0);
634 float32 f1 = vfp_itos(T1);
635 T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
636 ? float32_sub(f0, f1, NFS)
637 : float32_sub(f1, f0, NFS));
638 FORCE_RET();
641 #define NEON_FN(dest, src1, src2) dest = src1 + src2
642 NEON_VOP(add_u8, neon_u8, 4)
643 NEON_VOP(add_u16, neon_u16, 2)
644 NEON_POP(padd_u8, neon_u8, 4)
645 NEON_POP(padd_u16, neon_u16, 2)
646 #undef NEON_FN
648 NEON_OP(add_f32)
650 T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
651 FORCE_RET();
654 #define NEON_FN(dest, src1, src2) dest = src1 - src2
655 NEON_VOP(sub_u8, neon_u8, 4)
656 NEON_VOP(sub_u16, neon_u16, 2)
657 #undef NEON_FN
659 NEON_OP(sub_f32)
661 T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
662 FORCE_RET();
665 #define NEON_FN(dest, src1, src2) dest = src2 - src1
666 NEON_VOP(rsb_u8, neon_u8, 4)
667 NEON_VOP(rsb_u16, neon_u16, 2)
668 #undef NEON_FN
670 NEON_OP(rsb_f32)
672 T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
673 FORCE_RET();
676 #define NEON_FN(dest, src1, src2) dest = src1 * src2
677 NEON_VOP(mul_u8, neon_u8, 4)
678 NEON_VOP(mul_u16, neon_u16, 2)
679 #undef NEON_FN
681 NEON_OP(mul_f32)
683 T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
684 FORCE_RET();
687 NEON_OP(mul_p8)
689 T0 = helper_neon_mul_p8(T0, T1);
692 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
693 NEON_VOP(tst_u8, neon_u8, 4)
694 NEON_VOP(tst_u16, neon_u16, 2)
695 NEON_VOP(tst_u32, neon_u32, 1)
696 #undef NEON_FN
698 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
699 NEON_VOP(ceq_u8, neon_u8, 4)
700 NEON_VOP(ceq_u16, neon_u16, 2)
701 NEON_VOP(ceq_u32, neon_u32, 1)
702 #undef NEON_FN
704 #define NEON_QDMULH16(dest, src1, src2, round) do { \
705 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
706 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
707 env->QF = 1; \
708 tmp = (tmp >> 31) ^ ~SIGNBIT; \
710 tmp <<= 1; \
711 if (round) { \
712 int32_t old = tmp; \
713 tmp += 1 << 15; \
714 if ((int32_t)tmp < old) { \
715 env->QF = 1; \
716 tmp = SIGNBIT - 1; \
719 dest = tmp >> 16; \
720 } while(0)
721 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
722 NEON_VOP(qdmulh_s16, neon_s16, 2)
723 #undef NEON_FN
724 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
725 NEON_VOP(qrdmulh_s16, neon_s16, 2)
726 #undef NEON_FN
727 #undef NEON_QDMULH16
729 #define SIGNBIT64 ((uint64_t)1 << 63)
730 #define NEON_QDMULH32(dest, src1, src2, round) do { \
731 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
732 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
733 env->QF = 1; \
734 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
735 } else { \
736 tmp <<= 1; \
738 if (round) { \
739 int64_t old = tmp; \
740 tmp += (int64_t)1 << 31; \
741 if ((int64_t)tmp < old) { \
742 env->QF = 1; \
743 tmp = SIGNBIT64 - 1; \
746 dest = tmp >> 32; \
747 } while(0)
748 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
749 NEON_VOP(qdmulh_s32, neon_s32, 1)
750 #undef NEON_FN
751 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
752 NEON_VOP(qrdmulh_s32, neon_s32, 1)
753 #undef NEON_FN
754 #undef NEON_QDMULH32
756 NEON_OP(recps_f32)
758 T0 = vfp_stoi(helper_recps_f32(vfp_itos(T0), vfp_itos(T1)));
759 FORCE_RET();
762 NEON_OP(rsqrts_f32)
764 T0 = vfp_stoi(helper_rsqrts_f32(vfp_itos(T0), vfp_itos(T1)));
765 FORCE_RET();
768 /* Floating point comparisons produce an integer result. */
769 #define NEON_VOP_FCMP(name, cmp) \
770 NEON_OP(name) \
772 if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
773 T0 = -1; \
774 else \
775 T0 = 0; \
776 FORCE_RET(); \
779 NEON_VOP_FCMP(ceq_f32, ==)
780 NEON_VOP_FCMP(cge_f32, >=)
781 NEON_VOP_FCMP(cgt_f32, >)
783 NEON_OP(acge_f32)
785 float32 f0 = float32_abs(vfp_itos(T0));
786 float32 f1 = float32_abs(vfp_itos(T1));
787 T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
788 FORCE_RET();
791 NEON_OP(acgt_f32)
793 float32 f0 = float32_abs(vfp_itos(T0));
794 float32 f1 = float32_abs(vfp_itos(T1));
795 T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
796 FORCE_RET();
799 /* Narrowing instructions. The named type is the destination type. */
800 NEON_OP(narrow_u8)
802 T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
803 | ((T1 << 16) & 0xff0000) | (T1 << 24);
804 FORCE_RET();
807 NEON_OP(narrow_sat_u8)
809 neon_u16 src;
810 neon_u8 dest;
811 #define SAT8(d, s) \
812 if (s > 0xff) { \
813 d = 0xff; \
814 env->QF = 1; \
815 } else { \
816 d = s; \
819 NEON_UNPACK(neon_u16, src, T0);
820 SAT8(dest.v1, src.v1);
821 SAT8(dest.v2, src.v2);
822 NEON_UNPACK(neon_u16, src, T1);
823 SAT8(dest.v3, src.v1);
824 SAT8(dest.v4, src.v2);
825 NEON_PACK(neon_u8, T0, dest);
826 FORCE_RET();
827 #undef SAT8
830 NEON_OP(narrow_sat_s8)
832 neon_s16 src;
833 neon_s8 dest;
834 #define SAT8(d, s) \
835 if (s != (uint8_t)s) { \
836 d = (s >> 15) ^ 0x7f; \
837 env->QF = 1; \
838 } else { \
839 d = s; \
842 NEON_UNPACK(neon_s16, src, T0);
843 SAT8(dest.v1, src.v1);
844 SAT8(dest.v2, src.v2);
845 NEON_UNPACK(neon_s16, src, T1);
846 SAT8(dest.v3, src.v1);
847 SAT8(dest.v4, src.v2);
848 NEON_PACK(neon_s8, T0, dest);
849 FORCE_RET();
850 #undef SAT8
853 NEON_OP(narrow_u16)
855 T0 = (T0 & 0xffff) | (T1 << 16);
858 NEON_OP(narrow_sat_u16)
860 if (T0 > 0xffff) {
861 T0 = 0xffff;
862 env->QF = 1;
864 if (T1 > 0xffff) {
865 T1 = 0xffff;
866 env->QF = 1;
868 T0 |= T1 << 16;
869 FORCE_RET();
872 NEON_OP(narrow_sat_s16)
874 if ((int32_t)T0 != (int16_t)T0) {
875 T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
876 env->QF = 1;
878 if ((int32_t)T1 != (int16_t) T1) {
879 T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
880 env->QF = 1;
882 T0 = (uint16_t)T0 | (T1 << 16);
883 FORCE_RET();
886 NEON_OP(narrow_sat_u32)
888 if (T1) {
889 T0 = 0xffffffffu;
890 env->QF = 1;
892 FORCE_RET();
895 NEON_OP(narrow_sat_s32)
897 int32_t sign = (int32_t)T1 >> 31;
899 if ((int32_t)T1 != sign) {
900 T0 = sign ^ 0x7fffffff;
901 env->QF = 1;
903 FORCE_RET();
906 /* Narrowing instructions. Named type is the narrow type. */
907 NEON_OP(narrow_high_u8)
909 T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
910 | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
911 FORCE_RET();
914 NEON_OP(narrow_high_u16)
916 T0 = (T0 >> 16) | (T1 & 0xffff0000);
917 FORCE_RET();
920 NEON_OP(narrow_high_round_u8)
922 T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
923 | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
924 FORCE_RET();
927 NEON_OP(narrow_high_round_u16)
929 T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
930 FORCE_RET();
933 NEON_OP(narrow_high_round_u32)
935 if (T0 >= 0x80000000u)
936 T0 = T1 + 1;
937 else
938 T0 = T1;
939 FORCE_RET();
942 /* Widening instructions. Named type is source type. */
943 NEON_OP(widen_s8)
945 uint32_t src;
947 src = T0;
948 T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
949 T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
952 NEON_OP(widen_u8)
954 T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
955 T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
958 NEON_OP(widen_s16)
960 int32_t src;
962 src = T0;
963 T0 = (int16_t)src;
964 T1 = src >> 16;
967 NEON_OP(widen_u16)
969 T1 = T0 >> 16;
970 T0 &= 0xffff;
973 NEON_OP(widen_s32)
975 T1 = (int32_t)T0 >> 31;
976 FORCE_RET();
979 NEON_OP(widen_high_u8)
981 T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
982 T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
985 NEON_OP(widen_high_u16)
987 T1 = T0 & 0xffff0000;
988 T0 <<= 16;
991 /* Long operations. The type is the wide type. */
992 NEON_OP(shll_u16)
994 int shift = PARAM1;
995 uint32_t mask;
997 mask = 0xffff >> (16 - shift);
998 mask |= mask << 16;
999 mask = ~mask;
1001 T0 = (T0 << shift) & mask;
1002 T1 = (T1 << shift) & mask;
1003 FORCE_RET();
1006 NEON_OP(shll_u64)
1008 int shift = PARAM1;
1010 T1 <<= shift;
1011 T1 |= T0 >> (32 - shift);
1012 T0 <<= shift;
1013 FORCE_RET();
1016 NEON_OP(addl_u16)
1018 uint32_t tmp;
1019 uint32_t high;
1021 tmp = env->vfp.scratch[0];
1022 high = (T0 >> 16) + (tmp >> 16);
1023 T0 = (uint16_t)(T0 + tmp);
1024 T0 |= (high << 16);
1025 tmp = env->vfp.scratch[1];
1026 high = (T1 >> 16) + (tmp >> 16);
1027 T1 = (uint16_t)(T1 + tmp);
1028 T1 |= (high << 16);
1029 FORCE_RET();
1032 NEON_OP(addl_u32)
1034 T0 += env->vfp.scratch[0];
1035 T1 += env->vfp.scratch[1];
1036 FORCE_RET();
1039 NEON_OP(addl_u64)
1041 uint64_t tmp;
1042 tmp = T0 | ((uint64_t)T1 << 32);
1043 tmp += env->vfp.scratch[0];
1044 tmp += (uint64_t)env->vfp.scratch[1] << 32;
1045 T0 = tmp;
1046 T1 = tmp >> 32;
1047 FORCE_RET();
1050 NEON_OP(subl_u16)
1052 uint32_t tmp;
1053 uint32_t high;
1055 tmp = env->vfp.scratch[0];
1056 high = (T0 >> 16) - (tmp >> 16);
1057 T0 = (uint16_t)(T0 - tmp);
1058 T0 |= (high << 16);
1059 tmp = env->vfp.scratch[1];
1060 high = (T1 >> 16) - (tmp >> 16);
1061 T1 = (uint16_t)(T1 - tmp);
1062 T1 |= (high << 16);
1063 FORCE_RET();
1066 NEON_OP(subl_u32)
1068 T0 -= env->vfp.scratch[0];
1069 T1 -= env->vfp.scratch[1];
1070 FORCE_RET();
1073 NEON_OP(subl_u64)
1075 uint64_t tmp;
1076 tmp = T0 | ((uint64_t)T1 << 32);
1077 tmp -= env->vfp.scratch[0];
1078 tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1079 T0 = tmp;
1080 T1 = tmp >> 32;
1081 FORCE_RET();
1084 #define DO_ABD(dest, x, y, type) do { \
1085 type tmp_x = x; \
1086 type tmp_y = y; \
1087 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1088 } while(0)
1090 NEON_OP(abdl_u16)
1092 uint32_t tmp;
1093 uint32_t low;
1094 uint32_t high;
1096 DO_ABD(low, T0, T1, uint8_t);
1097 DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1098 low |= tmp << 16;
1099 DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1100 DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1101 high |= tmp << 16;
1102 T0 = low;
1103 T1 = high;
1104 FORCE_RET();
1107 NEON_OP(abdl_s16)
1109 uint32_t tmp;
1110 uint32_t low;
1111 uint32_t high;
1113 DO_ABD(low, T0, T1, int8_t);
1114 DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1115 low |= tmp << 16;
1116 DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1117 DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1118 high |= tmp << 16;
1119 T0 = low;
1120 T1 = high;
1121 FORCE_RET();
1124 NEON_OP(abdl_u32)
1126 uint32_t low;
1127 uint32_t high;
1129 DO_ABD(low, T0, T1, uint16_t);
1130 DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1131 T0 = low;
1132 T1 = high;
1133 FORCE_RET();
1136 NEON_OP(abdl_s32)
1138 uint32_t low;
1139 uint32_t high;
1141 DO_ABD(low, T0, T1, int16_t);
1142 DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1143 T0 = low;
1144 T1 = high;
1145 FORCE_RET();
1148 NEON_OP(abdl_u64)
1150 DO_ABD(T0, T0, T1, uint32_t);
1151 T1 = 0;
1154 NEON_OP(abdl_s64)
1156 DO_ABD(T0, T0, T1, int32_t);
1157 T1 = 0;
1159 #undef DO_ABD
1161 /* Widening multiple. Named type is the source type. */
1162 #define DO_MULL(dest, x, y, type1, type2) do { \
1163 type1 tmp_x = x; \
1164 type1 tmp_y = y; \
1165 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1166 } while(0)
1168 NEON_OP(mull_u8)
1170 uint32_t tmp;
1171 uint32_t low;
1172 uint32_t high;
1174 DO_MULL(low, T0, T1, uint8_t, uint16_t);
1175 DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1176 low |= tmp << 16;
1177 DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1178 DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1179 high |= tmp << 16;
1180 T0 = low;
1181 T1 = high;
1182 FORCE_RET();
1185 NEON_OP(mull_s8)
1187 uint32_t tmp;
1188 uint32_t low;
1189 uint32_t high;
1191 DO_MULL(low, T0, T1, int8_t, uint16_t);
1192 DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1193 low |= tmp << 16;
1194 DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1195 DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1196 high |= tmp << 16;
1197 T0 = low;
1198 T1 = high;
1199 FORCE_RET();
1202 NEON_OP(mull_u16)
1204 uint32_t low;
1205 uint32_t high;
1207 DO_MULL(low, T0, T1, uint16_t, uint32_t);
1208 DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1209 T0 = low;
1210 T1 = high;
1211 FORCE_RET();
1214 NEON_OP(mull_s16)
1216 uint32_t low;
1217 uint32_t high;
1219 DO_MULL(low, T0, T1, int16_t, uint32_t);
1220 DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1221 T0 = low;
1222 T1 = high;
1223 FORCE_RET();
1226 NEON_OP(addl_saturate_s32)
1228 uint32_t tmp;
1229 uint32_t res;
1231 tmp = env->vfp.scratch[0];
1232 res = T0 + tmp;
1233 if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1234 env->QF = 1;
1235 T0 = (T0 >> 31) ^ 0x7fffffff;
1236 } else {
1237 T0 = res;
1239 tmp = env->vfp.scratch[1];
1240 res = T1 + tmp;
1241 if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1242 env->QF = 1;
1243 T1 = (T1 >> 31) ^ 0x7fffffff;
1244 } else {
1245 T1 = res;
1247 FORCE_RET();
1250 NEON_OP(addl_saturate_s64)
1252 uint64_t src1;
1253 uint64_t src2;
1254 uint64_t res;
1256 src1 = T0 + ((uint64_t)T1 << 32);
1257 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1258 res = src1 + src2;
1259 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1260 env->QF = 1;
1261 T0 = ~(int64_t)src1 >> 63;
1262 T1 = T0 ^ 0x80000000;
1263 } else {
1264 T0 = res;
1265 T1 = res >> 32;
1267 FORCE_RET();
1270 NEON_OP(addl_saturate_u64)
1272 uint64_t src1;
1273 uint64_t src2;
1274 uint64_t res;
1276 src1 = T0 + ((uint64_t)T1 << 32);
1277 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1278 res = src1 + src2;
1279 if (res < src1) {
1280 env->QF = 1;
1281 T0 = 0xffffffff;
1282 T1 = 0xffffffff;
1283 } else {
1284 T0 = res;
1285 T1 = res >> 32;
1287 FORCE_RET();
1290 NEON_OP(subl_saturate_s64)
1292 uint64_t src1;
1293 uint64_t src2;
1294 uint64_t res;
1296 src1 = T0 + ((uint64_t)T1 << 32);
1297 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1298 res = src1 - src2;
1299 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1300 env->QF = 1;
1301 T0 = ~(int64_t)src1 >> 63;
1302 T1 = T0 ^ 0x80000000;
1303 } else {
1304 T0 = res;
1305 T1 = res >> 32;
1307 FORCE_RET();
1310 NEON_OP(subl_saturate_u64)
1312 uint64_t src1;
1313 uint64_t src2;
1314 uint64_t res;
1316 src1 = T0 + ((uint64_t)T1 << 32);
1317 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1318 if (src1 < src2) {
1319 env->QF = 1;
1320 T0 = 0;
1321 T1 = 0;
1322 } else {
1323 res = src1 - src2;
1324 T0 = res;
1325 T1 = res >> 32;
1327 FORCE_RET();
1330 NEON_OP(negl_u16)
1332 uint32_t tmp;
1333 tmp = T0 >> 16;
1334 tmp = -tmp;
1335 T0 = (-T0 & 0xffff) | (tmp << 16);
1336 tmp = T1 >> 16;
1337 tmp = -tmp;
1338 T1 = (-T1 & 0xffff) | (tmp << 16);
1339 FORCE_RET();
1342 NEON_OP(negl_u32)
1344 T0 = -T0;
1345 T1 = -T1;
1346 FORCE_RET();
1349 NEON_OP(negl_u64)
1351 uint64_t val;
1353 val = T0 | ((uint64_t)T1 << 32);
1354 val = -val;
1355 T0 = val;
1356 T1 = val >> 32;
1357 FORCE_RET();
1360 /* Scalar operations. */
1361 NEON_OP(dup_low16)
1363 T0 = (T0 & 0xffff) | (T0 << 16);
1364 FORCE_RET();
1367 NEON_OP(dup_high16)
1369 T0 = (T0 >> 16) | (T0 & 0xffff0000);
1370 FORCE_RET();
1373 /* Helper for VEXT */
1374 NEON_OP(extract)
1376 int shift = PARAM1;
1377 T0 = (T0 >> shift) | (T1 << (32 - shift));
1378 FORCE_RET();
1381 /* Pairwise add long. Named type is source type. */
1382 NEON_OP(paddl_s8)
1384 int8_t src1;
1385 int8_t src2;
1386 uint16_t result;
1387 src1 = T0 >> 24;
1388 src2 = T0 >> 16;
1389 result = (uint16_t)src1 + src2;
1390 src1 = T0 >> 8;
1391 src2 = T0;
1392 T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1393 FORCE_RET();
1396 NEON_OP(paddl_u8)
1398 uint8_t src1;
1399 uint8_t src2;
1400 uint16_t result;
1401 src1 = T0 >> 24;
1402 src2 = T0 >> 16;
1403 result = (uint16_t)src1 + src2;
1404 src1 = T0 >> 8;
1405 src2 = T0;
1406 T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1407 FORCE_RET();
1410 NEON_OP(paddl_s16)
1412 T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1413 FORCE_RET();
1416 NEON_OP(paddl_u16)
1418 T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1419 FORCE_RET();
1422 NEON_OP(paddl_s32)
1424 int64_t tmp;
1425 tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1426 T0 = tmp;
1427 T1 = tmp >> 32;
1428 FORCE_RET();
1431 NEON_OP(paddl_u32)
1433 uint64_t tmp;
1434 tmp = (uint64_t)T0 + (uint64_t)T1;
1435 T0 = tmp;
1436 T1 = tmp >> 32;
1437 FORCE_RET();
1440 /* Count Leading Sign/Zero Bits. */
1441 static inline int do_clz8(uint8_t x)
1443 int n;
1444 for (n = 8; x; n--)
1445 x >>= 1;
1446 return n;
1449 static inline int do_clz16(uint16_t x)
1451 int n;
1452 for (n = 16; x; n--)
1453 x >>= 1;
1454 return n;
1457 NEON_OP(clz_u8)
1459 uint32_t result;
1460 uint32_t tmp;
1462 tmp = T0;
1463 result = do_clz8(tmp);
1464 result |= do_clz8(tmp >> 8) << 8;
1465 result |= do_clz8(tmp >> 16) << 16;
1466 result |= do_clz8(tmp >> 24) << 24;
1467 T0 = result;
1468 FORCE_RET();
1471 NEON_OP(clz_u16)
1473 uint32_t result;
1474 uint32_t tmp;
1475 tmp = T0;
1476 result = do_clz16(tmp);
1477 result |= do_clz16(tmp >> 16) << 16;
1478 T0 = result;
1479 FORCE_RET();
1482 NEON_OP(cls_s8)
1484 uint32_t result;
1485 int8_t tmp;
1486 tmp = T0;
1487 result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1488 tmp = T0 >> 8;
1489 result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1490 tmp = T0 >> 16;
1491 result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1492 tmp = T0 >> 24;
1493 result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1494 T0 = result;
1495 FORCE_RET();
1498 NEON_OP(cls_s16)
1500 uint32_t result;
1501 int16_t tmp;
1502 tmp = T0;
1503 result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1504 tmp = T0 >> 16;
1505 result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1506 T0 = result;
1507 FORCE_RET();
1510 NEON_OP(cls_s32)
1512 int count;
1513 if ((int32_t)T0 < 0)
1514 T0 = ~T0;
1515 for (count = 32; T0 > 0; count--)
1516 T0 = T0 >> 1;
1517 T0 = count - 1;
1518 FORCE_RET();
1521 /* Bit count. */
1522 NEON_OP(cnt_u8)
1524 T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555);
1525 T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333);
1526 T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f);
1527 FORCE_RET();
1530 /* Saturnating negation. */
1531 /* ??? Make these use NEON_VOP1 */
1532 #define DO_QABS8(x) do { \
1533 if (x == (int8_t)0x80) { \
1534 x = 0x7f; \
1535 env->QF = 1; \
1536 } else if (x < 0) { \
1537 x = -x; \
1538 }} while (0)
1539 NEON_OP(qabs_s8)
1541 neon_s8 vec;
1542 NEON_UNPACK(neon_s8, vec, T0);
1543 DO_QABS8(vec.v1);
1544 DO_QABS8(vec.v2);
1545 DO_QABS8(vec.v3);
1546 DO_QABS8(vec.v4);
1547 NEON_PACK(neon_s8, T0, vec);
1548 FORCE_RET();
1550 #undef DO_QABS8
1552 #define DO_QNEG8(x) do { \
1553 if (x == (int8_t)0x80) { \
1554 x = 0x7f; \
1555 env->QF = 1; \
1556 } else { \
1557 x = -x; \
1558 }} while (0)
1559 NEON_OP(qneg_s8)
1561 neon_s8 vec;
1562 NEON_UNPACK(neon_s8, vec, T0);
1563 DO_QNEG8(vec.v1);
1564 DO_QNEG8(vec.v2);
1565 DO_QNEG8(vec.v3);
1566 DO_QNEG8(vec.v4);
1567 NEON_PACK(neon_s8, T0, vec);
1568 FORCE_RET();
1570 #undef DO_QNEG8
1572 #define DO_QABS16(x) do { \
1573 if (x == (int16_t)0x8000) { \
1574 x = 0x7fff; \
1575 env->QF = 1; \
1576 } else if (x < 0) { \
1577 x = -x; \
1578 }} while (0)
1579 NEON_OP(qabs_s16)
1581 neon_s16 vec;
1582 NEON_UNPACK(neon_s16, vec, T0);
1583 DO_QABS16(vec.v1);
1584 DO_QABS16(vec.v2);
1585 NEON_PACK(neon_s16, T0, vec);
1586 FORCE_RET();
1588 #undef DO_QABS16
1590 #define DO_QNEG16(x) do { \
1591 if (x == (int16_t)0x8000) { \
1592 x = 0x7fff; \
1593 env->QF = 1; \
1594 } else { \
1595 x = -x; \
1596 }} while (0)
1597 NEON_OP(qneg_s16)
1599 neon_s16 vec;
1600 NEON_UNPACK(neon_s16, vec, T0);
1601 DO_QNEG16(vec.v1);
1602 DO_QNEG16(vec.v2);
1603 NEON_PACK(neon_s16, T0, vec);
1604 FORCE_RET();
1606 #undef DO_QNEG16
1608 NEON_OP(qabs_s32)
1610 if (T0 == 0x80000000) {
1611 T0 = 0x7fffffff;
1612 env->QF = 1;
1613 } else if ((int32_t)T0 < 0) {
1614 T0 = -T0;
1616 FORCE_RET();
1619 NEON_OP(qneg_s32)
1621 if (T0 == 0x80000000) {
1622 T0 = 0x7fffffff;
1623 env->QF = 1;
1624 } else {
1625 T0 = -T0;
1627 FORCE_RET();
1630 /* Unary opperations */
1631 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1632 NEON_VOP1(abs_s8, neon_s8, 4)
1633 NEON_VOP1(abs_s16, neon_s16, 2)
1634 NEON_OP(abs_s32)
1636 if ((int32_t)T0 < 0)
1637 T0 = -T0;
1638 FORCE_RET();
1640 #undef NEON_FN
1642 /* Transpose. Argument order is rather strange to avoid special casing
1643 the tranlation code.
1644 On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
1645 NEON_OP(trn_u8)
1647 uint32_t rd;
1648 uint32_t rm;
1649 rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1650 rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1651 T0 = rd;
1652 T1 = rm;
1653 FORCE_RET();
1656 NEON_OP(trn_u16)
1658 uint32_t rd;
1659 uint32_t rm;
1660 rd = (T0 << 16) | (T1 & 0xffff);
1661 rm = (T1 >> 16) | (T0 & 0xffff0000);
1662 T0 = rd;
1663 T1 = rm;
1664 FORCE_RET();
1667 /* Worker routines for zip and unzip. */
1668 NEON_OP(unzip_u8)
1670 uint32_t rd;
1671 uint32_t rm;
1672 rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1673 | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1674 rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1675 | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1676 T0 = rd;
1677 T1 = rm;
1678 FORCE_RET();
1681 NEON_OP(zip_u8)
1683 uint32_t rd;
1684 uint32_t rm;
1685 rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1686 | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1687 rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1688 | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1689 T0 = rd;
1690 T1 = rm;
1691 FORCE_RET();
1694 NEON_OP(zip_u16)
1696 uint32_t tmp;
1698 tmp = (T0 & 0xffff) | (T1 << 16);
1699 T1 = (T1 & 0xffff0000) | (T0 >> 16);
1700 T0 = tmp;
1701 FORCE_RET();
1704 /* Reciprocal/root estimate. */
1705 NEON_OP(recpe_u32)
1707 T0 = helper_recpe_u32(T0);
1710 NEON_OP(rsqrte_u32)
1712 T0 = helper_rsqrte_u32(T0);
1715 NEON_OP(recpe_f32)
1717 FT0s = helper_recpe_f32(FT0s);
1720 NEON_OP(rsqrte_f32)
1722 FT0s = helper_rsqrte_f32(FT0s);
1725 /* Table lookup. This accessed the register file directly. */
1726 NEON_OP(tbl)
1728 helper_neon_tbl(PARAM1, PARAM2);
1731 NEON_OP(dup_u8)
1733 T0 = (T0 >> PARAM1) & 0xff;
1734 T0 |= T0 << 8;
1735 T0 |= T0 << 16;
1736 FORCE_RET();
1739 /* Helpers for element load/store. */
1740 NEON_OP(insert_elt)
1742 int shift = PARAM1;
1743 uint32_t mask = PARAM2;
1744 T2 = (T2 & mask) | (T0 << shift);
1745 FORCE_RET();
1748 NEON_OP(extract_elt)
1750 int shift = PARAM1;
1751 uint32_t mask = PARAM2;
1752 T0 = (T2 & mask) >> shift;
1753 FORCE_RET();