target/loongarch: Implement xvmax/xvmin
[qemu/armbru.git] / target / loongarch / vec_helper.c
blobfdf8b3dd64d2532b82c1640f700bb6b745db5899
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3 * QEMU LoongArch vector helper functions.
5 * Copyright (c) 2022-2023 Loongson Technology Corporation Limited
6 */
8 #include "qemu/osdep.h"
9 #include "cpu.h"
10 #include "exec/exec-all.h"
11 #include "exec/helper-proto.h"
12 #include "fpu/softfloat.h"
13 #include "internals.h"
14 #include "tcg/tcg.h"
15 #include "vec.h"
16 #include "tcg/tcg-gvec-desc.h"
18 #define DO_ADD(a, b) (a + b)
19 #define DO_SUB(a, b) (a - b)
21 #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \
22 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
23 { \
24 int i; \
25 VReg *Vd = (VReg *)vd; \
26 VReg *Vj = (VReg *)vj; \
27 VReg *Vk = (VReg *)vk; \
28 typedef __typeof(Vd->E1(0)) TD; \
29 int oprsz = simd_oprsz(desc); \
31 for (i = 0; i < oprsz / (BIT / 8); i++) { \
32 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \
33 } \
36 DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD)
37 DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD)
38 DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD)
40 void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
42 int i;
43 VReg *Vd = (VReg *)vd;
44 VReg *Vj = (VReg *)vj;
45 VReg *Vk = (VReg *)vk;
46 int oprsz = simd_oprsz(desc);
48 for (i = 0; i < oprsz / 16 ; i++) {
49 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)),
50 int128_makes64(Vk->D(2 * i)));
54 DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB)
55 DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB)
56 DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB)
58 void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
60 int i;
61 VReg *Vd = (VReg *)vd;
62 VReg *Vj = (VReg *)vj;
63 VReg *Vk = (VReg *)vk;
64 int oprsz = simd_oprsz(desc);
66 for (i = 0; i < oprsz / 16; i++) {
67 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
68 int128_makes64(Vk->D(2 * i)));
72 DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD)
73 DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD)
74 DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD)
76 void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
78 int i;
79 VReg *Vd = (VReg *)vd;
80 VReg *Vj = (VReg *)vj;
81 VReg *Vk = (VReg *)vk;
82 int oprsz = simd_oprsz(desc);
84 for (i = 0; i < oprsz / 16; i ++) {
85 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
86 int128_make64(Vk->UD(2 * i)));
90 DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB)
91 DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB)
92 DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB)
94 void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
96 int i;
97 VReg *Vd = (VReg *)vd;
98 VReg *Vj = (VReg *)vj;
99 VReg *Vk = (VReg *)vk;
100 int oprsz = simd_oprsz(desc);
102 for (i = 0; i < oprsz / 16; i++) {
103 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
104 int128_make64(Vk->UD(2 * i)));
108 #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \
109 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
111 int i; \
112 VReg *Vd = (VReg *)vd; \
113 VReg *Vj = (VReg *)vj; \
114 VReg *Vk = (VReg *)vk; \
115 typedef __typeof(Vd->E1(0)) TD; \
116 int oprsz = simd_oprsz(desc); \
118 for (i = 0; i < oprsz / (BIT / 8); i++) { \
119 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \
123 #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \
124 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
126 int i; \
127 VReg *Vd = (VReg *)vd; \
128 VReg *Vj = (VReg *)vj; \
129 VReg *Vk = (VReg *)vk; \
130 typedef __typeof(Vd->E1(0)) TD; \
131 int oprsz = simd_oprsz(desc); \
133 for (i = 0; i < oprsz / (BIT / 8); i++) { \
134 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \
138 void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
140 int i;
141 VReg *Vd = (VReg *)vd;
142 VReg *Vj = (VReg *)vj;
143 VReg *Vk = (VReg *)vk;
144 int oprsz = simd_oprsz(desc);
146 for (i = 0; i < oprsz / 16; i++) {
147 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)),
148 int128_makes64(Vk->D(2 * i)));
152 DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD)
153 DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD)
154 DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD)
156 void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
158 int i;
159 VReg *Vd = (VReg *)vd;
160 VReg *Vj = (VReg *)vj;
161 VReg *Vk = (VReg *)vk;
162 int oprsz = simd_oprsz(desc);
164 for (i = 0; i < oprsz / 16; i++) {
165 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)),
166 int128_makes64(Vk->D(2 * i +1)));
170 DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD)
171 DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD)
172 DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD)
174 void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
176 int i;
177 VReg *Vd = (VReg *)vd;
178 VReg *Vj = (VReg *)vj;
179 VReg *Vk = (VReg *)vk;
180 int oprsz = simd_oprsz(desc);
182 for (i = 0; i < oprsz / 16; i++) {
183 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)),
184 int128_makes64(Vk->D(2 * i)));
188 DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB)
189 DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB)
190 DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB)
192 void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
194 int i;
195 VReg *Vd = (VReg *)vd;
196 VReg *Vj = (VReg *)vj;
197 VReg *Vk = (VReg *)vk;
198 int oprsz = simd_oprsz(desc);
200 for (i = 0; i < oprsz / 16; i++) {
201 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
202 int128_makes64(Vk->D(2 * i + 1)));
206 DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB)
207 DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB)
208 DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB)
210 void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
212 int i;
213 VReg *Vd = (VReg *)vd;
214 VReg *Vj = (VReg *)vj;
215 VReg *Vk = (VReg *)vk;
216 int oprsz = simd_oprsz(desc);
218 for (i = 0; i < oprsz / 16; i++) {
219 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
220 int128_make64(Vk->UD(2 * i)));
224 DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD)
225 DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD)
226 DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD)
228 void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
230 int i;
231 VReg *Vd = (VReg *)vd;
232 VReg *Vj = (VReg *)vj;
233 VReg *Vk = (VReg *)vk;
234 int oprsz = simd_oprsz(desc);
236 for (i = 0; i < oprsz / 16; i++) {
237 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
238 int128_make64(Vk->UD(2 * i + 1)));
242 DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD)
243 DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD)
244 DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD)
246 void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
248 int i;
249 VReg *Vd = (VReg *)vd;
250 VReg *Vj = (VReg *)vj;
251 VReg *Vk = (VReg *)vk;
252 int oprsz = simd_oprsz(desc);
254 for (i = 0; i < oprsz / 16; i++) {
255 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)),
256 int128_make64(Vk->UD(2 * i)));
260 DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB)
261 DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB)
262 DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB)
264 void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
266 int i;
267 VReg *Vd = (VReg *)vd;
268 VReg *Vj = (VReg *)vj;
269 VReg *Vk = (VReg *)vk;
270 int oprsz = simd_oprsz(desc);
272 for (i = 0; i < oprsz / 16; i++) {
273 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
274 int128_make64(Vk->UD(2 * i + 1)));
278 DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB)
279 DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB)
280 DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB)
282 #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
283 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
285 int i; \
286 VReg *Vd = (VReg *)vd; \
287 VReg *Vj = (VReg *)vj; \
288 VReg *Vk = (VReg *)vk; \
289 typedef __typeof(Vd->ES1(0)) TDS; \
290 typedef __typeof(Vd->EU1(0)) TDU; \
291 int oprsz = simd_oprsz(desc); \
293 for (i = 0; i < oprsz / (BIT / 8); i++) { \
294 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \
298 #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
299 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
301 int i; \
302 VReg *Vd = (VReg *)vd; \
303 VReg *Vj = (VReg *)vj; \
304 VReg *Vk = (VReg *)vk; \
305 typedef __typeof(Vd->ES1(0)) TDS; \
306 typedef __typeof(Vd->EU1(0)) TDU; \
307 int oprsz = simd_oprsz(desc); \
309 for (i = 0; i < oprsz / (BIT / 8); i++) { \
310 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \
314 void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
316 int i;
317 VReg *Vd = (VReg *)vd;
318 VReg *Vj = (VReg *)vj;
319 VReg *Vk = (VReg *)vk;
320 int oprsz = simd_oprsz(desc);
322 for (i = 0; i < oprsz / 16; i++) {
323 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
324 int128_makes64(Vk->D(2 * i)));
328 DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD)
329 DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD)
330 DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD)
332 void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
334 int i;
335 VReg *Vd = (VReg *)vd;
336 VReg *Vj = (VReg *)vj;
337 VReg *Vk = (VReg *)vk;
338 int oprsz = simd_oprsz(desc);
340 for (i = 0; i < oprsz / 16; i++) {
341 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
342 int128_makes64(Vk->D(2 * i + 1)));
346 DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD)
347 DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD)
348 DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD)
350 #define DO_VAVG(a, b) ((a >> 1) + (b >> 1) + (a & b & 1))
351 #define DO_VAVGR(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1))
353 #define DO_3OP(NAME, BIT, E, DO_OP) \
354 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
356 int i; \
357 VReg *Vd = (VReg *)vd; \
358 VReg *Vj = (VReg *)vj; \
359 VReg *Vk = (VReg *)vk; \
360 int oprsz = simd_oprsz(desc); \
362 for (i = 0; i < oprsz / (BIT / 8); i++) { \
363 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \
367 DO_3OP(vavg_b, 8, B, DO_VAVG)
368 DO_3OP(vavg_h, 16, H, DO_VAVG)
369 DO_3OP(vavg_w, 32, W, DO_VAVG)
370 DO_3OP(vavg_d, 64, D, DO_VAVG)
371 DO_3OP(vavgr_b, 8, B, DO_VAVGR)
372 DO_3OP(vavgr_h, 16, H, DO_VAVGR)
373 DO_3OP(vavgr_w, 32, W, DO_VAVGR)
374 DO_3OP(vavgr_d, 64, D, DO_VAVGR)
375 DO_3OP(vavg_bu, 8, UB, DO_VAVG)
376 DO_3OP(vavg_hu, 16, UH, DO_VAVG)
377 DO_3OP(vavg_wu, 32, UW, DO_VAVG)
378 DO_3OP(vavg_du, 64, UD, DO_VAVG)
379 DO_3OP(vavgr_bu, 8, UB, DO_VAVGR)
380 DO_3OP(vavgr_hu, 16, UH, DO_VAVGR)
381 DO_3OP(vavgr_wu, 32, UW, DO_VAVGR)
382 DO_3OP(vavgr_du, 64, UD, DO_VAVGR)
384 #define DO_VABSD(a, b) ((a > b) ? (a -b) : (b-a))
386 DO_3OP(vabsd_b, 8, B, DO_VABSD)
387 DO_3OP(vabsd_h, 16, H, DO_VABSD)
388 DO_3OP(vabsd_w, 32, W, DO_VABSD)
389 DO_3OP(vabsd_d, 64, D, DO_VABSD)
390 DO_3OP(vabsd_bu, 8, UB, DO_VABSD)
391 DO_3OP(vabsd_hu, 16, UH, DO_VABSD)
392 DO_3OP(vabsd_wu, 32, UW, DO_VABSD)
393 DO_3OP(vabsd_du, 64, UD, DO_VABSD)
395 #define DO_VABS(a) ((a < 0) ? (-a) : (a))
397 #define DO_VADDA(NAME, BIT, E) \
398 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
400 int i; \
401 VReg *Vd = (VReg *)vd; \
402 VReg *Vj = (VReg *)vj; \
403 VReg *Vk = (VReg *)vk; \
404 int oprsz = simd_oprsz(desc); \
406 for (i = 0; i < oprsz / (BIT / 8); i++) { \
407 Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \
411 DO_VADDA(vadda_b, 8, B)
412 DO_VADDA(vadda_h, 16, H)
413 DO_VADDA(vadda_w, 32, W)
414 DO_VADDA(vadda_d, 64, D)
416 #define DO_MIN(a, b) (a < b ? a : b)
417 #define DO_MAX(a, b) (a > b ? a : b)
419 #define VMINMAXI(NAME, BIT, E, DO_OP) \
420 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
422 int i; \
423 VReg *Vd = (VReg *)vd; \
424 VReg *Vj = (VReg *)vj; \
425 typedef __typeof(Vd->E(0)) TD; \
426 int oprsz = simd_oprsz(desc); \
428 for (i = 0; i < oprsz / (BIT / 8); i++) { \
429 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \
433 VMINMAXI(vmini_b, 8, B, DO_MIN)
434 VMINMAXI(vmini_h, 16, H, DO_MIN)
435 VMINMAXI(vmini_w, 32, W, DO_MIN)
436 VMINMAXI(vmini_d, 64, D, DO_MIN)
437 VMINMAXI(vmaxi_b, 8, B, DO_MAX)
438 VMINMAXI(vmaxi_h, 16, H, DO_MAX)
439 VMINMAXI(vmaxi_w, 32, W, DO_MAX)
440 VMINMAXI(vmaxi_d, 64, D, DO_MAX)
441 VMINMAXI(vmini_bu, 8, UB, DO_MIN)
442 VMINMAXI(vmini_hu, 16, UH, DO_MIN)
443 VMINMAXI(vmini_wu, 32, UW, DO_MIN)
444 VMINMAXI(vmini_du, 64, UD, DO_MIN)
445 VMINMAXI(vmaxi_bu, 8, UB, DO_MAX)
446 VMINMAXI(vmaxi_hu, 16, UH, DO_MAX)
447 VMINMAXI(vmaxi_wu, 32, UW, DO_MAX)
448 VMINMAXI(vmaxi_du, 64, UD, DO_MAX)
450 #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \
451 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
453 int i; \
454 VReg *Vd = (VReg *)vd; \
455 VReg *Vj = (VReg *)vj; \
456 VReg *Vk = (VReg *)vk; \
457 typedef __typeof(Vd->E1(0)) T; \
459 for (i = 0; i < LSX_LEN/BIT; i++) { \
460 Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \
464 void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t v)
466 uint64_t l, h1, h2;
467 VReg *Vd = (VReg *)vd;
468 VReg *Vj = (VReg *)vj;
469 VReg *Vk = (VReg *)vk;
471 muls64(&l, &h1, Vj->D(0), Vk->D(0));
472 muls64(&l, &h2, Vj->D(1), Vk->D(1));
474 Vd->D(0) = h1;
475 Vd->D(1) = h2;
478 DO_VMUH(vmuh_b, 8, H, B, DO_MUH)
479 DO_VMUH(vmuh_h, 16, W, H, DO_MUH)
480 DO_VMUH(vmuh_w, 32, D, W, DO_MUH)
482 void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t v)
484 uint64_t l, h1, h2;
485 VReg *Vd = (VReg *)vd;
486 VReg *Vj = (VReg *)vj;
487 VReg *Vk = (VReg *)vk;
489 mulu64(&l, &h1, Vj->D(0), Vk->D(0));
490 mulu64(&l, &h2, Vj->D(1), Vk->D(1));
492 Vd->D(0) = h1;
493 Vd->D(1) = h2;
496 DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH)
497 DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH)
498 DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH)
500 #define DO_MUL(a, b) (a * b)
502 DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL)
503 DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL)
504 DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL)
506 DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL)
507 DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL)
508 DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL)
510 DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL)
511 DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL)
512 DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL)
514 DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL)
515 DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL)
516 DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL)
518 DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
519 DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
520 DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
522 DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
523 DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
524 DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
526 #define DO_MADD(a, b, c) (a + b * c)
527 #define DO_MSUB(a, b, c) (a - b * c)
529 #define VMADDSUB(NAME, BIT, E, DO_OP) \
530 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
532 int i; \
533 VReg *Vd = (VReg *)vd; \
534 VReg *Vj = (VReg *)vj; \
535 VReg *Vk = (VReg *)vk; \
536 for (i = 0; i < LSX_LEN/BIT; i++) { \
537 Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \
541 VMADDSUB(vmadd_b, 8, B, DO_MADD)
542 VMADDSUB(vmadd_h, 16, H, DO_MADD)
543 VMADDSUB(vmadd_w, 32, W, DO_MADD)
544 VMADDSUB(vmadd_d, 64, D, DO_MADD)
545 VMADDSUB(vmsub_b, 8, B, DO_MSUB)
546 VMADDSUB(vmsub_h, 16, H, DO_MSUB)
547 VMADDSUB(vmsub_w, 32, W, DO_MSUB)
548 VMADDSUB(vmsub_d, 64, D, DO_MSUB)
550 #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \
551 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
553 int i; \
554 VReg *Vd = (VReg *)vd; \
555 VReg *Vj = (VReg *)vj; \
556 VReg *Vk = (VReg *)vk; \
557 typedef __typeof(Vd->E1(0)) TD; \
559 for (i = 0; i < LSX_LEN/BIT; i++) { \
560 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \
564 VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL)
565 VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL)
566 VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL)
567 VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL)
568 VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL)
569 VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL)
571 #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \
572 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
574 int i; \
575 VReg *Vd = (VReg *)vd; \
576 VReg *Vj = (VReg *)vj; \
577 VReg *Vk = (VReg *)vk; \
578 typedef __typeof(Vd->E1(0)) TD; \
580 for (i = 0; i < LSX_LEN/BIT; i++) { \
581 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \
582 (TD)Vk->E2(2 * i + 1)); \
586 VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL)
587 VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL)
588 VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL)
589 VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL)
590 VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL)
591 VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL)
593 #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
594 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
596 int i; \
597 VReg *Vd = (VReg *)vd; \
598 VReg *Vj = (VReg *)vj; \
599 VReg *Vk = (VReg *)vk; \
600 typedef __typeof(Vd->ES1(0)) TS1; \
601 typedef __typeof(Vd->EU1(0)) TU1; \
603 for (i = 0; i < LSX_LEN/BIT; i++) { \
604 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \
605 (TS1)Vk->ES2(2 * i)); \
609 VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
610 VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
611 VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
613 #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
614 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
616 int i; \
617 VReg *Vd = (VReg *)vd; \
618 VReg *Vj = (VReg *)vj; \
619 VReg *Vk = (VReg *)vk; \
620 typedef __typeof(Vd->ES1(0)) TS1; \
621 typedef __typeof(Vd->EU1(0)) TU1; \
623 for (i = 0; i < LSX_LEN/BIT; i++) { \
624 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \
625 (TS1)Vk->ES2(2 * i + 1)); \
629 VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
630 VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
631 VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
633 #define DO_DIVU(N, M) (unlikely(M == 0) ? 0 : N / M)
634 #define DO_REMU(N, M) (unlikely(M == 0) ? 0 : N % M)
635 #define DO_DIV(N, M) (unlikely(M == 0) ? 0 :\
636 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
637 #define DO_REM(N, M) (unlikely(M == 0) ? 0 :\
638 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
640 #define VDIV(NAME, BIT, E, DO_OP) \
641 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
643 int i; \
644 VReg *Vd = (VReg *)vd; \
645 VReg *Vj = (VReg *)vj; \
646 VReg *Vk = (VReg *)vk; \
647 for (i = 0; i < LSX_LEN/BIT; i++) { \
648 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \
652 VDIV(vdiv_b, 8, B, DO_DIV)
653 VDIV(vdiv_h, 16, H, DO_DIV)
654 VDIV(vdiv_w, 32, W, DO_DIV)
655 VDIV(vdiv_d, 64, D, DO_DIV)
656 VDIV(vdiv_bu, 8, UB, DO_DIVU)
657 VDIV(vdiv_hu, 16, UH, DO_DIVU)
658 VDIV(vdiv_wu, 32, UW, DO_DIVU)
659 VDIV(vdiv_du, 64, UD, DO_DIVU)
660 VDIV(vmod_b, 8, B, DO_REM)
661 VDIV(vmod_h, 16, H, DO_REM)
662 VDIV(vmod_w, 32, W, DO_REM)
663 VDIV(vmod_d, 64, D, DO_REM)
664 VDIV(vmod_bu, 8, UB, DO_REMU)
665 VDIV(vmod_hu, 16, UH, DO_REMU)
666 VDIV(vmod_wu, 32, UW, DO_REMU)
667 VDIV(vmod_du, 64, UD, DO_REMU)
669 #define VSAT_S(NAME, BIT, E) \
670 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t v) \
672 int i; \
673 VReg *Vd = (VReg *)vd; \
674 VReg *Vj = (VReg *)vj; \
675 typedef __typeof(Vd->E(0)) TD; \
677 for (i = 0; i < LSX_LEN/BIT; i++) { \
678 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \
679 Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \
683 VSAT_S(vsat_b, 8, B)
684 VSAT_S(vsat_h, 16, H)
685 VSAT_S(vsat_w, 32, W)
686 VSAT_S(vsat_d, 64, D)
688 #define VSAT_U(NAME, BIT, E) \
689 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t v) \
691 int i; \
692 VReg *Vd = (VReg *)vd; \
693 VReg *Vj = (VReg *)vj; \
694 typedef __typeof(Vd->E(0)) TD; \
696 for (i = 0; i < LSX_LEN/BIT; i++) { \
697 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \
701 VSAT_U(vsat_bu, 8, UB)
702 VSAT_U(vsat_hu, 16, UH)
703 VSAT_U(vsat_wu, 32, UW)
704 VSAT_U(vsat_du, 64, UD)
706 #define VEXTH(NAME, BIT, E1, E2) \
707 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
709 int i; \
710 VReg *Vd = (VReg *)vd; \
711 VReg *Vj = (VReg *)vj; \
713 for (i = 0; i < LSX_LEN/BIT; i++) { \
714 Vd->E1(i) = Vj->E2(i + LSX_LEN/BIT); \
718 void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc)
720 VReg *Vd = (VReg *)vd;
721 VReg *Vj = (VReg *)vj;
723 Vd->Q(0) = int128_makes64(Vj->D(1));
726 void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc)
728 VReg *Vd = (VReg *)vd;
729 VReg *Vj = (VReg *)vj;
731 Vd->Q(0) = int128_make64((uint64_t)Vj->D(1));
734 VEXTH(vexth_h_b, 16, H, B)
735 VEXTH(vexth_w_h, 32, W, H)
736 VEXTH(vexth_d_w, 64, D, W)
737 VEXTH(vexth_hu_bu, 16, UH, UB)
738 VEXTH(vexth_wu_hu, 32, UW, UH)
739 VEXTH(vexth_du_wu, 64, UD, UW)
741 #define DO_SIGNCOV(a, b) (a == 0 ? 0 : a < 0 ? -b : b)
743 DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV)
744 DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV)
745 DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV)
746 DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV)
748 static uint64_t do_vmskltz_b(int64_t val)
750 uint64_t m = 0x8080808080808080ULL;
751 uint64_t c = val & m;
752 c |= c << 7;
753 c |= c << 14;
754 c |= c << 28;
755 return c >> 56;
758 void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc)
760 uint16_t temp = 0;
761 VReg *Vd = (VReg *)vd;
762 VReg *Vj = (VReg *)vj;
764 temp = do_vmskltz_b(Vj->D(0));
765 temp |= (do_vmskltz_b(Vj->D(1)) << 8);
766 Vd->D(0) = temp;
767 Vd->D(1) = 0;
770 static uint64_t do_vmskltz_h(int64_t val)
772 uint64_t m = 0x8000800080008000ULL;
773 uint64_t c = val & m;
774 c |= c << 15;
775 c |= c << 30;
776 return c >> 60;
779 void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc)
781 uint16_t temp = 0;
782 VReg *Vd = (VReg *)vd;
783 VReg *Vj = (VReg *)vj;
785 temp = do_vmskltz_h(Vj->D(0));
786 temp |= (do_vmskltz_h(Vj->D(1)) << 4);
787 Vd->D(0) = temp;
788 Vd->D(1) = 0;
791 static uint64_t do_vmskltz_w(int64_t val)
793 uint64_t m = 0x8000000080000000ULL;
794 uint64_t c = val & m;
795 c |= c << 31;
796 return c >> 62;
799 void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc)
801 uint16_t temp = 0;
802 VReg *Vd = (VReg *)vd;
803 VReg *Vj = (VReg *)vj;
805 temp = do_vmskltz_w(Vj->D(0));
806 temp |= (do_vmskltz_w(Vj->D(1)) << 2);
807 Vd->D(0) = temp;
808 Vd->D(1) = 0;
811 static uint64_t do_vmskltz_d(int64_t val)
813 return (uint64_t)val >> 63;
815 void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc)
817 uint16_t temp = 0;
818 VReg *Vd = (VReg *)vd;
819 VReg *Vj = (VReg *)vj;
821 temp = do_vmskltz_d(Vj->D(0));
822 temp |= (do_vmskltz_d(Vj->D(1)) << 1);
823 Vd->D(0) = temp;
824 Vd->D(1) = 0;
827 void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc)
829 uint16_t temp = 0;
830 VReg *Vd = (VReg *)vd;
831 VReg *Vj = (VReg *)vj;
833 temp = do_vmskltz_b(Vj->D(0));
834 temp |= (do_vmskltz_b(Vj->D(1)) << 8);
835 Vd->D(0) = (uint16_t)(~temp);
836 Vd->D(1) = 0;
839 static uint64_t do_vmskez_b(uint64_t a)
841 uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
842 uint64_t c = ~(((a & m) + m) | a | m);
843 c |= c << 7;
844 c |= c << 14;
845 c |= c << 28;
846 return c >> 56;
849 void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc)
851 uint16_t temp = 0;
852 VReg *Vd = (VReg *)vd;
853 VReg *Vj = (VReg *)vj;
855 temp = do_vmskez_b(Vj->D(0));
856 temp |= (do_vmskez_b(Vj->D(1)) << 8);
857 Vd->D(0) = (uint16_t)(~temp);
858 Vd->D(1) = 0;
861 void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t v)
863 int i;
864 VReg *Vd = (VReg *)vd;
865 VReg *Vj = (VReg *)vj;
867 for (i = 0; i < LSX_LEN/8; i++) {
868 Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm);
872 #define VSLLWIL(NAME, BIT, E1, E2) \
873 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
875 int i; \
876 VReg temp; \
877 VReg *Vd = (VReg *)vd; \
878 VReg *Vj = (VReg *)vj; \
879 typedef __typeof(temp.E1(0)) TD; \
881 temp.D(0) = 0; \
882 temp.D(1) = 0; \
883 for (i = 0; i < LSX_LEN/BIT; i++) { \
884 temp.E1(i) = (TD)Vj->E2(i) << (imm % BIT); \
886 *Vd = temp; \
889 void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc)
891 VReg *Vd = (VReg *)vd;
892 VReg *Vj = (VReg *)vj;
894 Vd->Q(0) = int128_makes64(Vj->D(0));
897 void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc)
899 VReg *Vd = (VReg *)vd;
900 VReg *Vj = (VReg *)vj;
902 Vd->Q(0) = int128_make64(Vj->D(0));
905 VSLLWIL(vsllwil_h_b, 16, H, B)
906 VSLLWIL(vsllwil_w_h, 32, W, H)
907 VSLLWIL(vsllwil_d_w, 64, D, W)
908 VSLLWIL(vsllwil_hu_bu, 16, UH, UB)
909 VSLLWIL(vsllwil_wu_hu, 32, UW, UH)
910 VSLLWIL(vsllwil_du_wu, 64, UD, UW)
912 #define do_vsrlr(E, T) \
913 static T do_vsrlr_ ##E(T s1, int sh) \
915 if (sh == 0) { \
916 return s1; \
917 } else { \
918 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \
922 do_vsrlr(B, uint8_t)
923 do_vsrlr(H, uint16_t)
924 do_vsrlr(W, uint32_t)
925 do_vsrlr(D, uint64_t)
927 #define VSRLR(NAME, BIT, T, E) \
928 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
930 int i; \
931 VReg *Vd = (VReg *)vd; \
932 VReg *Vj = (VReg *)vj; \
933 VReg *Vk = (VReg *)vk; \
935 for (i = 0; i < LSX_LEN/BIT; i++) { \
936 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
940 VSRLR(vsrlr_b, 8, uint8_t, B)
941 VSRLR(vsrlr_h, 16, uint16_t, H)
942 VSRLR(vsrlr_w, 32, uint32_t, W)
943 VSRLR(vsrlr_d, 64, uint64_t, D)
945 #define VSRLRI(NAME, BIT, E) \
946 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
948 int i; \
949 VReg *Vd = (VReg *)vd; \
950 VReg *Vj = (VReg *)vj; \
952 for (i = 0; i < LSX_LEN/BIT; i++) { \
953 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \
957 VSRLRI(vsrlri_b, 8, B)
958 VSRLRI(vsrlri_h, 16, H)
959 VSRLRI(vsrlri_w, 32, W)
960 VSRLRI(vsrlri_d, 64, D)
962 #define do_vsrar(E, T) \
963 static T do_vsrar_ ##E(T s1, int sh) \
965 if (sh == 0) { \
966 return s1; \
967 } else { \
968 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \
972 do_vsrar(B, int8_t)
973 do_vsrar(H, int16_t)
974 do_vsrar(W, int32_t)
975 do_vsrar(D, int64_t)
977 #define VSRAR(NAME, BIT, T, E) \
978 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
980 int i; \
981 VReg *Vd = (VReg *)vd; \
982 VReg *Vj = (VReg *)vj; \
983 VReg *Vk = (VReg *)vk; \
985 for (i = 0; i < LSX_LEN/BIT; i++) { \
986 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
990 VSRAR(vsrar_b, 8, uint8_t, B)
991 VSRAR(vsrar_h, 16, uint16_t, H)
992 VSRAR(vsrar_w, 32, uint32_t, W)
993 VSRAR(vsrar_d, 64, uint64_t, D)
995 #define VSRARI(NAME, BIT, E) \
996 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
998 int i; \
999 VReg *Vd = (VReg *)vd; \
1000 VReg *Vj = (VReg *)vj; \
1002 for (i = 0; i < LSX_LEN/BIT; i++) { \
1003 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \
1007 VSRARI(vsrari_b, 8, B)
1008 VSRARI(vsrari_h, 16, H)
1009 VSRARI(vsrari_w, 32, W)
1010 VSRARI(vsrari_d, 64, D)
1012 #define R_SHIFT(a, b) (a >> b)
1014 #define VSRLN(NAME, BIT, T, E1, E2) \
1015 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1017 int i; \
1018 VReg *Vd = (VReg *)vd; \
1019 VReg *Vj = (VReg *)vj; \
1020 VReg *Vk = (VReg *)vk; \
1022 for (i = 0; i < LSX_LEN/BIT; i++) { \
1023 Vd->E1(i) = R_SHIFT((T)Vj->E2(i),((T)Vk->E2(i)) % BIT); \
1025 Vd->D(1) = 0; \
1028 VSRLN(vsrln_b_h, 16, uint16_t, B, H)
1029 VSRLN(vsrln_h_w, 32, uint32_t, H, W)
1030 VSRLN(vsrln_w_d, 64, uint64_t, W, D)
1032 #define VSRAN(NAME, BIT, T, E1, E2) \
1033 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1035 int i; \
1036 VReg *Vd = (VReg *)vd; \
1037 VReg *Vj = (VReg *)vj; \
1038 VReg *Vk = (VReg *)vk; \
1040 for (i = 0; i < LSX_LEN/BIT; i++) { \
1041 Vd->E1(i) = R_SHIFT(Vj->E2(i), ((T)Vk->E2(i)) % BIT); \
1043 Vd->D(1) = 0; \
1046 VSRAN(vsran_b_h, 16, uint16_t, B, H)
1047 VSRAN(vsran_h_w, 32, uint32_t, H, W)
1048 VSRAN(vsran_w_d, 64, uint64_t, W, D)
1050 #define VSRLNI(NAME, BIT, T, E1, E2) \
1051 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1053 int i, max; \
1054 VReg temp; \
1055 VReg *Vd = (VReg *)vd; \
1056 VReg *Vj = (VReg *)vj; \
1058 temp.D(0) = 0; \
1059 temp.D(1) = 0; \
1060 max = LSX_LEN/BIT; \
1061 for (i = 0; i < max; i++) { \
1062 temp.E1(i) = R_SHIFT((T)Vj->E2(i), imm); \
1063 temp.E1(i + max) = R_SHIFT((T)Vd->E2(i), imm); \
1065 *Vd = temp; \
1068 void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1070 VReg temp;
1071 VReg *Vd = (VReg *)vd;
1072 VReg *Vj = (VReg *)vj;
1074 temp.D(0) = 0;
1075 temp.D(1) = 0;
1076 temp.D(0) = int128_getlo(int128_urshift(Vj->Q(0), imm % 128));
1077 temp.D(1) = int128_getlo(int128_urshift(Vd->Q(0), imm % 128));
1078 *Vd = temp;
1081 VSRLNI(vsrlni_b_h, 16, uint16_t, B, H)
1082 VSRLNI(vsrlni_h_w, 32, uint32_t, H, W)
1083 VSRLNI(vsrlni_w_d, 64, uint64_t, W, D)
1085 #define VSRANI(NAME, BIT, E1, E2) \
1086 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1088 int i, max; \
1089 VReg temp; \
1090 VReg *Vd = (VReg *)vd; \
1091 VReg *Vj = (VReg *)vj; \
1093 temp.D(0) = 0; \
1094 temp.D(1) = 0; \
1095 max = LSX_LEN/BIT; \
1096 for (i = 0; i < max; i++) { \
1097 temp.E1(i) = R_SHIFT(Vj->E2(i), imm); \
1098 temp.E1(i + max) = R_SHIFT(Vd->E2(i), imm); \
1100 *Vd = temp; \
1103 void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1105 VReg temp;
1106 VReg *Vd = (VReg *)vd;
1107 VReg *Vj = (VReg *)vj;
1109 temp.D(0) = 0;
1110 temp.D(1) = 0;
1111 temp.D(0) = int128_getlo(int128_rshift(Vj->Q(0), imm % 128));
1112 temp.D(1) = int128_getlo(int128_rshift(Vd->Q(0), imm % 128));
1113 *Vd = temp;
1116 VSRANI(vsrani_b_h, 16, B, H)
1117 VSRANI(vsrani_h_w, 32, H, W)
1118 VSRANI(vsrani_w_d, 64, W, D)
1120 #define VSRLRN(NAME, BIT, T, E1, E2) \
1121 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1123 int i; \
1124 VReg *Vd = (VReg *)vd; \
1125 VReg *Vj = (VReg *)vj; \
1126 VReg *Vk = (VReg *)vk; \
1128 for (i = 0; i < LSX_LEN/BIT; i++) { \
1129 Vd->E1(i) = do_vsrlr_ ## E2(Vj->E2(i), ((T)Vk->E2(i))%BIT); \
1131 Vd->D(1) = 0; \
1134 VSRLRN(vsrlrn_b_h, 16, uint16_t, B, H)
1135 VSRLRN(vsrlrn_h_w, 32, uint32_t, H, W)
1136 VSRLRN(vsrlrn_w_d, 64, uint64_t, W, D)
1138 #define VSRARN(NAME, BIT, T, E1, E2) \
1139 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1141 int i; \
1142 VReg *Vd = (VReg *)vd; \
1143 VReg *Vj = (VReg *)vj; \
1144 VReg *Vk = (VReg *)vk; \
1146 for (i = 0; i < LSX_LEN/BIT; i++) { \
1147 Vd->E1(i) = do_vsrar_ ## E2(Vj->E2(i), ((T)Vk->E2(i))%BIT); \
1149 Vd->D(1) = 0; \
1152 VSRARN(vsrarn_b_h, 16, uint8_t, B, H)
1153 VSRARN(vsrarn_h_w, 32, uint16_t, H, W)
1154 VSRARN(vsrarn_w_d, 64, uint32_t, W, D)
1156 #define VSRLRNI(NAME, BIT, E1, E2) \
1157 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1159 int i, max; \
1160 VReg temp; \
1161 VReg *Vd = (VReg *)vd; \
1162 VReg *Vj = (VReg *)vj; \
1164 temp.D(0) = 0; \
1165 temp.D(1) = 0; \
1166 max = LSX_LEN/BIT; \
1167 for (i = 0; i < max; i++) { \
1168 temp.E1(i) = do_vsrlr_ ## E2(Vj->E2(i), imm); \
1169 temp.E1(i + max) = do_vsrlr_ ## E2(Vd->E2(i), imm); \
1171 *Vd = temp; \
1174 void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1176 VReg temp;
1177 VReg *Vd = (VReg *)vd;
1178 VReg *Vj = (VReg *)vj;
1179 Int128 r1, r2;
1181 if (imm == 0) {
1182 temp.D(0) = int128_getlo(Vj->Q(0));
1183 temp.D(1) = int128_getlo(Vd->Q(0));
1184 } else {
1185 r1 = int128_and(int128_urshift(Vj->Q(0), (imm -1)), int128_one());
1186 r2 = int128_and(int128_urshift(Vd->Q(0), (imm -1)), int128_one());
1188 temp.D(0) = int128_getlo(int128_add(int128_urshift(Vj->Q(0), imm), r1));
1189 temp.D(1) = int128_getlo(int128_add(int128_urshift(Vd->Q(0), imm), r2));
1191 *Vd = temp;
1194 VSRLRNI(vsrlrni_b_h, 16, B, H)
1195 VSRLRNI(vsrlrni_h_w, 32, H, W)
1196 VSRLRNI(vsrlrni_w_d, 64, W, D)
1198 #define VSRARNI(NAME, BIT, E1, E2) \
1199 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1201 int i, max; \
1202 VReg temp; \
1203 VReg *Vd = (VReg *)vd; \
1204 VReg *Vj = (VReg *)vj; \
1206 temp.D(0) = 0; \
1207 temp.D(1) = 0; \
1208 max = LSX_LEN/BIT; \
1209 for (i = 0; i < max; i++) { \
1210 temp.E1(i) = do_vsrar_ ## E2(Vj->E2(i), imm); \
1211 temp.E1(i + max) = do_vsrar_ ## E2(Vd->E2(i), imm); \
1213 *Vd = temp; \
1216 void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1218 VReg temp;
1219 VReg *Vd = (VReg *)vd;
1220 VReg *Vj = (VReg *)vj;
1221 Int128 r1, r2;
1223 if (imm == 0) {
1224 temp.D(0) = int128_getlo(Vj->Q(0));
1225 temp.D(1) = int128_getlo(Vd->Q(0));
1226 } else {
1227 r1 = int128_and(int128_rshift(Vj->Q(0), (imm -1)), int128_one());
1228 r2 = int128_and(int128_rshift(Vd->Q(0), (imm -1)), int128_one());
1230 temp.D(0) = int128_getlo(int128_add(int128_rshift(Vj->Q(0), imm), r1));
1231 temp.D(1) = int128_getlo(int128_add(int128_rshift(Vd->Q(0), imm), r2));
1233 *Vd = temp;
1236 VSRARNI(vsrarni_b_h, 16, B, H)
1237 VSRARNI(vsrarni_h_w, 32, H, W)
1238 VSRARNI(vsrarni_w_d, 64, W, D)
1240 #define SSRLNS(NAME, T1, T2, T3) \
1241 static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \
1243 T1 shft_res; \
1244 if (sa == 0) { \
1245 shft_res = e2; \
1246 } else { \
1247 shft_res = (((T1)e2) >> sa); \
1249 T3 mask; \
1250 mask = (1ull << sh) -1; \
1251 if (shft_res > mask) { \
1252 return mask; \
1253 } else { \
1254 return shft_res; \
1258 SSRLNS(B, uint16_t, int16_t, uint8_t)
1259 SSRLNS(H, uint32_t, int32_t, uint16_t)
1260 SSRLNS(W, uint64_t, int64_t, uint32_t)
1262 #define VSSRLN(NAME, BIT, T, E1, E2) \
1263 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1265 int i; \
1266 VReg *Vd = (VReg *)vd; \
1267 VReg *Vj = (VReg *)vj; \
1268 VReg *Vk = (VReg *)vk; \
1270 for (i = 0; i < LSX_LEN/BIT; i++) { \
1271 Vd->E1(i) = do_ssrlns_ ## E1(Vj->E2(i), (T)Vk->E2(i)% BIT, BIT/2 -1); \
1273 Vd->D(1) = 0; \
1276 VSSRLN(vssrln_b_h, 16, uint16_t, B, H)
1277 VSSRLN(vssrln_h_w, 32, uint32_t, H, W)
1278 VSSRLN(vssrln_w_d, 64, uint64_t, W, D)
1280 #define SSRANS(E, T1, T2) \
1281 static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \
1283 T1 shft_res; \
1284 if (sa == 0) { \
1285 shft_res = e2; \
1286 } else { \
1287 shft_res = e2 >> sa; \
1289 T2 mask; \
1290 mask = (1ll << sh) -1; \
1291 if (shft_res > mask) { \
1292 return mask; \
1293 } else if (shft_res < -(mask +1)) { \
1294 return ~mask; \
1295 } else { \
1296 return shft_res; \
1300 SSRANS(B, int16_t, int8_t)
1301 SSRANS(H, int32_t, int16_t)
1302 SSRANS(W, int64_t, int32_t)
1304 #define VSSRAN(NAME, BIT, T, E1, E2) \
1305 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1307 int i; \
1308 VReg *Vd = (VReg *)vd; \
1309 VReg *Vj = (VReg *)vj; \
1310 VReg *Vk = (VReg *)vk; \
1312 for (i = 0; i < LSX_LEN/BIT; i++) { \
1313 Vd->E1(i) = do_ssrans_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2 -1); \
1315 Vd->D(1) = 0; \
1318 VSSRAN(vssran_b_h, 16, uint16_t, B, H)
1319 VSSRAN(vssran_h_w, 32, uint32_t, H, W)
1320 VSSRAN(vssran_w_d, 64, uint64_t, W, D)
1322 #define SSRLNU(E, T1, T2, T3) \
1323 static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \
1325 T1 shft_res; \
1326 if (sa == 0) { \
1327 shft_res = e2; \
1328 } else { \
1329 shft_res = (((T1)e2) >> sa); \
1331 T2 mask; \
1332 mask = (1ull << sh) -1; \
1333 if (shft_res > mask) { \
1334 return mask; \
1335 } else { \
1336 return shft_res; \
1340 SSRLNU(B, uint16_t, uint8_t, int16_t)
1341 SSRLNU(H, uint32_t, uint16_t, int32_t)
1342 SSRLNU(W, uint64_t, uint32_t, int64_t)
1344 #define VSSRLNU(NAME, BIT, T, E1, E2) \
1345 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1347 int i; \
1348 VReg *Vd = (VReg *)vd; \
1349 VReg *Vj = (VReg *)vj; \
1350 VReg *Vk = (VReg *)vk; \
1352 for (i = 0; i < LSX_LEN/BIT; i++) { \
1353 Vd->E1(i) = do_ssrlnu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \
1355 Vd->D(1) = 0; \
1358 VSSRLNU(vssrln_bu_h, 16, uint16_t, B, H)
1359 VSSRLNU(vssrln_hu_w, 32, uint32_t, H, W)
1360 VSSRLNU(vssrln_wu_d, 64, uint64_t, W, D)
1362 #define SSRANU(E, T1, T2, T3) \
1363 static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \
1365 T1 shft_res; \
1366 if (sa == 0) { \
1367 shft_res = e2; \
1368 } else { \
1369 shft_res = e2 >> sa; \
1371 if (e2 < 0) { \
1372 shft_res = 0; \
1374 T2 mask; \
1375 mask = (1ull << sh) -1; \
1376 if (shft_res > mask) { \
1377 return mask; \
1378 } else { \
1379 return shft_res; \
1383 SSRANU(B, uint16_t, uint8_t, int16_t)
1384 SSRANU(H, uint32_t, uint16_t, int32_t)
1385 SSRANU(W, uint64_t, uint32_t, int64_t)
1387 #define VSSRANU(NAME, BIT, T, E1, E2) \
1388 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1390 int i; \
1391 VReg *Vd = (VReg *)vd; \
1392 VReg *Vj = (VReg *)vj; \
1393 VReg *Vk = (VReg *)vk; \
1395 for (i = 0; i < LSX_LEN/BIT; i++) { \
1396 Vd->E1(i) = do_ssranu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \
1398 Vd->D(1) = 0; \
1401 VSSRANU(vssran_bu_h, 16, uint16_t, B, H)
1402 VSSRANU(vssran_hu_w, 32, uint32_t, H, W)
1403 VSSRANU(vssran_wu_d, 64, uint64_t, W, D)
1405 #define VSSRLNI(NAME, BIT, E1, E2) \
1406 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1408 int i; \
1409 VReg temp; \
1410 VReg *Vd = (VReg *)vd; \
1411 VReg *Vj = (VReg *)vj; \
1413 for (i = 0; i < LSX_LEN/BIT; i++) { \
1414 temp.E1(i) = do_ssrlns_ ## E1(Vj->E2(i), imm, BIT/2 -1); \
1415 temp.E1(i + LSX_LEN/BIT) = do_ssrlns_ ## E1(Vd->E2(i), imm, BIT/2 -1);\
1417 *Vd = temp; \
1420 void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1422 Int128 shft_res1, shft_res2, mask;
1423 VReg *Vd = (VReg *)vd;
1424 VReg *Vj = (VReg *)vj;
1426 if (imm == 0) {
1427 shft_res1 = Vj->Q(0);
1428 shft_res2 = Vd->Q(0);
1429 } else {
1430 shft_res1 = int128_urshift(Vj->Q(0), imm);
1431 shft_res2 = int128_urshift(Vd->Q(0), imm);
1433 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1435 if (int128_ult(mask, shft_res1)) {
1436 Vd->D(0) = int128_getlo(mask);
1437 }else {
1438 Vd->D(0) = int128_getlo(shft_res1);
1441 if (int128_ult(mask, shft_res2)) {
1442 Vd->D(1) = int128_getlo(mask);
1443 }else {
1444 Vd->D(1) = int128_getlo(shft_res2);
1448 VSSRLNI(vssrlni_b_h, 16, B, H)
1449 VSSRLNI(vssrlni_h_w, 32, H, W)
1450 VSSRLNI(vssrlni_w_d, 64, W, D)
1452 #define VSSRANI(NAME, BIT, E1, E2) \
1453 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1455 int i; \
1456 VReg temp; \
1457 VReg *Vd = (VReg *)vd; \
1458 VReg *Vj = (VReg *)vj; \
1460 for (i = 0; i < LSX_LEN/BIT; i++) { \
1461 temp.E1(i) = do_ssrans_ ## E1(Vj->E2(i), imm, BIT/2 -1); \
1462 temp.E1(i + LSX_LEN/BIT) = do_ssrans_ ## E1(Vd->E2(i), imm, BIT/2 -1); \
1464 *Vd = temp; \
1467 void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1469 Int128 shft_res1, shft_res2, mask, min;
1470 VReg *Vd = (VReg *)vd;
1471 VReg *Vj = (VReg *)vj;
1473 if (imm == 0) {
1474 shft_res1 = Vj->Q(0);
1475 shft_res2 = Vd->Q(0);
1476 } else {
1477 shft_res1 = int128_rshift(Vj->Q(0), imm);
1478 shft_res2 = int128_rshift(Vd->Q(0), imm);
1480 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1481 min = int128_lshift(int128_one(), 63);
1483 if (int128_gt(shft_res1, mask)) {
1484 Vd->D(0) = int128_getlo(mask);
1485 } else if (int128_lt(shft_res1, int128_neg(min))) {
1486 Vd->D(0) = int128_getlo(min);
1487 } else {
1488 Vd->D(0) = int128_getlo(shft_res1);
1491 if (int128_gt(shft_res2, mask)) {
1492 Vd->D(1) = int128_getlo(mask);
1493 } else if (int128_lt(shft_res2, int128_neg(min))) {
1494 Vd->D(1) = int128_getlo(min);
1495 } else {
1496 Vd->D(1) = int128_getlo(shft_res2);
1500 VSSRANI(vssrani_b_h, 16, B, H)
1501 VSSRANI(vssrani_h_w, 32, H, W)
1502 VSSRANI(vssrani_w_d, 64, W, D)
1504 #define VSSRLNUI(NAME, BIT, E1, E2) \
1505 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1507 int i; \
1508 VReg temp; \
1509 VReg *Vd = (VReg *)vd; \
1510 VReg *Vj = (VReg *)vj; \
1512 for (i = 0; i < LSX_LEN/BIT; i++) { \
1513 temp.E1(i) = do_ssrlnu_ ## E1(Vj->E2(i), imm, BIT/2); \
1514 temp.E1(i + LSX_LEN/BIT) = do_ssrlnu_ ## E1(Vd->E2(i), imm, BIT/2); \
1516 *Vd = temp; \
1519 void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1521 Int128 shft_res1, shft_res2, mask;
1522 VReg *Vd = (VReg *)vd;
1523 VReg *Vj = (VReg *)vj;
1525 if (imm == 0) {
1526 shft_res1 = Vj->Q(0);
1527 shft_res2 = Vd->Q(0);
1528 } else {
1529 shft_res1 = int128_urshift(Vj->Q(0), imm);
1530 shft_res2 = int128_urshift(Vd->Q(0), imm);
1532 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1534 if (int128_ult(mask, shft_res1)) {
1535 Vd->D(0) = int128_getlo(mask);
1536 }else {
1537 Vd->D(0) = int128_getlo(shft_res1);
1540 if (int128_ult(mask, shft_res2)) {
1541 Vd->D(1) = int128_getlo(mask);
1542 }else {
1543 Vd->D(1) = int128_getlo(shft_res2);
1547 VSSRLNUI(vssrlni_bu_h, 16, B, H)
1548 VSSRLNUI(vssrlni_hu_w, 32, H, W)
1549 VSSRLNUI(vssrlni_wu_d, 64, W, D)
1551 #define VSSRANUI(NAME, BIT, E1, E2) \
1552 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1554 int i; \
1555 VReg temp; \
1556 VReg *Vd = (VReg *)vd; \
1557 VReg *Vj = (VReg *)vj; \
1559 for (i = 0; i < LSX_LEN/BIT; i++) { \
1560 temp.E1(i) = do_ssranu_ ## E1(Vj->E2(i), imm, BIT/2); \
1561 temp.E1(i + LSX_LEN/BIT) = do_ssranu_ ## E1(Vd->E2(i), imm, BIT/2); \
1563 *Vd = temp; \
1566 void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1568 Int128 shft_res1, shft_res2, mask;
1569 VReg *Vd = (VReg *)vd;
1570 VReg *Vj = (VReg *)vj;
1572 if (imm == 0) {
1573 shft_res1 = Vj->Q(0);
1574 shft_res2 = Vd->Q(0);
1575 } else {
1576 shft_res1 = int128_rshift(Vj->Q(0), imm);
1577 shft_res2 = int128_rshift(Vd->Q(0), imm);
1580 if (int128_lt(Vj->Q(0), int128_zero())) {
1581 shft_res1 = int128_zero();
1584 if (int128_lt(Vd->Q(0), int128_zero())) {
1585 shft_res2 = int128_zero();
1588 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1590 if (int128_ult(mask, shft_res1)) {
1591 Vd->D(0) = int128_getlo(mask);
1592 }else {
1593 Vd->D(0) = int128_getlo(shft_res1);
1596 if (int128_ult(mask, shft_res2)) {
1597 Vd->D(1) = int128_getlo(mask);
1598 }else {
1599 Vd->D(1) = int128_getlo(shft_res2);
1603 VSSRANUI(vssrani_bu_h, 16, B, H)
1604 VSSRANUI(vssrani_hu_w, 32, H, W)
1605 VSSRANUI(vssrani_wu_d, 64, W, D)
1607 #define SSRLRNS(E1, E2, T1, T2, T3) \
1608 static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \
1610 T1 shft_res; \
1612 shft_res = do_vsrlr_ ## E2(e2, sa); \
1613 T1 mask; \
1614 mask = (1ull << sh) -1; \
1615 if (shft_res > mask) { \
1616 return mask; \
1617 } else { \
1618 return shft_res; \
1622 SSRLRNS(B, H, uint16_t, int16_t, uint8_t)
1623 SSRLRNS(H, W, uint32_t, int32_t, uint16_t)
1624 SSRLRNS(W, D, uint64_t, int64_t, uint32_t)
1626 #define VSSRLRN(NAME, BIT, T, E1, E2) \
1627 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1629 int i; \
1630 VReg *Vd = (VReg *)vd; \
1631 VReg *Vj = (VReg *)vj; \
1632 VReg *Vk = (VReg *)vk; \
1634 for (i = 0; i < LSX_LEN/BIT; i++) { \
1635 Vd->E1(i) = do_ssrlrns_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2 -1); \
1637 Vd->D(1) = 0; \
1640 VSSRLRN(vssrlrn_b_h, 16, uint16_t, B, H)
1641 VSSRLRN(vssrlrn_h_w, 32, uint32_t, H, W)
1642 VSSRLRN(vssrlrn_w_d, 64, uint64_t, W, D)
1644 #define SSRARNS(E1, E2, T1, T2) \
1645 static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \
1647 T1 shft_res; \
1649 shft_res = do_vsrar_ ## E2(e2, sa); \
1650 T2 mask; \
1651 mask = (1ll << sh) -1; \
1652 if (shft_res > mask) { \
1653 return mask; \
1654 } else if (shft_res < -(mask +1)) { \
1655 return ~mask; \
1656 } else { \
1657 return shft_res; \
1661 SSRARNS(B, H, int16_t, int8_t)
1662 SSRARNS(H, W, int32_t, int16_t)
1663 SSRARNS(W, D, int64_t, int32_t)
1665 #define VSSRARN(NAME, BIT, T, E1, E2) \
1666 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1668 int i; \
1669 VReg *Vd = (VReg *)vd; \
1670 VReg *Vj = (VReg *)vj; \
1671 VReg *Vk = (VReg *)vk; \
1673 for (i = 0; i < LSX_LEN/BIT; i++) { \
1674 Vd->E1(i) = do_ssrarns_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2 -1); \
1676 Vd->D(1) = 0; \
1679 VSSRARN(vssrarn_b_h, 16, uint16_t, B, H)
1680 VSSRARN(vssrarn_h_w, 32, uint32_t, H, W)
1681 VSSRARN(vssrarn_w_d, 64, uint64_t, W, D)
1683 #define SSRLRNU(E1, E2, T1, T2, T3) \
1684 static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \
1686 T1 shft_res; \
1688 shft_res = do_vsrlr_ ## E2(e2, sa); \
1690 T2 mask; \
1691 mask = (1ull << sh) -1; \
1692 if (shft_res > mask) { \
1693 return mask; \
1694 } else { \
1695 return shft_res; \
1699 SSRLRNU(B, H, uint16_t, uint8_t, int16_t)
1700 SSRLRNU(H, W, uint32_t, uint16_t, int32_t)
1701 SSRLRNU(W, D, uint64_t, uint32_t, int64_t)
1703 #define VSSRLRNU(NAME, BIT, T, E1, E2) \
1704 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1706 int i; \
1707 VReg *Vd = (VReg *)vd; \
1708 VReg *Vj = (VReg *)vj; \
1709 VReg *Vk = (VReg *)vk; \
1711 for (i = 0; i < LSX_LEN/BIT; i++) { \
1712 Vd->E1(i) = do_ssrlrnu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \
1714 Vd->D(1) = 0; \
1717 VSSRLRNU(vssrlrn_bu_h, 16, uint16_t, B, H)
1718 VSSRLRNU(vssrlrn_hu_w, 32, uint32_t, H, W)
1719 VSSRLRNU(vssrlrn_wu_d, 64, uint64_t, W, D)
1721 #define SSRARNU(E1, E2, T1, T2, T3) \
1722 static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \
1724 T1 shft_res; \
1726 if (e2 < 0) { \
1727 shft_res = 0; \
1728 } else { \
1729 shft_res = do_vsrar_ ## E2(e2, sa); \
1731 T2 mask; \
1732 mask = (1ull << sh) -1; \
1733 if (shft_res > mask) { \
1734 return mask; \
1735 } else { \
1736 return shft_res; \
1740 SSRARNU(B, H, uint16_t, uint8_t, int16_t)
1741 SSRARNU(H, W, uint32_t, uint16_t, int32_t)
1742 SSRARNU(W, D, uint64_t, uint32_t, int64_t)
1744 #define VSSRARNU(NAME, BIT, T, E1, E2) \
1745 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1747 int i; \
1748 VReg *Vd = (VReg *)vd; \
1749 VReg *Vj = (VReg *)vj; \
1750 VReg *Vk = (VReg *)vk; \
1752 for (i = 0; i < LSX_LEN/BIT; i++) { \
1753 Vd->E1(i) = do_ssrarnu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \
1755 Vd->D(1) = 0; \
1758 VSSRARNU(vssrarn_bu_h, 16, uint16_t, B, H)
1759 VSSRARNU(vssrarn_hu_w, 32, uint32_t, H, W)
1760 VSSRARNU(vssrarn_wu_d, 64, uint64_t, W, D)
1762 #define VSSRLRNI(NAME, BIT, E1, E2) \
1763 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1765 int i; \
1766 VReg temp; \
1767 VReg *Vd = (VReg *)vd; \
1768 VReg *Vj = (VReg *)vj; \
1770 for (i = 0; i < LSX_LEN/BIT; i++) { \
1771 temp.E1(i) = do_ssrlrns_ ## E1(Vj->E2(i), imm, BIT/2 -1); \
1772 temp.E1(i + LSX_LEN/BIT) = do_ssrlrns_ ## E1(Vd->E2(i), imm, BIT/2 -1);\
1774 *Vd = temp; \
1777 #define VSSRLRNI_Q(NAME, sh) \
1778 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1780 Int128 shft_res1, shft_res2, mask, r1, r2; \
1781 VReg *Vd = (VReg *)vd; \
1782 VReg *Vj = (VReg *)vj; \
1784 if (imm == 0) { \
1785 shft_res1 = Vj->Q(0); \
1786 shft_res2 = Vd->Q(0); \
1787 } else { \
1788 r1 = int128_and(int128_urshift(Vj->Q(0), (imm -1)), int128_one()); \
1789 r2 = int128_and(int128_urshift(Vd->Q(0), (imm -1)), int128_one()); \
1791 shft_res1 = (int128_add(int128_urshift(Vj->Q(0), imm), r1)); \
1792 shft_res2 = (int128_add(int128_urshift(Vd->Q(0), imm), r2)); \
1795 mask = int128_sub(int128_lshift(int128_one(), sh), int128_one()); \
1797 if (int128_ult(mask, shft_res1)) { \
1798 Vd->D(0) = int128_getlo(mask); \
1799 }else { \
1800 Vd->D(0) = int128_getlo(shft_res1); \
1803 if (int128_ult(mask, shft_res2)) { \
1804 Vd->D(1) = int128_getlo(mask); \
1805 }else { \
1806 Vd->D(1) = int128_getlo(shft_res2); \
1810 VSSRLRNI(vssrlrni_b_h, 16, B, H)
1811 VSSRLRNI(vssrlrni_h_w, 32, H, W)
1812 VSSRLRNI(vssrlrni_w_d, 64, W, D)
1813 VSSRLRNI_Q(vssrlrni_d_q, 63)
1815 #define VSSRARNI(NAME, BIT, E1, E2) \
1816 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1818 int i; \
1819 VReg temp; \
1820 VReg *Vd = (VReg *)vd; \
1821 VReg *Vj = (VReg *)vj; \
1823 for (i = 0; i < LSX_LEN/BIT; i++) { \
1824 temp.E1(i) = do_ssrarns_ ## E1(Vj->E2(i), imm, BIT/2 -1); \
1825 temp.E1(i + LSX_LEN/BIT) = do_ssrarns_ ## E1(Vd->E2(i), imm, BIT/2 -1); \
1827 *Vd = temp; \
1830 void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1832 Int128 shft_res1, shft_res2, mask1, mask2, r1, r2;
1833 VReg *Vd = (VReg *)vd;
1834 VReg *Vj = (VReg *)vj;
1836 if (imm == 0) {
1837 shft_res1 = Vj->Q(0);
1838 shft_res2 = Vd->Q(0);
1839 } else {
1840 r1 = int128_and(int128_rshift(Vj->Q(0), (imm -1)), int128_one());
1841 r2 = int128_and(int128_rshift(Vd->Q(0), (imm -1)), int128_one());
1843 shft_res1 = int128_add(int128_rshift(Vj->Q(0), imm), r1);
1844 shft_res2 = int128_add(int128_rshift(Vd->Q(0), imm), r2);
1847 mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1848 mask2 = int128_lshift(int128_one(), 63);
1850 if (int128_gt(shft_res1, mask1)) {
1851 Vd->D(0) = int128_getlo(mask1);
1852 } else if (int128_lt(shft_res1, int128_neg(mask2))) {
1853 Vd->D(0) = int128_getlo(mask2);
1854 } else {
1855 Vd->D(0) = int128_getlo(shft_res1);
1858 if (int128_gt(shft_res2, mask1)) {
1859 Vd->D(1) = int128_getlo(mask1);
1860 } else if (int128_lt(shft_res2, int128_neg(mask2))) {
1861 Vd->D(1) = int128_getlo(mask2);
1862 } else {
1863 Vd->D(1) = int128_getlo(shft_res2);
1867 VSSRARNI(vssrarni_b_h, 16, B, H)
1868 VSSRARNI(vssrarni_h_w, 32, H, W)
1869 VSSRARNI(vssrarni_w_d, 64, W, D)
1871 #define VSSRLRNUI(NAME, BIT, E1, E2) \
1872 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1874 int i; \
1875 VReg temp; \
1876 VReg *Vd = (VReg *)vd; \
1877 VReg *Vj = (VReg *)vj; \
1879 for (i = 0; i < LSX_LEN/BIT; i++) { \
1880 temp.E1(i) = do_ssrlrnu_ ## E1(Vj->E2(i), imm, BIT/2); \
1881 temp.E1(i + LSX_LEN/BIT) = do_ssrlrnu_ ## E1(Vd->E2(i), imm, BIT/2); \
1883 *Vd = temp; \
1886 VSSRLRNUI(vssrlrni_bu_h, 16, B, H)
1887 VSSRLRNUI(vssrlrni_hu_w, 32, H, W)
1888 VSSRLRNUI(vssrlrni_wu_d, 64, W, D)
1889 VSSRLRNI_Q(vssrlrni_du_q, 64)
1891 #define VSSRARNUI(NAME, BIT, E1, E2) \
1892 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1894 int i; \
1895 VReg temp; \
1896 VReg *Vd = (VReg *)vd; \
1897 VReg *Vj = (VReg *)vj; \
1899 for (i = 0; i < LSX_LEN/BIT; i++) { \
1900 temp.E1(i) = do_ssrarnu_ ## E1(Vj->E2(i), imm, BIT/2); \
1901 temp.E1(i + LSX_LEN/BIT) = do_ssrarnu_ ## E1(Vd->E2(i), imm, BIT/2); \
1903 *Vd = temp; \
1906 void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1908 Int128 shft_res1, shft_res2, mask1, mask2, r1, r2;
1909 VReg *Vd = (VReg *)vd;
1910 VReg *Vj = (VReg *)vj;
1912 if (imm == 0) {
1913 shft_res1 = Vj->Q(0);
1914 shft_res2 = Vd->Q(0);
1915 } else {
1916 r1 = int128_and(int128_rshift(Vj->Q(0), (imm -1)), int128_one());
1917 r2 = int128_and(int128_rshift(Vd->Q(0), (imm -1)), int128_one());
1919 shft_res1 = int128_add(int128_rshift(Vj->Q(0), imm), r1);
1920 shft_res2 = int128_add(int128_rshift(Vd->Q(0), imm), r2);
1923 if (int128_lt(Vj->Q(0), int128_zero())) {
1924 shft_res1 = int128_zero();
1926 if (int128_lt(Vd->Q(0), int128_zero())) {
1927 shft_res2 = int128_zero();
1930 mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1931 mask2 = int128_lshift(int128_one(), 64);
1933 if (int128_gt(shft_res1, mask1)) {
1934 Vd->D(0) = int128_getlo(mask1);
1935 } else if (int128_lt(shft_res1, int128_neg(mask2))) {
1936 Vd->D(0) = int128_getlo(mask2);
1937 } else {
1938 Vd->D(0) = int128_getlo(shft_res1);
1941 if (int128_gt(shft_res2, mask1)) {
1942 Vd->D(1) = int128_getlo(mask1);
1943 } else if (int128_lt(shft_res2, int128_neg(mask2))) {
1944 Vd->D(1) = int128_getlo(mask2);
1945 } else {
1946 Vd->D(1) = int128_getlo(shft_res2);
1950 VSSRARNUI(vssrarni_bu_h, 16, B, H)
1951 VSSRARNUI(vssrarni_hu_w, 32, H, W)
1952 VSSRARNUI(vssrarni_wu_d, 64, W, D)
1954 #define DO_2OP(NAME, BIT, E, DO_OP) \
1955 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
1957 int i; \
1958 VReg *Vd = (VReg *)vd; \
1959 VReg *Vj = (VReg *)vj; \
1961 for (i = 0; i < LSX_LEN/BIT; i++) \
1963 Vd->E(i) = DO_OP(Vj->E(i)); \
1967 #define DO_CLO_B(N) (clz32(~N & 0xff) - 24)
1968 #define DO_CLO_H(N) (clz32(~N & 0xffff) - 16)
1969 #define DO_CLO_W(N) (clz32(~N))
1970 #define DO_CLO_D(N) (clz64(~N))
1971 #define DO_CLZ_B(N) (clz32(N) - 24)
1972 #define DO_CLZ_H(N) (clz32(N) - 16)
1973 #define DO_CLZ_W(N) (clz32(N))
1974 #define DO_CLZ_D(N) (clz64(N))
1976 DO_2OP(vclo_b, 8, UB, DO_CLO_B)
1977 DO_2OP(vclo_h, 16, UH, DO_CLO_H)
1978 DO_2OP(vclo_w, 32, UW, DO_CLO_W)
1979 DO_2OP(vclo_d, 64, UD, DO_CLO_D)
1980 DO_2OP(vclz_b, 8, UB, DO_CLZ_B)
1981 DO_2OP(vclz_h, 16, UH, DO_CLZ_H)
1982 DO_2OP(vclz_w, 32, UW, DO_CLZ_W)
1983 DO_2OP(vclz_d, 64, UD, DO_CLZ_D)
1985 #define VPCNT(NAME, BIT, E, FN) \
1986 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
1988 int i; \
1989 VReg *Vd = (VReg *)vd; \
1990 VReg *Vj = (VReg *)vj; \
1992 for (i = 0; i < LSX_LEN/BIT; i++) \
1994 Vd->E(i) = FN(Vj->E(i)); \
1998 VPCNT(vpcnt_b, 8, UB, ctpop8)
1999 VPCNT(vpcnt_h, 16, UH, ctpop16)
2000 VPCNT(vpcnt_w, 32, UW, ctpop32)
2001 VPCNT(vpcnt_d, 64, UD, ctpop64)
2003 #define DO_BITCLR(a, bit) (a & ~(1ull << bit))
2004 #define DO_BITSET(a, bit) (a | 1ull << bit)
2005 #define DO_BITREV(a, bit) (a ^ (1ull << bit))
2007 #define DO_BIT(NAME, BIT, E, DO_OP) \
2008 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \
2010 int i; \
2011 VReg *Vd = (VReg *)vd; \
2012 VReg *Vj = (VReg *)vj; \
2013 VReg *Vk = (VReg *)vk; \
2015 for (i = 0; i < LSX_LEN/BIT; i++) { \
2016 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \
2020 DO_BIT(vbitclr_b, 8, UB, DO_BITCLR)
2021 DO_BIT(vbitclr_h, 16, UH, DO_BITCLR)
2022 DO_BIT(vbitclr_w, 32, UW, DO_BITCLR)
2023 DO_BIT(vbitclr_d, 64, UD, DO_BITCLR)
2024 DO_BIT(vbitset_b, 8, UB, DO_BITSET)
2025 DO_BIT(vbitset_h, 16, UH, DO_BITSET)
2026 DO_BIT(vbitset_w, 32, UW, DO_BITSET)
2027 DO_BIT(vbitset_d, 64, UD, DO_BITSET)
2028 DO_BIT(vbitrev_b, 8, UB, DO_BITREV)
2029 DO_BIT(vbitrev_h, 16, UH, DO_BITREV)
2030 DO_BIT(vbitrev_w, 32, UW, DO_BITREV)
2031 DO_BIT(vbitrev_d, 64, UD, DO_BITREV)
2033 #define DO_BITI(NAME, BIT, E, DO_OP) \
2034 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t v) \
2036 int i; \
2037 VReg *Vd = (VReg *)vd; \
2038 VReg *Vj = (VReg *)vj; \
2040 for (i = 0; i < LSX_LEN/BIT; i++) { \
2041 Vd->E(i) = DO_OP(Vj->E(i), imm); \
2045 DO_BITI(vbitclri_b, 8, UB, DO_BITCLR)
2046 DO_BITI(vbitclri_h, 16, UH, DO_BITCLR)
2047 DO_BITI(vbitclri_w, 32, UW, DO_BITCLR)
2048 DO_BITI(vbitclri_d, 64, UD, DO_BITCLR)
2049 DO_BITI(vbitseti_b, 8, UB, DO_BITSET)
2050 DO_BITI(vbitseti_h, 16, UH, DO_BITSET)
2051 DO_BITI(vbitseti_w, 32, UW, DO_BITSET)
2052 DO_BITI(vbitseti_d, 64, UD, DO_BITSET)
2053 DO_BITI(vbitrevi_b, 8, UB, DO_BITREV)
2054 DO_BITI(vbitrevi_h, 16, UH, DO_BITREV)
2055 DO_BITI(vbitrevi_w, 32, UW, DO_BITREV)
2056 DO_BITI(vbitrevi_d, 64, UD, DO_BITREV)
2058 #define VFRSTP(NAME, BIT, MASK, E) \
2059 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2061 int i, m; \
2062 VReg *Vd = (VReg *)vd; \
2063 VReg *Vj = (VReg *)vj; \
2064 VReg *Vk = (VReg *)vk; \
2066 for (i = 0; i < LSX_LEN/BIT; i++) { \
2067 if (Vj->E(i) < 0) { \
2068 break; \
2071 m = Vk->E(0) & MASK; \
2072 Vd->E(m) = i; \
2075 VFRSTP(vfrstp_b, 8, 0xf, B)
2076 VFRSTP(vfrstp_h, 16, 0x7, H)
2078 #define VFRSTPI(NAME, BIT, E) \
2079 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2081 int i, m; \
2082 VReg *Vd = (VReg *)vd; \
2083 VReg *Vj = (VReg *)vj; \
2085 for (i = 0; i < LSX_LEN/BIT; i++) { \
2086 if (Vj->E(i) < 0) { \
2087 break; \
2090 m = imm % (LSX_LEN/BIT); \
2091 Vd->E(m) = i; \
2094 VFRSTPI(vfrstpi_b, 8, B)
2095 VFRSTPI(vfrstpi_h, 16, H)
2097 static void vec_update_fcsr0_mask(CPULoongArchState *env,
2098 uintptr_t pc, int mask)
2100 int flags = get_float_exception_flags(&env->fp_status);
2102 set_float_exception_flags(0, &env->fp_status);
2104 flags &= ~mask;
2106 if (flags) {
2107 flags = ieee_ex_to_loongarch(flags);
2108 UPDATE_FP_CAUSE(env->fcsr0, flags);
2111 if (GET_FP_ENABLES(env->fcsr0) & flags) {
2112 do_raise_exception(env, EXCCODE_FPE, pc);
2113 } else {
2114 UPDATE_FP_FLAGS(env->fcsr0, flags);
2118 static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc)
2120 vec_update_fcsr0_mask(env, pc, 0);
2123 static inline void vec_clear_cause(CPULoongArchState *env)
2125 SET_FP_CAUSE(env->fcsr0, 0);
2128 #define DO_3OP_F(NAME, BIT, E, FN) \
2129 void HELPER(NAME)(void *vd, void *vj, void *vk, \
2130 CPULoongArchState *env, uint32_t desc) \
2132 int i; \
2133 VReg *Vd = (VReg *)vd; \
2134 VReg *Vj = (VReg *)vj; \
2135 VReg *Vk = (VReg *)vk; \
2137 vec_clear_cause(env); \
2138 for (i = 0; i < LSX_LEN/BIT; i++) { \
2139 Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
2140 vec_update_fcsr0(env, GETPC()); \
2144 DO_3OP_F(vfadd_s, 32, UW, float32_add)
2145 DO_3OP_F(vfadd_d, 64, UD, float64_add)
2146 DO_3OP_F(vfsub_s, 32, UW, float32_sub)
2147 DO_3OP_F(vfsub_d, 64, UD, float64_sub)
2148 DO_3OP_F(vfmul_s, 32, UW, float32_mul)
2149 DO_3OP_F(vfmul_d, 64, UD, float64_mul)
2150 DO_3OP_F(vfdiv_s, 32, UW, float32_div)
2151 DO_3OP_F(vfdiv_d, 64, UD, float64_div)
2152 DO_3OP_F(vfmax_s, 32, UW, float32_maxnum)
2153 DO_3OP_F(vfmax_d, 64, UD, float64_maxnum)
2154 DO_3OP_F(vfmin_s, 32, UW, float32_minnum)
2155 DO_3OP_F(vfmin_d, 64, UD, float64_minnum)
2156 DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag)
2157 DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag)
2158 DO_3OP_F(vfmina_s, 32, UW, float32_minnummag)
2159 DO_3OP_F(vfmina_d, 64, UD, float64_minnummag)
2161 #define DO_4OP_F(NAME, BIT, E, FN, flags) \
2162 void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \
2163 CPULoongArchState *env, uint32_t desc) \
2165 int i; \
2166 VReg *Vd = (VReg *)vd; \
2167 VReg *Vj = (VReg *)vj; \
2168 VReg *Vk = (VReg *)vk; \
2169 VReg *Va = (VReg *)va; \
2171 vec_clear_cause(env); \
2172 for (i = 0; i < LSX_LEN/BIT; i++) { \
2173 Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \
2174 vec_update_fcsr0(env, GETPC()); \
2178 DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0)
2179 DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0)
2180 DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c)
2181 DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c)
2182 DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result)
2183 DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result)
2184 DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd,
2185 float_muladd_negate_c | float_muladd_negate_result)
2186 DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd,
2187 float_muladd_negate_c | float_muladd_negate_result)
2189 #define DO_2OP_F(NAME, BIT, E, FN) \
2190 void HELPER(NAME)(void *vd, void *vj, \
2191 CPULoongArchState *env, uint32_t desc) \
2193 int i; \
2194 VReg *Vd = (VReg *)vd; \
2195 VReg *Vj = (VReg *)vj; \
2197 vec_clear_cause(env); \
2198 for (i = 0; i < LSX_LEN/BIT; i++) { \
2199 Vd->E(i) = FN(env, Vj->E(i)); \
2203 #define FLOGB(BIT, T) \
2204 static T do_flogb_## BIT(CPULoongArchState *env, T fj) \
2206 T fp, fd; \
2207 float_status *status = &env->fp_status; \
2208 FloatRoundMode old_mode = get_float_rounding_mode(status); \
2210 set_float_rounding_mode(float_round_down, status); \
2211 fp = float ## BIT ##_log2(fj, status); \
2212 fd = float ## BIT ##_round_to_int(fp, status); \
2213 set_float_rounding_mode(old_mode, status); \
2214 vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \
2215 return fd; \
2218 FLOGB(32, uint32_t)
2219 FLOGB(64, uint64_t)
2221 #define FCLASS(NAME, BIT, E, FN) \
2222 void HELPER(NAME)(void *vd, void *vj, \
2223 CPULoongArchState *env, uint32_t desc) \
2225 int i; \
2226 VReg *Vd = (VReg *)vd; \
2227 VReg *Vj = (VReg *)vj; \
2229 for (i = 0; i < LSX_LEN/BIT; i++) { \
2230 Vd->E(i) = FN(env, Vj->E(i)); \
2234 FCLASS(vfclass_s, 32, UW, helper_fclass_s)
2235 FCLASS(vfclass_d, 64, UD, helper_fclass_d)
2237 #define FSQRT(BIT, T) \
2238 static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \
2240 T fd; \
2241 fd = float ## BIT ##_sqrt(fj, &env->fp_status); \
2242 vec_update_fcsr0(env, GETPC()); \
2243 return fd; \
2246 FSQRT(32, uint32_t)
2247 FSQRT(64, uint64_t)
2249 #define FRECIP(BIT, T) \
2250 static T do_frecip_## BIT(CPULoongArchState *env, T fj) \
2252 T fd; \
2253 fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \
2254 vec_update_fcsr0(env, GETPC()); \
2255 return fd; \
2258 FRECIP(32, uint32_t)
2259 FRECIP(64, uint64_t)
2261 #define FRSQRT(BIT, T) \
2262 static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \
2264 T fd, fp; \
2265 fp = float ## BIT ##_sqrt(fj, &env->fp_status); \
2266 fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \
2267 vec_update_fcsr0(env, GETPC()); \
2268 return fd; \
2271 FRSQRT(32, uint32_t)
2272 FRSQRT(64, uint64_t)
2274 DO_2OP_F(vflogb_s, 32, UW, do_flogb_32)
2275 DO_2OP_F(vflogb_d, 64, UD, do_flogb_64)
2276 DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32)
2277 DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64)
2278 DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32)
2279 DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64)
2280 DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32)
2281 DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64)
2283 static uint32_t float16_cvt_float32(uint16_t h, float_status *status)
2285 return float16_to_float32(h, true, status);
2287 static uint64_t float32_cvt_float64(uint32_t s, float_status *status)
2289 return float32_to_float64(s, status);
2292 static uint16_t float32_cvt_float16(uint32_t s, float_status *status)
2294 return float32_to_float16(s, true, status);
2296 static uint32_t float64_cvt_float32(uint64_t d, float_status *status)
2298 return float64_to_float32(d, status);
2301 void HELPER(vfcvtl_s_h)(void *vd, void *vj,
2302 CPULoongArchState *env, uint32_t desc)
2304 int i;
2305 VReg temp;
2306 VReg *Vd = (VReg *)vd;
2307 VReg *Vj = (VReg *)vj;
2309 vec_clear_cause(env);
2310 for (i = 0; i < LSX_LEN/32; i++) {
2311 temp.UW(i) = float16_cvt_float32(Vj->UH(i), &env->fp_status);
2312 vec_update_fcsr0(env, GETPC());
2314 *Vd = temp;
2317 void HELPER(vfcvtl_d_s)(void *vd, void *vj,
2318 CPULoongArchState *env, uint32_t desc)
2320 int i;
2321 VReg temp;
2322 VReg *Vd = (VReg *)vd;
2323 VReg *Vj = (VReg *)vj;
2325 vec_clear_cause(env);
2326 for (i = 0; i < LSX_LEN/64; i++) {
2327 temp.UD(i) = float32_cvt_float64(Vj->UW(i), &env->fp_status);
2328 vec_update_fcsr0(env, GETPC());
2330 *Vd = temp;
2333 void HELPER(vfcvth_s_h)(void *vd, void *vj,
2334 CPULoongArchState *env, uint32_t desc)
2336 int i;
2337 VReg temp;
2338 VReg *Vd = (VReg *)vd;
2339 VReg *Vj = (VReg *)vj;
2341 vec_clear_cause(env);
2342 for (i = 0; i < LSX_LEN/32; i++) {
2343 temp.UW(i) = float16_cvt_float32(Vj->UH(i + 4), &env->fp_status);
2344 vec_update_fcsr0(env, GETPC());
2346 *Vd = temp;
2349 void HELPER(vfcvth_d_s)(void *vd, void *vj,
2350 CPULoongArchState *env, uint32_t desc)
2352 int i;
2353 VReg temp;
2354 VReg *Vd = (VReg *)vd;
2355 VReg *Vj = (VReg *)vj;
2357 vec_clear_cause(env);
2358 for (i = 0; i < LSX_LEN/64; i++) {
2359 temp.UD(i) = float32_cvt_float64(Vj->UW(i + 2), &env->fp_status);
2360 vec_update_fcsr0(env, GETPC());
2362 *Vd = temp;
2365 void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk,
2366 CPULoongArchState *env, uint32_t desc)
2368 int i;
2369 VReg temp;
2370 VReg *Vd = (VReg *)vd;
2371 VReg *Vj = (VReg *)vj;
2372 VReg *Vk = (VReg *)vk;
2374 vec_clear_cause(env);
2375 for(i = 0; i < LSX_LEN/32; i++) {
2376 temp.UH(i + 4) = float32_cvt_float16(Vj->UW(i), &env->fp_status);
2377 temp.UH(i) = float32_cvt_float16(Vk->UW(i), &env->fp_status);
2378 vec_update_fcsr0(env, GETPC());
2380 *Vd = temp;
2383 void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk,
2384 CPULoongArchState *env, uint32_t desc)
2386 int i;
2387 VReg temp;
2388 VReg *Vd = (VReg *)vd;
2389 VReg *Vj = (VReg *)vj;
2390 VReg *Vk = (VReg *)vk;
2392 vec_clear_cause(env);
2393 for(i = 0; i < LSX_LEN/64; i++) {
2394 temp.UW(i + 2) = float64_cvt_float32(Vj->UD(i), &env->fp_status);
2395 temp.UW(i) = float64_cvt_float32(Vk->UD(i), &env->fp_status);
2396 vec_update_fcsr0(env, GETPC());
2398 *Vd = temp;
2401 void HELPER(vfrint_s)(void *vd, void *vj,
2402 CPULoongArchState *env, uint32_t desc)
2404 int i;
2405 VReg *Vd = (VReg *)vd;
2406 VReg *Vj = (VReg *)vj;
2408 vec_clear_cause(env);
2409 for (i = 0; i < 4; i++) {
2410 Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status);
2411 vec_update_fcsr0(env, GETPC());
2415 void HELPER(vfrint_d)(void *vd, void *vj,
2416 CPULoongArchState *env, uint32_t desc)
2418 int i;
2419 VReg *Vd = (VReg *)vd;
2420 VReg *Vj = (VReg *)vj;
2422 vec_clear_cause(env);
2423 for (i = 0; i < 2; i++) {
2424 Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status);
2425 vec_update_fcsr0(env, GETPC());
2429 #define FCVT_2OP(NAME, BIT, E, MODE) \
2430 void HELPER(NAME)(void *vd, void *vj, \
2431 CPULoongArchState *env, uint32_t desc) \
2433 int i; \
2434 VReg *Vd = (VReg *)vd; \
2435 VReg *Vj = (VReg *)vj; \
2437 vec_clear_cause(env); \
2438 for (i = 0; i < LSX_LEN/BIT; i++) { \
2439 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2440 set_float_rounding_mode(MODE, &env->fp_status); \
2441 Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \
2442 set_float_rounding_mode(old_mode, &env->fp_status); \
2443 vec_update_fcsr0(env, GETPC()); \
2447 FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even)
2448 FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even)
2449 FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero)
2450 FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero)
2451 FCVT_2OP(vfrintrp_s, 32, UW, float_round_up)
2452 FCVT_2OP(vfrintrp_d, 64, UD, float_round_up)
2453 FCVT_2OP(vfrintrm_s, 32, UW, float_round_down)
2454 FCVT_2OP(vfrintrm_d, 64, UD, float_round_down)
2456 #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \
2457 static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \
2459 T2 fd; \
2460 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2462 set_float_rounding_mode(MODE, &env->fp_status); \
2463 fd = do_## FMT1 ##_to_## FMT2(env, fj); \
2464 set_float_rounding_mode(old_mode, &env->fp_status); \
2465 return fd; \
2468 #define DO_FTINT(FMT1, FMT2, T1, T2) \
2469 static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \
2471 T2 fd; \
2473 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \
2474 if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \
2475 if (FMT1 ##_is_any_nan(fj)) { \
2476 fd = 0; \
2479 vec_update_fcsr0(env, GETPC()); \
2480 return fd; \
2483 DO_FTINT(float32, int32, uint32_t, uint32_t)
2484 DO_FTINT(float64, int64, uint64_t, uint64_t)
2485 DO_FTINT(float32, uint32, uint32_t, uint32_t)
2486 DO_FTINT(float64, uint64, uint64_t, uint64_t)
2487 DO_FTINT(float64, int32, uint64_t, uint32_t)
2488 DO_FTINT(float32, int64, uint32_t, uint64_t)
2490 FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even)
2491 FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even)
2492 FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up)
2493 FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up)
2494 FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero)
2495 FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero)
2496 FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down)
2497 FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down)
2499 DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s)
2500 DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d)
2501 DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s)
2502 DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d)
2503 DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s)
2504 DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d)
2505 DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s)
2506 DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d)
2507 DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32)
2508 DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64)
2510 FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero)
2511 FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero)
2513 DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s)
2514 DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d)
2515 DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32)
2516 DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64)
2518 FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down)
2519 FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up)
2520 FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero)
2521 FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even)
2523 #define FTINT_W_D(NAME, FN) \
2524 void HELPER(NAME)(void *vd, void *vj, void *vk, \
2525 CPULoongArchState *env, uint32_t desc) \
2527 int i; \
2528 VReg temp; \
2529 VReg *Vd = (VReg *)vd; \
2530 VReg *Vj = (VReg *)vj; \
2531 VReg *Vk = (VReg *)vk; \
2533 vec_clear_cause(env); \
2534 for (i = 0; i < 2; i++) { \
2535 temp.W(i + 2) = FN(env, Vj->UD(i)); \
2536 temp.W(i) = FN(env, Vk->UD(i)); \
2538 *Vd = temp; \
2541 FTINT_W_D(vftint_w_d, do_float64_to_int32)
2542 FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d)
2543 FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d)
2544 FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d)
2545 FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d)
2547 FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2548 FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2549 FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2550 FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2551 FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2552 FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2553 FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2554 FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2556 #define FTINTL_L_S(NAME, FN) \
2557 void HELPER(NAME)(void *vd, void *vj, \
2558 CPULoongArchState *env, uint32_t desc) \
2560 int i; \
2561 VReg temp; \
2562 VReg *Vd = (VReg *)vd; \
2563 VReg *Vj = (VReg *)vj; \
2565 vec_clear_cause(env); \
2566 for (i = 0; i < 2; i++) { \
2567 temp.D(i) = FN(env, Vj->UW(i)); \
2569 *Vd = temp; \
2572 FTINTL_L_S(vftintl_l_s, do_float32_to_int64)
2573 FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s)
2574 FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s)
2575 FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s)
2576 FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s)
2578 #define FTINTH_L_S(NAME, FN) \
2579 void HELPER(NAME)(void *vd, void *vj, \
2580 CPULoongArchState *env, uint32_t desc) \
2582 int i; \
2583 VReg temp; \
2584 VReg *Vd = (VReg *)vd; \
2585 VReg *Vj = (VReg *)vj; \
2587 vec_clear_cause(env); \
2588 for (i = 0; i < 2; i++) { \
2589 temp.D(i) = FN(env, Vj->UW(i + 2)); \
2591 *Vd = temp; \
2594 FTINTH_L_S(vftinth_l_s, do_float32_to_int64)
2595 FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s)
2596 FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s)
2597 FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s)
2598 FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s)
2600 #define FFINT(NAME, FMT1, FMT2, T1, T2) \
2601 static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \
2603 T2 fd; \
2605 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \
2606 vec_update_fcsr0(env, GETPC()); \
2607 return fd; \
2610 FFINT(s_w, int32, float32, int32_t, uint32_t)
2611 FFINT(d_l, int64, float64, int64_t, uint64_t)
2612 FFINT(s_wu, uint32, float32, uint32_t, uint32_t)
2613 FFINT(d_lu, uint64, float64, uint64_t, uint64_t)
2615 DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w)
2616 DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l)
2617 DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu)
2618 DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu)
2620 void HELPER(vffintl_d_w)(void *vd, void *vj,
2621 CPULoongArchState *env, uint32_t desc)
2623 int i;
2624 VReg temp;
2625 VReg *Vd = (VReg *)vd;
2626 VReg *Vj = (VReg *)vj;
2628 vec_clear_cause(env);
2629 for (i = 0; i < 2; i++) {
2630 temp.D(i) = int32_to_float64(Vj->W(i), &env->fp_status);
2631 vec_update_fcsr0(env, GETPC());
2633 *Vd = temp;
2636 void HELPER(vffinth_d_w)(void *vd, void *vj,
2637 CPULoongArchState *env, uint32_t desc)
2639 int i;
2640 VReg temp;
2641 VReg *Vd = (VReg *)vd;
2642 VReg *Vj = (VReg *)vj;
2644 vec_clear_cause(env);
2645 for (i = 0; i < 2; i++) {
2646 temp.D(i) = int32_to_float64(Vj->W(i + 2), &env->fp_status);
2647 vec_update_fcsr0(env, GETPC());
2649 *Vd = temp;
2652 void HELPER(vffint_s_l)(void *vd, void *vj, void *vk,
2653 CPULoongArchState *env, uint32_t desc)
2655 int i;
2656 VReg temp;
2657 VReg *Vd = (VReg *)vd;
2658 VReg *Vj = (VReg *)vj;
2659 VReg *Vk = (VReg *)vk;
2661 vec_clear_cause(env);
2662 for (i = 0; i < 2; i++) {
2663 temp.W(i + 2) = int64_to_float32(Vj->D(i), &env->fp_status);
2664 temp.W(i) = int64_to_float32(Vk->D(i), &env->fp_status);
2665 vec_update_fcsr0(env, GETPC());
2667 *Vd = temp;
2670 #define VSEQ(a, b) (a == b ? -1 : 0)
2671 #define VSLE(a, b) (a <= b ? -1 : 0)
2672 #define VSLT(a, b) (a < b ? -1 : 0)
2674 #define VCMPI(NAME, BIT, E, DO_OP) \
2675 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t v) \
2677 int i; \
2678 VReg *Vd = (VReg *)vd; \
2679 VReg *Vj = (VReg *)vj; \
2680 typedef __typeof(Vd->E(0)) TD; \
2682 for (i = 0; i < LSX_LEN/BIT; i++) { \
2683 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \
2687 VCMPI(vseqi_b, 8, B, VSEQ)
2688 VCMPI(vseqi_h, 16, H, VSEQ)
2689 VCMPI(vseqi_w, 32, W, VSEQ)
2690 VCMPI(vseqi_d, 64, D, VSEQ)
2691 VCMPI(vslei_b, 8, B, VSLE)
2692 VCMPI(vslei_h, 16, H, VSLE)
2693 VCMPI(vslei_w, 32, W, VSLE)
2694 VCMPI(vslei_d, 64, D, VSLE)
2695 VCMPI(vslei_bu, 8, UB, VSLE)
2696 VCMPI(vslei_hu, 16, UH, VSLE)
2697 VCMPI(vslei_wu, 32, UW, VSLE)
2698 VCMPI(vslei_du, 64, UD, VSLE)
2699 VCMPI(vslti_b, 8, B, VSLT)
2700 VCMPI(vslti_h, 16, H, VSLT)
2701 VCMPI(vslti_w, 32, W, VSLT)
2702 VCMPI(vslti_d, 64, D, VSLT)
2703 VCMPI(vslti_bu, 8, UB, VSLT)
2704 VCMPI(vslti_hu, 16, UH, VSLT)
2705 VCMPI(vslti_wu, 32, UW, VSLT)
2706 VCMPI(vslti_du, 64, UD, VSLT)
2708 static uint64_t vfcmp_common(CPULoongArchState *env,
2709 FloatRelation cmp, uint32_t flags)
2711 uint64_t ret = 0;
2713 switch (cmp) {
2714 case float_relation_less:
2715 ret = (flags & FCMP_LT);
2716 break;
2717 case float_relation_equal:
2718 ret = (flags & FCMP_EQ);
2719 break;
2720 case float_relation_greater:
2721 ret = (flags & FCMP_GT);
2722 break;
2723 case float_relation_unordered:
2724 ret = (flags & FCMP_UN);
2725 break;
2726 default:
2727 g_assert_not_reached();
2730 if (ret) {
2731 ret = -1;
2734 return ret;
2737 #define VFCMP(NAME, BIT, E, FN) \
2738 void HELPER(NAME)(CPULoongArchState *env, \
2739 uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \
2741 int i; \
2742 VReg t; \
2743 VReg *Vd = &(env->fpr[vd].vreg); \
2744 VReg *Vj = &(env->fpr[vj].vreg); \
2745 VReg *Vk = &(env->fpr[vk].vreg); \
2747 vec_clear_cause(env); \
2748 for (i = 0; i < LSX_LEN/BIT ; i++) { \
2749 FloatRelation cmp; \
2750 cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
2751 t.E(i) = vfcmp_common(env, cmp, flags); \
2752 vec_update_fcsr0(env, GETPC()); \
2754 *Vd = t; \
2757 VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet)
2758 VFCMP(vfcmp_s_s, 32, UW, float32_compare)
2759 VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet)
2760 VFCMP(vfcmp_s_d, 64, UD, float64_compare)
2762 void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t v)
2764 int i;
2765 VReg *Vd = (VReg *)vd;
2766 VReg *Vj = (VReg *)vj;
2768 for (i = 0; i < 16; i++) {
2769 Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm);
2773 /* Copy from target/arm/tcg/sve_helper.c */
2774 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
2776 uint64_t bits = 8 << esz;
2777 uint64_t ones = dup_const(esz, 1);
2778 uint64_t signs = ones << (bits - 1);
2779 uint64_t cmp0, cmp1;
2781 cmp1 = dup_const(esz, n);
2782 cmp0 = cmp1 ^ m0;
2783 cmp1 = cmp1 ^ m1;
2784 cmp0 = (cmp0 - ones) & ~cmp0;
2785 cmp1 = (cmp1 - ones) & ~cmp1;
2786 return (cmp0 | cmp1) & signs;
2789 #define SETANYEQZ(NAME, MO) \
2790 void HELPER(NAME)(CPULoongArchState *env, uint32_t cd, uint32_t vj) \
2792 VReg *Vj = &(env->fpr[vj].vreg); \
2794 env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \
2796 SETANYEQZ(vsetanyeqz_b, MO_8)
2797 SETANYEQZ(vsetanyeqz_h, MO_16)
2798 SETANYEQZ(vsetanyeqz_w, MO_32)
2799 SETANYEQZ(vsetanyeqz_d, MO_64)
2801 #define SETALLNEZ(NAME, MO) \
2802 void HELPER(NAME)(CPULoongArchState *env, uint32_t cd, uint32_t vj) \
2804 VReg *Vj = &(env->fpr[vj].vreg); \
2806 env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \
2808 SETALLNEZ(vsetallnez_b, MO_8)
2809 SETALLNEZ(vsetallnez_h, MO_16)
2810 SETALLNEZ(vsetallnez_w, MO_32)
2811 SETALLNEZ(vsetallnez_d, MO_64)
2813 #define VPACKEV(NAME, BIT, E) \
2814 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2816 int i; \
2817 VReg temp; \
2818 VReg *Vd = (VReg *)vd; \
2819 VReg *Vj = (VReg *)vj; \
2820 VReg *Vk = (VReg *)vk; \
2822 for (i = 0; i < LSX_LEN/BIT; i++) { \
2823 temp.E(2 * i + 1) = Vj->E(2 * i); \
2824 temp.E(2 *i) = Vk->E(2 * i); \
2826 *Vd = temp; \
2829 VPACKEV(vpackev_b, 16, B)
2830 VPACKEV(vpackev_h, 32, H)
2831 VPACKEV(vpackev_w, 64, W)
2832 VPACKEV(vpackev_d, 128, D)
2834 #define VPACKOD(NAME, BIT, E) \
2835 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2837 int i; \
2838 VReg temp; \
2839 VReg *Vd = (VReg *)vd; \
2840 VReg *Vj = (VReg *)vj; \
2841 VReg *Vk = (VReg *)vk; \
2843 for (i = 0; i < LSX_LEN/BIT; i++) { \
2844 temp.E(2 * i + 1) = Vj->E(2 * i + 1); \
2845 temp.E(2 * i) = Vk->E(2 * i + 1); \
2847 *Vd = temp; \
2850 VPACKOD(vpackod_b, 16, B)
2851 VPACKOD(vpackod_h, 32, H)
2852 VPACKOD(vpackod_w, 64, W)
2853 VPACKOD(vpackod_d, 128, D)
2855 #define VPICKEV(NAME, BIT, E) \
2856 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2858 int i; \
2859 VReg temp; \
2860 VReg *Vd = (VReg *)vd; \
2861 VReg *Vj = (VReg *)vj; \
2862 VReg *Vk = (VReg *)vk; \
2864 for (i = 0; i < LSX_LEN/BIT; i++) { \
2865 temp.E(i + LSX_LEN/BIT) = Vj->E(2 * i); \
2866 temp.E(i) = Vk->E(2 * i); \
2868 *Vd = temp; \
2871 VPICKEV(vpickev_b, 16, B)
2872 VPICKEV(vpickev_h, 32, H)
2873 VPICKEV(vpickev_w, 64, W)
2874 VPICKEV(vpickev_d, 128, D)
2876 #define VPICKOD(NAME, BIT, E) \
2877 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2879 int i; \
2880 VReg temp; \
2881 VReg *Vd = (VReg *)vd; \
2882 VReg *Vj = (VReg *)vj; \
2883 VReg *Vk = (VReg *)vk; \
2885 for (i = 0; i < LSX_LEN/BIT; i++) { \
2886 temp.E(i + LSX_LEN/BIT) = Vj->E(2 * i + 1); \
2887 temp.E(i) = Vk->E(2 * i + 1); \
2889 *Vd = temp; \
2892 VPICKOD(vpickod_b, 16, B)
2893 VPICKOD(vpickod_h, 32, H)
2894 VPICKOD(vpickod_w, 64, W)
2895 VPICKOD(vpickod_d, 128, D)
2897 #define VILVL(NAME, BIT, E) \
2898 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2900 int i; \
2901 VReg temp; \
2902 VReg *Vd = (VReg *)vd; \
2903 VReg *Vj = (VReg *)vj; \
2904 VReg *Vk = (VReg *)vk; \
2906 for (i = 0; i < LSX_LEN/BIT; i++) { \
2907 temp.E(2 * i + 1) = Vj->E(i); \
2908 temp.E(2 * i) = Vk->E(i); \
2910 *Vd = temp; \
2913 VILVL(vilvl_b, 16, B)
2914 VILVL(vilvl_h, 32, H)
2915 VILVL(vilvl_w, 64, W)
2916 VILVL(vilvl_d, 128, D)
2918 #define VILVH(NAME, BIT, E) \
2919 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2921 int i; \
2922 VReg temp; \
2923 VReg *Vd = (VReg *)vd; \
2924 VReg *Vj = (VReg *)vj; \
2925 VReg *Vk = (VReg *)vk; \
2927 for (i = 0; i < LSX_LEN/BIT; i++) { \
2928 temp.E(2 * i + 1) = Vj->E(i + LSX_LEN/BIT); \
2929 temp.E(2 * i) = Vk->E(i + LSX_LEN/BIT); \
2931 *Vd = temp; \
2934 VILVH(vilvh_b, 16, B)
2935 VILVH(vilvh_h, 32, H)
2936 VILVH(vilvh_w, 64, W)
2937 VILVH(vilvh_d, 128, D)
2939 void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
2941 int i, m;
2942 VReg temp;
2943 VReg *Vd = (VReg *)vd;
2944 VReg *Vj = (VReg *)vj;
2945 VReg *Vk = (VReg *)vk;
2946 VReg *Va = (VReg *)va;
2948 m = LSX_LEN/8;
2949 for (i = 0; i < m ; i++) {
2950 uint64_t k = (uint8_t)Va->B(i) % (2 * m);
2951 temp.B(i) = k < m ? Vk->B(k) : Vj->B(k - m);
2953 *Vd = temp;
2956 #define VSHUF(NAME, BIT, E) \
2957 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2959 int i, m; \
2960 VReg temp; \
2961 VReg *Vd = (VReg *)vd; \
2962 VReg *Vj = (VReg *)vj; \
2963 VReg *Vk = (VReg *)vk; \
2965 m = LSX_LEN/BIT; \
2966 for (i = 0; i < m; i++) { \
2967 uint64_t k = ((uint8_t) Vd->E(i)) % (2 * m); \
2968 temp.E(i) = k < m ? Vk->E(k) : Vj->E(k - m); \
2970 *Vd = temp; \
2973 VSHUF(vshuf_h, 16, H)
2974 VSHUF(vshuf_w, 32, W)
2975 VSHUF(vshuf_d, 64, D)
2977 #define VSHUF4I(NAME, BIT, E) \
2978 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2980 int i; \
2981 VReg temp; \
2982 VReg *Vd = (VReg *)vd; \
2983 VReg *Vj = (VReg *)vj; \
2985 for (i = 0; i < LSX_LEN/BIT; i++) { \
2986 temp.E(i) = Vj->E(((i) & 0xfc) + (((imm) >> \
2987 (2 * ((i) & 0x03))) & 0x03)); \
2989 *Vd = temp; \
2992 VSHUF4I(vshuf4i_b, 8, B)
2993 VSHUF4I(vshuf4i_h, 16, H)
2994 VSHUF4I(vshuf4i_w, 32, W)
2996 void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2998 VReg *Vd = (VReg *)vd;
2999 VReg *Vj = (VReg *)vj;
3001 VReg temp;
3002 temp.D(0) = (imm & 2 ? Vj : Vd)->D(imm & 1);
3003 temp.D(1) = (imm & 8 ? Vj : Vd)->D((imm >> 2) & 1);
3004 *Vd = temp;
3007 void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3009 VReg temp;
3010 VReg *Vd = (VReg *)vd;
3011 VReg *Vj = (VReg *)vj;
3013 temp.W(0) = Vj->W(imm & 0x3);
3014 temp.W(1) = Vj->W((imm >> 2) & 0x3);
3015 temp.W(2) = Vd->W((imm >> 4) & 0x3);
3016 temp.W(3) = Vd->W((imm >> 6) & 0x3);
3017 *Vd = temp;
3020 #define VEXTRINS(NAME, BIT, E, MASK) \
3021 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3023 int ins, extr; \
3024 VReg *Vd = (VReg *)vd; \
3025 VReg *Vj = (VReg *)vj; \
3027 ins = (imm >> 4) & MASK; \
3028 extr = imm & MASK; \
3029 Vd->E(ins) = Vj->E(extr); \
3032 VEXTRINS(vextrins_b, 8, B, 0xf)
3033 VEXTRINS(vextrins_h, 16, H, 0x7)
3034 VEXTRINS(vextrins_w, 32, W, 0x3)
3035 VEXTRINS(vextrins_d, 64, D, 0x1)