s390x/tcg: Implement VECTOR SHIFT RIGHT ARITHMETIC
[qemu/ar7.git] / target / s390x / vec_int_helper.c
blob67e9f2b0eda3a887fc6fad6da5eef1fd9e8260fe
1 /*
2 * QEMU TCG support -- s390x vector integer instruction support
4 * Copyright (C) 2019 Red Hat Inc
6 * Authors:
7 * David Hildenbrand <david@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "qemu-common.h"
14 #include "cpu.h"
15 #include "vec.h"
16 #include "exec/helper-proto.h"
17 #include "tcg/tcg-gvec-desc.h"
19 static bool s390_vec_is_zero(const S390Vector *v)
21 return !v->doubleword[0] && !v->doubleword[1];
24 static void s390_vec_xor(S390Vector *res, const S390Vector *a,
25 const S390Vector *b)
27 res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0];
28 res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1];
31 static void s390_vec_shl(S390Vector *d, const S390Vector *a, uint64_t count)
33 uint64_t tmp;
35 g_assert(count < 128);
36 if (count == 0) {
37 d->doubleword[0] = a->doubleword[0];
38 d->doubleword[1] = a->doubleword[1];
39 } else if (count == 64) {
40 d->doubleword[0] = a->doubleword[1];
41 d->doubleword[1] = 0;
42 } else if (count < 64) {
43 tmp = extract64(a->doubleword[1], 64 - count, count);
44 d->doubleword[1] = a->doubleword[1] << count;
45 d->doubleword[0] = (a->doubleword[0] << count) | tmp;
46 } else {
47 d->doubleword[0] = a->doubleword[1] << (count - 64);
48 d->doubleword[1] = 0;
52 static void s390_vec_sar(S390Vector *d, const S390Vector *a, uint64_t count)
54 uint64_t tmp;
56 if (count == 0) {
57 d->doubleword[0] = a->doubleword[0];
58 d->doubleword[1] = a->doubleword[1];
59 } else if (count == 64) {
60 d->doubleword[1] = a->doubleword[0];
61 d->doubleword[0] = 0;
62 } else if (count < 64) {
63 tmp = a->doubleword[1] >> count;
64 d->doubleword[1] = deposit64(tmp, 64 - count, count, a->doubleword[0]);
65 d->doubleword[0] = (int64_t)a->doubleword[0] >> count;
66 } else {
67 d->doubleword[1] = (int64_t)a->doubleword[0] >> (count - 64);
68 d->doubleword[0] = 0;
72 static void s390_vec_shr(S390Vector *d, const S390Vector *a, uint64_t count)
74 uint64_t tmp;
76 g_assert(count < 128);
77 if (count == 0) {
78 d->doubleword[0] = a->doubleword[0];
79 d->doubleword[1] = a->doubleword[1];
80 } else if (count == 64) {
81 d->doubleword[1] = a->doubleword[0];
82 d->doubleword[0] = 0;
83 } else if (count < 64) {
84 tmp = a->doubleword[1] >> count;
85 d->doubleword[1] = deposit64(tmp, 64 - count, count, a->doubleword[0]);
86 d->doubleword[0] = a->doubleword[0] >> count;
87 } else {
88 d->doubleword[1] = a->doubleword[0] >> (count - 64);
89 d->doubleword[0] = 0;
92 #define DEF_VAVG(BITS) \
93 void HELPER(gvec_vavg##BITS)(void *v1, const void *v2, const void *v3, \
94 uint32_t desc) \
95 { \
96 int i; \
98 for (i = 0; i < (128 / BITS); i++) { \
99 const int32_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, i); \
100 const int32_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, i); \
102 s390_vec_write_element##BITS(v1, i, (a + b + 1) >> 1); \
105 DEF_VAVG(8)
106 DEF_VAVG(16)
108 #define DEF_VAVGL(BITS) \
109 void HELPER(gvec_vavgl##BITS)(void *v1, const void *v2, const void *v3, \
110 uint32_t desc) \
112 int i; \
114 for (i = 0; i < (128 / BITS); i++) { \
115 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
116 const uint##BITS##_t b = s390_vec_read_element##BITS(v3, i); \
118 s390_vec_write_element##BITS(v1, i, (a + b + 1) >> 1); \
121 DEF_VAVGL(8)
122 DEF_VAVGL(16)
124 #define DEF_VCLZ(BITS) \
125 void HELPER(gvec_vclz##BITS)(void *v1, const void *v2, uint32_t desc) \
127 int i; \
129 for (i = 0; i < (128 / BITS); i++) { \
130 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
132 s390_vec_write_element##BITS(v1, i, clz32(a) - 32 + BITS); \
135 DEF_VCLZ(8)
136 DEF_VCLZ(16)
138 #define DEF_VCTZ(BITS) \
139 void HELPER(gvec_vctz##BITS)(void *v1, const void *v2, uint32_t desc) \
141 int i; \
143 for (i = 0; i < (128 / BITS); i++) { \
144 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
146 s390_vec_write_element##BITS(v1, i, a ? ctz32(a) : BITS); \
149 DEF_VCTZ(8)
150 DEF_VCTZ(16)
152 /* like binary multiplication, but XOR instead of addition */
153 #define DEF_GALOIS_MULTIPLY(BITS, TBITS) \
154 static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \
155 uint##TBITS##_t b) \
157 uint##TBITS##_t res = 0; \
159 while (b) { \
160 if (b & 0x1) { \
161 res = res ^ a; \
163 a = a << 1; \
164 b = b >> 1; \
166 return res; \
168 DEF_GALOIS_MULTIPLY(8, 16)
169 DEF_GALOIS_MULTIPLY(16, 32)
170 DEF_GALOIS_MULTIPLY(32, 64)
172 static S390Vector galois_multiply64(uint64_t a, uint64_t b)
174 S390Vector res = {};
175 S390Vector va = {
176 .doubleword[1] = a,
178 S390Vector vb = {
179 .doubleword[1] = b,
182 while (!s390_vec_is_zero(&vb)) {
183 if (vb.doubleword[1] & 0x1) {
184 s390_vec_xor(&res, &res, &va);
186 s390_vec_shl(&va, &va, 1);
187 s390_vec_shr(&vb, &vb, 1);
189 return res;
192 #define DEF_VGFM(BITS, TBITS) \
193 void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
194 uint32_t desc) \
196 int i; \
198 for (i = 0; i < (128 / TBITS); i++) { \
199 uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \
200 uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \
201 uint##TBITS##_t d = galois_multiply##BITS(a, b); \
203 a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
204 b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
205 d = d ^ galois_multiply32(a, b); \
206 s390_vec_write_element##TBITS(v1, i, d); \
209 DEF_VGFM(8, 16)
210 DEF_VGFM(16, 32)
211 DEF_VGFM(32, 64)
213 void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
214 uint32_t desc)
216 S390Vector tmp1, tmp2;
217 uint64_t a, b;
219 a = s390_vec_read_element64(v2, 0);
220 b = s390_vec_read_element64(v3, 0);
221 tmp1 = galois_multiply64(a, b);
222 a = s390_vec_read_element64(v2, 1);
223 b = s390_vec_read_element64(v3, 1);
224 tmp2 = galois_multiply64(a, b);
225 s390_vec_xor(v1, &tmp1, &tmp2);
228 #define DEF_VGFMA(BITS, TBITS) \
229 void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \
230 const void *v4, uint32_t desc) \
232 int i; \
234 for (i = 0; i < (128 / TBITS); i++) { \
235 uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \
236 uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \
237 uint##TBITS##_t d = galois_multiply##BITS(a, b); \
239 a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
240 b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
241 d = d ^ galois_multiply32(a, b); \
242 d = d ^ s390_vec_read_element##TBITS(v4, i); \
243 s390_vec_write_element##TBITS(v1, i, d); \
246 DEF_VGFMA(8, 16)
247 DEF_VGFMA(16, 32)
248 DEF_VGFMA(32, 64)
250 void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
251 const void *v4, uint32_t desc)
253 S390Vector tmp1, tmp2;
254 uint64_t a, b;
256 a = s390_vec_read_element64(v2, 0);
257 b = s390_vec_read_element64(v3, 0);
258 tmp1 = galois_multiply64(a, b);
259 a = s390_vec_read_element64(v2, 1);
260 b = s390_vec_read_element64(v3, 1);
261 tmp2 = galois_multiply64(a, b);
262 s390_vec_xor(&tmp1, &tmp1, &tmp2);
263 s390_vec_xor(v1, &tmp1, v4);
266 #define DEF_VMAL(BITS) \
267 void HELPER(gvec_vmal##BITS)(void *v1, const void *v2, const void *v3, \
268 const void *v4, uint32_t desc) \
270 int i; \
272 for (i = 0; i < (128 / BITS); i++) { \
273 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
274 const uint##BITS##_t b = s390_vec_read_element##BITS(v3, i); \
275 const uint##BITS##_t c = s390_vec_read_element##BITS(v4, i); \
277 s390_vec_write_element##BITS(v1, i, a * b + c); \
280 DEF_VMAL(8)
281 DEF_VMAL(16)
283 #define DEF_VMAH(BITS) \
284 void HELPER(gvec_vmah##BITS)(void *v1, const void *v2, const void *v3, \
285 const void *v4, uint32_t desc) \
287 int i; \
289 for (i = 0; i < (128 / BITS); i++) { \
290 const int32_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, i); \
291 const int32_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, i); \
292 const int32_t c = (int##BITS##_t)s390_vec_read_element##BITS(v4, i); \
294 s390_vec_write_element##BITS(v1, i, (a * b + c) >> BITS); \
297 DEF_VMAH(8)
298 DEF_VMAH(16)
300 #define DEF_VMALH(BITS) \
301 void HELPER(gvec_vmalh##BITS)(void *v1, const void *v2, const void *v3, \
302 const void *v4, uint32_t desc) \
304 int i; \
306 for (i = 0; i < (128 / BITS); i++) { \
307 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
308 const uint##BITS##_t b = s390_vec_read_element##BITS(v3, i); \
309 const uint##BITS##_t c = s390_vec_read_element##BITS(v4, i); \
311 s390_vec_write_element##BITS(v1, i, (a * b + c) >> BITS); \
314 DEF_VMALH(8)
315 DEF_VMALH(16)
317 #define DEF_VMAE(BITS, TBITS) \
318 void HELPER(gvec_vmae##BITS)(void *v1, const void *v2, const void *v3, \
319 const void *v4, uint32_t desc) \
321 int i, j; \
323 for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
324 int##TBITS##_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, j); \
325 int##TBITS##_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, j); \
326 int##TBITS##_t c = (int##BITS##_t)s390_vec_read_element##BITS(v4, j); \
328 s390_vec_write_element##TBITS(v1, i, a * b + c); \
331 DEF_VMAE(8, 16)
332 DEF_VMAE(16, 32)
333 DEF_VMAE(32, 64)
335 #define DEF_VMALE(BITS, TBITS) \
336 void HELPER(gvec_vmale##BITS)(void *v1, const void *v2, const void *v3, \
337 const void *v4, uint32_t desc) \
339 int i, j; \
341 for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
342 uint##TBITS##_t a = s390_vec_read_element##BITS(v2, j); \
343 uint##TBITS##_t b = s390_vec_read_element##BITS(v3, j); \
344 uint##TBITS##_t c = s390_vec_read_element##BITS(v4, j); \
346 s390_vec_write_element##TBITS(v1, i, a * b + c); \
349 DEF_VMALE(8, 16)
350 DEF_VMALE(16, 32)
351 DEF_VMALE(32, 64)
353 #define DEF_VMAO(BITS, TBITS) \
354 void HELPER(gvec_vmao##BITS)(void *v1, const void *v2, const void *v3, \
355 const void *v4, uint32_t desc) \
357 int i, j; \
359 for (i = 0, j = 1; i < (128 / TBITS); i++, j += 2) { \
360 int##TBITS##_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, j); \
361 int##TBITS##_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, j); \
362 int##TBITS##_t c = (int##BITS##_t)s390_vec_read_element##BITS(v4, j); \
364 s390_vec_write_element##TBITS(v1, i, a * b + c); \
367 DEF_VMAO(8, 16)
368 DEF_VMAO(16, 32)
369 DEF_VMAO(32, 64)
371 #define DEF_VMALO(BITS, TBITS) \
372 void HELPER(gvec_vmalo##BITS)(void *v1, const void *v2, const void *v3, \
373 const void *v4, uint32_t desc) \
375 int i, j; \
377 for (i = 0, j = 1; i < (128 / TBITS); i++, j += 2) { \
378 uint##TBITS##_t a = s390_vec_read_element##BITS(v2, j); \
379 uint##TBITS##_t b = s390_vec_read_element##BITS(v3, j); \
380 uint##TBITS##_t c = s390_vec_read_element##BITS(v4, j); \
382 s390_vec_write_element##TBITS(v1, i, a * b + c); \
385 DEF_VMALO(8, 16)
386 DEF_VMALO(16, 32)
387 DEF_VMALO(32, 64)
389 #define DEF_VMH(BITS) \
390 void HELPER(gvec_vmh##BITS)(void *v1, const void *v2, const void *v3, \
391 uint32_t desc) \
393 int i; \
395 for (i = 0; i < (128 / BITS); i++) { \
396 const int32_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, i); \
397 const int32_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, i); \
399 s390_vec_write_element##BITS(v1, i, (a * b) >> BITS); \
402 DEF_VMH(8)
403 DEF_VMH(16)
405 #define DEF_VMLH(BITS) \
406 void HELPER(gvec_vmlh##BITS)(void *v1, const void *v2, const void *v3, \
407 uint32_t desc) \
409 int i; \
411 for (i = 0; i < (128 / BITS); i++) { \
412 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
413 const uint##BITS##_t b = s390_vec_read_element##BITS(v3, i); \
415 s390_vec_write_element##BITS(v1, i, (a * b) >> BITS); \
418 DEF_VMLH(8)
419 DEF_VMLH(16)
421 #define DEF_VME(BITS, TBITS) \
422 void HELPER(gvec_vme##BITS)(void *v1, const void *v2, const void *v3, \
423 uint32_t desc) \
425 int i, j; \
427 for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
428 int##TBITS##_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, j); \
429 int##TBITS##_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, j); \
431 s390_vec_write_element##TBITS(v1, i, a * b); \
434 DEF_VME(8, 16)
435 DEF_VME(16, 32)
436 DEF_VME(32, 64)
438 #define DEF_VMLE(BITS, TBITS) \
439 void HELPER(gvec_vmle##BITS)(void *v1, const void *v2, const void *v3, \
440 uint32_t desc) \
442 int i, j; \
444 for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
445 const uint##TBITS##_t a = s390_vec_read_element##BITS(v2, j); \
446 const uint##TBITS##_t b = s390_vec_read_element##BITS(v3, j); \
448 s390_vec_write_element##TBITS(v1, i, a * b); \
451 DEF_VMLE(8, 16)
452 DEF_VMLE(16, 32)
453 DEF_VMLE(32, 64)
455 #define DEF_VMO(BITS, TBITS) \
456 void HELPER(gvec_vmo##BITS)(void *v1, const void *v2, const void *v3, \
457 uint32_t desc) \
459 int i, j; \
461 for (i = 0, j = 1; i < (128 / TBITS); i++, j += 2) { \
462 int##TBITS##_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, j); \
463 int##TBITS##_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, j); \
465 s390_vec_write_element##TBITS(v1, i, a * b); \
468 DEF_VMO(8, 16)
469 DEF_VMO(16, 32)
470 DEF_VMO(32, 64)
472 #define DEF_VMLO(BITS, TBITS) \
473 void HELPER(gvec_vmlo##BITS)(void *v1, const void *v2, const void *v3, \
474 uint32_t desc) \
476 int i, j; \
478 for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
479 const uint##TBITS##_t a = s390_vec_read_element##BITS(v2, j); \
480 const uint##TBITS##_t b = s390_vec_read_element##BITS(v3, j); \
482 s390_vec_write_element##TBITS(v1, i, a * b); \
485 DEF_VMLO(8, 16)
486 DEF_VMLO(16, 32)
487 DEF_VMLO(32, 64)
489 #define DEF_VPOPCT(BITS) \
490 void HELPER(gvec_vpopct##BITS)(void *v1, const void *v2, uint32_t desc) \
492 int i; \
494 for (i = 0; i < (128 / BITS); i++) { \
495 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
497 s390_vec_write_element##BITS(v1, i, ctpop32(a)); \
500 DEF_VPOPCT(8)
501 DEF_VPOPCT(16)
503 #define DEF_VERLLV(BITS) \
504 void HELPER(gvec_verllv##BITS)(void *v1, const void *v2, const void *v3, \
505 uint32_t desc) \
507 int i; \
509 for (i = 0; i < (128 / BITS); i++) { \
510 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
511 const uint##BITS##_t b = s390_vec_read_element##BITS(v3, i); \
513 s390_vec_write_element##BITS(v1, i, rol##BITS(a, b)); \
516 DEF_VERLLV(8)
517 DEF_VERLLV(16)
519 #define DEF_VERLL(BITS) \
520 void HELPER(gvec_verll##BITS)(void *v1, const void *v2, uint64_t count, \
521 uint32_t desc) \
523 int i; \
525 for (i = 0; i < (128 / BITS); i++) { \
526 const uint##BITS##_t a = s390_vec_read_element##BITS(v2, i); \
528 s390_vec_write_element##BITS(v1, i, rol##BITS(a, count)); \
531 DEF_VERLL(8)
532 DEF_VERLL(16)
534 #define DEF_VERIM(BITS) \
535 void HELPER(gvec_verim##BITS)(void *v1, const void *v2, const void *v3, \
536 uint32_t desc) \
538 const uint8_t count = simd_data(desc); \
539 int i; \
541 for (i = 0; i < (128 / BITS); i++) { \
542 const uint##BITS##_t a = s390_vec_read_element##BITS(v1, i); \
543 const uint##BITS##_t b = s390_vec_read_element##BITS(v2, i); \
544 const uint##BITS##_t mask = s390_vec_read_element##BITS(v3, i); \
545 const uint##BITS##_t d = (a & ~mask) | (rol##BITS(b, count) & mask); \
547 s390_vec_write_element##BITS(v1, i, d); \
550 DEF_VERIM(8)
551 DEF_VERIM(16)
553 void HELPER(gvec_vsl)(void *v1, const void *v2, uint64_t count,
554 uint32_t desc)
556 s390_vec_shl(v1, v2, count);
559 void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count,
560 uint32_t desc)
562 s390_vec_sar(v1, v2, count);