target/arm: Decode aa32 armv8.3 3-same
[qemu/ar7.git] / target / arm / vec_helper.c
blobec705cfca5ec8f29debc58fa7aef3aa495dab900
1 /*
2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/helper-proto.h"
24 #include "tcg/tcg-gvec-desc.h"
25 #include "fpu/softfloat.h"
28 /* Note that vector data is stored in host-endian 64-bit chunks,
29 so addressing units smaller than that needs a host-endian fixup. */
30 #ifdef HOST_WORDS_BIGENDIAN
31 #define H1(x) ((x) ^ 7)
32 #define H2(x) ((x) ^ 3)
33 #define H4(x) ((x) ^ 1)
34 #else
35 #define H1(x) (x)
36 #define H2(x) (x)
37 #define H4(x) (x)
38 #endif
40 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q
42 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
44 uint64_t *d = vd + opr_sz;
45 uintptr_t i;
47 for (i = opr_sz; i < max_sz; i += 8) {
48 *d++ = 0;
52 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
53 static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1,
54 int16_t src2, int16_t src3)
56 /* Simplify:
57 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16
58 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15
60 int32_t ret = (int32_t)src1 * src2;
61 ret = ((int32_t)src3 << 15) + ret + (1 << 14);
62 ret >>= 15;
63 if (ret != (int16_t)ret) {
64 SET_QC();
65 ret = (ret < 0 ? -0x8000 : 0x7fff);
67 return ret;
70 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
71 uint32_t src2, uint32_t src3)
73 uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3);
74 uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16);
75 return deposit32(e1, 16, 16, e2);
78 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
79 void *ve, uint32_t desc)
81 uintptr_t opr_sz = simd_oprsz(desc);
82 int16_t *d = vd;
83 int16_t *n = vn;
84 int16_t *m = vm;
85 CPUARMState *env = ve;
86 uintptr_t i;
88 for (i = 0; i < opr_sz / 2; ++i) {
89 d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]);
91 clear_tail(d, opr_sz, simd_maxsz(desc));
94 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */
95 static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1,
96 int16_t src2, int16_t src3)
98 /* Similarly, using subtraction:
99 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16
100 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15
102 int32_t ret = (int32_t)src1 * src2;
103 ret = ((int32_t)src3 << 15) - ret + (1 << 14);
104 ret >>= 15;
105 if (ret != (int16_t)ret) {
106 SET_QC();
107 ret = (ret < 0 ? -0x8000 : 0x7fff);
109 return ret;
112 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
113 uint32_t src2, uint32_t src3)
115 uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3);
116 uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16);
117 return deposit32(e1, 16, 16, e2);
120 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
121 void *ve, uint32_t desc)
123 uintptr_t opr_sz = simd_oprsz(desc);
124 int16_t *d = vd;
125 int16_t *n = vn;
126 int16_t *m = vm;
127 CPUARMState *env = ve;
128 uintptr_t i;
130 for (i = 0; i < opr_sz / 2; ++i) {
131 d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]);
133 clear_tail(d, opr_sz, simd_maxsz(desc));
136 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
137 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
138 int32_t src2, int32_t src3)
140 /* Simplify similarly to int_qrdmlah_s16 above. */
141 int64_t ret = (int64_t)src1 * src2;
142 ret = ((int64_t)src3 << 31) + ret + (1 << 30);
143 ret >>= 31;
144 if (ret != (int32_t)ret) {
145 SET_QC();
146 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
148 return ret;
151 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
152 void *ve, uint32_t desc)
154 uintptr_t opr_sz = simd_oprsz(desc);
155 int32_t *d = vd;
156 int32_t *n = vn;
157 int32_t *m = vm;
158 CPUARMState *env = ve;
159 uintptr_t i;
161 for (i = 0; i < opr_sz / 4; ++i) {
162 d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]);
164 clear_tail(d, opr_sz, simd_maxsz(desc));
167 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */
168 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
169 int32_t src2, int32_t src3)
171 /* Simplify similarly to int_qrdmlsh_s16 above. */
172 int64_t ret = (int64_t)src1 * src2;
173 ret = ((int64_t)src3 << 31) - ret + (1 << 30);
174 ret >>= 31;
175 if (ret != (int32_t)ret) {
176 SET_QC();
177 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
179 return ret;
182 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
183 void *ve, uint32_t desc)
185 uintptr_t opr_sz = simd_oprsz(desc);
186 int32_t *d = vd;
187 int32_t *n = vn;
188 int32_t *m = vm;
189 CPUARMState *env = ve;
190 uintptr_t i;
192 for (i = 0; i < opr_sz / 4; ++i) {
193 d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]);
195 clear_tail(d, opr_sz, simd_maxsz(desc));
198 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
199 void *vfpst, uint32_t desc)
201 uintptr_t opr_sz = simd_oprsz(desc);
202 float16 *d = vd;
203 float16 *n = vn;
204 float16 *m = vm;
205 float_status *fpst = vfpst;
206 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
207 uint32_t neg_imag = neg_real ^ 1;
208 uintptr_t i;
210 /* Shift boolean to the sign bit so we can xor to negate. */
211 neg_real <<= 15;
212 neg_imag <<= 15;
214 for (i = 0; i < opr_sz / 2; i += 2) {
215 float16 e0 = n[H2(i)];
216 float16 e1 = m[H2(i + 1)] ^ neg_imag;
217 float16 e2 = n[H2(i + 1)];
218 float16 e3 = m[H2(i)] ^ neg_real;
220 d[H2(i)] = float16_add(e0, e1, fpst);
221 d[H2(i + 1)] = float16_add(e2, e3, fpst);
223 clear_tail(d, opr_sz, simd_maxsz(desc));
226 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
227 void *vfpst, uint32_t desc)
229 uintptr_t opr_sz = simd_oprsz(desc);
230 float32 *d = vd;
231 float32 *n = vn;
232 float32 *m = vm;
233 float_status *fpst = vfpst;
234 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
235 uint32_t neg_imag = neg_real ^ 1;
236 uintptr_t i;
238 /* Shift boolean to the sign bit so we can xor to negate. */
239 neg_real <<= 31;
240 neg_imag <<= 31;
242 for (i = 0; i < opr_sz / 4; i += 2) {
243 float32 e0 = n[H4(i)];
244 float32 e1 = m[H4(i + 1)] ^ neg_imag;
245 float32 e2 = n[H4(i + 1)];
246 float32 e3 = m[H4(i)] ^ neg_real;
248 d[H4(i)] = float32_add(e0, e1, fpst);
249 d[H4(i + 1)] = float32_add(e2, e3, fpst);
251 clear_tail(d, opr_sz, simd_maxsz(desc));
254 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
255 void *vfpst, uint32_t desc)
257 uintptr_t opr_sz = simd_oprsz(desc);
258 float64 *d = vd;
259 float64 *n = vn;
260 float64 *m = vm;
261 float_status *fpst = vfpst;
262 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
263 uint64_t neg_imag = neg_real ^ 1;
264 uintptr_t i;
266 /* Shift boolean to the sign bit so we can xor to negate. */
267 neg_real <<= 63;
268 neg_imag <<= 63;
270 for (i = 0; i < opr_sz / 8; i += 2) {
271 float64 e0 = n[i];
272 float64 e1 = m[i + 1] ^ neg_imag;
273 float64 e2 = n[i + 1];
274 float64 e3 = m[i] ^ neg_real;
276 d[i] = float64_add(e0, e1, fpst);
277 d[i + 1] = float64_add(e2, e3, fpst);
279 clear_tail(d, opr_sz, simd_maxsz(desc));
282 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm,
283 void *vfpst, uint32_t desc)
285 uintptr_t opr_sz = simd_oprsz(desc);
286 float16 *d = vd;
287 float16 *n = vn;
288 float16 *m = vm;
289 float_status *fpst = vfpst;
290 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
291 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
292 uint32_t neg_real = flip ^ neg_imag;
293 uintptr_t i;
295 /* Shift boolean to the sign bit so we can xor to negate. */
296 neg_real <<= 15;
297 neg_imag <<= 15;
299 for (i = 0; i < opr_sz / 2; i += 2) {
300 float16 e2 = n[H2(i + flip)];
301 float16 e1 = m[H2(i + flip)] ^ neg_real;
302 float16 e4 = e2;
303 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
305 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
306 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
308 clear_tail(d, opr_sz, simd_maxsz(desc));
311 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm,
312 void *vfpst, uint32_t desc)
314 uintptr_t opr_sz = simd_oprsz(desc);
315 float16 *d = vd;
316 float16 *n = vn;
317 float16 *m = vm;
318 float_status *fpst = vfpst;
319 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
320 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
321 uint32_t neg_real = flip ^ neg_imag;
322 uintptr_t i;
323 float16 e1 = m[H2(flip)];
324 float16 e3 = m[H2(1 - flip)];
326 /* Shift boolean to the sign bit so we can xor to negate. */
327 neg_real <<= 15;
328 neg_imag <<= 15;
329 e1 ^= neg_real;
330 e3 ^= neg_imag;
332 for (i = 0; i < opr_sz / 2; i += 2) {
333 float16 e2 = n[H2(i + flip)];
334 float16 e4 = e2;
336 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
337 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
339 clear_tail(d, opr_sz, simd_maxsz(desc));
342 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm,
343 void *vfpst, uint32_t desc)
345 uintptr_t opr_sz = simd_oprsz(desc);
346 float32 *d = vd;
347 float32 *n = vn;
348 float32 *m = vm;
349 float_status *fpst = vfpst;
350 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
351 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
352 uint32_t neg_real = flip ^ neg_imag;
353 uintptr_t i;
355 /* Shift boolean to the sign bit so we can xor to negate. */
356 neg_real <<= 31;
357 neg_imag <<= 31;
359 for (i = 0; i < opr_sz / 4; i += 2) {
360 float32 e2 = n[H4(i + flip)];
361 float32 e1 = m[H4(i + flip)] ^ neg_real;
362 float32 e4 = e2;
363 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
365 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
366 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
368 clear_tail(d, opr_sz, simd_maxsz(desc));
371 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm,
372 void *vfpst, uint32_t desc)
374 uintptr_t opr_sz = simd_oprsz(desc);
375 float32 *d = vd;
376 float32 *n = vn;
377 float32 *m = vm;
378 float_status *fpst = vfpst;
379 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
380 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
381 uint32_t neg_real = flip ^ neg_imag;
382 uintptr_t i;
383 float32 e1 = m[H4(flip)];
384 float32 e3 = m[H4(1 - flip)];
386 /* Shift boolean to the sign bit so we can xor to negate. */
387 neg_real <<= 31;
388 neg_imag <<= 31;
389 e1 ^= neg_real;
390 e3 ^= neg_imag;
392 for (i = 0; i < opr_sz / 4; i += 2) {
393 float32 e2 = n[H4(i + flip)];
394 float32 e4 = e2;
396 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
397 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
399 clear_tail(d, opr_sz, simd_maxsz(desc));
402 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
403 void *vfpst, uint32_t desc)
405 uintptr_t opr_sz = simd_oprsz(desc);
406 float64 *d = vd;
407 float64 *n = vn;
408 float64 *m = vm;
409 float_status *fpst = vfpst;
410 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
411 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
412 uint64_t neg_real = flip ^ neg_imag;
413 uintptr_t i;
415 /* Shift boolean to the sign bit so we can xor to negate. */
416 neg_real <<= 63;
417 neg_imag <<= 63;
419 for (i = 0; i < opr_sz / 8; i += 2) {
420 float64 e2 = n[i + flip];
421 float64 e1 = m[i + flip] ^ neg_real;
422 float64 e4 = e2;
423 float64 e3 = m[i + 1 - flip] ^ neg_imag;
425 d[i] = float64_muladd(e2, e1, d[i], 0, fpst);
426 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst);
428 clear_tail(d, opr_sz, simd_maxsz(desc));