2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "exec/exec-all.h"
23 #include "exec/helper-proto.h"
24 #include "tcg/tcg-gvec-desc.h"
25 #include "fpu/softfloat.h"
28 /* Note that vector data is stored in host-endian 64-bit chunks,
29 so addressing units smaller than that needs a host-endian fixup. */
30 #ifdef HOST_WORDS_BIGENDIAN
31 #define H1(x) ((x) ^ 7)
32 #define H2(x) ((x) ^ 3)
33 #define H4(x) ((x) ^ 1)
40 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q
42 static void clear_tail(void *vd
, uintptr_t opr_sz
, uintptr_t max_sz
)
44 uint64_t *d
= vd
+ opr_sz
;
47 for (i
= opr_sz
; i
< max_sz
; i
+= 8) {
52 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
53 static uint16_t inl_qrdmlah_s16(CPUARMState
*env
, int16_t src1
,
54 int16_t src2
, int16_t src3
)
57 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16
58 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15
60 int32_t ret
= (int32_t)src1
* src2
;
61 ret
= ((int32_t)src3
<< 15) + ret
+ (1 << 14);
63 if (ret
!= (int16_t)ret
) {
65 ret
= (ret
< 0 ? -0x8000 : 0x7fff);
70 uint32_t HELPER(neon_qrdmlah_s16
)(CPUARMState
*env
, uint32_t src1
,
71 uint32_t src2
, uint32_t src3
)
73 uint16_t e1
= inl_qrdmlah_s16(env
, src1
, src2
, src3
);
74 uint16_t e2
= inl_qrdmlah_s16(env
, src1
>> 16, src2
>> 16, src3
>> 16);
75 return deposit32(e1
, 16, 16, e2
);
78 void HELPER(gvec_qrdmlah_s16
)(void *vd
, void *vn
, void *vm
,
79 void *ve
, uint32_t desc
)
81 uintptr_t opr_sz
= simd_oprsz(desc
);
85 CPUARMState
*env
= ve
;
88 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
89 d
[i
] = inl_qrdmlah_s16(env
, n
[i
], m
[i
], d
[i
]);
91 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
94 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */
95 static uint16_t inl_qrdmlsh_s16(CPUARMState
*env
, int16_t src1
,
96 int16_t src2
, int16_t src3
)
98 /* Similarly, using subtraction:
99 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16
100 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15
102 int32_t ret
= (int32_t)src1
* src2
;
103 ret
= ((int32_t)src3
<< 15) - ret
+ (1 << 14);
105 if (ret
!= (int16_t)ret
) {
107 ret
= (ret
< 0 ? -0x8000 : 0x7fff);
112 uint32_t HELPER(neon_qrdmlsh_s16
)(CPUARMState
*env
, uint32_t src1
,
113 uint32_t src2
, uint32_t src3
)
115 uint16_t e1
= inl_qrdmlsh_s16(env
, src1
, src2
, src3
);
116 uint16_t e2
= inl_qrdmlsh_s16(env
, src1
>> 16, src2
>> 16, src3
>> 16);
117 return deposit32(e1
, 16, 16, e2
);
120 void HELPER(gvec_qrdmlsh_s16
)(void *vd
, void *vn
, void *vm
,
121 void *ve
, uint32_t desc
)
123 uintptr_t opr_sz
= simd_oprsz(desc
);
127 CPUARMState
*env
= ve
;
130 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
131 d
[i
] = inl_qrdmlsh_s16(env
, n
[i
], m
[i
], d
[i
]);
133 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
136 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
137 uint32_t HELPER(neon_qrdmlah_s32
)(CPUARMState
*env
, int32_t src1
,
138 int32_t src2
, int32_t src3
)
140 /* Simplify similarly to int_qrdmlah_s16 above. */
141 int64_t ret
= (int64_t)src1
* src2
;
142 ret
= ((int64_t)src3
<< 31) + ret
+ (1 << 30);
144 if (ret
!= (int32_t)ret
) {
146 ret
= (ret
< 0 ? INT32_MIN
: INT32_MAX
);
151 void HELPER(gvec_qrdmlah_s32
)(void *vd
, void *vn
, void *vm
,
152 void *ve
, uint32_t desc
)
154 uintptr_t opr_sz
= simd_oprsz(desc
);
158 CPUARMState
*env
= ve
;
161 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
162 d
[i
] = helper_neon_qrdmlah_s32(env
, n
[i
], m
[i
], d
[i
]);
164 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
167 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */
168 uint32_t HELPER(neon_qrdmlsh_s32
)(CPUARMState
*env
, int32_t src1
,
169 int32_t src2
, int32_t src3
)
171 /* Simplify similarly to int_qrdmlsh_s16 above. */
172 int64_t ret
= (int64_t)src1
* src2
;
173 ret
= ((int64_t)src3
<< 31) - ret
+ (1 << 30);
175 if (ret
!= (int32_t)ret
) {
177 ret
= (ret
< 0 ? INT32_MIN
: INT32_MAX
);
182 void HELPER(gvec_qrdmlsh_s32
)(void *vd
, void *vn
, void *vm
,
183 void *ve
, uint32_t desc
)
185 uintptr_t opr_sz
= simd_oprsz(desc
);
189 CPUARMState
*env
= ve
;
192 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
193 d
[i
] = helper_neon_qrdmlsh_s32(env
, n
[i
], m
[i
], d
[i
]);
195 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
198 void HELPER(gvec_fcaddh
)(void *vd
, void *vn
, void *vm
,
199 void *vfpst
, uint32_t desc
)
201 uintptr_t opr_sz
= simd_oprsz(desc
);
205 float_status
*fpst
= vfpst
;
206 uint32_t neg_real
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
207 uint32_t neg_imag
= neg_real
^ 1;
210 /* Shift boolean to the sign bit so we can xor to negate. */
214 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
215 float16 e0
= n
[H2(i
)];
216 float16 e1
= m
[H2(i
+ 1)] ^ neg_imag
;
217 float16 e2
= n
[H2(i
+ 1)];
218 float16 e3
= m
[H2(i
)] ^ neg_real
;
220 d
[H2(i
)] = float16_add(e0
, e1
, fpst
);
221 d
[H2(i
+ 1)] = float16_add(e2
, e3
, fpst
);
223 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
226 void HELPER(gvec_fcadds
)(void *vd
, void *vn
, void *vm
,
227 void *vfpst
, uint32_t desc
)
229 uintptr_t opr_sz
= simd_oprsz(desc
);
233 float_status
*fpst
= vfpst
;
234 uint32_t neg_real
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
235 uint32_t neg_imag
= neg_real
^ 1;
238 /* Shift boolean to the sign bit so we can xor to negate. */
242 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
243 float32 e0
= n
[H4(i
)];
244 float32 e1
= m
[H4(i
+ 1)] ^ neg_imag
;
245 float32 e2
= n
[H4(i
+ 1)];
246 float32 e3
= m
[H4(i
)] ^ neg_real
;
248 d
[H4(i
)] = float32_add(e0
, e1
, fpst
);
249 d
[H4(i
+ 1)] = float32_add(e2
, e3
, fpst
);
251 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
254 void HELPER(gvec_fcaddd
)(void *vd
, void *vn
, void *vm
,
255 void *vfpst
, uint32_t desc
)
257 uintptr_t opr_sz
= simd_oprsz(desc
);
261 float_status
*fpst
= vfpst
;
262 uint64_t neg_real
= extract64(desc
, SIMD_DATA_SHIFT
, 1);
263 uint64_t neg_imag
= neg_real
^ 1;
266 /* Shift boolean to the sign bit so we can xor to negate. */
270 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
272 float64 e1
= m
[i
+ 1] ^ neg_imag
;
273 float64 e2
= n
[i
+ 1];
274 float64 e3
= m
[i
] ^ neg_real
;
276 d
[i
] = float64_add(e0
, e1
, fpst
);
277 d
[i
+ 1] = float64_add(e2
, e3
, fpst
);
279 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
282 void HELPER(gvec_fcmlah
)(void *vd
, void *vn
, void *vm
,
283 void *vfpst
, uint32_t desc
)
285 uintptr_t opr_sz
= simd_oprsz(desc
);
289 float_status
*fpst
= vfpst
;
290 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
291 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
292 uint32_t neg_real
= flip
^ neg_imag
;
295 /* Shift boolean to the sign bit so we can xor to negate. */
299 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
300 float16 e2
= n
[H2(i
+ flip
)];
301 float16 e1
= m
[H2(i
+ flip
)] ^ neg_real
;
303 float16 e3
= m
[H2(i
+ 1 - flip
)] ^ neg_imag
;
305 d
[H2(i
)] = float16_muladd(e2
, e1
, d
[H2(i
)], 0, fpst
);
306 d
[H2(i
+ 1)] = float16_muladd(e4
, e3
, d
[H2(i
+ 1)], 0, fpst
);
308 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
311 void HELPER(gvec_fcmlah_idx
)(void *vd
, void *vn
, void *vm
,
312 void *vfpst
, uint32_t desc
)
314 uintptr_t opr_sz
= simd_oprsz(desc
);
318 float_status
*fpst
= vfpst
;
319 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
320 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
321 uint32_t neg_real
= flip
^ neg_imag
;
323 float16 e1
= m
[H2(flip
)];
324 float16 e3
= m
[H2(1 - flip
)];
326 /* Shift boolean to the sign bit so we can xor to negate. */
332 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
333 float16 e2
= n
[H2(i
+ flip
)];
336 d
[H2(i
)] = float16_muladd(e2
, e1
, d
[H2(i
)], 0, fpst
);
337 d
[H2(i
+ 1)] = float16_muladd(e4
, e3
, d
[H2(i
+ 1)], 0, fpst
);
339 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
342 void HELPER(gvec_fcmlas
)(void *vd
, void *vn
, void *vm
,
343 void *vfpst
, uint32_t desc
)
345 uintptr_t opr_sz
= simd_oprsz(desc
);
349 float_status
*fpst
= vfpst
;
350 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
351 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
352 uint32_t neg_real
= flip
^ neg_imag
;
355 /* Shift boolean to the sign bit so we can xor to negate. */
359 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
360 float32 e2
= n
[H4(i
+ flip
)];
361 float32 e1
= m
[H4(i
+ flip
)] ^ neg_real
;
363 float32 e3
= m
[H4(i
+ 1 - flip
)] ^ neg_imag
;
365 d
[H4(i
)] = float32_muladd(e2
, e1
, d
[H4(i
)], 0, fpst
);
366 d
[H4(i
+ 1)] = float32_muladd(e4
, e3
, d
[H4(i
+ 1)], 0, fpst
);
368 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
371 void HELPER(gvec_fcmlas_idx
)(void *vd
, void *vn
, void *vm
,
372 void *vfpst
, uint32_t desc
)
374 uintptr_t opr_sz
= simd_oprsz(desc
);
378 float_status
*fpst
= vfpst
;
379 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
380 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
381 uint32_t neg_real
= flip
^ neg_imag
;
383 float32 e1
= m
[H4(flip
)];
384 float32 e3
= m
[H4(1 - flip
)];
386 /* Shift boolean to the sign bit so we can xor to negate. */
392 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
393 float32 e2
= n
[H4(i
+ flip
)];
396 d
[H4(i
)] = float32_muladd(e2
, e1
, d
[H4(i
)], 0, fpst
);
397 d
[H4(i
+ 1)] = float32_muladd(e4
, e3
, d
[H4(i
+ 1)], 0, fpst
);
399 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
402 void HELPER(gvec_fcmlad
)(void *vd
, void *vn
, void *vm
,
403 void *vfpst
, uint32_t desc
)
405 uintptr_t opr_sz
= simd_oprsz(desc
);
409 float_status
*fpst
= vfpst
;
410 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
411 uint64_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
412 uint64_t neg_real
= flip
^ neg_imag
;
415 /* Shift boolean to the sign bit so we can xor to negate. */
419 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
420 float64 e2
= n
[i
+ flip
];
421 float64 e1
= m
[i
+ flip
] ^ neg_real
;
423 float64 e3
= m
[i
+ 1 - flip
] ^ neg_imag
;
425 d
[i
] = float64_muladd(e2
, e1
, d
[i
], 0, fpst
);
426 d
[i
+ 1] = float64_muladd(e4
, e3
, d
[i
+ 1], 0, fpst
);
428 clear_tail(d
, opr_sz
, simd_maxsz(desc
));