2 * Generic vectorized operation runtime
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "qemu/host-utils.h"
23 #include "exec/helper-proto.h"
24 #include "tcg-gvec-desc.h"
27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate
28 * them via GCC's generic vector extension. This turns out to be simpler and
29 * more reliable than getting the compiler to autovectorize.
31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
32 * are multiples of 16.
34 * When the compiler does not support all of the operations we require, the
35 * loops are written so that we can always fall back on the base types.
37 #ifdef CONFIG_VECTOR16
38 typedef uint8_t vec8
__attribute__((vector_size(16)));
39 typedef uint16_t vec16
__attribute__((vector_size(16)));
40 typedef uint32_t vec32
__attribute__((vector_size(16)));
41 typedef uint64_t vec64
__attribute__((vector_size(16)));
43 typedef int8_t svec8
__attribute__((vector_size(16)));
44 typedef int16_t svec16
__attribute__((vector_size(16)));
45 typedef int32_t svec32
__attribute__((vector_size(16)));
46 typedef int64_t svec64
__attribute__((vector_size(16)));
48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
49 #define DUP8(X) { X, X, X, X, X, X, X, X }
50 #define DUP4(X) { X, X, X, X }
51 #define DUP2(X) { X, X }
54 typedef uint16_t vec16
;
55 typedef uint32_t vec32
;
56 typedef uint64_t vec64
;
59 typedef int16_t svec16
;
60 typedef int32_t svec32
;
61 typedef int64_t svec64
;
67 #endif /* CONFIG_VECTOR16 */
69 static inline void clear_high(void *d
, intptr_t oprsz
, uint32_t desc
)
71 intptr_t maxsz
= simd_maxsz(desc
);
74 if (unlikely(maxsz
> oprsz
)) {
75 for (i
= oprsz
; i
< maxsz
; i
+= sizeof(uint64_t)) {
76 *(uint64_t *)(d
+ i
) = 0;
81 void HELPER(gvec_add8
)(void *d
, void *a
, void *b
, uint32_t desc
)
83 intptr_t oprsz
= simd_oprsz(desc
);
86 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
87 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) + *(vec8
*)(b
+ i
);
89 clear_high(d
, oprsz
, desc
);
92 void HELPER(gvec_add16
)(void *d
, void *a
, void *b
, uint32_t desc
)
94 intptr_t oprsz
= simd_oprsz(desc
);
97 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
98 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) + *(vec16
*)(b
+ i
);
100 clear_high(d
, oprsz
, desc
);
103 void HELPER(gvec_add32
)(void *d
, void *a
, void *b
, uint32_t desc
)
105 intptr_t oprsz
= simd_oprsz(desc
);
108 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
109 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) + *(vec32
*)(b
+ i
);
111 clear_high(d
, oprsz
, desc
);
114 void HELPER(gvec_add64
)(void *d
, void *a
, void *b
, uint32_t desc
)
116 intptr_t oprsz
= simd_oprsz(desc
);
119 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
120 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) + *(vec64
*)(b
+ i
);
122 clear_high(d
, oprsz
, desc
);
125 void HELPER(gvec_sub8
)(void *d
, void *a
, void *b
, uint32_t desc
)
127 intptr_t oprsz
= simd_oprsz(desc
);
130 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
131 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) - *(vec8
*)(b
+ i
);
133 clear_high(d
, oprsz
, desc
);
136 void HELPER(gvec_sub16
)(void *d
, void *a
, void *b
, uint32_t desc
)
138 intptr_t oprsz
= simd_oprsz(desc
);
141 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
142 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) - *(vec16
*)(b
+ i
);
144 clear_high(d
, oprsz
, desc
);
147 void HELPER(gvec_sub32
)(void *d
, void *a
, void *b
, uint32_t desc
)
149 intptr_t oprsz
= simd_oprsz(desc
);
152 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
153 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) - *(vec32
*)(b
+ i
);
155 clear_high(d
, oprsz
, desc
);
158 void HELPER(gvec_sub64
)(void *d
, void *a
, void *b
, uint32_t desc
)
160 intptr_t oprsz
= simd_oprsz(desc
);
163 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
164 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) - *(vec64
*)(b
+ i
);
166 clear_high(d
, oprsz
, desc
);
169 void HELPER(gvec_neg8
)(void *d
, void *a
, uint32_t desc
)
171 intptr_t oprsz
= simd_oprsz(desc
);
174 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
175 *(vec8
*)(d
+ i
) = -*(vec8
*)(a
+ i
);
177 clear_high(d
, oprsz
, desc
);
180 void HELPER(gvec_neg16
)(void *d
, void *a
, uint32_t desc
)
182 intptr_t oprsz
= simd_oprsz(desc
);
185 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
186 *(vec16
*)(d
+ i
) = -*(vec16
*)(a
+ i
);
188 clear_high(d
, oprsz
, desc
);
191 void HELPER(gvec_neg32
)(void *d
, void *a
, uint32_t desc
)
193 intptr_t oprsz
= simd_oprsz(desc
);
196 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
197 *(vec32
*)(d
+ i
) = -*(vec32
*)(a
+ i
);
199 clear_high(d
, oprsz
, desc
);
202 void HELPER(gvec_neg64
)(void *d
, void *a
, uint32_t desc
)
204 intptr_t oprsz
= simd_oprsz(desc
);
207 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
208 *(vec64
*)(d
+ i
) = -*(vec64
*)(a
+ i
);
210 clear_high(d
, oprsz
, desc
);
213 void HELPER(gvec_mov
)(void *d
, void *a
, uint32_t desc
)
215 intptr_t oprsz
= simd_oprsz(desc
);
218 clear_high(d
, oprsz
, desc
);
221 void HELPER(gvec_dup64
)(void *d
, uint32_t desc
, uint64_t c
)
223 intptr_t oprsz
= simd_oprsz(desc
);
229 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
230 *(uint64_t *)(d
+ i
) = c
;
233 clear_high(d
, oprsz
, desc
);
236 void HELPER(gvec_dup32
)(void *d
, uint32_t desc
, uint32_t c
)
238 intptr_t oprsz
= simd_oprsz(desc
);
244 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
245 *(uint32_t *)(d
+ i
) = c
;
248 clear_high(d
, oprsz
, desc
);
251 void HELPER(gvec_dup16
)(void *d
, uint32_t desc
, uint32_t c
)
253 HELPER(gvec_dup32
)(d
, desc
, 0x00010001 * (c
& 0xffff));
256 void HELPER(gvec_dup8
)(void *d
, uint32_t desc
, uint32_t c
)
258 HELPER(gvec_dup32
)(d
, desc
, 0x01010101 * (c
& 0xff));
261 void HELPER(gvec_not
)(void *d
, void *a
, uint32_t desc
)
263 intptr_t oprsz
= simd_oprsz(desc
);
266 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
267 *(vec64
*)(d
+ i
) = ~*(vec64
*)(a
+ i
);
269 clear_high(d
, oprsz
, desc
);
272 void HELPER(gvec_and
)(void *d
, void *a
, void *b
, uint32_t desc
)
274 intptr_t oprsz
= simd_oprsz(desc
);
277 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
278 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) & *(vec64
*)(b
+ i
);
280 clear_high(d
, oprsz
, desc
);
283 void HELPER(gvec_or
)(void *d
, void *a
, void *b
, uint32_t desc
)
285 intptr_t oprsz
= simd_oprsz(desc
);
288 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
289 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) | *(vec64
*)(b
+ i
);
291 clear_high(d
, oprsz
, desc
);
294 void HELPER(gvec_xor
)(void *d
, void *a
, void *b
, uint32_t desc
)
296 intptr_t oprsz
= simd_oprsz(desc
);
299 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
300 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) ^ *(vec64
*)(b
+ i
);
302 clear_high(d
, oprsz
, desc
);
305 void HELPER(gvec_andc
)(void *d
, void *a
, void *b
, uint32_t desc
)
307 intptr_t oprsz
= simd_oprsz(desc
);
310 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
311 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) &~ *(vec64
*)(b
+ i
);
313 clear_high(d
, oprsz
, desc
);
316 void HELPER(gvec_orc
)(void *d
, void *a
, void *b
, uint32_t desc
)
318 intptr_t oprsz
= simd_oprsz(desc
);
321 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
322 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) |~ *(vec64
*)(b
+ i
);
324 clear_high(d
, oprsz
, desc
);