tcg: Add generic vector expanders
[qemu/kevin.git] / accel / tcg / tcg-runtime-gvec.c
blobe093922225218065a347f63e134ccfc3f5e3b8f6
1 /*
2 * Generic vectorized operation runtime
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "qemu/host-utils.h"
22 #include "cpu.h"
23 #include "exec/helper-proto.h"
24 #include "tcg-gvec-desc.h"
27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate
28 * them via GCC's generic vector extension. This turns out to be simpler and
29 * more reliable than getting the compiler to autovectorize.
31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
32 * are multiples of 16.
34 * When the compiler does not support all of the operations we require, the
35 * loops are written so that we can always fall back on the base types.
37 #ifdef CONFIG_VECTOR16
38 typedef uint8_t vec8 __attribute__((vector_size(16)));
39 typedef uint16_t vec16 __attribute__((vector_size(16)));
40 typedef uint32_t vec32 __attribute__((vector_size(16)));
41 typedef uint64_t vec64 __attribute__((vector_size(16)));
43 typedef int8_t svec8 __attribute__((vector_size(16)));
44 typedef int16_t svec16 __attribute__((vector_size(16)));
45 typedef int32_t svec32 __attribute__((vector_size(16)));
46 typedef int64_t svec64 __attribute__((vector_size(16)));
48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
49 #define DUP8(X) { X, X, X, X, X, X, X, X }
50 #define DUP4(X) { X, X, X, X }
51 #define DUP2(X) { X, X }
52 #else
53 typedef uint8_t vec8;
54 typedef uint16_t vec16;
55 typedef uint32_t vec32;
56 typedef uint64_t vec64;
58 typedef int8_t svec8;
59 typedef int16_t svec16;
60 typedef int32_t svec32;
61 typedef int64_t svec64;
63 #define DUP16(X) X
64 #define DUP8(X) X
65 #define DUP4(X) X
66 #define DUP2(X) X
67 #endif /* CONFIG_VECTOR16 */
69 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
71 intptr_t maxsz = simd_maxsz(desc);
72 intptr_t i;
74 if (unlikely(maxsz > oprsz)) {
75 for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
76 *(uint64_t *)(d + i) = 0;
81 void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
83 intptr_t oprsz = simd_oprsz(desc);
84 intptr_t i;
86 for (i = 0; i < oprsz; i += sizeof(vec8)) {
87 *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
89 clear_high(d, oprsz, desc);
92 void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
94 intptr_t oprsz = simd_oprsz(desc);
95 intptr_t i;
97 for (i = 0; i < oprsz; i += sizeof(vec16)) {
98 *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
100 clear_high(d, oprsz, desc);
103 void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
105 intptr_t oprsz = simd_oprsz(desc);
106 intptr_t i;
108 for (i = 0; i < oprsz; i += sizeof(vec32)) {
109 *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
111 clear_high(d, oprsz, desc);
114 void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
116 intptr_t oprsz = simd_oprsz(desc);
117 intptr_t i;
119 for (i = 0; i < oprsz; i += sizeof(vec64)) {
120 *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
122 clear_high(d, oprsz, desc);
125 void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
127 intptr_t oprsz = simd_oprsz(desc);
128 intptr_t i;
130 for (i = 0; i < oprsz; i += sizeof(vec8)) {
131 *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
133 clear_high(d, oprsz, desc);
136 void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
138 intptr_t oprsz = simd_oprsz(desc);
139 intptr_t i;
141 for (i = 0; i < oprsz; i += sizeof(vec16)) {
142 *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
144 clear_high(d, oprsz, desc);
147 void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
149 intptr_t oprsz = simd_oprsz(desc);
150 intptr_t i;
152 for (i = 0; i < oprsz; i += sizeof(vec32)) {
153 *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
155 clear_high(d, oprsz, desc);
158 void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
160 intptr_t oprsz = simd_oprsz(desc);
161 intptr_t i;
163 for (i = 0; i < oprsz; i += sizeof(vec64)) {
164 *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
166 clear_high(d, oprsz, desc);
169 void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
171 intptr_t oprsz = simd_oprsz(desc);
172 intptr_t i;
174 for (i = 0; i < oprsz; i += sizeof(vec8)) {
175 *(vec8 *)(d + i) = -*(vec8 *)(a + i);
177 clear_high(d, oprsz, desc);
180 void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
182 intptr_t oprsz = simd_oprsz(desc);
183 intptr_t i;
185 for (i = 0; i < oprsz; i += sizeof(vec16)) {
186 *(vec16 *)(d + i) = -*(vec16 *)(a + i);
188 clear_high(d, oprsz, desc);
191 void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
193 intptr_t oprsz = simd_oprsz(desc);
194 intptr_t i;
196 for (i = 0; i < oprsz; i += sizeof(vec32)) {
197 *(vec32 *)(d + i) = -*(vec32 *)(a + i);
199 clear_high(d, oprsz, desc);
202 void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
204 intptr_t oprsz = simd_oprsz(desc);
205 intptr_t i;
207 for (i = 0; i < oprsz; i += sizeof(vec64)) {
208 *(vec64 *)(d + i) = -*(vec64 *)(a + i);
210 clear_high(d, oprsz, desc);
213 void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
215 intptr_t oprsz = simd_oprsz(desc);
217 memcpy(d, a, oprsz);
218 clear_high(d, oprsz, desc);
221 void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
223 intptr_t oprsz = simd_oprsz(desc);
224 intptr_t i;
226 if (c == 0) {
227 oprsz = 0;
228 } else {
229 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
230 *(uint64_t *)(d + i) = c;
233 clear_high(d, oprsz, desc);
236 void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
238 intptr_t oprsz = simd_oprsz(desc);
239 intptr_t i;
241 if (c == 0) {
242 oprsz = 0;
243 } else {
244 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
245 *(uint32_t *)(d + i) = c;
248 clear_high(d, oprsz, desc);
251 void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
253 HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
256 void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
258 HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
261 void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
263 intptr_t oprsz = simd_oprsz(desc);
264 intptr_t i;
266 for (i = 0; i < oprsz; i += sizeof(vec64)) {
267 *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
269 clear_high(d, oprsz, desc);
272 void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
274 intptr_t oprsz = simd_oprsz(desc);
275 intptr_t i;
277 for (i = 0; i < oprsz; i += sizeof(vec64)) {
278 *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
280 clear_high(d, oprsz, desc);
283 void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
285 intptr_t oprsz = simd_oprsz(desc);
286 intptr_t i;
288 for (i = 0; i < oprsz; i += sizeof(vec64)) {
289 *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
291 clear_high(d, oprsz, desc);
294 void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
296 intptr_t oprsz = simd_oprsz(desc);
297 intptr_t i;
299 for (i = 0; i < oprsz; i += sizeof(vec64)) {
300 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
302 clear_high(d, oprsz, desc);
305 void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
307 intptr_t oprsz = simd_oprsz(desc);
308 intptr_t i;
310 for (i = 0; i < oprsz; i += sizeof(vec64)) {
311 *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
313 clear_high(d, oprsz, desc);
316 void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
318 intptr_t oprsz = simd_oprsz(desc);
319 intptr_t i;
321 for (i = 0; i < oprsz; i += sizeof(vec64)) {
322 *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
324 clear_high(d, oprsz, desc);