Merge aosp-toolchain/gcc/gcc-4_9 changes.
[official-gcc.git] / gcc-4_9 / gcc / config / i386 / arm_neon.h
blob869215199a48b1bfe15162ca781b1c17af4f0ae8
1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com
3 //*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved.
5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
7 //By downloading, copying, installing or using the software you agree to this license.
8 //If you do not agree to this license, do not download, install, copy or use the software.
10 // License Agreement
12 //Permission to use, copy, modify, and/or distribute this software for any
13 //purpose with or without fee is hereby granted, provided that the above
14 //copyright notice and this permission notice appear in all copies.
16 //THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
17 //REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
18 //AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
19 //INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
20 //LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
21 //OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
22 //PERFORMANCE OF THIS SOFTWARE.
24 //*****************************************************************************************
25 // This file is intended to simplify ARM->IA32 porting
26 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
27 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
28 // MMX instruction set is not used due to performance overhead and the necessity to use the
29 // EMMS instruction (_mm_empty())for mmx-x87 floating point switching
30 //*****************************************************************************************
32 //!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
33 //!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
34 //!!!!!!! greater performance. It can be done by -msse4.2 compiler switch.
36 #ifndef NEON2SSE_H
37 #define NEON2SSE_H
39 #ifndef USE_SSE4
40 #if defined(__SSE4_2__)
41 #define USE_SSE4
42 #endif
43 #endif
45 #include <xmmintrin.h> //SSE
46 #include <emmintrin.h> //SSE2
47 #include <pmmintrin.h> //SSE3
48 #include <tmmintrin.h> //SSSE3
49 #ifdef USE_SSE4
50 #include <smmintrin.h> //SSE4.1
51 #include <nmmintrin.h> //SSE4.2
52 #endif
55 //*************** functions and data attributes, compiler dependent *********************************
56 //***********************************************************************************
57 #ifdef __GNUC__
58 #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
59 #define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
60 #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
61 #if _GCC_VERSION < 40500
62 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
63 #else
64 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
65 #endif
66 #if defined(__x86_64__)
67 #define _NEON2SSE_64BIT __x86_64__
68 #endif
69 #else
70 #define _NEON2SSE_ALIGN_16 __declspec(align(16))
71 #define _NEON2SSE_INLINE __inline
72 #if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
73 #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
74 #if defined(_M_X64)
75 #define _NEON2SSE_64BIT _M_X64
76 #endif
77 #else
78 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
79 #endif
80 #endif
82 #if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
83 #define _NEON2SSE_64BIT_SSE4
84 #endif
86 /*********************************************************************************************************************/
87 // data types conversion
88 /*********************************************************************************************************************/
89 #if defined(_MSC_VER) && (_MSC_VER < 1300)
90 typedef signed char int8_t;
91 typedef unsigned char uint8_t;
92 typedef signed short int16_t;
93 typedef unsigned short uint16_t;
94 typedef signed int int32_t;
95 typedef unsigned int uint32_t;
96 typedef signed long long int64_t;
97 typedef unsigned long long uint64_t;
98 #elif defined(_MSC_VER)
99 typedef signed __int8 int8_t;
100 typedef unsigned __int8 uint8_t;
101 typedef signed __int16 int16_t;
102 typedef unsigned __int16 uint16_t;
103 typedef signed __int32 int32_t;
104 typedef unsigned __int32 uint32_t;
106 typedef signed long long int64_t;
107 typedef unsigned long long uint64_t;
108 #else
109 #include <stdint.h>
110 #include <limits.h>
111 #endif
113 typedef union __m64_128 {
114 uint64_t m64_u64[1];
115 float m64_f32[2];
116 int8_t m64_i8[8];
117 int16_t m64_i16[4];
118 int32_t m64_i32[2];
119 int64_t m64_i64[1];
120 uint8_t m64_u8[8];
121 uint16_t m64_u16[4];
122 uint32_t m64_u32[2];
123 } __m64_128;
125 typedef __m64_128 int8x8_t;
126 typedef __m64_128 uint8x8_t;
127 typedef __m64_128 int16x4_t;
128 typedef __m64_128 uint16x4_t;
129 typedef __m64_128 int32x2_t;
130 typedef __m64_128 uint32x2_t;
131 typedef __m64_128 int64x1_t;
132 typedef __m64_128 uint64x1_t;
133 typedef __m64_128 poly8x8_t;
134 typedef __m64_128 poly16x4_t;
136 typedef __m64_128 float32x2_t;
137 typedef __m128 float32x4_t;
139 typedef __m128 float16x4_t; //not supported by IA, for compatibility
140 typedef __m128 float16x8_t; //not supported by IA, for compatibility
142 typedef __m128i int8x16_t;
143 typedef __m128i int16x8_t;
144 typedef __m128i int32x4_t;
145 typedef __m128i int64x2_t;
146 typedef __m128i uint8x16_t;
147 typedef __m128i uint16x8_t;
148 typedef __m128i uint32x4_t;
149 typedef __m128i uint64x2_t;
150 typedef __m128i poly8x16_t;
151 typedef __m128i poly16x8_t;
153 #if defined(_MSC_VER)
154 #define SINT_MIN (-2147483647 - 1) /* min signed int value */
155 #define SINT_MAX 2147483647 /* max signed int value */
156 #else
157 #define SINT_MIN INT_MIN /* min signed int value */
158 #define SINT_MAX INT_MAX /* max signed int value */
159 #endif
161 typedef float float32_t;
162 typedef float __fp16;
164 typedef uint8_t poly8_t;
165 typedef uint16_t poly16_t;
168 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
169 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
170 struct int8x16x2_t {
171 int8x16_t val[2];
173 struct int16x8x2_t {
174 int16x8_t val[2];
176 struct int32x4x2_t {
177 int32x4_t val[2];
179 struct int64x2x2_t {
180 int64x2_t val[2];
182 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
183 struct int8x8x2_t {
184 int8x8_t val[2];
186 struct int16x4x2_t {
187 int16x4_t val[2];
189 struct int32x2x2_t {
190 int32x2_t val[2];
192 struct int64x1x2_t {
193 int64x1_t val[2];
196 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
197 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
198 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
199 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
201 typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
202 typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
203 typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
204 typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
206 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
207 typedef struct int8x16x2_t uint8x16x2_t;
208 typedef struct int16x8x2_t uint16x8x2_t;
209 typedef struct int32x4x2_t uint32x4x2_t;
210 typedef struct int64x2x2_t uint64x2x2_t;
211 typedef struct int8x16x2_t poly8x16x2_t;
212 typedef struct int16x8x2_t poly16x8x2_t;
214 typedef struct int8x8x2_t uint8x8x2_t;
215 typedef struct int16x4x2_t uint16x4x2_t;
216 typedef struct int32x2x2_t uint32x2x2_t;
217 typedef struct int64x1x2_t uint64x1x2_t;
218 typedef struct int8x8x2_t poly8x8x2_t;
219 typedef struct int16x4x2_t poly16x4x2_t;
221 //float
222 struct float32x4x2_t {
223 float32x4_t val[2];
225 struct float16x8x2_t {
226 float16x8_t val[2];
228 struct float32x2x2_t {
229 float32x2_t val[2];
232 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
233 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
234 typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
235 typedef float16x8x2_t float16x4x2_t;
238 struct int8x16x4_t {
239 int8x16_t val[4];
241 struct int16x8x4_t {
242 int16x8_t val[4];
244 struct int32x4x4_t {
245 int32x4_t val[4];
247 struct int64x2x4_t {
248 int64x2_t val[4];
251 struct int8x8x4_t {
252 int8x8_t val[4];
254 struct int16x4x4_t {
255 int16x4_t val[4];
257 struct int32x2x4_t {
258 int32x2_t val[4];
260 struct int64x1x4_t {
261 int64x1_t val[4];
264 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
265 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
266 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
267 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
269 typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
270 typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
271 typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
272 typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
274 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
275 typedef struct int8x8x4_t uint8x8x4_t;
276 typedef struct int16x4x4_t uint16x4x4_t;
277 typedef struct int32x2x4_t uint32x2x4_t;
278 typedef struct int64x1x4_t uint64x1x4_t;
279 typedef struct int8x8x4_t poly8x8x4_t;
280 typedef struct int16x4x4_t poly16x4x4_t;
282 typedef struct int8x16x4_t uint8x16x4_t;
283 typedef struct int16x8x4_t uint16x8x4_t;
284 typedef struct int32x4x4_t uint32x4x4_t;
285 typedef struct int64x2x4_t uint64x2x4_t;
286 typedef struct int8x16x4_t poly8x16x4_t;
287 typedef struct int16x8x4_t poly16x8x4_t;
289 struct float32x4x4_t {
290 float32x4_t val[4];
292 struct float16x8x4_t {
293 float16x8_t val[4];
295 struct float32x2x4_t {
296 float32x2_t val[4];
299 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
300 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
301 typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
302 typedef float16x8x4_t float16x4x4_t;
305 struct int16x8x3_t {
306 int16x8_t val[3];
308 struct int32x4x3_t {
309 int32x4_t val[3];
311 struct int64x2x3_t {
312 int64x2_t val[3];
314 struct int8x16x3_t {
315 int8x16_t val[3];
318 struct int16x4x3_t {
319 int16x4_t val[3];
321 struct int32x2x3_t {
322 int32x2_t val[3];
324 struct int64x1x3_t {
325 int64x1_t val[3];
327 struct int8x8x3_t {
328 int8x8_t val[3];
330 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
331 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
332 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
333 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
335 typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
336 typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
337 typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
338 typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
341 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
342 typedef struct int8x16x3_t uint8x16x3_t;
343 typedef struct int16x8x3_t uint16x8x3_t;
344 typedef struct int32x4x3_t uint32x4x3_t;
345 typedef struct int64x2x3_t uint64x2x3_t;
346 typedef struct int8x16x3_t poly8x16x3_t;
347 typedef struct int16x8x3_t poly16x8x3_t;
348 typedef struct int8x8x3_t uint8x8x3_t;
349 typedef struct int16x4x3_t uint16x4x3_t;
350 typedef struct int32x2x3_t uint32x2x3_t;
351 typedef struct int64x1x3_t uint64x1x3_t;
352 typedef struct int8x8x3_t poly8x8x3_t;
353 typedef struct int16x4x3_t poly16x4x3_t;
355 //float
356 struct float32x4x3_t {
357 float32x4_t val[3];
359 struct float32x2x3_t {
360 float32x2_t val[3];
362 struct float16x8x3_t {
363 float16x8_t val[3];
366 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
367 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
368 typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
369 typedef float16x8x3_t float16x4x3_t;
372 //****************************************************************************
373 //****** Porting auxiliary macros ********************************************
375 //** floating point related macros **
376 #define _M128i(a) _mm_castps_si128(a)
377 #define _M128(a) _mm_castsi128_ps(a)
378 //here the most performance effective implementation is compiler and 32/64 bits build dependent
379 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
381 #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
382 #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
383 #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
384 #else
385 //for 32bit gcc and Microsoft compilers builds
386 #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
387 #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
388 #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
389 #endif
390 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
392 #define return64(a) _M64(res64,a); return res64;
393 #define return64f(a) _M64f(res64,a); return res64;
395 #define _Ui64(a) (*(uint64_t*)&(a))
396 #define _UNSIGNED_T(a) u ## a
398 #define _SIGNBIT64 ((uint64_t)1 << 63)
399 #define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
400 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
402 #define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
403 #define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
405 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
406 #define __constrange(min,max) const
407 #define __transfersize(size)
408 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
411 //*************************************************************************
412 //*************************************************************************
413 //********* Functions declarations as declared in original arm_neon.h *****
414 //*************************************************************************
415 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
416 int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
417 int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
418 int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
419 int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
420 float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
421 uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
422 uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
423 uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
424 uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
425 int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
426 int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
427 int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
428 int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
429 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
430 uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
431 uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
432 uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
433 uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
434 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
435 int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
436 int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
437 int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
438 uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
439 uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
440 uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
441 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
442 int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
443 int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
444 int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
445 uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
446 uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
447 uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
448 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
449 int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
450 int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
451 int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
452 uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
453 uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
454 uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
455 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
456 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
457 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
458 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
459 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
460 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
461 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
462 int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
463 int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
464 int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
465 uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
466 uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
467 uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
468 int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
469 int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
470 int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
471 uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
472 uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
473 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
474 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
475 int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
476 int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
477 int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
478 int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
479 uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
480 uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
481 uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
482 uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
483 int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
484 int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
485 int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
486 int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
487 uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
488 uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
489 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
490 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
491 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
492 int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
493 int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
494 int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
495 uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
496 uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
497 uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
498 //Vector rounding add high half: vraddhn
499 int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
500 int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
501 int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
502 uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
503 uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
504 uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
505 //Multiplication
506 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
507 int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
508 int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
509 int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
510 float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
511 uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
512 uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
513 uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
514 poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
515 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
516 int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
517 int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
518 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
519 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
520 uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
521 uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
522 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
523 //multiply lane
524 int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
525 int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
526 float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
527 uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
528 uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
529 int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
530 int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
531 float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
532 uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
533 uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
534 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
535 int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
536 int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
537 int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
538 float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
539 uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
540 uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
541 uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
542 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
543 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
544 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
545 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
546 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
547 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
548 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
549 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
550 int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
551 int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
552 int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
553 uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
554 uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
555 uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
556 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
557 int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
558 int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
559 int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
560 float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
561 uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
562 uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
563 uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
564 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
565 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
566 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
567 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
568 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
569 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
570 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
571 //Vector multiply subtract long
572 int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
573 int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
574 int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
575 uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
576 uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
577 uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
578 //Vector saturating doubling multiply high
579 int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
580 int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
581 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
582 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
583 //Vector saturating rounding doubling multiply high
584 int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
585 int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
586 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
587 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
588 //Vector saturating doubling multiply accumulate long
589 int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
590 int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
591 //Vector saturating doubling multiply subtract long
592 int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
593 int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
594 //Vector long multiply
595 int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
596 int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
597 int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
598 uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
599 uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
600 uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
601 poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
602 //Vector saturating doubling long multiply
603 int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
604 int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
605 //Subtraction
606 //Vector subtract
607 int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
608 int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
609 int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
610 int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
611 float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
612 uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
613 uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
614 uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
615 uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
616 int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
617 int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
618 int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
619 int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
620 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
621 uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
622 uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
623 uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
624 uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
625 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
626 int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
627 int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
628 int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
629 uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
630 uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
631 uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
632 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
633 int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
634 int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
635 int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
636 uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
637 uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
638 uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
639 //Vector saturating subtract
640 int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
641 int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
642 int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
643 int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
644 uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
645 uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
646 uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
647 uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
648 int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
649 int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
650 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
651 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
652 uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
653 uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
654 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
655 uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
656 //Vector halving subtract
657 int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
658 int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
659 int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
660 uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
661 uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
662 uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
663 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
664 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
665 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
666 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
667 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
668 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
669 //Vector subtract high half
670 int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
671 int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
672 int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
673 uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
674 uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
675 uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
676 //Vector rounding subtract high half
677 int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
678 int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
679 int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
680 uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
681 uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
682 uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
683 //Comparison
684 //Vector compare equal
685 uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
686 uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
687 uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
688 uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
689 uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
690 uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
691 uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
692 uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
693 uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
694 uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
695 uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
696 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
697 uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
698 uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
699 uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
700 uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
701 //Vector compare greater-than or equal
702 uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
703 uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
704 uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
705 uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
706 uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
707 uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
708 uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
709 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
710 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
711 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
712 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
713 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
714 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
715 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
716 //Vector compare less-than or equal
717 uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
718 uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
719 uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
720 uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
721 uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
722 uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
723 uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
724 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
725 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
726 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
727 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
728 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
729 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
730 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
731 //Vector compare greater-than
732 uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
733 uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
734 uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
735 uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
736 uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
737 uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
738 uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
739 uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
740 uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
741 uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
742 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
743 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
744 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
745 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
746 //Vector compare less-than
747 uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
748 uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
749 uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
750 uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
751 uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
752 uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
753 uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
754 uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
755 uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
756 uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
757 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
758 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
759 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
760 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
761 //Vector compare absolute greater-than or equal
762 uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
763 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
764 //Vector compare absolute less-than or equal
765 uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
766 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
767 //Vector compare absolute greater-than
768 uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
769 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
770 //Vector compare absolute less-than
771 uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
772 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
773 //Vector test bits
774 uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
775 uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
776 uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
777 uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
778 uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
779 uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
780 uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
781 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
782 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
783 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
784 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
785 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
786 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
787 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
788 //Absolute difference
789 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
790 int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
791 int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
792 int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
793 uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
794 uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
795 uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
796 float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
797 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
798 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
799 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
800 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
801 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
802 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
803 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
804 //Absolute difference - long
805 int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
806 int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
807 int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
808 uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
809 uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
810 uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
811 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
812 int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
813 int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
814 int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
815 uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
816 uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
817 uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
818 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
819 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
820 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
821 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
822 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
823 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
824 //Absolute difference and accumulate - long
825 int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
826 int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
827 int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
828 uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
829 uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
830 uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
831 //Max/Min
832 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
833 int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
834 int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
835 int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
836 uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
837 uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
838 uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
839 float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
840 int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
841 int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
842 int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
843 uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
844 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
845 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
846 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
847 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
848 int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
849 int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
850 int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
851 uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
852 uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
853 uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
854 float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
855 int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
856 int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
857 int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
858 uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
859 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
860 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
861 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
862 //Pairwise addition
863 //Pairwise add
864 int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
865 int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
866 int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
867 uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
868 uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
869 uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
870 float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
871 //Long pairwise add
872 int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
873 int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
874 int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
875 uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
876 uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
877 uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
878 int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
879 int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
880 int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
881 uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
882 uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
883 uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
884 //Long pairwise add and accumulate
885 int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
886 int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
887 int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
888 uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
889 uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
890 uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
891 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
892 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
893 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
894 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
895 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
896 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
897 //Folding maximum vpmax -> takes maximum of adjacent pairs
898 int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
899 int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
900 int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
901 uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
902 uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
903 uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
904 float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
905 //Folding minimum vpmin -> takes minimum of adjacent pairs
906 int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
907 int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
908 int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
909 uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
910 uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
911 uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
912 float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
913 //Reciprocal/Sqrt
914 float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
915 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
916 float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
917 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
918 //Shifts by signed variable
919 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
920 int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
921 int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
922 int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
923 int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
924 uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
925 uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
926 uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
927 uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
928 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
929 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
930 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
931 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
932 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
933 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
934 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
935 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
936 //Vector saturating shift left: (negative values shift right)
937 int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
938 int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
939 int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
940 int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
941 uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
942 uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
943 uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
944 uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
945 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
946 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
947 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
948 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
949 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
950 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
951 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
952 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
953 //Vector rounding shift left: (negative values shift right)
954 int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
955 int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
956 int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
957 int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
958 uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
959 uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
960 uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
961 uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
962 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
963 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
964 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
965 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
966 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
967 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
968 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
969 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
970 //Vector saturating rounding shift left: (negative values shift right)
971 int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
972 int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
973 int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
974 int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
975 uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
976 uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
977 uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
978 uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
979 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
980 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
981 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
982 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
983 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
984 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
985 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
986 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
987 //Shifts by a constant
988 //Vector shift right by constant
989 int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
990 int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
991 int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
992 int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
993 uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
994 uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
995 uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
996 uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
997 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
998 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
999 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
1000 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
1001 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
1002 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
1003 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
1004 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
1005 //Vector shift left by constant
1006 int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1007 int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1008 int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1009 int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1010 uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1011 uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1012 uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1013 uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1014 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1015 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1016 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1017 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1018 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1019 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1020 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1021 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1022 //Vector rounding shift right by constant
1023 int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
1024 int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
1025 int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
1026 int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
1027 uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
1028 uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
1029 uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
1030 uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
1031 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
1032 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
1033 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
1034 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
1035 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
1036 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
1037 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
1038 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
1039 //Vector shift right by constant and accumulate
1040 int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
1041 int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
1042 int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
1043 int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
1044 uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
1045 uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
1046 uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
1047 uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
1048 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
1049 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
1050 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
1051 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
1052 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
1053 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
1054 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
1055 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
1056 //Vector rounding shift right by constant and accumulate
1057 int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
1058 int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
1059 int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
1060 int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
1061 uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
1062 uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
1063 uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
1064 uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
1065 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
1066 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
1067 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
1068 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
1069 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
1070 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
1071 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
1072 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
1073 //Vector saturating shift left by constant
1074 int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
1075 int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
1076 int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
1077 int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
1078 uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
1079 uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
1080 uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
1081 uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
1082 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
1083 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
1084 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
1085 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
1086 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
1087 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
1088 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
1089 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
1090 //Vector signed->unsigned saturating shift left by constant
1091 uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
1092 uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
1093 uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
1094 uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
1095 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
1096 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
1097 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
1098 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
1099 //Vector narrowing shift right by constant
1100 int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1101 int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1102 int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1103 uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1104 uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1105 uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1106 //Vector signed->unsigned narrowing saturating shift right by constant
1107 uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
1108 uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
1109 uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
1110 //Vector signed->unsigned rounding narrowing saturating shift right by constant
1111 uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
1112 uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
1113 uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
1114 //Vector narrowing saturating shift right by constant
1115 int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
1116 int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
1117 int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
1118 uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
1119 uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
1120 uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
1121 //Vector rounding narrowing shift right by constant
1122 int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1123 int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1124 int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1125 uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1126 uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1127 uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1128 //Vector rounding narrowing saturating shift right by constant
1129 int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
1130 int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
1131 int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
1132 uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
1133 uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
1134 uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
1135 //Vector widening shift left by constant
1136 int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
1137 int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
1138 int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
1139 uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
1140 uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
1141 uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
1142 //Shifts with insert
1143 //Vector shift right and insert
1144 int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1145 int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1146 int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1147 int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1148 uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1149 uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1150 uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1151 uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1152 poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1153 poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1154 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1155 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1156 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1157 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1158 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1159 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1160 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1161 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1162 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1163 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1164 //Vector shift left and insert
1165 int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1166 int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1167 int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1168 int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1169 uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1170 uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1171 uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1172 uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1173 poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1174 poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1175 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1176 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1177 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1178 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1179 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1180 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1181 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1182 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1183 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1184 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1185 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1186 //Load a single vector from memory
1187 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1188 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1189 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1190 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1191 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1192 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1193 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1194 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1195 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
1196 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1197 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1198 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1199 uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
1200 uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
1201 uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
1202 uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1203 int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
1204 int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
1205 int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
1206 int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1207 float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
1208 float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
1209 poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
1210 poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
1211 //Load a single lane from memory
1212 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1213 uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1214 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1215 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
1216 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1217 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1218 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
1219 float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1220 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1221 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
1222 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1223 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1224 uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1225 uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1226 uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1227 uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1228 int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
1229 int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1230 int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1231 float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1232 float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
1233 int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1234 poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1235 poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1236 //Load all lanes of vector with same value from memory
1237 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1238 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1239 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1240 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1241 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1242 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1243 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1244 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1245 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1246 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1247 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1248 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1249 uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1250 uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1251 uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1252 uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1253 int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1254 int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1255 int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1256 int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1257 float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1258 float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1259 poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1260 poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1261 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
1262 //Store a single vector into memory
1263 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
1264 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
1265 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
1266 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
1267 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
1268 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
1269 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
1270 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
1271 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
1272 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
1273 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
1274 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
1275 void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
1276 void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
1277 void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
1278 void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
1279 void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
1280 void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
1281 void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
1282 void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
1283 void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
1284 void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
1285 void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
1286 void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
1287 //Store a lane of a vector into memory
1288 //Loads of an N-element structure
1289 //Load N-element structure from memory
1290 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1291 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1292 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1293 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1294 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1295 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1296 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
1297 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1298 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1299 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1300 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1301 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1302 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1303 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1304 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1305 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1306 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1307 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1308 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
1309 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1310 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1311 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1312 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1313 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1314 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1315 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1316 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1317 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1318 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1319 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1320 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1321 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1322 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1323 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1324 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1325 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1326 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1327 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1328 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1329 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1330 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1331 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1332 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1333 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1334 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1335 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1336 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1337 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1338 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1339 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1340 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1341 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1342 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1343 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1344 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1345 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1346 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1347 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1348 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1349 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1350 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1351 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1352 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1353 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1354 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1355 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1356 //Load all lanes of N-element structure with same value from memory
1357 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1358 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1359 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1360 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1361 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1362 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1363 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1364 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1365 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1366 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1367 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1368 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1369 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1370 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1371 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1372 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1373 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1374 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1375 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1376 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1377 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1378 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1379 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1380 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1381 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1382 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1383 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1384 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1385 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1386 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1387 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1388 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1389 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1390 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1391 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1392 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1393 //Load a single lane of N-element structure from memory
1394 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1395 uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1396 uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1397 int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1398 int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1399 float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1400 float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1401 poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1402 uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1403 uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1404 uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1405 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1406 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
1407 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
1408 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1409 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1410 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1411 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1412 uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1413 uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1414 int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1415 int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1416 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1417 float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1418 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1419 uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1420 uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1421 uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1422 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1423 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1424 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1425 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1426 float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1427 poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1428 poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1429 uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1430 uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1431 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1432 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1433 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1434 float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1435 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1436 uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1437 uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1438 uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1439 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1440 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1441 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1442 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1443 float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1444 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1445 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1446 //Store N-element structure to memory
1447 void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1448 void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1449 void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1450 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1451 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1452 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1453 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1454 void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1455 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1456 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1457 void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
1458 void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1459 void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
1460 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
1461 void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
1462 void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1463 void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
1464 void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
1465 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1466 void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
1467 void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
1468 void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1469 void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1470 void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1471 void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1472 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1473 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1474 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1475 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1476 void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1477 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1478 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1479 void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
1480 void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1481 void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
1482 void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
1483 void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
1484 void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1485 void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
1486 void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
1487 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1488 void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
1489 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
1490 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1491 void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1492 void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1493 void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1494 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1495 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1496 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1497 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1498 void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1499 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1500 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1501 void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
1502 void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1503 void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
1504 void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
1505 void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
1506 void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1507 void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
1508 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
1509 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1510 void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
1511 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
1512 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1513 //Store a single lane of N-element structure to memory
1514 void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1515 void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1516 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1517 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1518 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1519 void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
1520 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1521 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1522 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1523 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1524 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
1525 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1526 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1527 void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1528 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1529 void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1530 void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1531 void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1532 void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1533 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1534 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1535 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1536 void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1537 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1538 void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1539 void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1540 void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1541 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1542 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1543 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1544 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1545 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1546 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1547 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1548 void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1549 void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1550 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1551 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1552 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1553 void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1554 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1555 void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1556 void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1557 void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1558 void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1559 void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1560 void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1561 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1562 void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1563 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1564 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1565 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1566 uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1567 uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1568 uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1569 int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
1570 int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
1571 int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1572 poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1573 poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1574 float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1575 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1576 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1577 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1578 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
1579 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
1580 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1581 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1582 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1583 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1584 int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1585 uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1586 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1587 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1588 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1589 uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1590 uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1591 uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1592 int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1593 int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1594 int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1595 poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1596 poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1597 float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1598 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1599 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1600 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1601 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1602 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1603 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1604 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1605 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1606 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1607 int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1608 uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1609 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1610 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1611 //Initialize a vector from a literal bit pattern.
1612 int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
1613 int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
1614 int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
1615 float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
1616 float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
1617 uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
1618 uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
1619 uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
1620 uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
1621 poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
1622 poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
1623 int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
1624 //Set all lanes to same value
1625 //Load all lanes of vector to the same literal value
1626 uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
1627 uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
1628 uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
1629 int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
1630 int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
1631 int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
1632 poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
1633 poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
1634 float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
1635 uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
1636 uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
1637 uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
1638 int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
1639 int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
1640 int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
1641 poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
1642 poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
1643 float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
1644 int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
1645 uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
1646 int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
1647 uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
1648 uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
1649 uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
1650 uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
1651 int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
1652 int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
1653 int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
1654 poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
1655 poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
1656 float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
1657 uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
1658 uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
1659 uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
1660 int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
1661 int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
1662 int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
1663 poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
1664 poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
1665 float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
1666 int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
1667 uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
1668 int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
1669 uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
1670 //Load all lanes of the vector to the value of a lane of a vector
1671 uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1672 uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1673 uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1674 int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1675 int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1676 int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1677 poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1678 poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1679 float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1680 uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1681 uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1682 uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1683 int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1684 int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1685 int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1686 poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1687 poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1688 float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1689 int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1690 uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1691 int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1692 uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1693 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1694 int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
1695 int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
1696 int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
1697 int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
1698 float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
1699 float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
1700 uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
1701 uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
1702 uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
1703 uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
1704 poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
1705 poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
1706 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1707 int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
1708 int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
1709 int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
1710 int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
1711 float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
1712 float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
1713 uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
1714 uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
1715 uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
1716 uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
1717 poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
1718 poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
1719 int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
1720 int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
1721 int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
1722 int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
1723 float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
1724 float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
1725 uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
1726 uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
1727 uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
1728 uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
1729 poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
1730 poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
1731 //Converting vectors. These intrinsics are used to convert vectors.
1732 //Convert from float
1733 int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
1734 uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
1735 int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
1736 uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
1737 int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
1738 uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
1739 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
1740 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
1741 //Convert to float
1742 float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
1743 float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
1744 float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
1745 float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
1746 float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
1747 float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
1748 float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
1749 float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
1750 //Convert between floats
1751 float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
1752 float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
1753 //Vector narrow integer
1754 int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
1755 int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
1756 int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
1757 uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
1758 uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
1759 uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
1760 //Vector long move
1761 int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
1762 int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
1763 int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
1764 uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
1765 uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
1766 uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
1767 //Vector saturating narrow integer
1768 int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
1769 int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
1770 int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
1771 uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
1772 uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
1773 uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
1774 //Vector saturating narrow integer signed->unsigned
1775 uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
1776 uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
1777 uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
1778 //Table look up
1779 uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1780 int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
1781 poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1782 uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
1783 int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
1784 poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
1785 uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
1786 int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
1787 poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
1788 uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
1789 int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
1790 poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
1791 //Extended table look up intrinsics
1792 uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1793 int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
1794 poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1795 uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1796 int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1797 poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1798 uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1799 int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1800 poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1801 uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1802 int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1803 poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1804 //Operations with a scalar value
1805 //Vector multiply accumulate with scalar
1806 int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1807 int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1808 uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1809 uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1810 float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
1811 int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
1812 int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
1813 uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
1814 uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
1815 float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
1816 //Vector widening multiply accumulate with scalar
1817 int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
1818 int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
1819 uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
1820 uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
1821 //Vector widening saturating doubling multiply accumulate with scalar
1822 int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
1823 int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
1824 //Vector multiply subtract with scalar
1825 int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1826 int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1827 uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1828 uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1829 float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
1830 int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
1831 int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
1832 uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
1833 uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
1834 float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
1835 //Vector widening multiply subtract with scalar
1836 int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
1837 int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
1838 uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
1839 uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
1840 //Vector widening saturating doubling multiply subtract with scalar
1841 int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
1842 int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
1843 //Vector multiply by scalar
1844 int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
1845 int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
1846 float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
1847 uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
1848 uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
1849 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
1850 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
1851 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
1852 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
1853 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
1854 //Vector long multiply with scalar
1855 int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
1856 int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
1857 uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
1858 uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
1859 //Vector long multiply by scalar
1860 int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
1861 int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
1862 uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
1863 uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
1864 //Vector saturating doubling long multiply with scalar
1865 int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
1866 int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
1867 //Vector saturating doubling long multiply by scalar
1868 int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
1869 int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
1870 //Vector saturating doubling multiply high with scalar
1871 int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
1872 int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
1873 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
1874 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
1875 //Vector saturating doubling multiply high by scalar
1876 int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
1877 int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
1878 int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
1879 int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
1880 //Vector saturating rounding doubling multiply high with scalar
1881 int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
1882 int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
1883 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
1884 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
1885 //Vector rounding saturating doubling multiply high by scalar
1886 int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
1887 int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
1888 int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
1889 int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
1890 //Vector multiply accumulate with scalar
1891 int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
1892 int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
1893 uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
1894 uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
1895 float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
1896 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
1897 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
1898 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
1899 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
1900 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
1901 //Vector widening multiply accumulate with scalar
1902 int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
1903 int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
1904 uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
1905 uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
1906 //Vector widening saturating doubling multiply accumulate with scalar
1907 int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
1908 int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
1909 //Vector multiply subtract with scalar
1910 int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
1911 int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
1912 uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
1913 uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
1914 float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
1915 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
1916 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
1917 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
1918 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
1919 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
1920 //Vector widening multiply subtract with scalar
1921 int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
1922 int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
1923 uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
1924 uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
1925 //Vector widening saturating doubling multiply subtract with scalar
1926 int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
1927 int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
1928 //Vector extract
1929 int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1930 uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1931 poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1932 int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1933 uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1934 poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1935 int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1936 uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1937 int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1938 uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1939 float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1940 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1941 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1942 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1943 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1944 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1945 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1946 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1947 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1948 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1949 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1950 float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
1951 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1952 int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
1953 int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
1954 int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
1955 uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
1956 uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
1957 uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
1958 poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
1959 poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
1960 float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
1961 int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
1962 int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
1963 int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
1964 uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
1965 uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
1966 uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
1967 poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
1968 poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
1969 float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
1970 int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
1971 int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
1972 uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
1973 uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
1974 poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
1975 poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
1976 int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
1977 int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
1978 uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
1979 uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
1980 poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
1981 poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
1982 int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
1983 uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
1984 poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
1985 int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
1986 uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
1987 poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
1988 //Other single operand arithmetic
1989 //Absolute: Vd[i] = |Va[i]|
1990 int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
1991 int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
1992 int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
1993 float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
1994 int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
1995 int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
1996 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
1997 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
1998 //Saturating absolute: Vd[i] = sat(|Va[i]|)
1999 int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
2000 int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
2001 int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
2002 int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
2003 int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
2004 int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
2005 //Negate: Vd[i] = - Va[i]
2006 int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
2007 int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
2008 int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
2009 float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
2010 int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
2011 int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
2012 int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
2013 float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
2014 //Saturating Negate: sat(Vd[i] = - Va[i])
2015 int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
2016 int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
2017 int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
2018 int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
2019 int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
2020 int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
2021 //Count leading sign bits
2022 int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
2023 int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
2024 int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
2025 int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
2026 int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
2027 int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
2028 //Count leading zeros
2029 int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
2030 int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
2031 int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
2032 uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
2033 uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
2034 uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
2035 int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
2036 int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
2037 int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
2038 uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
2039 uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
2040 uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
2041 //Count number of set bits
2042 uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
2043 int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
2044 poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
2045 uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
2046 int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
2047 poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
2048 //Reciprocal estimate
2049 float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
2050 uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
2051 float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
2052 uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
2053 //Reciprocal square root estimate
2054 float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
2055 uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
2056 float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
2057 uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
2058 //Logical operations
2059 //Bitwise not
2060 int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
2061 int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
2062 int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
2063 uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
2064 uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
2065 uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
2066 poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
2067 int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
2068 int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
2069 int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
2070 uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
2071 uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
2072 uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
2073 poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
2074 //Bitwise and
2075 int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
2076 int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
2077 int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
2078 int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
2079 uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
2080 uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
2081 uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
2082 uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
2083 int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
2084 int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
2085 int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
2086 int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
2087 uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
2088 uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
2089 uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
2090 uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
2091 //Bitwise or
2092 int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
2093 int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
2094 int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
2095 int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
2096 uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
2097 uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
2098 uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
2099 uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
2100 int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
2101 int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
2102 int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
2103 int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
2104 uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
2105 uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
2106 uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
2107 uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
2108 //Bitwise exclusive or (EOR or XOR)
2109 int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
2110 int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
2111 int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
2112 int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
2113 uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
2114 uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
2115 uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
2116 uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
2117 int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
2118 int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
2119 int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
2120 int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
2121 uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
2122 uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
2123 uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
2124 uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
2125 //Bit Clear
2126 int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
2127 int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
2128 int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
2129 int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
2130 uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
2131 uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
2132 uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
2133 uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
2134 int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
2135 int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
2136 int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
2137 int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
2138 uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
2139 uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
2140 uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
2141 uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
2142 //Bitwise OR complement
2143 int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
2144 int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
2145 int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
2146 int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
2147 uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
2148 uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
2149 uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
2150 uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
2151 int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
2152 int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
2153 int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
2154 int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
2155 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
2156 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
2157 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
2158 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
2159 //Bitwise Select
2160 int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
2161 int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
2162 int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
2163 int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
2164 uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
2165 uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
2166 uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
2167 uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
2168 float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
2169 poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
2170 poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
2171 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
2172 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
2173 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
2174 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
2175 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
2176 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
2177 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
2178 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
2179 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
2180 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
2181 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
2182 //Transposition operations
2183 //Transpose elements
2184 int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
2185 int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
2186 int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
2187 uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
2188 uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
2189 uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
2190 float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
2191 poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
2192 poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
2193 int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
2194 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
2195 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
2196 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
2197 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
2198 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
2199 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
2200 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
2201 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
2202 //Interleave elements
2203 int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
2204 int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
2205 int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
2206 uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
2207 uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
2208 uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
2209 float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
2210 poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
2211 poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
2212 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
2213 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
2214 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
2215 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
2216 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
2217 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
2218 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
2219 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
2220 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
2221 //De-Interleave elements
2222 int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
2223 int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
2224 int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
2225 uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
2226 uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
2227 uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
2228 float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
2229 poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
2230 poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
2231 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
2232 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
2233 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
2234 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
2235 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
2236 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
2237 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
2238 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
2239 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
2242 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2243 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
2244 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2246 #if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
2248 #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2250 #define _MM_EXTRACT_EPI16 _mm_extract_epi16
2251 #define _MM_INSERT_EPI16 _mm_insert_epi16
2252 #ifdef USE_SSE4
2253 #define _MM_EXTRACT_EPI8 _mm_extract_epi8
2254 #define _MM_EXTRACT_EPI32 _mm_extract_epi32
2255 #define _MM_EXTRACT_PS _mm_extract_ps
2257 #define _MM_INSERT_EPI8 _mm_insert_epi8
2258 #define _MM_INSERT_EPI32 _mm_insert_epi32
2259 #define _MM_INSERT_PS _mm_insert_ps
2260 #ifdef _NEON2SSE_64BIT
2261 #define _MM_INSERT_EPI64 _mm_insert_epi64
2262 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
2263 #endif
2264 #endif //SSE4
2265 #else
2266 #define _NEON2SSE_COMMA ,
2267 #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2268 switch(LANE) \
2270 case 0: return NAME(a b, 0); \
2271 case 1: return NAME(a b, 1); \
2272 case 2: return NAME(a b, 2); \
2273 case 3: return NAME(a b, 3); \
2274 case 4: return NAME(a b, 4); \
2275 case 5: return NAME(a b, 5); \
2276 case 6: return NAME(a b, 6); \
2277 case 7: return NAME(a b, 7); \
2278 case 8: return NAME(a b, 8); \
2279 case 9: return NAME(a b, 9); \
2280 case 10: return NAME(a b, 10); \
2281 case 11: return NAME(a b, 11); \
2282 case 12: return NAME(a b, 12); \
2283 case 13: return NAME(a b, 13); \
2284 case 14: return NAME(a b, 14); \
2285 case 15: return NAME(a b, 15); \
2286 default: return NAME(a b, 0); \
2289 #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2290 switch(LANE) \
2292 case 0: return NAME(vec p,0); \
2293 case 1: return NAME(vec p,1); \
2294 case 2: return NAME(vec p,2); \
2295 case 3: return NAME(vec p,3); \
2296 case 4: return NAME(vec p,4); \
2297 case 5: return NAME(vec p,5); \
2298 case 6: return NAME(vec p,6); \
2299 case 7: return NAME(vec p,7); \
2300 default: return NAME(vec p,0); \
2303 #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2304 switch(LANE) \
2306 case case0: return NAME(vec p,case0); \
2307 case case1: return NAME(vec p,case1); \
2308 case case2: return NAME(vec p,case2); \
2309 case case3: return NAME(vec p,case3); \
2310 default: return NAME(vec p,case0); \
2313 _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
2315 _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
2318 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
2320 _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
2323 _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
2325 _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
2328 #ifdef USE_SSE4
2329 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2331 _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
2334 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2336 _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
2339 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2341 _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
2344 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2346 _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
2349 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2351 _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
2354 #ifdef _NEON2SSE_64BIT
2355 //the special case of functions available only for SSE4 and 64-bit build.
2356 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
2358 switch(LANE) {
2359 case 0:
2360 return _mm_insert_epi64(vec, p, 0);
2361 case 1:
2362 return _mm_insert_epi64(vec, p, 1);
2363 default:
2364 return _mm_insert_epi64(vec, p, 0);
2368 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
2370 if (LANE ==0) return _mm_extract_epi64(val, 0);
2371 else return _mm_extract_epi64(val, 1);
2373 #endif
2375 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2377 _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
2380 #endif //USE_SSE4
2382 #endif //#ifdef NDEBUG
2384 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2385 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2386 // or for some specific commonly used operations implementation missing in SSE
2387 #ifdef USE_SSE4
2388 #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
2389 #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2390 #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
2392 #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
2393 #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2394 #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
2396 #define _MM_MAX_EPI8 _mm_max_epi8
2397 #define _MM_MAX_EPI32 _mm_max_epi32
2398 #define _MM_MAX_EPU16 _mm_max_epu16
2399 #define _MM_MAX_EPU32 _mm_max_epu32
2401 #define _MM_MIN_EPI8 _mm_min_epi8
2402 #define _MM_MIN_EPI32 _mm_min_epi32
2403 #define _MM_MIN_EPU16 _mm_min_epu16
2404 #define _MM_MIN_EPU32 _mm_min_epu32
2406 #define _MM_BLENDV_EPI8 _mm_blendv_epi8
2407 #define _MM_PACKUS_EPI32 _mm_packus_epi32
2408 #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2410 #define _MM_MULLO_EPI32 _mm_mullo_epi32
2411 #define _MM_MUL_EPI32 _mm_mul_epi32
2413 #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2414 #else //no SSE4 !!!!!!
2415 _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
2417 __m128i zero = _mm_setzero_si128();
2418 return _mm_unpacklo_epi8(a, zero);
2421 _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
2423 __m128i zero = _mm_setzero_si128();
2424 return _mm_unpacklo_epi16(a, zero);
2427 _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
2429 __m128i zero = _mm_setzero_si128();
2430 return _mm_unpacklo_epi32(a, zero);
2433 _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
2435 __m128i zero = _mm_setzero_si128();
2436 __m128i sign = _mm_cmpgt_epi8(zero, a);
2437 return _mm_unpacklo_epi8(a, sign);
2440 _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
2442 __m128i zero = _mm_setzero_si128();
2443 __m128i sign = _mm_cmpgt_epi16(zero, a);
2444 return _mm_unpacklo_epi16(a, sign);
2447 _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
2449 __m128i zero = _mm_setzero_si128();
2450 __m128i sign = _mm_cmpgt_epi32(zero, a);
2451 return _mm_unpacklo_epi32(a, sign);
2454 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2456 _NEON2SSE_ALIGN_16 int32_t tmp[4];
2457 _mm_store_si128((__m128i*)tmp, vec);
2458 return tmp[LANE];
2461 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2463 _NEON2SSE_ALIGN_16 int8_t tmp[16];
2464 _mm_store_si128((__m128i*)tmp, vec);
2465 return (int)tmp[LANE];
2468 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2470 _NEON2SSE_ALIGN_16 int32_t tmp[4];
2471 _mm_store_si128((__m128i*)tmp, _M128i(vec));
2472 return tmp[LANE];
2475 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2477 _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
2478 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2479 __m128i vec_masked, p_masked;
2480 pvec[LANE] = p;
2481 mask[LANE] = 0x0;
2482 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2483 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2484 return _mm_or_si128(vec_masked, p_masked);
2487 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2489 _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
2490 _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
2491 __m128i vec_masked, p_masked;
2492 pvec[LANE] = (int8_t)p;
2493 mask[LANE] = 0x0;
2494 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2495 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2496 return _mm_or_si128(vec_masked, p_masked);
2499 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2501 _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2502 __m128 tmp, vec_masked, p_masked;
2503 mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
2504 vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
2505 p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
2506 tmp = _mm_or_ps(vec_masked, p_masked);
2507 return tmp;
2510 _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
2512 __m128i cmp, resa, resb;
2513 cmp = _mm_cmpgt_epi8 (a, b);
2514 resa = _mm_and_si128 (cmp, a);
2515 resb = _mm_andnot_si128 (cmp,b);
2516 return _mm_or_si128(resa, resb);
2519 _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
2521 __m128i cmp, resa, resb;
2522 cmp = _mm_cmpgt_epi32(a, b);
2523 resa = _mm_and_si128 (cmp, a);
2524 resb = _mm_andnot_si128 (cmp,b);
2525 return _mm_or_si128(resa, resb);
2528 _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
2530 __m128i c8000, b_s, a_s, cmp;
2531 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2532 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2533 b_s = _mm_sub_epi16 (b, c8000);
2534 a_s = _mm_sub_epi16 (a, c8000);
2535 cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
2536 a_s = _mm_and_si128 (cmp,a);
2537 b_s = _mm_andnot_si128 (cmp,b);
2538 return _mm_or_si128(a_s, b_s);
2541 _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
2543 __m128i c80000000, b_s, a_s, cmp;
2544 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2545 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2546 b_s = _mm_sub_epi32 (b, c80000000);
2547 a_s = _mm_sub_epi32 (a, c80000000);
2548 cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
2549 a_s = _mm_and_si128 (cmp,a);
2550 b_s = _mm_andnot_si128 (cmp,b);
2551 return _mm_or_si128(a_s, b_s);
2554 _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
2556 __m128i cmp, resa, resb;
2557 cmp = _mm_cmpgt_epi8 (b, a);
2558 resa = _mm_and_si128 (cmp, a);
2559 resb = _mm_andnot_si128 (cmp,b);
2560 return _mm_or_si128(resa, resb);
2563 _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
2565 __m128i cmp, resa, resb;
2566 cmp = _mm_cmpgt_epi32(b, a);
2567 resa = _mm_and_si128 (cmp, a);
2568 resb = _mm_andnot_si128 (cmp,b);
2569 return _mm_or_si128(resa, resb);
2572 _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
2574 __m128i c8000, b_s, a_s, cmp;
2575 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2576 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2577 b_s = _mm_sub_epi16 (b, c8000);
2578 a_s = _mm_sub_epi16 (a, c8000);
2579 cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
2580 a_s = _mm_and_si128 (cmp,a);
2581 b_s = _mm_andnot_si128 (cmp,b);
2582 return _mm_or_si128(a_s, b_s);
2585 _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
2587 __m128i c80000000, b_s, a_s, cmp;
2588 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2589 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2590 b_s = _mm_sub_epi32 (b, c80000000);
2591 a_s = _mm_sub_epi32 (a, c80000000);
2592 cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
2593 a_s = _mm_and_si128 (cmp,a);
2594 b_s = _mm_andnot_si128 (cmp,b);
2595 return _mm_or_si128(a_s, b_s);
2598 _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
2600 //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2601 __m128i a_masked, b_masked;
2602 b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
2603 a_masked = _mm_andnot_si128 (mask,a);
2604 return _mm_or_si128(a_masked, b_masked);
2607 _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
2609 _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
2610 __m128i a16, b16, res, reshi,cmp, zero;
2611 zero = _mm_setzero_si128();
2612 a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
2613 b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
2614 res = _mm_unpacklo_epi64(a16, b16); //result without saturation
2615 reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
2616 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2617 res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2618 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2619 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2622 _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
2624 _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
2625 __m128i a16, res, reshi,cmp, zero;
2626 zero = _mm_setzero_si128();
2627 a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
2628 reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
2629 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2630 res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2631 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2632 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2636 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
2638 _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
2639 int64_t res64;
2640 int i;
2641 _mm_store_si128((__m128i*)atmp, a);
2642 _mm_store_si128((__m128i*)btmp, b);
2643 for (i = 0; i<4; i++) {
2644 res64 = atmp[i] * btmp[i];
2645 res[i] = (int)(res64 & 0xffffffff);
2647 return _mm_load_si128((__m128i*)res);
2650 _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
2652 __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
2653 sign = _mm_xor_si128 (a, b);
2654 sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
2655 zero = _mm_setzero_si128();
2656 a_neg = _mm_abs_epi32 (a); //negate a and b
2657 b_neg = _mm_abs_epi32 (b); //negate a and b
2658 mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2659 mul_us_neg = _mm_sub_epi64(zero, mul_us);
2660 mul_us_neg = _mm_and_si128(sign, mul_us_neg);
2661 mul_us = _mm_andnot_si128(sign, mul_us);
2662 return _mm_or_si128 (mul_us, mul_us_neg);
2665 _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
2667 __m128i res;
2668 res = _mm_cmpeq_epi32 (a, b);
2669 return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
2671 #endif //SSE4
2673 //the special case of functions working only for 32 bits, no SSE4
2674 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
2676 _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
2677 _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
2678 __m128i vec_masked, p_masked;
2679 pvec[LANE] = p;
2680 mask[LANE] = 0x0;
2681 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2682 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2683 return _mm_or_si128(vec_masked, p_masked);
2686 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
2688 _NEON2SSE_ALIGN_16 int64_t tmp[2];
2689 _mm_store_si128((__m128i*)tmp, val);
2690 return tmp[LANE];
2693 #ifndef _NEON2SSE_64BIT_SSE4
2694 #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2695 #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2696 #endif
2698 int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
2699 _NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
2701 //Overflow happens only if a and sum have the opposite signs
2702 __m128i c7fffffff, res, res_sat, res_xor_a;
2703 c7fffffff = _mm_set1_epi32(0x7fffffff);
2704 res = _mm_slli_epi32 (a, 1); // res = a*2
2705 res_sat = _mm_srli_epi32(a, 31);
2706 res_sat = _mm_add_epi32(res_sat, c7fffffff);
2707 res_xor_a = _mm_xor_si128(res, a);
2708 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
2709 res_sat = _mm_and_si128(res_xor_a, res_sat);
2710 res = _mm_andnot_si128(res_xor_a, res);
2711 return _mm_or_si128(res, res_sat);
2715 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2716 //*************************************************************************
2717 //*************************************************************************
2718 //***************** Functions redefinition\implementatin starts here *****
2719 //*************************************************************************
2720 //*************************************************************************
2721 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2723 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
2724 #ifdef ARM
2725 #define vector_addq_s32 _mm_add_epi32
2726 #else //if we have IA
2727 #define vector_addq_s32 vadd_s32
2728 #endif
2730 ********************************************************************************************
2731 Functions below are organised in the following way:
2733 Each NEON intrinsic function has one of the following options:
2734 1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2735 2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
2736 3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
2737 4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2738 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2739 - please consider such functions removal from your code.
2742 //***********************************************************************
2743 //************************ Vector add *****************************
2744 //***********************************************************************
2745 int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
2746 _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
2748 int8x8_t res64;
2749 return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
2753 int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
2754 _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
2756 int16x4_t res64;
2757 return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
2761 int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
2762 _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
2764 int32x2_t res64;
2765 return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
2769 int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
2770 _NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
2772 int64x1_t res64;
2773 res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
2774 return res64;
2778 float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
2779 _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
2781 __m128 res;
2782 __m64_128 res64;
2783 res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
2784 _M64f(res64, res);
2785 return res64;
2788 uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
2789 #define vadd_u8 vadd_s8
2791 uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
2792 #define vadd_u16 vadd_s16
2794 uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
2795 #define vadd_u32 vadd_s32
2797 uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
2798 _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
2800 uint64x1_t res64;
2801 res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
2802 return res64;
2806 int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
2807 #define vaddq_s8 _mm_add_epi8
2809 int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
2810 #define vaddq_s16 _mm_add_epi16
2812 int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
2813 #define vaddq_s32 _mm_add_epi32
2815 int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
2816 #define vaddq_s64 _mm_add_epi64
2818 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
2819 #define vaddq_f32 _mm_add_ps
2821 uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
2822 #define vaddq_u8 _mm_add_epi8
2824 uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
2825 #define vaddq_u16 _mm_add_epi16
2827 uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
2828 #define vaddq_u32 _mm_add_epi32
2830 uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
2831 #define vaddq_u64 _mm_add_epi64
2833 //**************************** Vector long add *****************************:
2834 //***********************************************************************
2835 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2836 int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
2837 _NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
2839 __m128i a16, b16;
2840 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
2841 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2842 return _mm_add_epi16 (a16, b16);
2845 int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
2846 _NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
2848 __m128i a32, b32;
2849 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
2850 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
2851 return _mm_add_epi32 (a32, b32);
2854 int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
2855 _NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
2857 //may be not optimal
2858 __m128i a64, b64;
2859 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
2860 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2861 return _mm_add_epi64 ( a64, b64);
2864 uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
2865 _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
2867 __m128i a16, b16;
2868 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
2869 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2870 return _mm_add_epi16 (a16, b16);
2873 uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
2874 _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
2876 __m128i a32, b32;
2877 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
2878 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2879 return _mm_add_epi32 (a32, b32);
2882 uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
2883 _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
2885 //may be not optimal
2886 __m128i a64, b64;
2887 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
2888 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2889 return _mm_add_epi64 (a64, b64);
2892 //*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
2893 //*************** *********************************************************************
2894 int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
2895 _NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
2897 __m128i b16;
2898 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2899 return _mm_add_epi16 (a, b16);
2902 int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
2903 _NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
2905 __m128i b32;
2906 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
2907 return _mm_add_epi32 (a, b32);
2910 int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
2911 _NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
2913 __m128i b64;
2914 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2915 return _mm_add_epi64 (a, b64);
2918 uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
2919 _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
2921 __m128i b16;
2922 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2923 return _mm_add_epi16 (a, b16);
2926 uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
2927 _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
2929 __m128i b32;
2930 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2931 return _mm_add_epi32 (a, b32);
2934 uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
2935 _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
2937 __m128i b64;
2938 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2939 return _mm_add_epi64 (a, b64);
2942 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
2943 //*************************************************************************************************************************
2944 int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
2945 _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
2947 int8x8_t res64;
2948 return64(vhaddq_u8(_pM128i(a), _pM128i(b)));
2952 int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
2953 _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
2955 int16x4_t res64;
2956 return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
2960 int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
2961 _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
2963 int32x2_t res64;
2964 return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
2968 uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
2969 _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
2971 uint8x8_t res64;
2972 return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
2976 uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
2977 _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
2979 uint16x4_t res64;
2980 return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
2984 uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
2985 _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
2987 uint32x2_t res64;
2988 return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
2992 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
2993 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
2995 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
2996 __m128i tmp1, tmp2;
2997 tmp1 = _mm_and_si128(a,b);
2998 tmp2 = _mm_xor_si128(a,b);
2999 tmp2 = vshrq_n_s8(tmp2,1);
3000 return _mm_add_epi8(tmp1,tmp2);
3003 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
3004 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
3006 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3007 __m128i tmp1, tmp2;
3008 tmp1 = _mm_and_si128(a,b);
3009 tmp2 = _mm_xor_si128(a,b);
3010 tmp2 = _mm_srai_epi16(tmp2,1);
3011 return _mm_add_epi16(tmp1,tmp2);
3014 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
3015 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
3017 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3018 __m128i tmp1, tmp2;
3019 tmp1 = _mm_and_si128(a,b);
3020 tmp2 = _mm_xor_si128(a,b);
3021 tmp2 = _mm_srai_epi32(tmp2,1);
3022 return _mm_add_epi32(tmp1,tmp2);
3025 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
3026 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
3028 __m128i c1, sum, res;
3029 c1 = _mm_set1_epi8(1);
3030 sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
3031 res = _mm_xor_si128(a, b); //for rounding compensation
3032 res = _mm_and_si128(res,c1); //for rounding compensation
3033 return _mm_sub_epi8 (sum, res); //actual rounding compensation
3036 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
3037 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
3039 __m128i sum, res;
3040 sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
3041 res = _mm_xor_si128(a, b); //for rounding compensation
3042 res = _mm_slli_epi16 (res,15); //shift left then back right to
3043 res = _mm_srli_epi16 (res,15); //get 1 or zero
3044 return _mm_sub_epi16 (sum, res); //actual rounding compensation
3047 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
3048 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
3050 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3051 __m128i tmp1, tmp2;
3052 tmp1 = _mm_and_si128(a,b);
3053 tmp2 = _mm_xor_si128(a,b);
3054 tmp2 = _mm_srli_epi32(tmp2,1);
3055 return _mm_add_epi32(tmp1,tmp2);
3058 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
3059 //*****************************************************************************************************************************
3060 int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
3061 _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
3063 int8x8_t res64;
3064 return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
3068 int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
3069 _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
3071 int16x4_t res64;
3072 return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
3076 int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
3077 _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
3079 int32x2_t res64;
3080 return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
3084 uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
3085 _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
3087 uint8x8_t res64;
3088 return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3092 uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
3093 _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
3095 uint16x4_t res64;
3096 return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3100 uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
3101 _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
3103 uint32x2_t res64;
3104 return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
3108 int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
3109 _NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
3111 //no signed average in x86 SIMD, go to unsigned
3112 __m128i c128, au, bu, sum;
3113 c128 = _mm_set1_epi8(0x80); //-128
3114 au = _mm_sub_epi8(a, c128); //add 128
3115 bu = _mm_sub_epi8(b, c128); //add 128
3116 sum = _mm_avg_epu8(au, bu);
3117 return _mm_add_epi8 (sum, c128); //sub 128
3120 int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
3121 _NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
3123 //no signed average in x86 SIMD, go to unsigned
3124 __m128i cx8000, au, bu, sum;
3125 cx8000 = _mm_set1_epi16(0x8000); // - 32768
3126 au = _mm_sub_epi16(a, cx8000); //add 32768
3127 bu = _mm_sub_epi16(b, cx8000); //add 32768
3128 sum = _mm_avg_epu16(au, bu);
3129 return _mm_add_epi16 (sum, cx8000); //sub 32768
3132 int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
3133 _NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
3135 //need to avoid overflow
3136 __m128i a2, b2, res, sum;
3137 a2 = _mm_srai_epi32(a,1); //a2=a/2;
3138 b2 = _mm_srai_epi32(b,1); // b2=b/2;
3139 res = _mm_or_si128(a,b); //for rounding
3140 res = _mm_slli_epi32 (res,31); //shift left then back right to
3141 res = _mm_srli_epi32 (res,31); //get 1 or zero
3142 sum = _mm_add_epi32(a2,b2);
3143 return _mm_add_epi32(sum,res);
3146 uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
3147 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3149 uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
3150 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3153 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
3154 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
3156 //need to avoid overflow
3157 __m128i a2, b2, res, sum;
3158 a2 = _mm_srli_epi32(a,1); //a2=a/2;
3159 b2 = _mm_srli_epi32(b,1); // b2=b/2;
3160 res = _mm_or_si128(a,b); //for rounding
3161 res = _mm_slli_epi32 (res,31); //shift left then back right to
3162 res = _mm_srli_epi32 (res,31); //get 1 or zero
3163 sum = _mm_add_epi32(a2,b2);
3164 return _mm_add_epi32(sum,res);
3167 //****************** VQADD: Vector saturating add ************************
3168 //************************************************************************
3169 int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
3170 _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
3172 int8x8_t res64;
3173 return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
3177 int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
3178 _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
3180 int16x4_t res64;
3181 return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
3185 int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
3186 _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
3188 int32x2_t res64;
3189 return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
3193 int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
3194 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3196 int64x1_t res;
3197 uint64_t a64, b64;
3198 a64 = a.m64_u64[0];
3199 b64 = b.m64_u64[0];
3200 res.m64_u64[0] = a64 + b64;
3201 a64 = (a64 >> 63) + (~_SIGNBIT64);
3202 if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
3203 res.m64_u64[0] = a64;
3205 return res;
3208 uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
3209 _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
3211 uint8x8_t res64;
3212 return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
3216 uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
3217 _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
3219 uint16x4_t res64;
3220 return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
3224 uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
3225 _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
3227 uint32x2_t res64;
3228 return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
3232 uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
3233 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3235 _NEON2SSE_ALIGN_16 uint64_t a64, b64;
3236 uint64x1_t res;
3237 a64 = a.m64_u64[0];
3238 b64 = b.m64_u64[0];
3239 res.m64_u64[0] = a64 + b64;
3240 if (res.m64_u64[0] < a64) {
3241 res.m64_u64[0] = ~(uint64_t)0;
3243 return res;
3246 int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
3247 #define vqaddq_s8 _mm_adds_epi8
3249 int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
3250 #define vqaddq_s16 _mm_adds_epi16
3252 int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
3253 _NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
3255 //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3256 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
3257 c7fffffff = _mm_set1_epi32(0x7fffffff);
3258 res = _mm_add_epi32(a, b);
3259 res_sat = _mm_srli_epi32(a, 31);
3260 res_sat = _mm_add_epi32(res_sat, c7fffffff);
3261 res_xor_a = _mm_xor_si128(res, a);
3262 b_xor_a_ = _mm_xor_si128(b, a);
3263 res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
3264 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
3265 res_sat = _mm_and_si128(res_xor_a, res_sat);
3266 res = _mm_andnot_si128(res_xor_a, res);
3267 return _mm_or_si128(res, res_sat);
3270 int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
3271 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3273 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3274 _mm_store_si128((__m128i*)atmp, a);
3275 _mm_store_si128((__m128i*)btmp, b);
3276 res[0] = atmp[0] + btmp[0];
3277 res[1] = atmp[1] + btmp[1];
3279 atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
3280 atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
3282 if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
3283 res[0] = atmp[0];
3285 if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
3286 res[1] = atmp[1];
3288 return _mm_load_si128((__m128i*)res);
3291 uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
3292 #define vqaddq_u8 _mm_adds_epu8
3294 uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
3295 #define vqaddq_u16 _mm_adds_epu16
3297 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
3298 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
3300 __m128i c80000000, cmp, subsum, suba, sum;
3301 c80000000 = _mm_set1_epi32 (0x80000000);
3302 sum = _mm_add_epi32 (a, b);
3303 subsum = _mm_sub_epi32 (sum, c80000000);
3304 suba = _mm_sub_epi32 (a, c80000000);
3305 cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
3306 return _mm_or_si128 (sum, cmp); //saturation
3309 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
3310 #ifdef USE_SSE4
3311 _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
3313 __m128i c80000000, sum, cmp, suba, subsum;
3314 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
3315 sum = _mm_add_epi64 (a, b);
3316 subsum = _mm_sub_epi64 (sum, c80000000);
3317 suba = _mm_sub_epi64 (a, c80000000);
3318 cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
3319 return _mm_or_si128 (sum, cmp); //saturation
3321 #else
3322 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3324 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3325 _mm_store_si128((__m128i*)atmp, a);
3326 _mm_store_si128((__m128i*)btmp, b);
3327 res[0] = atmp[0] + btmp[0];
3328 res[1] = atmp[1] + btmp[1];
3329 if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
3330 if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
3331 return _mm_load_si128((__m128i*)(res));
3333 #endif
3336 //******************* Vector add high half (truncated) ******************
3337 //************************************************************************
3338 int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
3339 _NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
3341 int8x8_t res64;
3342 __m128i sum;
3343 sum = _mm_add_epi16 (a, b);
3344 sum = _mm_srai_epi16 (sum, 8);
3345 sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
3346 return64(sum);
3349 int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
3350 _NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
3352 int16x4_t res64;
3353 __m128i sum;
3354 sum = _mm_add_epi32 (a, b);
3355 sum = _mm_srai_epi32(sum, 16);
3356 sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
3357 return64(sum);
3360 int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
3361 _NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
3363 int32x2_t res64;
3364 __m128i sum;
3365 sum = _mm_add_epi64 (a, b);
3366 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6));
3367 return64(sum);
3370 uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
3371 _NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
3373 uint8x8_t res64;
3374 __m128i sum;
3375 sum = _mm_add_epi16 (a, b);
3376 sum = _mm_srli_epi16 (sum, 8);
3377 sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
3378 return64(sum);
3381 uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
3382 _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
3384 uint16x4_t res64;
3385 __m128i sum;
3386 sum = _mm_add_epi32 (a, b);
3387 sum = _mm_srli_epi32 (sum, 16);
3388 sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
3389 return64(sum);
3392 uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
3393 #define vaddhn_u64 vaddhn_s64
3395 //*********** Vector rounding add high half: vraddhn_<type> ******************.
3396 //***************************************************************************
3397 int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
3398 _NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
3400 int8x8_t res64;
3401 __m128i sum, mask1;
3402 sum = _mm_add_epi16 (a, b);
3403 mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3404 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
3405 sum = _mm_srai_epi16 (sum, 8); //get high half
3406 sum = _mm_add_epi16 (sum, mask1); //actual rounding
3407 sum = _mm_packs_epi16 (sum, sum);
3408 return64(sum);
3411 int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
3412 _NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
3414 //SIMD may be not optimal, serial may be faster
3415 int16x4_t res64;
3416 __m128i sum, mask1;
3417 sum = _mm_add_epi32 (a, b);
3418 mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3419 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
3420 sum = _mm_srai_epi32 (sum, 16); //get high half
3421 sum = _mm_add_epi32 (sum, mask1); //actual rounding
3422 sum = _mm_packs_epi32 (sum, sum);
3423 return64(sum);
3426 int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
3427 _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
3429 //SIMD may be not optimal, serial may be faster
3430 int32x2_t res64;
3431 __m128i sum, mask1;
3432 sum = _mm_add_epi64 (a, b);
3433 mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
3434 mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
3435 sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
3436 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6));
3437 return64(sum);
3440 uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
3441 _NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
3443 uint8x8_t res64;
3444 __m128i sum, mask1;
3445 sum = _mm_add_epi16 (a, b);
3446 mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3447 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
3448 sum = _mm_srai_epi16 (sum, 8); //get high half
3449 sum = _mm_add_epi16 (sum, mask1); //actual rounding
3450 sum = _mm_packus_epi16 (sum, sum);
3451 return64(sum);
3454 uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
3455 _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
3457 //SIMD may be not optimal, serial may be faster
3458 uint16x4_t res64;
3459 __m128i sum, mask1;
3460 sum = _mm_add_epi32 (a, b);
3461 mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3462 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
3463 sum = _mm_srai_epi32 (sum, 16); //get high half
3464 sum = _mm_add_epi32 (sum, mask1); //actual rounding
3465 sum = _MM_PACKUS1_EPI32 (sum);
3466 return64(sum);
3469 uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
3470 #define vraddhn_u64 vraddhn_s64
3472 //**********************************************************************************
3473 //********* Multiplication *************************************
3474 //**************************************************************************************
3476 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
3477 //As we don't go to wider result functions are equal to "multiply low" in x86
3478 int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
3479 _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
3481 // no 8 bit simd multiply, need to go to 16 bits in SSE
3482 int8x8_t res64;
3483 __m128i a128, b128, res;
3484 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3485 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
3486 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3487 res = _mm_mullo_epi16 (a128, b128);
3488 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
3489 return64(res);
3492 int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
3493 #define vmul_s16 vmul_u16
3495 int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
3496 #define vmul_s32 vmul_u32
3498 float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
3499 _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
3501 float32x4_t tmp;
3502 __m64_128 res64;
3503 tmp = _mm_mul_ps(_pM128(a),_pM128(b));
3504 _M64f(res64, tmp); //use low 64 bits
3505 return res64;
3508 uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
3509 _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
3511 // no 8 bit simd multiply, need to go to 16 bits in SSE
3512 uint8x8_t res64;
3513 __m128i mask, a128, b128, res;
3514 mask = _mm_set1_epi16(0xff);
3515 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
3516 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
3517 res = _mm_mullo_epi16 (a128, b128);
3518 res = _mm_and_si128(res, mask); //to avoid saturation
3519 res = _mm_packus_epi16 (res,res); //use only low 64 bits
3520 return64(res);
3523 uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
3524 _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
3526 uint16x4_t res64;
3527 return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
3531 uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
3532 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3534 uint32x2_t res;
3535 res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
3536 res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
3537 return res;
3540 poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
3541 _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
3543 //may be optimized
3544 poly8x8_t res64;
3545 __m128i a64, b64, c1, res, tmp, bmasked;
3546 int i;
3547 a64 = _pM128i(a);
3548 b64 = _pM128i(b);
3549 c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
3550 c1 = vshrq_n_u8(c1,7); //0x1
3551 bmasked = _mm_and_si128(b64, c1); //0x1
3552 res = vmulq_u8(a64, bmasked);
3553 for(i = 1; i<8; i++) {
3554 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3555 bmasked = _mm_and_si128(b64, c1); //0x1
3556 tmp = vmulq_u8(a64, bmasked);
3557 res = _mm_xor_si128(res, tmp);
3559 return64 (res);
3562 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
3563 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
3565 // no 8 bit simd multiply, need to go to 16 bits
3566 //solution may be not optimal
3567 __m128i a16, b16, r16_1, r16_2;
3568 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3569 a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
3570 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3571 r16_1 = _mm_mullo_epi16 (a16, b16);
3572 //swap hi and low part of a and b to process the remaining data
3573 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3574 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3575 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3576 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
3578 r16_2 = _mm_mullo_epi16 (a16, b16);
3579 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3580 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3582 return _mm_unpacklo_epi64(r16_1, r16_2);
3585 int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
3586 #define vmulq_s16 _mm_mullo_epi16
3588 int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
3589 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3591 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
3592 #define vmulq_f32 _mm_mul_ps
3594 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
3595 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
3597 // no 8 bit simd multiply, need to go to 16 bits
3598 //solution may be not optimal
3599 __m128i maskff, a16, b16, r16_1, r16_2;
3600 maskff = _mm_set1_epi16(0xff);
3601 a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
3602 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3603 r16_1 = _mm_mullo_epi16 (a16, b16);
3604 r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
3605 //swap hi and low part of a and b to process the remaining data
3606 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3607 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3608 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3609 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3611 r16_2 = _mm_mullo_epi16 (a16, b16);
3612 r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
3613 return _mm_packus_epi16 (r16_1, r16_2);
3616 uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
3617 #define vmulq_u16 _mm_mullo_epi16
3619 uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
3620 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3622 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
3623 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
3625 //may be optimized
3626 __m128i c1, res, tmp, bmasked;
3627 int i;
3628 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
3629 c1 = vshrq_n_u8(c1,7); //0x1
3630 bmasked = _mm_and_si128(b, c1); //0x1
3631 res = vmulq_u8(a, bmasked);
3632 for(i = 1; i<8; i++) {
3633 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3634 bmasked = _mm_and_si128(b, c1); //0x1
3635 tmp = vmulq_u8(a, bmasked);
3636 res = _mm_xor_si128(res, tmp);
3638 return res;
3641 //************************* Vector long multiply ***********************************
3642 //****************************************************************************
3643 int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
3644 _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
3646 //no 8 bit simd multiply, need to go to 16 bits
3647 __m128i a16, b16;
3648 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
3649 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
3650 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3653 int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
3654 _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
3656 #ifdef USE_SSE4
3657 __m128i a16, b16;
3658 a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
3659 b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
3660 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3661 #else
3662 __m128i low, hi, a128,b128;
3663 a128 = _pM128i(a);
3664 b128 = _pM128i(b);
3665 low = _mm_mullo_epi16(a128,b128);
3666 hi = _mm_mulhi_epi16(a128,b128);
3667 return _mm_unpacklo_epi16(low,hi);
3668 #endif
3671 int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
3672 _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
3674 __m128i ab, ba, a128, b128;
3675 a128 = _pM128i(a);
3676 b128 = _pM128i(b);
3677 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3678 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3679 return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3682 uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
3683 _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
3685 //no 8 bit simd multiply, need to go to 16 bits
3686 __m128i a16, b16;
3687 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
3688 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
3689 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3692 uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
3693 _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
3695 #ifdef USE_SSE4
3696 __m128i a16, b16;
3697 a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
3698 b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
3699 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3700 #else
3701 __m128i a128,b128,low, hi;
3702 a128 = _pM128i(a);
3703 b128 = _pM128i(b);
3704 low = _mm_mullo_epi16(a128,b128);
3705 hi = _mm_mulhi_epu16(a128,b128);
3706 return _mm_unpacklo_epi16(low,hi);
3707 #endif
3710 uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
3711 _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
3713 ///may be not optimal compared with serial implementation
3714 __m128i ab, ba, a128, b128;
3715 a128 = _pM128i(a);
3716 b128 = _pM128i(b);
3717 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3718 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3719 return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3722 poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
3723 _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
3725 //may be optimized
3726 __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
3727 int i;
3728 a128 = _pM128i(a);
3729 b128 = _pM128i(b);
3730 c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
3731 c1 = vshrq_n_u8(c1,7); //0x1
3732 bmasked = _mm_and_si128(b128, c1); //0x1
3734 a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
3735 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3736 res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
3737 for(i = 1; i<8; i++) {
3738 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3739 bmasked = _mm_and_si128(b128, c1); //0x1
3740 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3741 tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
3742 res = _mm_xor_si128(res, tmp);
3744 return res;
3747 //****************Vector saturating doubling long multiply **************************
3748 //*****************************************************************
3749 int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
3750 _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
3752 //the serial soulution may be faster due to saturation
3753 __m128i res;
3754 res = vmull_s16(a, b);
3755 return vqd_s32(res);
3758 int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
3759 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
3761 //the serial soulution may be faster due to saturation
3762 __m128i res;
3763 res = vmull_s32(a,b);
3764 return vqaddq_s64(res,res); //slow serial function!!!!
3767 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
3768 //******************************************************************************************
3769 int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
3770 _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
3772 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3773 int8x8_t res64;
3774 __m128i b128, c128, res;
3775 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3776 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3777 c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3778 res = _mm_mullo_epi16 (c128, b128);
3779 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
3780 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3781 return64(res);
3784 int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
3785 _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
3787 int16x4_t res64;
3788 return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3792 int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
3793 _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
3795 int32x2_t res64;
3796 __m128i res;
3797 res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
3798 res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
3799 return64(res);
3802 float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
3803 _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
3805 //fma is coming soon, but right now:
3806 __m128 res;
3807 __m64_128 res64;
3808 res = _mm_mul_ps (_pM128(c), _pM128(b));
3809 res = _mm_add_ps (_pM128(a), res);
3810 _M64f(res64, res);
3811 return res64;
3814 uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
3815 _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
3817 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3818 uint8x8_t res64;
3819 __m128i mask, b128, c128, res;
3820 mask = _mm_set1_epi16(0xff);
3821 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3822 c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3823 res = _mm_mullo_epi16 (c128, b128);
3824 res = _mm_and_si128(res, mask); //to avoid saturation
3825 res = _mm_packus_epi16 (res, res);
3826 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3827 return64(res);
3830 uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
3831 #define vmla_u16 vmla_s16
3833 uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
3834 #define vmla_u32 vmla_s32
3836 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
3837 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
3839 //solution may be not optimal
3840 // no 8 bit simd multiply, need to go to 16 bits
3841 __m128i b16, c16, r16_1, a_2,r16_2;
3842 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3843 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3844 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
3845 r16_1 = _mm_mullo_epi16 (b16, c16);
3846 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3847 r16_1 = _mm_add_epi8 (r16_1, a);
3848 //swap hi and low part of a, b and c to process the remaining data
3849 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3850 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3851 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3852 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3853 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
3855 r16_2 = _mm_mullo_epi16 (b16, c16);
3856 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3857 r16_2 = _mm_add_epi8(r16_2, a_2);
3858 return _mm_unpacklo_epi64(r16_1,r16_2);
3861 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
3862 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
3864 __m128i res;
3865 res = _mm_mullo_epi16 (c, b);
3866 return _mm_add_epi16 (res, a);
3869 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
3870 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
3872 __m128i res;
3873 res = _MM_MULLO_EPI32 (c, b); //SSE4.1
3874 return _mm_add_epi32 (res, a);
3877 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
3878 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
3880 //fma is coming soon, but right now:
3881 __m128 res;
3882 res = _mm_mul_ps (c, b);
3883 return _mm_add_ps (a, res);
3886 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
3887 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
3889 //solution may be not optimal
3890 // no 8 bit simd multiply, need to go to 16 bits
3891 __m128i b16, c16, r16_1, a_2, r16_2;
3892 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3893 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3894 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
3895 r16_1 = _mm_mullo_epi16 (b16, c16);
3896 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3897 r16_1 = _mm_add_epi8 (r16_1, a);
3898 //swap hi and low part of a, b and c to process the remaining data
3899 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3900 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3901 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3902 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
3903 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
3905 r16_2 = _mm_mullo_epi16 (b16, c16);
3906 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3907 r16_2 = _mm_add_epi8(r16_2, a_2);
3908 return _mm_unpacklo_epi64(r16_1,r16_2);
3911 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
3912 #define vmlaq_u16 vmlaq_s16
3914 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
3915 #define vmlaq_u32 vmlaq_s32
3917 //********************** Vector widening multiply accumulate (long multiply accumulate):
3918 // vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
3919 //********************************************************************************************
3920 int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
3921 _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
3923 int16x8_t res;
3924 res = vmull_s8(b, c);
3925 return _mm_add_epi16 (res, a);
3928 int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
3929 _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
3931 //may be not optimal compared with serial implementation
3932 int32x4_t res;
3933 res = vmull_s16(b, c);
3934 return _mm_add_epi32 (res, a);
3937 int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
3938 _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
3940 //may be not optimal compared with serial implementation
3941 int64x2_t res;
3942 res = vmull_s32( b, c);
3943 return _mm_add_epi64 (res, a);
3946 uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
3947 _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
3949 uint16x8_t res;
3950 res = vmull_u8(b, c);
3951 return _mm_add_epi16 (res, a);
3954 uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
3955 _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
3957 //may be not optimal compared with serial implementation
3958 uint32x4_t res;
3959 res = vmull_u16(b, c);
3960 return _mm_add_epi32 (res, a);
3963 uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
3964 _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
3966 //may be not optimal compared with serial implementation
3967 int64x2_t res;
3968 res = vmull_u32( b,c);
3969 return _mm_add_epi64 (res, a);
3972 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
3973 //********************************************************************************************
3974 int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
3975 _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
3977 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
3978 int8x8_t res64;
3979 __m128i res;
3980 res64 = vmul_s8(b,c);
3981 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
3982 return64(res);
3985 int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
3986 _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
3988 int16x4_t res64;
3989 return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3993 int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
3994 _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
3996 int32x2_t res64;
3997 __m128i res;
3998 res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
3999 res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
4000 return64(res);
4003 float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
4004 _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
4006 __m128 res;
4007 __m64_128 res64;
4008 res = _mm_mul_ps (_pM128(c), _pM128(b));
4009 res = _mm_sub_ps (_pM128(a), res);
4010 _M64f(res64, res);
4011 return res64;
4014 uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
4015 _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
4017 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4018 uint8x8_t res64;
4019 __m128i res;
4020 res64 = vmul_u8(b,c);
4021 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4022 return64(res);
4025 uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
4026 #define vmls_u16 vmls_s16
4028 uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
4029 #define vmls_u32 vmls_s32
4032 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
4033 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
4035 //solution may be not optimal
4036 // no 8 bit simd multiply, need to go to 16 bits
4037 __m128i b16, c16, r16_1, a_2, r16_2;
4038 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
4039 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
4040 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
4041 r16_1 = _mm_mullo_epi16 (b16, c16);
4042 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
4043 r16_1 = _mm_sub_epi8 (a, r16_1);
4044 //swap hi and low part of a, b, c to process the remaining data
4045 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4046 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4047 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4048 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
4049 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
4051 r16_2 = _mm_mullo_epi16 (b16, c16);
4052 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4053 r16_2 = _mm_sub_epi8 (a_2, r16_2);
4054 return _mm_unpacklo_epi64(r16_1,r16_2);
4057 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
4058 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
4060 __m128i res;
4061 res = _mm_mullo_epi16 (c, b);
4062 return _mm_sub_epi16 (a, res);
4065 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
4066 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
4068 __m128i res;
4069 res = _MM_MULLO_EPI32 (c, b); //SSE4.1
4070 return _mm_sub_epi32 (a, res);
4073 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
4074 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
4076 __m128 res;
4077 res = _mm_mul_ps (c, b);
4078 return _mm_sub_ps (a, res);
4081 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
4082 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
4084 //solution may be not optimal
4085 // no 8 bit simd multiply, need to go to 16 bits
4086 __m128i b16, c16, r16_1, a_2, r16_2;
4087 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
4088 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
4089 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
4090 r16_1 = _mm_mullo_epi16 (b16, c16);
4091 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
4092 r16_1 = _mm_sub_epi8 (a, r16_1);
4093 //swap hi and low part of a, b and c to process the remaining data
4094 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4095 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4096 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4097 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
4098 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
4100 r16_2 = _mm_mullo_epi16 (b16, c16);
4101 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4102 r16_2 = _mm_sub_epi8(a_2, r16_2);
4103 return _mm_unpacklo_epi64(r16_1,r16_2);
4106 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
4107 #define vmlsq_u16 vmlsq_s16
4109 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
4110 #define vmlsq_u32 vmlsq_s32
4112 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
4113 //*************************************************************************************************************
4114 int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
4115 _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
4117 int16x8_t res;
4118 res = vmull_s8(b, c);
4119 return _mm_sub_epi16 (a, res);
4122 int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
4123 _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
4125 //may be not optimal compared with serial implementation
4126 int32x4_t res;
4127 res = vmull_s16(b, c);
4128 return _mm_sub_epi32 (a, res);
4131 int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
4132 _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
4134 //may be not optimal compared with serial implementation
4135 int64x2_t res;
4136 res = vmull_s32( b,c);
4137 return _mm_sub_epi64 (a, res);
4140 uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
4141 _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
4143 uint16x8_t res;
4144 res = vmull_u8(b, c);
4145 return _mm_sub_epi16 (a, res);
4148 uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
4149 _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
4151 //may be not optimal compared with serial implementation
4152 uint32x4_t res;
4153 res = vmull_u16(b, c);
4154 return _mm_sub_epi32 (a, res);
4157 uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
4158 _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
4160 //may be not optimal compared with serial implementation
4161 int64x2_t res;
4162 res = vmull_u32( b,c);
4163 return _mm_sub_epi64 (a, res);
4166 //****** Vector saturating doubling multiply high **********************
4167 //*************************************************************************
4168 int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
4169 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4171 int16x4_t res;
4172 int32_t a32, b32, i;
4173 for (i = 0; i<4; i++) {
4174 a32 = (int32_t) a.m64_i16[i];
4175 b32 = (int32_t) b.m64_i16[i];
4176 a32 = (a32 * b32) >> 15;
4177 res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
4179 return res;
4182 int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
4183 _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4185 //may be not optimal compared with a serial solution
4186 int32x2_t res64;
4187 __m128i mask;
4188 _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4189 int64x2_t mul;
4190 mul = vmull_s32(a,b);
4191 mul = _mm_slli_epi64(mul,1); //double the result
4192 //at this point start treating 2 64-bit numbers as 4 32-bit
4193 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4194 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4195 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4196 return64(mul);
4199 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
4200 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
4202 __m128i res, res_lo, mask;
4203 _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4204 res = _mm_mulhi_epi16 (a, b);
4205 res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
4206 res_lo = _mm_mullo_epi16 (a, b);
4207 res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
4208 res = _mm_add_epi16(res, res_lo); //combine results
4209 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4210 return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4213 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
4214 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4216 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4217 __m128i ab, ba, mask, mul, mul1;
4218 _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4219 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4220 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4221 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4222 mul = _mm_slli_epi64(mul,1); //double the result
4223 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4224 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4225 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4226 mul1 = _mm_slli_epi64(mul1,1); //double the result
4227 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4228 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4229 mul = _mm_unpacklo_epi64(mul, mul1);
4230 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4231 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4234 //********* Vector saturating rounding doubling multiply high ****************
4235 //****************************************************************************
4236 //If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
4237 int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
4238 _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
4240 int16x4_t res64;
4241 return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
4244 int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
4245 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4247 //may be not optimal compared with a serial solution
4248 int32x2_t res64;
4249 _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4250 __m128i res_sat, mask, mask1;
4251 int64x2_t mul;
4252 mul = vmull_s32(a,b);
4253 res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4254 mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
4255 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4256 mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
4257 //at this point start treating 2 64-bit numbers as 4 32-bit
4258 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4259 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4260 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4261 return64(mul);
4264 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
4265 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
4267 __m128i mask, res;
4268 _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4269 res = _mm_mulhrs_epi16 (a, b);
4270 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4271 return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4274 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
4275 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4277 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4278 __m128i ab, ba, mask, mul, mul1, mask1;
4279 _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4280 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4281 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4282 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4283 mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4284 mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
4285 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4286 mul = _mm_add_epi32 (mul, mask1); //actual rounding
4288 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4289 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4290 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4291 mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
4292 mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
4293 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4294 mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
4295 //at this point start treating 2 64-bit numbers as 4 32-bit
4296 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4297 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4298 mul = _mm_unpacklo_epi64(mul, mul1);
4299 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4300 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4303 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
4304 //*************************************************************************************************************************
4305 int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
4306 _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
4308 //not optimal SIMD soulution, serial may be faster
4309 __m128i res32;
4310 res32 = vmull_s16(b, c);
4311 res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4312 return vqaddq_s32(res32, a); //saturation
4315 int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
4316 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
4318 __m128i res64;
4319 res64 = vmull_s32(b,c);
4320 res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4321 return vqaddq_s64(res64, a); //saturation
4324 //************************************************************************************
4325 //****************** Vector subtract ***********************************************
4326 //************************************************************************************
4327 int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
4328 _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
4330 int8x8_t res64;
4331 return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
4335 int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
4336 _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
4338 int16x4_t res64;
4339 return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
4343 int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
4344 _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
4346 int32x2_t res64;
4347 return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
4351 int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
4352 _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
4354 int64x1_t res64;
4355 res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
4356 return res64;
4360 float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
4361 _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
4363 float32x2_t res;
4364 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
4365 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
4366 return res;
4369 uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
4370 #define vsub_u8 vsub_s8
4372 uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
4373 #define vsub_u16 vsub_s16
4375 uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
4376 #define vsub_u32 vsub_s32
4379 uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
4380 _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
4382 int64x1_t res64;
4383 res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
4384 return res64;
4388 int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
4389 #define vsubq_s8 _mm_sub_epi8
4391 int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
4392 #define vsubq_s16 _mm_sub_epi16
4394 int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
4395 #define vsubq_s32 _mm_sub_epi32
4397 int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
4398 #define vsubq_s64 _mm_sub_epi64
4400 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
4401 #define vsubq_f32 _mm_sub_ps
4403 uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
4404 #define vsubq_u8 _mm_sub_epi8
4406 uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
4407 #define vsubq_u16 _mm_sub_epi16
4409 uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
4410 #define vsubq_u32 _mm_sub_epi32
4412 uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
4413 #define vsubq_u64 _mm_sub_epi64
4415 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
4416 //***********************************************************************************
4417 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4418 int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
4419 _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
4421 __m128i a16, b16;
4422 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
4423 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4424 return _mm_sub_epi16 (a16, b16);
4427 int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
4428 _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
4430 __m128i a32, b32;
4431 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
4432 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4433 return _mm_sub_epi32 (a32, b32);
4436 int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
4437 _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
4439 //may be not optimal
4440 __m128i a64, b64;
4441 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
4442 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
4443 return _mm_sub_epi64 (a64, b64);
4446 uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
4447 _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
4449 __m128i a16, b16;
4450 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
4451 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4452 return _mm_sub_epi16 (a16, b16);
4455 uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
4456 _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
4458 __m128i a32, b32;
4459 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
4460 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4461 return _mm_sub_epi32 (a32, b32);
4464 uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
4465 _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
4467 //may be not optimal
4468 __m128i a64, b64;
4469 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
4470 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
4471 return _mm_sub_epi64 (a64, b64);
4474 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
4475 //*****************************************************************************************************
4476 int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
4477 _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
4479 __m128i b16;
4480 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4481 return _mm_sub_epi16 (a, b16);
4484 int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
4485 _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
4487 __m128i b32;
4488 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4489 return _mm_sub_epi32 (a, b32);
4492 int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
4493 _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
4495 __m128i b64;
4496 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
4497 return _mm_sub_epi64 (a, b64);
4500 uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
4501 _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
4503 __m128i b16;
4504 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4505 return _mm_sub_epi16 (a, b16);
4508 uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
4509 _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
4511 __m128i b32;
4512 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4513 return _mm_sub_epi32 (a, b32);
4516 uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
4517 _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
4519 __m128i b64;
4520 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
4521 return _mm_sub_epi64 (a, b64);
4524 //************************Vector saturating subtract *********************************
4525 //*************************************************************************************
4526 int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
4527 _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
4529 int8x8_t res64;
4530 return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
4534 int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
4535 _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
4537 int16x4_t res64;
4538 return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
4542 int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
4543 _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
4545 int32x2_t res64;
4546 return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
4550 int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
4551 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4553 uint64x1_t res;
4554 uint64_t a64,b64;
4555 a64 = a.m64_u64[0];
4556 b64 = b.m64_u64[0];
4557 res.m64_u64[0] = a64 - b64;
4559 a64 = (a64 >> 63) + (~_SIGNBIT64);
4560 if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
4561 res.m64_u64[0] = a64;
4563 return res;
4566 uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
4567 _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
4569 uint8x8_t res64;
4570 return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
4574 uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
4575 _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
4577 uint16x4_t res64;
4578 return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
4582 uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
4583 _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
4585 uint32x2_t res64;
4586 return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
4590 uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
4591 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4593 uint64x1_t res;
4594 uint64_t a64, b64;
4595 a64 = _Ui64(a);
4596 b64 = _Ui64(b);
4597 if (a64 > b64) {
4598 res.m64_u64[0] = a64 - b64;
4599 } else {
4600 res.m64_u64[0] = 0;
4602 return res;
4605 int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
4606 #define vqsubq_s8 _mm_subs_epi8
4608 int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
4609 #define vqsubq_s16 _mm_subs_epi16
4611 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
4612 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
4614 //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4615 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
4616 c7fffffff = _mm_set1_epi32(0x7fffffff);
4617 res = _mm_sub_epi32(a, b);
4618 res_sat = _mm_srli_epi32(a, 31);
4619 res_sat = _mm_add_epi32(res_sat, c7fffffff);
4620 res_xor_a = _mm_xor_si128(res, a);
4621 b_xor_a = _mm_xor_si128(b, a);
4622 res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
4623 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
4624 res_sat = _mm_and_si128(res_xor_a, res_sat);
4625 res = _mm_andnot_si128(res_xor_a, res);
4626 return _mm_or_si128(res, res_sat);
4629 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
4630 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4632 _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
4633 _NEON2SSE_ALIGN_16 uint64_t res[2];
4634 _mm_store_si128((__m128i*)atmp, a);
4635 _mm_store_si128((__m128i*)btmp, b);
4636 res[0] = atmp[0] - btmp[0];
4637 res[1] = atmp[1] - btmp[1];
4638 if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
4639 res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
4641 if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
4642 res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
4644 return _mm_load_si128((__m128i*)res);
4647 uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
4648 #define vqsubq_u8 _mm_subs_epu8
4650 uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
4651 #define vqsubq_u16 _mm_subs_epu16
4653 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
4654 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
4656 __m128i min, mask, sub;
4657 min = _MM_MIN_EPU32(a, b); //SSE4.1
4658 mask = _mm_cmpeq_epi32 (min, b);
4659 sub = _mm_sub_epi32 (a, b);
4660 return _mm_and_si128 ( sub, mask);
4663 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
4664 #ifdef USE_SSE4
4665 _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
4667 __m128i c80000000, subb, suba, cmp, sub;
4668 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
4669 sub = _mm_sub_epi64 (a, b);
4670 suba = _mm_sub_epi64 (a, c80000000);
4671 subb = _mm_sub_epi64 (b, c80000000);
4672 cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
4673 return _mm_and_si128 (sub, cmp); //saturation
4675 #else
4676 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4678 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
4679 _mm_store_si128((__m128i*)atmp, a);
4680 _mm_store_si128((__m128i*)btmp, b);
4681 res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0;
4682 res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0;
4683 return _mm_load_si128((__m128i*)(res));
4685 #endif
4687 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
4688 //****************************************************************
4689 int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
4690 _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
4692 //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4693 int8x8_t res64;
4694 __m128i r16;
4695 int8x8_t r;
4696 r = vsub_s8 (a, b);
4697 r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
4698 r16 = _mm_srai_epi16 (r16, 1); //SSE2
4699 r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
4700 return64(r16);
4703 int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
4704 _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
4706 int16x4_t res64;
4707 return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
4712 int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
4713 _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
4715 int32x2_t res64;
4716 return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
4720 uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
4721 _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
4723 uint8x8_t res64;
4724 return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
4727 uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
4728 _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
4730 uint16x4_t res64;
4731 return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
4734 uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
4735 _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
4737 uint32x2_t res64;
4738 return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
4741 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
4742 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
4744 // //need to deal with the possibility of internal overflow
4745 __m128i c128, au,bu;
4746 c128 = _mm_set1_epi8 (128);
4747 au = _mm_add_epi8( a, c128);
4748 bu = _mm_add_epi8( b, c128);
4749 return vhsubq_u8(au,bu);
4752 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
4753 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
4755 //need to deal with the possibility of internal overflow
4756 __m128i c8000, au,bu;
4757 c8000 = _mm_set1_epi16(0x8000);
4758 au = _mm_add_epi16( a, c8000);
4759 bu = _mm_add_epi16( b, c8000);
4760 return vhsubq_u16(au,bu);
4763 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
4764 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
4766 //need to deal with the possibility of internal overflow
4767 __m128i a2, b2,r, b_1;
4768 a2 = _mm_srai_epi32 (a,1);
4769 b2 = _mm_srai_epi32 (b,1);
4770 r = _mm_sub_epi32 (a2, b2);
4771 b_1 = _mm_andnot_si128(a, b); //!a and b
4772 b_1 = _mm_slli_epi32 (b_1,31);
4773 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4774 return _mm_sub_epi32(r,b_1);
4777 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
4778 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
4780 __m128i avg;
4781 avg = _mm_avg_epu8 (a, b);
4782 return _mm_sub_epi8(a, avg);
4785 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
4786 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
4788 __m128i avg;
4789 avg = _mm_avg_epu16 (a, b);
4790 return _mm_sub_epi16(a, avg);
4793 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
4794 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
4796 //need to deal with the possibility of internal overflow
4797 __m128i a2, b2,r, b_1;
4798 a2 = _mm_srli_epi32 (a,1);
4799 b2 = _mm_srli_epi32 (b,1);
4800 r = _mm_sub_epi32 (a2, b2);
4801 b_1 = _mm_andnot_si128(a, b); //!a and b
4802 b_1 = _mm_slli_epi32 (b_1,31);
4803 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4804 return _mm_sub_epi32(r,b_1);
4807 //******* Vector subtract high half (truncated) ** ************
4808 //************************************************************
4809 int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
4810 _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
4812 int8x8_t res64;
4813 __m128i sum, sum8;
4814 sum = _mm_sub_epi16 (a, b);
4815 sum8 = _mm_srai_epi16 (sum, 8);
4816 sum8 = _mm_packs_epi16(sum8,sum8);
4817 return64(sum8);
4820 int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
4821 _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
4823 int16x4_t res64;
4824 __m128i sum, sum16;
4825 sum = _mm_sub_epi32 (a, b);
4826 sum16 = _mm_srai_epi32 (sum, 16);
4827 sum16 = _mm_packs_epi32(sum16,sum16);
4828 return64(sum16);
4831 int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
4832 _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
4834 int32x2_t res64;
4835 __m128i sub;
4836 sub = _mm_sub_epi64 (a, b);
4837 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4838 return64(sub);
4841 uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
4842 _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
4844 uint8x8_t res64;
4845 __m128i sum, sum8;
4846 sum = _mm_sub_epi16 (a, b);
4847 sum8 = _mm_srli_epi16 (sum, 8);
4848 sum8 = _mm_packus_epi16(sum8,sum8);
4849 return64(sum8);
4852 uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
4853 _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
4855 uint16x4_t res64;
4856 __m128i sum, sum16;
4857 sum = _mm_sub_epi32 (a, b);
4858 sum16 = _mm_srli_epi32 (sum, 16);
4859 sum16 = _MM_PACKUS1_EPI32(sum16);
4860 return64(sum16);
4863 uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
4864 #define vsubhn_u64 vsubhn_s64
4866 //************ Vector rounding subtract high half *********************
4867 //*********************************************************************
4868 int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
4869 _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
4871 int8x8_t res64;
4872 __m128i sub, mask1;
4873 sub = _mm_sub_epi16 (a, b);
4874 mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4875 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
4876 sub = _mm_srai_epi16 (sub, 8); //get high half
4877 sub = _mm_add_epi16 (sub, mask1); //actual rounding
4878 sub = _mm_packs_epi16 (sub, sub);
4879 return64(sub);
4882 int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
4883 _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
4885 //SIMD may be not optimal, serial may be faster
4886 int16x4_t res64;
4887 __m128i sub, mask1;
4888 sub = _mm_sub_epi32 (a, b);
4889 mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4890 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
4891 sub = _mm_srai_epi32 (sub, 16); //get high half
4892 sub = _mm_add_epi32 (sub, mask1); //actual rounding
4893 sub = _mm_packs_epi32 (sub, sub);
4894 return64(sub);
4897 int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
4898 _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
4900 //SIMD may be not optimal, serial may be faster
4901 int32x2_t res64;
4902 __m128i sub, mask1;
4903 sub = _mm_sub_epi64 (a, b);
4904 mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
4905 mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
4906 sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
4907 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4908 return64(sub);
4911 uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
4912 _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
4914 uint8x8_t res64;
4915 __m128i sub, mask1;
4916 sub = _mm_sub_epi16 (a, b);
4917 mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4918 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
4919 sub = _mm_srai_epi16 (sub, 8); //get high half
4920 sub = _mm_add_epi16 (sub, mask1); //actual rounding
4921 sub = _mm_packus_epi16 (sub, sub);
4922 return64(sub);
4925 uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
4926 _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
4928 //SIMD may be not optimal, serial may be faster
4929 uint16x4_t res64;
4930 __m128i sub, mask1;
4931 sub = _mm_sub_epi32 (a, b);
4932 mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4933 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
4934 sub = _mm_srai_epi32 (sub, 16); //get high half
4935 sub = _mm_add_epi32 (sub, mask1); //actual rounding
4936 sub = _MM_PACKUS1_EPI32 (sub);
4937 return64(sub);
4940 uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
4941 #define vrsubhn_u64 vrsubhn_s64
4943 //*********** Vector saturating doubling multiply subtract long ********************
4944 //************************************************************************************
4945 int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
4946 _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
4948 //not optimal SIMD soulution, serial may be faster
4949 __m128i res32, mask;
4950 int32x4_t res;
4951 _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4952 res = vmull_s16(b, c);
4953 res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
4954 mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
4955 res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
4956 return vqsubq_s32(a, res32); //saturation
4959 int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
4960 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
4962 __m128i res64, mask;
4963 int64x2_t res;
4964 _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
4965 res = vmull_s32(b, c);
4966 res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
4967 mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
4968 res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
4969 return vqsubq_s64(a, res64); //saturation
4972 //****************** COMPARISON ***************************************
4973 //******************* Vector compare equal *************************************
4974 //****************************************************************************
4975 uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
4976 _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
4978 int8x8_t res64;
4979 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
4983 uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
4984 _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
4986 int16x4_t res64;
4987 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
4991 uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
4992 _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
4994 int32x2_t res64;
4995 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
4999 uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
5000 _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
5002 uint32x2_t res64;
5003 __m128 res;
5004 res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
5005 return64f(res);
5008 uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
5009 _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
5011 uint8x8_t res64;
5012 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5016 uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
5017 _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
5019 uint16x4_t res64;
5020 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5024 uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
5025 _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
5027 uint32x2_t res64;
5028 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5032 uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
5033 #define vceq_p8 vceq_u8
5036 uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
5037 #define vceqq_s8 _mm_cmpeq_epi8
5039 uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
5040 #define vceqq_s16 _mm_cmpeq_epi16
5042 uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
5043 #define vceqq_s32 _mm_cmpeq_epi32
5045 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
5046 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
5048 __m128 res;
5049 res = _mm_cmpeq_ps(a,b);
5050 return _M128i(res);
5053 uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
5054 #define vceqq_u8 _mm_cmpeq_epi8
5056 uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
5057 #define vceqq_u16 _mm_cmpeq_epi16
5059 uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
5060 #define vceqq_u32 _mm_cmpeq_epi32
5062 uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
5063 #define vceqq_p8 _mm_cmpeq_epi8
5065 //******************Vector compare greater-than or equal*************************
5066 //*******************************************************************************
5067 //in IA SIMD no greater-than-or-equal comparison for integers,
5068 // there is greater-than available only, so we need the following tricks
5070 uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
5071 _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
5073 int8x8_t res64;
5074 return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
5078 uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
5079 _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
5081 int16x4_t res64;
5082 return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
5086 uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
5087 _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
5089 int32x2_t res64;
5090 return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
5094 uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
5095 _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
5097 uint32x2_t res64;
5098 __m128 res;
5099 res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5100 return64f(res);
5103 uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5104 _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
5106 uint8x8_t res64;
5107 return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
5111 uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5112 _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
5114 uint16x4_t res64;
5115 return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
5119 uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5120 _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
5122 //serial solution looks faster
5123 uint32x2_t res64;
5124 return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
5129 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
5130 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5132 __m128i m1, m2;
5133 m1 = _mm_cmpgt_epi8 ( a, b);
5134 m2 = _mm_cmpeq_epi8 ( a, b);
5135 return _mm_or_si128 ( m1, m2);
5138 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
5139 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5141 __m128i m1, m2;
5142 m1 = _mm_cmpgt_epi16 ( a, b);
5143 m2 = _mm_cmpeq_epi16 ( a, b);
5144 return _mm_or_si128 ( m1,m2);
5147 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
5148 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5150 __m128i m1, m2;
5151 m1 = _mm_cmpgt_epi32 (a, b);
5152 m2 = _mm_cmpeq_epi32 (a, b);
5153 return _mm_or_si128 (m1, m2);
5156 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
5157 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
5159 __m128 res;
5160 res = _mm_cmpge_ps(a,b); //use only 2 first entries
5161 return *(__m128i*)&res;
5164 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5165 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5167 //no unsigned chars comparison, only signed available,so need the trick
5168 #ifdef USE_SSE4
5169 __m128i cmp;
5170 cmp = _mm_max_epu8(a, b);
5171 return _mm_cmpeq_epi8(cmp, a); //a>=b
5172 #else
5173 __m128i c128, as, bs, m1, m2;
5174 c128 = _mm_set1_epi8 (128);
5175 as = _mm_sub_epi8( a, c128);
5176 bs = _mm_sub_epi8( b, c128);
5177 m1 = _mm_cmpgt_epi8( as, bs);
5178 m2 = _mm_cmpeq_epi8 (as, bs);
5179 return _mm_or_si128 ( m1, m2);
5180 #endif
5183 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5184 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5186 //no unsigned shorts comparison, only signed available,so need the trick
5187 #ifdef USE_SSE4
5188 __m128i cmp;
5189 cmp = _mm_max_epu16(a, b);
5190 return _mm_cmpeq_epi16(cmp, a); //a>=b
5191 #else
5192 __m128i c8000, as, bs, m1, m2;
5193 c8000 = _mm_set1_epi16 (0x8000);
5194 as = _mm_sub_epi16(a,c8000);
5195 bs = _mm_sub_epi16(b,c8000);
5196 m1 = _mm_cmpgt_epi16(as, bs);
5197 m2 = _mm_cmpeq_epi16 (as, bs);
5198 return _mm_or_si128 ( m1, m2);
5199 #endif
5202 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5203 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5205 //no unsigned ints comparison, only signed available,so need the trick
5206 #ifdef USE_SSE4
5207 __m128i cmp;
5208 cmp = _mm_max_epu32(a, b);
5209 return _mm_cmpeq_epi32(cmp, a); //a>=b
5210 #else
5211 //serial solution may be faster
5212 __m128i c80000000, as, bs, m1, m2;
5213 c80000000 = _mm_set1_epi32 (0x80000000);
5214 as = _mm_sub_epi32(a,c80000000);
5215 bs = _mm_sub_epi32(b,c80000000);
5216 m1 = _mm_cmpgt_epi32 (as, bs);
5217 m2 = _mm_cmpeq_epi32 (as, bs);
5218 return _mm_or_si128 ( m1, m2);
5219 #endif
5222 //**********************Vector compare less-than or equal******************************
5223 //***************************************************************************************
5224 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5226 uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
5227 _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
5229 int8x8_t res64;
5230 return64(vcleq_s8(_pM128i(a), _pM128i(b)));
5234 uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
5235 _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
5237 int16x4_t res64;
5238 return64(vcleq_s16(_pM128i(a), _pM128i(b)));
5242 uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
5243 _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
5245 int32x2_t res64;
5246 return64(vcleq_s32(_pM128i(a), _pM128i(b)));
5250 uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
5251 _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
5253 uint32x2_t res64;
5254 __m128 res;
5255 res = _mm_cmple_ps(_pM128(a),_pM128(b));
5256 return64f(res);
5259 uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5260 #define vcle_u8(a,b) vcge_u8(b,a)
5263 uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5264 #define vcle_u16(a,b) vcge_u16(b,a)
5267 uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5268 #define vcle_u32(a,b) vcge_u32(b,a)
5270 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
5271 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5273 __m128i c1, res;
5274 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
5275 res = _mm_cmpgt_epi8 ( a, b);
5276 return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
5279 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
5280 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5282 __m128i c1, res;
5283 c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
5284 res = _mm_cmpgt_epi16 ( a, b);
5285 return _mm_andnot_si128 (res, c1);
5288 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
5289 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5291 __m128i c1, res;
5292 c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
5293 res = _mm_cmpgt_epi32 ( a, b);
5294 return _mm_andnot_si128 (res, c1);
5297 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
5298 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
5300 __m128 res;
5301 res = _mm_cmple_ps(a,b);
5302 return *(__m128i*)&res;
5305 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5306 #ifdef USE_SSE4
5307 _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5309 //no unsigned chars comparison in SSE, only signed available,so need the trick
5310 __m128i cmp;
5311 cmp = _mm_min_epu8(a, b);
5312 return _mm_cmpeq_epi8(cmp, a); //a<=b
5314 #else
5315 #define vcleq_u8(a,b) vcgeq_u8(b,a)
5316 #endif
5319 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5320 #ifdef USE_SSE4
5321 _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5323 //no unsigned shorts comparison in SSE, only signed available,so need the trick
5324 __m128i cmp;
5325 cmp = _mm_min_epu16(a, b);
5326 return _mm_cmpeq_epi16(cmp, a); //a<=b
5328 #else
5329 #define vcleq_u16(a,b) vcgeq_u16(b,a)
5330 #endif
5333 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5334 #ifdef USE_SSE4
5335 _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5337 //no unsigned chars comparison in SSE, only signed available,so need the trick
5338 __m128i cmp;
5339 cmp = _mm_min_epu32(a, b);
5340 return _mm_cmpeq_epi32(cmp, a); //a<=b
5342 #else
5343 //solution may be not optimal compared with the serial one
5344 #define vcleq_u32(a,b) vcgeq_u32(b,a)
5345 #endif
5348 //****** Vector compare greater-than ******************************************
5349 //**************************************************************************
5350 uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5351 _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
5353 int8x8_t res64;
5354 return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
5358 uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5359 _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
5361 int16x4_t res64;
5362 return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
5366 uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5367 _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
5369 int32x2_t res64;
5370 return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
5374 uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5375 _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
5377 uint32x2_t res64;
5378 __m128 res;
5379 res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5380 return64f(res);
5383 uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5384 _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
5386 uint8x8_t res64;
5387 return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
5391 uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5392 _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
5394 uint16x4_t res64;
5395 return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
5399 uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5400 _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
5402 uint32x2_t res64;
5403 return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
5407 uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5408 #define vcgtq_s8 _mm_cmpgt_epi8
5410 uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5411 #define vcgtq_s16 _mm_cmpgt_epi16
5413 uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5414 #define vcgtq_s32 _mm_cmpgt_epi32
5416 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5417 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
5419 __m128 res;
5420 res = _mm_cmpgt_ps(a,b); //use only 2 first entries
5421 return *(__m128i*)&res;
5424 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5425 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
5427 //no unsigned chars comparison, only signed available,so need the trick
5428 __m128i c128, as, bs;
5429 c128 = _mm_set1_epi8 (128);
5430 as = _mm_sub_epi8(a,c128);
5431 bs = _mm_sub_epi8(b,c128);
5432 return _mm_cmpgt_epi8 (as, bs);
5435 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5436 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
5438 //no unsigned short comparison, only signed available,so need the trick
5439 __m128i c8000, as, bs;
5440 c8000 = _mm_set1_epi16 (0x8000);
5441 as = _mm_sub_epi16(a,c8000);
5442 bs = _mm_sub_epi16(b,c8000);
5443 return _mm_cmpgt_epi16 ( as, bs);
5446 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5447 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
5449 //no unsigned int comparison, only signed available,so need the trick
5450 __m128i c80000000, as, bs;
5451 c80000000 = _mm_set1_epi32 (0x80000000);
5452 as = _mm_sub_epi32(a,c80000000);
5453 bs = _mm_sub_epi32(b,c80000000);
5454 return _mm_cmpgt_epi32 ( as, bs);
5457 //********************* Vector compare less-than **************************
5458 //*************************************************************************
5459 uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5460 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5463 uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5464 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5467 uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5468 #define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
5471 uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5472 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5474 uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5475 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5477 uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5478 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5480 uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5481 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5483 uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5484 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5486 uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5487 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5489 uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5490 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5492 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5493 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5495 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5496 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5498 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5499 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5501 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5502 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5504 //*****************Vector compare absolute greater-than or equal ************
5505 //***************************************************************************
5506 uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
5507 _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
5509 uint32x2_t res64;
5510 __m128i c7fffffff;
5511 __m128 a0, b0;
5512 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5513 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5514 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5515 a0 = _mm_cmpge_ps ( a0, b0);
5516 return64f(a0);
5519 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
5520 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5522 __m128i c7fffffff;
5523 __m128 a0, b0;
5524 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5525 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5526 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5527 a0 = _mm_cmpge_ps ( a0, b0);
5528 return (*(__m128i*)&a0);
5531 //********Vector compare absolute less-than or equal ******************
5532 //********************************************************************
5533 uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
5534 _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
5536 uint32x2_t res64;
5537 __m128i c7fffffff;
5538 __m128 a0, b0;
5539 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5540 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5541 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5542 a0 = _mm_cmple_ps (a0, b0);
5543 return64f(a0);
5546 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
5547 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5549 __m128i c7fffffff;
5550 __m128 a0, b0;
5551 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5552 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5553 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5554 a0 = _mm_cmple_ps (a0, b0);
5555 return (*(__m128i*)&a0);
5558 //******** Vector compare absolute greater-than ******************
5559 //******************************************************************
5560 uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
5561 _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
5563 uint32x2_t res64;
5564 __m128i c7fffffff;
5565 __m128 a0, b0;
5566 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5567 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5568 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5569 a0 = _mm_cmpgt_ps (a0, b0);
5570 return64f(a0);
5573 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
5574 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5576 __m128i c7fffffff;
5577 __m128 a0, b0;
5578 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5579 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5580 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5581 a0 = _mm_cmpgt_ps (a0, b0);
5582 return (*(__m128i*)&a0);
5585 //***************Vector compare absolute less-than ***********************
5586 //*************************************************************************
5587 uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
5588 _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
5590 uint32x2_t res64;
5591 __m128i c7fffffff;
5592 __m128 a0, b0;
5593 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5594 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5595 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5596 a0 = _mm_cmplt_ps (a0, b0);
5597 return64f(a0);
5600 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
5601 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5603 __m128i c7fffffff;
5604 __m128 a0, b0;
5605 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5606 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5607 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5608 a0 = _mm_cmplt_ps (a0, b0);
5609 return (*(__m128i*)&a0);
5612 //*************************Vector test bits************************************
5613 //*****************************************************************************
5614 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
5615 with the corresponding element of a second vector. If the result is not zero, the
5616 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5617 all zeros. */
5619 uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
5620 _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
5622 int8x8_t res64;
5623 return64(vtstq_s8(_pM128i(a), _pM128i(b)));
5627 uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
5628 _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
5630 int16x4_t res64;
5631 return64(vtstq_s16(_pM128i(a), _pM128i(b)));
5635 uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
5636 _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
5638 int32x2_t res64;
5639 return64(vtstq_s32(_pM128i(a), _pM128i(b)));
5643 uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
5644 #define vtst_u8 vtst_s8
5646 uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
5647 #define vtst_u16 vtst_s16
5649 uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
5650 #define vtst_u32 vtst_s32
5653 uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
5654 #define vtst_p8 vtst_u8
5656 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
5657 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
5659 __m128i zero, one, res;
5660 zero = _mm_setzero_si128 ();
5661 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5662 res = _mm_and_si128 (a, b);
5663 res = _mm_cmpeq_epi8 (res, zero);
5664 return _mm_xor_si128(res, one); //invert result
5667 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
5668 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
5670 __m128i zero, one, res;
5671 zero = _mm_setzero_si128 ();
5672 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5673 res = _mm_and_si128 (a, b);
5674 res = _mm_cmpeq_epi16 (res, zero);
5675 return _mm_xor_si128(res, one); //invert result
5678 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
5679 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
5681 __m128i zero, one, res;
5682 zero = _mm_setzero_si128 ();
5683 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5684 res = _mm_and_si128 (a, b);
5685 res = _mm_cmpeq_epi32 (res, zero);
5686 return _mm_xor_si128(res, one); //invert result
5689 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
5690 #define vtstq_u8 vtstq_s8
5692 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
5693 #define vtstq_u16 vtstq_s16
5695 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
5696 #define vtstq_u32 vtstq_s32
5698 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
5699 #define vtstq_p8 vtstq_u8
5701 //****************** Absolute difference ********************
5702 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
5703 //************************************************************
5704 int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
5705 _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
5707 int8x8_t res64;
5708 return64(vabdq_s8(_pM128i(a), _pM128i(b)));
5711 int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
5712 _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
5714 int16x4_t res64;
5715 return64(vabdq_s16(_pM128i(a), _pM128i(b)));
5718 int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
5719 _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
5721 int32x2_t res64;
5722 return64(vabdq_s32(_pM128i(a), _pM128i(b)));
5725 uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
5726 _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
5728 uint8x8_t res64;
5729 return64(vabdq_u8(_pM128i(a), _pM128i(b)));
5732 uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
5733 _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
5735 uint16x4_t res64;
5736 return64(vabdq_u16(_pM128i(a), _pM128i(b)));
5739 uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
5740 _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
5742 uint32x2_t res64;
5743 return64(vabdq_u32(_pM128i(a), _pM128i(b)));
5746 float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
5747 _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
5749 float32x4_t res;
5750 __m64_128 res64;
5751 res = vabdq_f32(_pM128(a), _pM128(b));
5752 _M64f(res64, res);
5753 return res64;
5756 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
5757 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
5759 __m128i res;
5760 res = _mm_sub_epi8 (a, b);
5761 return _mm_abs_epi8 (res);
5764 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
5765 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
5767 __m128i res;
5768 res = _mm_sub_epi16 (a,b);
5769 return _mm_abs_epi16 (res);
5772 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
5773 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
5775 __m128i res;
5776 res = _mm_sub_epi32 (a,b);
5777 return _mm_abs_epi32 (res);
5780 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
5781 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
5783 __m128i cmp, difab, difba;
5784 cmp = vcgtq_u8(a,b);
5785 difab = _mm_sub_epi8(a,b);
5786 difba = _mm_sub_epi8 (b,a);
5787 difab = _mm_and_si128(cmp, difab);
5788 difba = _mm_andnot_si128(cmp, difba);
5789 return _mm_or_si128(difab, difba);
5792 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
5793 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
5795 __m128i cmp, difab, difba;
5796 cmp = vcgtq_u16(a,b);
5797 difab = _mm_sub_epi16(a,b);
5798 difba = _mm_sub_epi16 (b,a);
5799 difab = _mm_and_si128(cmp, difab);
5800 difba = _mm_andnot_si128(cmp, difba);
5801 return _mm_or_si128(difab, difba);
5804 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
5805 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
5807 __m128i cmp, difab, difba;
5808 cmp = vcgtq_u32(a,b);
5809 difab = _mm_sub_epi32(a,b);
5810 difba = _mm_sub_epi32 (b,a);
5811 difab = _mm_and_si128(cmp, difab);
5812 difba = _mm_andnot_si128(cmp, difba);
5813 return _mm_or_si128(difab, difba);
5816 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
5817 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
5819 __m128i c1;
5820 __m128 res;
5821 c1 = _mm_set1_epi32(0x7fffffff);
5822 res = _mm_sub_ps (a, b);
5823 return _mm_and_ps (res, *(__m128*)&c1);
5826 //************ Absolute difference - long **************************
5827 //********************************************************************
5828 int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
5829 _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
5831 __m128i a16, b16;
5832 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
5833 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5834 return vabdq_s16(a16, b16);
5838 int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
5839 _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
5841 __m128i a32, b32;
5842 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
5843 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
5844 return vabdq_s32(a32, b32);
5847 int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
5848 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
5850 //no optimal SIMD solution, serial looks faster
5851 _NEON2SSE_ALIGN_16 int64_t res[2];
5852 if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
5853 else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
5854 if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
5855 else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
5856 return _mm_load_si128((__m128i*)res);
5859 uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
5860 _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
5862 __m128i res;
5863 res = vsubl_u8(a,b);
5864 return _mm_abs_epi16(res);
5867 uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
5868 _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
5870 __m128i res;
5871 res = vsubl_u16(a,b);
5872 return _mm_abs_epi32(res);
5875 uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
5876 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
5878 _NEON2SSE_ALIGN_16 uint64_t res[2];
5879 if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
5880 else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
5881 if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
5882 else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
5883 return _mm_load_si128((__m128i*)res);
5886 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
5887 //*********************************************************************************************
5888 int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
5889 _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
5891 int8x8_t res64;
5892 return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
5895 int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
5896 _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
5898 int16x4_t res64;
5899 return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
5902 int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
5903 _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
5905 int32x2_t res64;
5906 return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
5909 uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
5910 #define vaba_u8 vaba_s8
5913 uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
5914 #define vaba_u16 vaba_s16
5916 uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
5917 _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
5919 uint32x2_t res64;
5920 return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
5923 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
5924 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
5926 int8x16_t sub;
5927 sub = vabdq_s8(b, c);
5928 return vaddq_s8( a, sub);
5931 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
5932 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
5934 int16x8_t sub;
5935 sub = vabdq_s16(b, c);
5936 return vaddq_s16( a, sub);
5939 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
5940 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
5942 int32x4_t sub;
5943 sub = vabdq_s32(b, c);
5944 return vaddq_s32( a, sub);
5947 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
5948 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
5950 uint8x16_t sub;
5951 sub = vabdq_u8(b, c);
5952 return vaddq_u8( a, sub);
5955 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
5956 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
5958 uint16x8_t sub;
5959 sub = vabdq_u16(b, c);
5960 return vaddq_u16( a, sub);
5963 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
5964 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
5966 uint32x4_t sub;
5967 sub = vabdq_u32(b, c);
5968 return vaddq_u32( a, sub);
5971 //************** Absolute difference and accumulate - long ********************************
5972 //*************************************************************************************
5973 int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
5974 _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
5976 __m128i b16, c16, res;
5977 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5978 c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
5979 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
5980 return _mm_add_epi16 (a, res);
5983 int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
5984 _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
5986 __m128i b32, c32, res;
5987 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
5988 c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
5989 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
5990 return _mm_add_epi32 (a, res);
5993 int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
5994 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
5996 __m128i res;
5997 res = vabdl_s32(b,c);
5998 return _mm_add_epi64(a, res);
6001 uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
6002 _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
6004 __m128i b16, c16, res;
6005 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
6006 c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
6007 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6008 return _mm_add_epi16 (a, res);
6011 uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
6012 _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
6014 __m128i b32, c32, res;
6015 b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
6016 c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
6017 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6018 return _mm_add_epi32 (a, res);
6021 uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
6022 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6024 __m128i res;
6025 res = vabdl_u32(b,c);
6026 return _mm_add_epi64(a, res);
6029 //***********************************************************************************
6030 //**************** Maximum and minimum operations **********************************
6031 //***********************************************************************************
6032 //************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
6033 //***********************************************************************************
6034 int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
6035 _NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
6037 int8x8_t res64;
6038 __m128i res;
6039 res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6040 return64(res);
6043 int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
6044 _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
6046 int16x4_t res64;
6047 return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
6050 int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
6051 _NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
6053 int32x2_t res64;
6054 __m128i res;
6055 res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6056 return64(res);
6059 uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
6060 _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
6062 uint8x8_t res64;
6063 return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
6067 uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
6068 _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
6070 uint16x4_t res64;
6071 return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
6075 uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
6076 _NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
6078 uint32x2_t res64;
6079 __m128i res;
6080 res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6081 return64(res);
6084 float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
6085 _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
6087 //serial solution looks faster than SIMD one
6088 float32x2_t res;
6089 res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6090 res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6091 return res;
6094 int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
6095 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6097 int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
6098 #define vmaxq_s16 _mm_max_epi16
6100 int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
6101 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6103 uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
6104 #define vmaxq_u8 _mm_max_epu8
6106 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
6107 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6109 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
6110 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6113 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
6114 #define vmaxq_f32 _mm_max_ps
6116 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
6117 //***********************************************************************************************************
6118 int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
6119 _NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
6121 int8x8_t res64;
6122 __m128i res;
6123 res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6124 return64(res);
6127 int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
6128 _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
6130 int16x4_t res64;
6131 return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
6135 int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
6136 _NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
6138 int32x2_t res64;
6139 __m128i res;
6140 res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6141 return64(res);
6144 uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
6145 _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
6147 uint8x8_t res64;
6148 return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
6152 uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
6153 _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
6155 uint16x4_t res64;
6156 return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
6160 uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
6161 _NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
6163 uint32x2_t res64;
6164 __m128i res;
6165 res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6166 return64(res);
6169 float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
6170 _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
6172 //serial solution looks faster than SIMD one
6173 float32x2_t res;
6174 res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6175 res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6176 return res;
6179 int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
6180 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6182 int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
6183 #define vminq_s16 _mm_min_epi16
6185 int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
6186 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6188 uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
6189 #define vminq_u8 _mm_min_epu8
6191 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
6192 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6194 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
6195 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6197 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
6198 #define vminq_f32 _mm_min_ps
6200 //************* Pairwise addition operations. **************************************
6201 //************************************************************************************
6202 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6203 int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
6204 _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
6206 //no 8 bit hadd in IA32, need to go to 16 bit and then pack
6207 int8x8_t res64;
6208 __m128i a16, b16, res;
6209 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
6210 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6211 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
6212 res = _mm_hadd_epi16 (a16, b16);
6213 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
6214 return64(res);
6217 int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
6218 _NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
6220 int16x4_t res64;
6221 __m128i hadd128;
6222 hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
6223 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6224 return64(hadd128);
6228 int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
6229 _NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
6231 int32x2_t res64;
6232 __m128i hadd128;
6233 hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
6234 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6235 return64(hadd128);
6239 uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
6240 _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
6242 // no 8 bit hadd in IA32, need to go to 16 bit and then pack
6243 uint8x8_t res64;
6244 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6245 __m128i mask8, a16, b16, res;
6246 mask8 = _mm_set1_epi16(0xff);
6247 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
6248 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
6249 res = _mm_hadd_epi16 (a16, b16);
6250 res = _mm_and_si128(res, mask8); //to avoid saturation
6251 res = _mm_packus_epi16 (res,res); //use low 64 bits
6252 return64(res);
6255 uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
6256 _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
6258 // solution may be not optimal, serial execution may be faster
6259 // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6260 uint16x4_t res64;
6261 __m128i c32767, cfffe, as, bs, res;
6262 c32767 = _mm_set1_epi16 (32767);
6263 cfffe = _mm_set1_epi16 (0xfffe);
6264 as = _mm_sub_epi16 (_pM128i(a), c32767);
6265 bs = _mm_sub_epi16 (_pM128i(b), c32767);
6266 res = _mm_hadd_epi16 (as, bs);
6267 res = _mm_add_epi16 (res, cfffe);
6268 res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6269 return64(res);
6272 uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
6273 _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
6275 //hadd doesn't work for unsigned values
6276 uint32x2_t res64;
6277 __m128i ab, ab_sh, res;
6278 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
6279 ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
6280 res = _mm_add_epi32(ab, ab_sh);
6281 res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6282 return64(res);
6285 float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
6286 _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
6288 __m128 hadd128;
6289 __m64_128 res64;
6290 hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
6291 hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
6292 _M64f(res64, hadd128);
6293 return res64;
6297 //************************** Long pairwise add **********************************
6298 //*********************************************************************************
6299 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6300 // and places the final results in the destination vector.
6302 int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
6303 _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
6305 //no 8 bit hadd in IA32, need to go to 16 bit anyway
6306 __m128i a16;
6307 int16x4_t res64;
6308 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6309 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6310 return64(a16);
6313 int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
6314 _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
6316 // solution may be not optimal, serial execution may be faster
6317 int32x2_t res64;
6318 __m128i r32_1;
6319 r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
6320 r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
6321 return64(r32_1);
6324 int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
6325 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6327 int64x1_t res;
6328 res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
6329 return res;
6332 uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
6333 _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
6335 // no 8 bit hadd in IA32, need to go to 16 bit
6336 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6337 uint16x4_t res64;
6338 __m128i a16;
6339 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
6340 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6341 return64(a16);
6344 uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
6345 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6347 //serial solution looks faster than a SIMD one
6348 uint32x2_t res;
6349 res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
6350 res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
6351 return res;
6354 uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
6355 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6357 uint64x1_t res;
6358 res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
6359 return res;
6362 int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
6363 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
6365 //no 8 bit hadd in IA32, need to go to 16 bit
6366 __m128i r16_1, r16_2;
6367 r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
6368 //swap hi and low part of r to process the remaining data
6369 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6370 r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
6371 return _mm_hadd_epi16 (r16_1, r16_2);
6374 int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
6375 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
6377 //no 8 bit hadd in IA32, need to go to 16 bit
6378 __m128i r32_1, r32_2;
6379 r32_1 = _MM_CVTEPI16_EPI32(a);
6380 //swap hi and low part of r to process the remaining data
6381 r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6382 r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
6383 return _mm_hadd_epi32 (r32_1, r32_2);
6386 int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
6387 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
6389 _NEON2SSE_ALIGN_16 int32_t atmp[4];
6390 _NEON2SSE_ALIGN_16 int64_t res[2];
6391 _mm_store_si128((__m128i*)atmp, a);
6392 res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
6393 res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
6394 return _mm_load_si128((__m128i*)res);
6397 uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
6398 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
6400 //no 8 bit hadd in IA32, need to go to 16 bit
6401 __m128i r16_1, r16_2;
6402 r16_1 = _MM_CVTEPU8_EPI16(a);
6403 //swap hi and low part of r to process the remaining data
6404 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6405 r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
6406 return _mm_hadd_epi16 (r16_1, r16_2);
6409 uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
6410 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6412 //serial solution looks faster than a SIMD one
6413 _NEON2SSE_ALIGN_16 uint16_t atmp[8];
6414 _NEON2SSE_ALIGN_16 uint32_t res[4];
6415 _mm_store_si128((__m128i*)atmp, a);
6416 res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
6417 res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
6418 res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
6419 res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
6420 return _mm_load_si128((__m128i*)res);
6423 uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
6424 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6426 _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6427 _NEON2SSE_ALIGN_16 uint64_t res[2];
6428 _mm_store_si128((__m128i*)atmp, a);
6429 res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
6430 res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
6431 return _mm_load_si128((__m128i*)res);
6434 //************************ Long pairwise add and accumulate **************************
6435 //****************************************************************************************
6436 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6437 // and accumulates the values of the results into the elements of the destination (wide) vector
6438 int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
6439 _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
6441 int16x4_t res64;
6442 return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
6445 int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
6446 _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
6448 int32x2_t res64;
6449 return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
6453 int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
6454 _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
6456 int64x1_t res;
6457 res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
6458 return res;
6461 uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
6462 _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
6464 uint16x4_t res64;
6465 return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
6469 uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
6470 _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
6472 uint32x2_t res64;
6473 return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
6476 uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
6477 _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
6479 uint64x1_t res;
6480 res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
6481 return res;
6484 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
6485 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
6487 int16x8_t pad;
6488 pad = vpaddlq_s8(b);
6489 return _mm_add_epi16 (a, pad);
6492 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
6493 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
6495 int32x4_t pad;
6496 pad = vpaddlq_s16(b);
6497 return _mm_add_epi32(a, pad);
6500 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
6501 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
6503 int64x2_t pad;
6504 pad = vpaddlq_s32(b);
6505 return _mm_add_epi64 (a, pad);
6508 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
6509 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
6511 uint16x8_t pad;
6512 pad = vpaddlq_u8(b);
6513 return _mm_add_epi16 (a, pad);
6516 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
6517 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6519 uint32x4_t pad;
6520 pad = vpaddlq_u16(b);
6521 return _mm_add_epi32(a, pad);
6522 } //no optimal SIMD solution, serial is faster
6524 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
6525 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6527 //no optimal SIMD solution, serial is faster
6528 uint64x2_t pad;
6529 pad = vpaddlq_u32(b);
6530 return _mm_add_epi64(a, pad);
6531 } //no optimal SIMD solution, serial is faster
6533 //********** Folding maximum *************************************
6534 //*******************************************************************
6535 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6536 //and copies the larger of each pair into the corresponding element in the destination
6537 // no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6538 int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
6539 _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
6541 int8x8_t res64;
6542 __m128i ab, ab1, max;
6543 _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6544 _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6545 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6546 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6547 max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
6548 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6549 return64(max); //we need 64 bits only
6552 int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
6553 _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
6555 //solution may be not optimal compared with the serial one
6556 int16x4_t res64;
6557 __m128i ab, ab1, max;
6558 _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6559 _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6560 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6561 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6562 max = _mm_max_epi16 (ab, ab1);
6563 max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
6564 return64(max);
6567 int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
6568 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6570 //serial solution looks faster than SIMD one
6571 int32x2_t res;
6572 res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6573 res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6574 return res;
6577 uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
6578 _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
6580 uint8x8_t res64;
6581 __m128i ab, ab1, max;
6582 _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6583 _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6584 ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
6585 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6586 max = _mm_max_epu8 (ab, ab1); // SSE4.1
6587 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6588 return64(max);
6591 uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
6592 _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
6594 //solution may be not optimal compared with the serial one
6595 uint16x4_t res64;
6596 __m128i ab, ab1, max;
6597 _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6598 _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6599 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6600 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6601 max = _MM_MAX_EPU16 (ab, ab1);
6602 max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
6603 return64(max);
6606 uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
6607 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6609 //serial solution looks faster than SIMD one
6610 uint32x2_t res;
6611 res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6612 res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6613 return res;
6614 } //serial solution looks faster than a SIMD one
6616 float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
6617 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6619 //serial solution looks faster than SIMD one
6620 float32x2_t res;
6621 res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6622 res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6623 return res;
6626 // ***************** Folding minimum ****************************
6627 // **************************************************************
6628 //vpmin -> takes minimum of adjacent pairs
6629 int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
6630 _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
6632 int8x8_t res64;
6633 __m128i ab, ab1, min;
6634 _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6635 _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6636 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6637 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
6638 min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
6639 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6640 return64(min);
6643 int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
6644 _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
6646 //solution may be not optimal compared with the serial one
6647 int16x4_t res64;
6648 __m128i ab, ab1, min;
6649 _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6650 _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6651 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6652 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6653 min = _mm_min_epi16 (ab, ab1);
6654 min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
6655 return64(min);
6658 int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
6659 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6661 //serial solution looks faster than SIMD one
6662 int32x2_t res;
6663 res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6664 res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6665 return res;
6668 uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
6669 _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
6671 uint8x8_t res64;
6672 __m128i ab, ab1, min;
6673 _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6674 _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6675 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6676 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6677 min = _mm_min_epu8 (ab, ab1); // SSE4.1
6678 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6679 return64(min);
6682 uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
6683 _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
6685 //solution may be not optimal compared with the serial one
6686 uint16x4_t res64;
6687 __m128i ab, ab1, min;
6688 _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6689 _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6690 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6691 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6692 min = _MM_MIN_EPU16 (ab, ab1);
6693 min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
6694 return64(min);
6697 uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
6698 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6700 //serial solution looks faster than SIMD one
6701 uint32x2_t res;
6702 res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6703 res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6704 return res;
6707 float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
6708 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6710 //serial solution looks faster than SIMD one
6711 float32x2_t res;
6712 res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6713 res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6714 return res;
6717 //***************************************************************
6718 //*********** Reciprocal/Sqrt ************************************
6719 //***************************************************************
6720 //****************** Reciprocal estimate *******************************
6721 //the ARM NEON and x86 SIMD results may be slightly different
6722 float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
6723 _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
6725 float32x4_t res;
6726 __m64_128 res64;
6727 res = _mm_rcp_ps(_pM128(a));
6728 _M64f(res64, res);
6729 return res64;
6732 uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
6733 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6735 //Input is fixed point number!!! No reciprocal for ints in IA32 available
6736 uint32x2_t res;
6737 float resf, r;
6738 int i, q, s;
6739 for (i =0; i<2; i++){
6740 if((a.m64_u32[i] & 0x80000000) == 0) {
6741 res.m64_u32[i] = 0xffffffff;
6742 }else{
6743 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6744 q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6745 r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
6746 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6747 r = (float)s / 256.0;
6748 res.m64_u32[i] = r * (uint32_t)(1 << 31);
6751 return res;
6754 float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
6755 #define vrecpeq_f32 _mm_rcp_ps
6758 uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
6759 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6761 //Input is fixed point number!!!
6762 //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6763 _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6764 _NEON2SSE_ALIGN_16 uint32_t res[4];
6765 _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
6766 float resf, r;
6767 int i, q, s;
6768 __m128i res128, mask, zero;
6769 _mm_store_si128((__m128i*)atmp, a);
6770 zero = _mm_setzero_si128();
6771 for (i =0; i<4; i++){
6772 resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6773 q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6774 r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
6775 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6776 r = (float)s / 256.0;
6777 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6779 res128 = _mm_load_si128((__m128i*)res);
6780 mask = _mm_and_si128(a, *(__m128i*)c80000000);
6781 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
6782 return _mm_or_si128(res128, mask);
6785 //**********Reciprocal square root estimate ****************
6786 //**********************************************************
6787 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6788 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6789 ////the ARM NEON and x86 SIMD results may be slightly different
6790 float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
6791 _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
6793 float32x4_t res;
6794 __m64_128 res64;
6795 res = _mm_rsqrt_ps(_pM128(a));
6796 _M64f(res64, res);
6797 return res64;
6800 uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
6801 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6803 //Input is fixed point number!!!
6804 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6805 uint32x2_t res;
6806 __m128 tmp;
6807 float r, resf, coeff;
6808 int i,q0, q1, s;;
6809 for (i =0; i<2; i++){
6810 if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
6811 res.m64_u32[i] = 0xffffffff;
6812 }else{
6813 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6814 coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6815 q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6816 r = ((float)q0 + 0.5) / coeff;
6817 tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6818 _mm_store_ss(&r, tmp);
6819 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6820 r = (float)s / 256.0;
6821 res.m64_u32[i] = r * (((uint32_t)1) << 31);
6824 return res;
6827 float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
6828 #define vrsqrteq_f32 _mm_rsqrt_ps
6830 uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
6831 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6833 //Input is fixed point number!!!
6834 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6835 _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
6836 _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
6837 _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
6838 __m128 tmp;
6839 __m128i res128, mask, zero;
6840 float r, resf, coeff;
6841 int i,q0, q1, s;
6842 _mm_store_si128((__m128i*)atmp, a);
6843 zero = _mm_setzero_si128();
6844 for (i =0; i<4; i++){
6845 resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
6846 coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6847 q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6848 r = ((float)q0 + 0.5) / coeff;
6849 tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6850 _mm_store_ss(&r, tmp);
6851 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6852 r = (float)s / 256.0;
6853 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6855 res128 = _mm_load_si128((__m128i*)res);
6856 mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
6857 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff
6858 return _mm_or_si128(res128, mask);
6860 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
6861 //******************************************************************************************
6862 //******VRECPS (Vector Reciprocal Step) ***************************************************
6863 //multiplies the elements of one vector by the corresponding elements of another vector,
6864 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6866 float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
6867 _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
6869 float32x4_t res;
6870 __m64_128 res64;
6871 res = vrecpsq_f32(_pM128(a), _pM128(b));
6872 _M64f(res64, res);
6873 return res64;
6876 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
6877 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
6879 __m128 f2, mul;
6880 f2 = _mm_set1_ps(2.);
6881 mul = _mm_mul_ps(a,b);
6882 return _mm_sub_ps(f2,mul);
6885 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
6886 //multiplies the elements of one vector by the corresponding elements of another vector,
6887 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
6889 float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
6890 _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
6892 float32x2_t res;
6893 res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
6894 res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
6895 return res;
6898 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
6899 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
6901 __m128 f3, f05, mul;
6902 f3 = _mm_set1_ps(3.);
6903 f05 = _mm_set1_ps(0.5);
6904 mul = _mm_mul_ps(a,b);
6905 f3 = _mm_sub_ps(f3,mul);
6906 return _mm_mul_ps (f3, f05);
6908 //********************************************************************************************
6909 //***************************** Shifts by signed variable ***********************************
6910 //********************************************************************************************
6911 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
6912 //********************************************************************************************
6913 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
6914 //helper macro. It matches ARM implementation for big shifts
6915 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
6916 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
6917 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
6918 for (i = 0; i<LEN; i++) { \
6919 if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
6920 else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
6921 return _mm_load_si128((__m128i*)res);
6923 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
6924 int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
6925 for (i = 0; i<LEN; i++) { \
6926 if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
6927 else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
6928 return res;
6930 int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
6931 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6933 SERIAL_SHIFT_64(8, i, 8)
6936 int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
6937 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6939 SERIAL_SHIFT_64(16, i, 4)
6942 int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
6943 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6945 SERIAL_SHIFT_64(32, i, 2)
6948 int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
6949 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6951 SERIAL_SHIFT_64(64, i, 1)
6954 uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
6955 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6957 SERIAL_SHIFT_64(8, u, 8)
6960 uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
6961 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6963 SERIAL_SHIFT_64(16, u, 4)
6966 uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
6967 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6969 SERIAL_SHIFT_64(32, u, 2)
6972 uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
6973 _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
6975 SERIAL_SHIFT_64(64, u, 1)
6978 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
6979 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6981 SERIAL_SHIFT(int8_t, int8_t, 16, 16)
6984 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
6985 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6987 SERIAL_SHIFT(int16_t, int16_t, 8, 8)
6990 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
6991 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6993 SERIAL_SHIFT(int32_t, int32_t, 4, 4)
6996 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
6997 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6999 SERIAL_SHIFT(int64_t, int64_t, 2, 2)
7002 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
7003 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7005 SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
7008 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
7009 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7011 SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
7014 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
7015 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7017 SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
7020 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
7021 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7023 SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
7027 //*********** Vector saturating shift left: (negative values shift right) **********************
7028 //********************************************************************************************
7029 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7030 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7031 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7032 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7033 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7034 for (i = 0; i<LEN; i++) { \
7035 if (atmp[i] ==0) res[i] = 0; \
7036 else{ \
7037 if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7038 else{ \
7039 if (btmp[i]>lanesize_1) { \
7040 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7041 }else{ \
7042 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7043 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7044 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7045 else res[i] = atmp[i] << btmp[i]; }}}} \
7046 return _mm_load_si128((__m128i*)res);
7048 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7049 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7050 TYPE lanesize = (sizeof(TYPE) << 3); \
7051 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7052 for (i = 0; i<LEN; i++) { \
7053 if (atmp[i] ==0) {res[i] = 0; \
7054 }else{ \
7055 if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7056 else{ \
7057 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7058 else{ \
7059 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7060 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7061 return _mm_load_si128((__m128i*)res);
7063 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7064 int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7065 int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7066 for (i = 0; i<LEN; i++) { \
7067 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7068 else{ \
7069 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7070 else{ \
7071 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7072 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7073 }else{ \
7074 limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7075 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7076 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7077 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7078 return res;
7080 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7081 int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7082 int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7083 for (i = 0; i<LEN; i++) { \
7084 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7085 }else{ \
7086 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7087 else{ \
7088 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7089 else{ \
7090 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7091 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7092 return res;
7094 int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
7095 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7097 SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
7100 int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
7101 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7103 SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
7106 int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
7107 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7109 SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
7112 int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
7113 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7115 SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
7118 uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
7119 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7121 SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
7124 uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
7125 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7127 SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
7130 uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
7131 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7133 SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
7136 uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
7137 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7139 SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
7142 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
7143 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7145 SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
7148 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
7149 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7151 SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
7154 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
7155 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7157 SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
7160 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
7161 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7163 SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
7166 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
7167 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7169 SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
7172 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
7173 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7175 SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
7178 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
7179 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7181 SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
7184 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
7185 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7187 SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
7191 //******** Vector rounding shift left: (negative values shift right) **********
7192 //****************************************************************************
7193 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7194 //rounding makes sense for right shifts only.
7195 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7196 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7197 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7198 for (i = 0; i<LEN; i++) { \
7199 if( btmp[i] >= 0) { \
7200 if(btmp[i] >= lanesize) res[i] = 0; \
7201 else res[i] = (atmp[i] << btmp[i]); \
7202 }else{ \
7203 res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
7204 (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7205 (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
7206 return _mm_load_si128((__m128i*)res);
7209 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7210 int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7211 for (i = 0; i<LEN; i++) { \
7212 if( b.m64_i ## TYPE[i] >= 0) { \
7213 if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7214 else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7215 }else{ \
7216 res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
7217 (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7218 (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
7219 return res;
7222 int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
7223 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7225 SERIAL_ROUNDING_SHIFT_64(8,i,8)
7228 int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
7229 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7231 SERIAL_ROUNDING_SHIFT_64(16,i,4)
7234 int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
7235 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7237 SERIAL_ROUNDING_SHIFT_64(32,i,2)
7240 int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
7241 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7243 SERIAL_ROUNDING_SHIFT_64(64,i,1)
7246 uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
7247 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7249 SERIAL_ROUNDING_SHIFT_64(8,u,8)
7252 uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
7253 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7255 SERIAL_ROUNDING_SHIFT_64(16,u,4)
7258 uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
7259 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7261 SERIAL_ROUNDING_SHIFT_64(32,u,2)
7264 uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
7265 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7267 SERIAL_ROUNDING_SHIFT_64(64,u,1)
7270 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
7271 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7273 SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
7276 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
7277 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7279 SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
7282 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
7283 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7285 SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
7288 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
7289 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7291 SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
7294 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
7295 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7297 SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
7300 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
7301 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7303 SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
7306 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
7307 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7309 SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
7312 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
7313 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7315 SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
7319 //********** Vector saturating rounding shift left: (negative values shift right) ****************
7320 //*************************************************************************************************
7321 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7322 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
7323 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7324 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7325 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7326 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7327 for (i = 0; i<LEN; i++) { \
7328 if (atmp[i] ==0) res[i] = 0; \
7329 else{ \
7330 if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7331 else{ \
7332 if (btmp[i]>lanesize_1) { \
7333 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7334 }else{ \
7335 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7336 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7337 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7338 else res[i] = atmp[i] << btmp[i]; }}}} \
7339 return _mm_load_si128((__m128i*)res);
7341 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7342 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7343 int lanesize = (sizeof(TYPE) << 3); \
7344 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7345 for (i = 0; i<LEN; i++) { \
7346 if (atmp[i] ==0) {res[i] = 0; \
7347 }else{ \
7348 if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7349 else{ \
7350 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7351 else{ \
7352 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7353 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7354 return _mm_load_si128((__m128i*)res);
7356 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7357 __m64_128 res; int ## TYPE ## _t limit; int i; \
7358 int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7359 for (i = 0; i<LEN; i++) { \
7360 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7361 else{ \
7362 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7363 else{ \
7364 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7365 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7366 }else{ \
7367 limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7368 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7369 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7370 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7371 return res;
7373 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7374 __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7375 int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7376 for (i = 0; i<LEN; i++) { \
7377 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7378 }else{ \
7379 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7380 else{ \
7381 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7382 else{ \
7383 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7384 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7385 return res;
7387 int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
7388 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7390 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
7393 int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
7394 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7396 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
7399 int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
7400 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7402 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
7405 int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
7406 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7408 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
7411 uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
7412 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7414 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
7417 uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
7418 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7420 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
7423 uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
7424 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7426 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
7429 uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
7430 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7432 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
7435 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
7436 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7438 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
7441 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
7442 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7444 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
7447 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
7448 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7450 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
7453 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
7454 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7456 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
7459 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
7460 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7462 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
7465 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
7466 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7468 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
7471 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
7472 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7474 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
7477 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
7478 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7480 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
7483 // *********************************************************************************
7484 // ***************************** Shifts by a constant *****************************
7485 // *********************************************************************************
7486 //**************** Vector shift right by constant*************************************
7487 //************************************************************************************
7488 int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
7489 _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
7491 //no 8 bit shift available, go to 16 bit
7492 int8x8_t res64;
7493 __m128i r;
7494 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7495 r = _mm_srai_epi16 (r, b); //SSE2
7496 r = _mm_packs_epi16 (r,r); //we need 64 bits only
7497 return64(r);
7500 int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
7501 _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b)
7503 int16x4_t res64;
7504 return64(_mm_srai_epi16(_pM128i(a), b));
7508 int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
7509 _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b)
7511 int32x2_t res64;
7512 return64(_mm_srai_epi32(_pM128i(a), b));
7515 int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
7516 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7518 //no arithmetic shift for 64bit values, serial solution used
7519 int64x1_t res;
7520 if(b>=64) res.m64_i64[0] = 0;
7521 else res.m64_i64[0] = (*(int64_t*)&a) >> b;
7522 return res;
7525 uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
7526 _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
7528 //no 8 bit shift available, go to 16 bit
7529 uint8x8_t res64;
7530 __m128i r;
7531 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7532 r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
7533 r = _mm_packus_epi16 (r,r); //we need 64 bits only
7534 return64(r);
7537 uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
7538 _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
7540 uint16x4_t res64;
7541 return64(_mm_srli_epi16(_pM128i(a), b));
7545 uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
7546 _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
7548 uint32x2_t res64;
7549 return64(_mm_srli_epi32(_pM128i(a), b));
7553 uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
7554 _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7556 uint64x1_t res64;
7557 return64(_mm_srli_epi64(_pM128i(a), b));
7561 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
7562 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
7564 //no 8 bit shift available, go to 16 bit trick
7565 __m128i zero, mask0, a_sign, r, a_sign_mask;
7566 _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
7567 zero = _mm_setzero_si128();
7568 mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7569 a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
7570 r = _mm_srai_epi16 (a, b);
7571 a_sign_mask = _mm_and_si128 (mask0, a_sign);
7572 r = _mm_andnot_si128 (mask0, r);
7573 return _mm_or_si128 (r, a_sign_mask);
7576 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
7577 #define vshrq_n_s16 _mm_srai_epi16
7579 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
7580 #define vshrq_n_s32 _mm_srai_epi32
7582 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
7583 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7585 //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7586 __m128i c1, signmask,a0, res64;
7587 _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
7588 c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
7589 signmask = _mm_slli_epi64 (c1, (64 - b));
7590 a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
7591 a0 = _MM_CMPEQ_EPI64 (a, a0);
7592 signmask = _mm_and_si128(a0, signmask);
7593 res64 = _mm_srli_epi64 (a, b);
7594 return _mm_or_si128(res64, signmask);
7597 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
7598 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
7600 //no 8 bit shift available, need the special trick
7601 __m128i mask0, r;
7602 _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
7603 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7604 r = _mm_srli_epi16 ( a, b);
7605 return _mm_and_si128 (r, mask0);
7608 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
7609 #define vshrq_n_u16 _mm_srli_epi16
7611 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
7612 #define vshrq_n_u32 _mm_srli_epi32
7614 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
7615 #define vshrq_n_u64 _mm_srli_epi64
7617 //*************************** Vector shift left by constant *************************
7618 //*********************************************************************************
7619 int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7620 _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
7622 //no 8 bit shift available, go to 16 bit
7623 int8x8_t res64;
7624 __m128i r;
7625 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
7626 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7627 r = _mm_slli_epi16 (r, b); //SSE2
7628 r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
7629 return64(r);
7632 int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7633 _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b)
7635 int16x4_t res64;
7636 return64(_mm_slli_epi16(_pM128i(a), b));
7640 int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7641 _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b)
7643 int32x2_t res64;
7644 return64(_mm_slli_epi32(_pM128i(a), b));
7648 int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7649 _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b)
7651 int64x1_t res64;
7652 return64(_mm_slli_epi64(_pM128i(a), b));
7656 uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7657 _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
7659 //no 8 bit shift available, go to 16 bit
7660 uint8x8_t res64;
7661 __m128i mask8;
7662 __m128i r;
7663 mask8 = _mm_set1_epi16(0xff);
7664 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7665 r = _mm_slli_epi16 (r, b); //SSE2
7666 r = _mm_and_si128(r, mask8); //to avoid saturation
7667 r = _mm_packus_epi16 (r,r); //we need 64 bits only
7668 return64(r);
7671 uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7672 #define vshl_n_u16 vshl_n_s16
7675 uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7676 #define vshl_n_u32 vshl_n_s32
7678 uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7679 #define vshl_n_u64 vshl_n_s64
7681 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7682 #define vshlq_n_s8 vshlq_n_u8
7684 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7685 #define vshlq_n_s16 _mm_slli_epi16
7687 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7688 #define vshlq_n_s32 _mm_slli_epi32
7690 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7691 #define vshlq_n_s64 _mm_slli_epi64
7693 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7694 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
7696 //no 8 bit shift available, need the special trick
7697 __m128i mask0, r;
7698 _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
7699 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7700 r = _mm_slli_epi16 ( a, b);
7701 return _mm_and_si128 (r, mask0);
7704 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7705 #define vshlq_n_u16 vshlq_n_s16
7707 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7708 #define vshlq_n_u32 vshlq_n_s32
7710 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7711 #define vshlq_n_u64 vshlq_n_s64
7713 //************* Vector rounding shift right by constant ******************
7714 //*************************************************************************
7715 //No corresponding x86 intrinsics exist, need to do some tricks
7716 int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
7717 _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
7719 //no 8 bit shift available, go to 16 bit
7720 int8x8_t res64;
7721 __m128i r, maskb;
7722 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7723 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7724 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7725 r = _mm_srai_epi16 (r, b);
7726 r = _mm_add_epi16 (r, maskb); //actual rounding
7727 r = _mm_packs_epi16 (r,r); ////we need 64 bits only
7728 return64(r);
7731 int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
7732 _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b)
7734 int16x4_t res64;
7735 return64(vrshrq_n_s16(_pM128i(a), b));
7739 int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
7740 _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b)
7742 int32x2_t res64;
7743 return64(vrshrq_n_s32(_pM128i(a), b));
7747 int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
7748 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7750 //serial solution is faster
7751 int64x1_t res;
7752 int64_t a_i64 = *( int64_t*)&a;
7753 if(b==64) {
7754 res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7755 } else {
7756 int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
7757 res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
7759 return res;
7762 uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
7763 _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
7765 //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7766 uint8x8_t res64;
7767 __m128i r, maskb;
7768 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7769 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7770 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7771 r = _mm_srli_epi16 (r, b);
7772 r = _mm_add_epi16 (r, maskb); //actual rounding
7773 r = _mm_packus_epi16 (r,r); ////we need 64 bits only
7774 return64(r);
7777 uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
7778 _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
7780 uint16x4_t res64;
7781 return64(vrshrq_n_u16(_pM128i(a), b));
7785 uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
7786 _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
7788 uint32x2_t res64;
7789 return64(vrshrq_n_u32(_pM128i(a), b));
7793 uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
7794 _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7796 uint64x1_t res64;
7797 return64(vrshrq_n_u64(_pM128i(a), b));
7800 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
7801 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
7803 //no 8 bit shift available, go to 16 bit trick
7804 __m128i r, mask1, maskb;
7805 _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7806 r = vshrq_n_s8 (a, b);
7807 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7808 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7809 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
7810 return _mm_add_epi8(r, maskb); //actual rounding
7813 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
7814 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7816 __m128i maskb, r;
7817 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7818 maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7819 r = _mm_srai_epi16 (a, b);
7820 return _mm_add_epi16 (r, maskb); //actual rounding
7823 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
7824 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7826 __m128i maskb, r;
7827 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7828 maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7829 r = _mm_srai_epi32(a, b);
7830 return _mm_add_epi32 (r, maskb); //actual rounding
7833 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
7834 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7836 //solution may be not optimal compared with a serial one
7837 __m128i maskb;
7838 int64x2_t r;
7839 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7840 maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7841 r = vshrq_n_s64(a, b);
7842 return _mm_add_epi64 (r, maskb); //actual rounding
7845 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
7846 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
7848 //no 8 bit shift available, go to 16 bit trick
7849 __m128i r, mask1, maskb;
7850 _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7851 r = vshrq_n_u8 (a, b);
7852 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7853 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7854 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
7855 return _mm_add_epi8(r, maskb); //actual rounding
7858 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
7859 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7861 __m128i maskb, r;
7862 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7863 maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7864 r = _mm_srli_epi16 (a, b);
7865 return _mm_add_epi16 (r, maskb); //actual rounding
7868 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
7869 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7871 __m128i maskb, r;
7872 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7873 maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7874 r = _mm_srli_epi32(a, b);
7875 return _mm_add_epi32 (r, maskb); //actual rounding
7878 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
7879 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
7881 //solution may be not optimal compared with a serial one
7882 __m128i maskb, r;
7883 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7884 maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7885 r = _mm_srli_epi64(a, b);
7886 return _mm_add_epi64 (r, maskb); //actual rounding
7889 //************* Vector shift right by constant and accumulate *********
7890 //*********************************************************************
7891 int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
7892 _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
7894 int8x8_t shift;
7895 shift = vshr_n_s8(b, c);
7896 return vadd_s8( a, shift);
7899 int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
7900 _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
7902 int16x4_t shift;
7903 shift = vshr_n_s16( b, c);
7904 return vadd_s16(a, shift);
7907 int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
7908 _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
7910 //may be not optimal compared with the serial execution
7911 int32x2_t shift;
7912 shift = vshr_n_s32(b, c);
7913 return vadd_s32( a, shift);
7916 int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
7917 _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
7919 //may be not optimal compared with a serial solution
7920 int64x1_t shift;
7921 shift = vshr_n_s64(b, c);
7922 return vadd_s64( a, shift);
7925 uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
7926 _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
7928 uint8x8_t shift;
7929 shift = vshr_n_u8(b, c);
7930 return vadd_u8(a, shift);
7933 uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
7934 _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
7936 uint16x4_t shift;
7937 shift = vshr_n_u16(b, c);
7938 return vadd_u16(a,shift);
7941 uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
7942 _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
7944 //may be not optimal compared with the serial execution
7945 uint32x2_t shift;
7946 shift = vshr_n_u32(b, c);
7947 return vadd_u32( a, shift);
7950 uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
7951 _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
7953 //may be not optimal compared with the serial execution
7954 uint64x1_t shift;
7955 shift = vshr_n_u64(b, c);
7956 return vadd_u64(a, shift);
7959 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
7960 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
7962 int8x16_t shift;
7963 shift = vshrq_n_s8(b, c);
7964 return vaddq_s8(a, shift);
7967 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
7968 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
7970 int16x8_t shift;
7971 shift = vshrq_n_s16(b, c);
7972 return vaddq_s16(a, shift);
7975 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
7976 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
7978 int32x4_t shift;
7979 shift = vshrq_n_s32(b, c);
7980 return vaddq_s32(a, shift);
7983 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
7984 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
7986 int64x2_t shift;
7987 shift = vshrq_n_s64(b, c);
7988 return vaddq_s64( a, shift);
7991 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
7992 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
7994 uint8x16_t shift;
7995 shift = vshrq_n_u8(b, c);
7996 return vaddq_u8(a, shift);
7999 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
8000 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
8002 uint16x8_t shift;
8003 shift = vshrq_n_u16(b, c);
8004 return vaddq_u16(a, shift);
8007 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
8008 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
8010 uint32x4_t shift;
8011 shift = vshrq_n_u32(b, c);
8012 return vaddq_u32(a, shift);
8015 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
8016 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
8018 uint64x2_t shift;
8019 shift = vshrq_n_u64(b, c);
8020 return vaddq_u64(a, shift);
8023 //************* Vector rounding shift right by constant and accumulate ****************************
8024 //************************************************************************************************
8025 int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
8026 _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
8028 int8x8_t shift;
8029 shift = vrshr_n_s8(b, c);
8030 return vadd_s8( a, shift);
8033 int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
8034 _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
8036 int16x4_t shift;
8037 shift = vrshr_n_s16( b, c);
8038 return vadd_s16(a, shift);
8041 int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
8042 _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
8044 //may be not optimal compared with the serial execution
8045 int32x2_t shift;
8046 shift = vrshr_n_s32(b, c);
8047 return vadd_s32( a, shift);
8050 int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
8051 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8053 int64x1_t shift;
8054 shift = vrshr_n_s64(b, c);
8055 return vadd_s64( a, shift);
8058 uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
8059 _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
8061 uint8x8_t shift;
8062 shift = vrshr_n_u8(b, c);
8063 return vadd_u8(a, shift);
8066 uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
8067 _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
8069 uint16x4_t shift;
8070 shift = vrshr_n_u16(b, c);
8071 return vadd_u16(a,shift);
8074 uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
8075 _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
8077 //may be not optimal compared with the serial execution
8078 uint32x2_t shift;
8079 shift = vrshr_n_u32(b, c);
8080 return vadd_u32( a, shift);
8083 uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
8084 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8086 //may be not optimal compared with the serial execution
8087 uint64x1_t shift;
8088 shift = vrshr_n_u64(b, c);
8089 return vadd_u64( a, shift);
8092 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
8093 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
8095 int8x16_t shift;
8096 shift = vrshrq_n_s8(b, c);
8097 return vaddq_s8(a, shift);
8100 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
8101 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
8103 int16x8_t shift;
8104 shift = vrshrq_n_s16(b, c);
8105 return vaddq_s16(a, shift);
8108 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
8109 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
8111 int32x4_t shift;
8112 shift = vrshrq_n_s32(b, c);
8113 return vaddq_s32(a, shift);
8116 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
8117 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
8119 int64x2_t shift;
8120 shift = vrshrq_n_s64(b, c);
8121 return vaddq_s64(a, shift);
8124 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
8125 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
8127 uint8x16_t shift;
8128 shift = vrshrq_n_u8(b, c);
8129 return vaddq_u8(a, shift);
8132 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
8133 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
8135 uint16x8_t shift;
8136 shift = vrshrq_n_u16(b, c);
8137 return vaddq_u16(a, shift);
8140 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
8141 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
8143 uint32x4_t shift;
8144 shift = vrshrq_n_u32(b, c);
8145 return vaddq_u32(a, shift);
8148 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
8149 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
8151 uint64x2_t shift;
8152 shift = vrshrq_n_u64(b, c);
8153 return vaddq_u64(a, shift);
8156 //**********************Vector saturating shift left by constant *****************************
8157 //********************************************************************************************
8158 //we don't check const ranges assuming they are met
8159 int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
8160 _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
8162 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8163 int8x8_t res64;
8164 __m128i a128, r128;
8165 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8166 r128 = _mm_slli_epi16 (a128, b);
8167 r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
8168 return64(r128);
8171 int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
8172 _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
8174 // go to 32 bit to get the auto saturation (in packs function)
8175 int16x4_t res64;
8176 __m128i a128, r128;
8177 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8178 r128 = _mm_slli_epi32 (a128, b); //shift_res
8179 r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
8180 return64(r128);
8183 int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
8184 _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b)
8186 //serial execution may be faster
8187 int32x2_t res64;
8188 return64(vqshlq_n_s32 (_pM128i(a), b));
8192 int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
8193 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8195 // no effective SIMD solution here
8196 int64x1_t res;
8197 int64_t bmask;
8198 int64_t a_i64 = *( int64_t*)&a;
8199 bmask = ( int64_t)1 << (63 - b); //positive
8200 if (a_i64 >= bmask) {
8201 res.m64_i64[0] = ~(_SIGNBIT64);
8202 } else {
8203 res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
8205 return res;
8209 uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
8210 _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
8212 //no 8 bit shift available in IA32 SIMD, go to 16 bit
8213 uint8x8_t res64;
8214 __m128i a128, r128;
8215 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8216 r128 = _mm_slli_epi16 (a128, b); //shift_res
8217 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8218 return64(r128);
8221 uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
8222 _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
8224 // go to 32 bit to get the auto saturation (in packus function)
8225 uint16x4_t res64;
8226 __m128i a128, r128;
8227 a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8228 r128 = _mm_slli_epi32 (a128, b); //shift_res
8229 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
8230 return64(r128);
8233 uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
8234 _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b)
8236 uint32x2_t res64;
8237 return64(vqshlq_n_u32(_pM128i(a), b));
8240 uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
8241 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8243 // no effective SIMD solution here
8244 uint64x1_t res;
8245 uint64_t bmask;
8246 uint64_t a_i64 = *(uint64_t*)&a;
8247 bmask = ( uint64_t)1 << (64 - b);
8248 res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
8249 return res;
8252 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
8253 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
8255 // go to 16 bit to get the auto saturation (in packs function)
8256 __m128i a128, r128_1, r128_2;
8257 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8258 r128_1 = _mm_slli_epi16 (a128, b);
8259 //swap hi and low part of a128 to process the remaining data
8260 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8261 a128 = _MM_CVTEPI8_EPI16 (a128);
8262 r128_2 = _mm_slli_epi16 (a128, b);
8263 return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
8266 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
8267 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
8269 // manual saturation solution looks LESS optimal than 32 bits conversion one
8270 // go to 32 bit to get the auto saturation (in packs function)
8271 __m128i a128, r128_1, r128_2;
8272 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8273 r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8274 //swap hi and low part of a128 to process the remaining data
8275 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8276 a128 = _MM_CVTEPI16_EPI32 (a128);
8277 r128_2 = _mm_slli_epi32 (a128, b);
8278 return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
8281 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
8282 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
8284 // no 64 bit saturation option available, special tricks necessary
8285 __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
8286 c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
8287 maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
8288 saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
8289 c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
8290 shift_res = _mm_slli_epi32 (a, b);
8291 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8292 //result with positive numbers saturated
8293 shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8294 //treat negative numbers
8295 maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
8296 saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
8297 c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
8298 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8299 return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8302 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
8303 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8305 // no effective SIMD solution here
8306 _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
8307 int64_t bmask;
8308 int i;
8309 bmask = ( int64_t)1 << (63 - b); //positive
8310 _mm_store_si128((__m128i*)atmp, a);
8311 for (i = 0; i<2; i++) {
8312 if (atmp[i] >= bmask) {
8313 res[i] = ~(_SIGNBIT64);
8314 } else {
8315 res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
8318 return _mm_load_si128((__m128i*)res);
8321 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
8322 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
8324 // go to 16 bit to get the auto saturation (in packs function)
8325 __m128i a128, r128_1, r128_2;
8326 a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
8327 r128_1 = _mm_slli_epi16 (a128, b);
8328 //swap hi and low part of a128 to process the remaining data
8329 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8330 a128 = _MM_CVTEPU8_EPI16 (a128);
8331 r128_2 = _mm_slli_epi16 (a128, b);
8332 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8335 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
8336 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
8338 // manual saturation solution looks more optimal than 32 bits conversion one
8339 __m128i cb, c8000, a_signed, saturation_mask, shift_res;
8340 cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
8341 c8000 = _mm_set1_epi16 (0x8000);
8342 //no unsigned shorts comparison in SSE, only signed available, so need the trick
8343 a_signed = _mm_sub_epi16(a, c8000); //go to signed
8344 saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
8345 shift_res = _mm_slli_epi16 (a, b);
8346 return _mm_or_si128 (shift_res, saturation_mask);
8349 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
8350 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
8352 // manual saturation solution, no 64 bit saturation option, the serial version may be faster
8353 __m128i cb, c80000000, a_signed, saturation_mask, shift_res;
8354 cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
8355 c80000000 = _mm_set1_epi32 (0x80000000);
8356 //no unsigned ints comparison in SSE, only signed available, so need the trick
8357 a_signed = _mm_sub_epi32(a, c80000000); //go to signed
8358 saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
8359 shift_res = _mm_slli_epi32 (a, b);
8360 return _mm_or_si128 (shift_res, saturation_mask);
8363 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
8364 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8366 // no effective SIMD solution here
8367 _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
8368 uint64_t bmask;
8369 int i;
8370 bmask = ( uint64_t)1 << (64 - b);
8371 _mm_store_si128((__m128i*)atmp, a);
8372 for (i = 0; i<2; i++) {
8373 res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
8375 return _mm_load_si128((__m128i*)res);
8378 //**************Vector signed->unsigned saturating shift left by constant *************
8379 //*************************************************************************************
8380 uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
8381 _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
8383 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8384 uint8x8_t res64;
8385 __m128i a128, r128;
8386 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8387 r128 = _mm_slli_epi16 (a128, b);
8388 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8389 return64(r128);
8392 uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
8393 _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
8395 uint16x4_t res64;
8396 __m128i a128, r128;
8397 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8398 r128 = _mm_slli_epi32 (a128, b); //shift_res
8399 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
8400 return64(r128);
8403 uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
8404 _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b)
8406 int32x2_t res64;
8407 return64( vqshluq_n_s32(_pM128i(a), b));
8410 uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
8411 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
8413 uint64x1_t res;
8414 uint64_t limit;
8415 if (a.m64_i64[0]<=0) {
8416 res.m64_u64[0] = 0;
8417 } else {
8418 limit = (uint64_t) 1 << (64 - b);
8419 res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
8421 return res;
8424 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
8425 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
8427 __m128i a128, r128_1, r128_2;
8428 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8429 r128_1 = _mm_slli_epi16 (a128, b);
8430 //swap hi and low part of a128 to process the remaining data
8431 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8432 a128 = _MM_CVTEPI8_EPI16 (a128);
8433 r128_2 = _mm_slli_epi16 (a128, b);
8434 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8437 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
8438 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
8440 // manual saturation solution looks LESS optimal than 32 bits conversion one
8441 __m128i a128, r128_1, r128_2;
8442 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8443 r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8444 //swap hi and low part of a128 to process the remaining data
8445 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8446 a128 = _MM_CVTEPI16_EPI32 (a128);
8447 r128_2 = _mm_slli_epi32 (a128, b);
8448 return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
8451 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
8452 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
8454 //solution may be not optimal compared with the serial one
8455 __m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
8456 zero = _mm_setzero_si128();
8457 maskA = _mm_cmpeq_epi32(a, a);
8458 maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
8459 //saturate negative numbers to zero
8460 maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
8461 a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
8462 //saturate positive to 0xffffffff
8463 a_masked = _mm_and_si128 (a0, maskA);
8464 a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
8465 a_shift = _mm_slli_epi32 (a0, b);
8466 return _mm_or_si128 (a_shift, a_masked); //actual saturation
8469 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
8470 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8472 // no effective SIMD solution here, serial execution looks faster
8473 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8474 _NEON2SSE_ALIGN_16 uint64_t res[2];
8475 uint64_t limit;
8476 int i;
8477 _mm_store_si128((__m128i*)atmp, a);
8478 for (i = 0; i<2; i++) {
8479 if (atmp[i]<=0) {
8480 res[i] = 0;
8481 } else {
8482 limit = (uint64_t) 1 << (64 - b);
8483 res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
8486 return _mm_load_si128((__m128i*)res);
8489 //************** Vector narrowing shift right by constant **************
8490 //**********************************************************************
8491 int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8492 _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8494 int8x8_t res64;
8495 __m128i r16;
8496 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8497 r16 = vshrq_n_s16(a,b);
8498 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8499 return64(r16);
8502 int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8503 _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8505 int16x4_t res64;
8506 __m128i r32;
8507 _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8508 r32 = vshrq_n_s32(a,b);
8509 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8510 return64(r32);
8513 int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8514 _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8516 int32x2_t res64;
8517 __m128i r64;
8518 r64 = vshrq_n_s64(a,b);
8519 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8520 return64(r64);
8523 uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8524 _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8526 uint8x8_t res64;
8527 __m128i mask, r16;
8528 mask = _mm_set1_epi16(0xff);
8529 r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8530 r16 = _mm_and_si128(r16, mask); //to avoid saturation
8531 r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
8532 return64(r16);
8535 uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8536 _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8538 uint16x4_t res64;
8539 __m128i mask, r32;
8540 mask = _mm_set1_epi32(0xffff);
8541 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8542 r32 = _mm_and_si128(r32, mask); //to avoid saturation
8543 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8544 return64(r32);
8547 uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8548 _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8550 uint32x2_t res64;
8551 __m128i r64;
8552 r64 = vshrq_n_u64(a,b);
8553 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8554 return64(r64);
8557 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
8558 //*********************************************************************************************
8559 uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
8560 _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
8562 uint8x8_t res64;
8563 __m128i r16;
8564 r16 = vshrq_n_s16(a,b);
8565 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8566 return64(r16);
8569 uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
8570 _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
8572 uint16x4_t res64;
8573 __m128i r32;
8574 r32 = vshrq_n_s32(a,b);
8575 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
8576 return64(r32);
8579 uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
8580 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8582 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8583 uint32x2_t res;
8584 int64_t res64;
8585 _mm_store_si128((__m128i*)atmp, a);
8586 if (atmp[0] < 0) {
8587 res.m64_u32[0] = 0;
8588 } else {
8589 res64 = (atmp[0] >> b);
8590 res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
8592 if (atmp[1] < 0) {
8593 res.m64_u32[1] = 0;
8594 } else {
8595 res64 = (atmp[1] >> b);
8596 res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
8598 return res;
8601 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
8602 uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
8603 _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
8605 //solution may be not optimal compared with the serial one
8606 __m128i r16;
8607 uint8x8_t res64;
8608 r16 = vrshrq_n_s16(a,b);
8609 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8610 return64(r16);
8613 uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
8614 _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
8616 //solution may be not optimal compared with the serial one
8617 __m128i r32;
8618 uint16x4_t res64;
8619 r32 = vrshrq_n_s32(a,b);
8620 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
8621 return64(r32);
8624 uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
8625 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8627 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8628 uint32x2_t res;
8629 int64_t res64;
8630 _mm_store_si128((__m128i*)atmp, a);
8631 if (atmp[0] < 0) {
8632 res.m64_u32[0] = 0;
8633 } else {
8634 res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
8635 res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
8637 if (atmp[1] < 0) {
8638 res.m64_u32[1] = 0;
8639 } else {
8640 res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
8641 res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
8643 return res;
8646 //***** Vector narrowing saturating shift right by constant ******
8647 //*****************************************************************
8648 int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
8649 _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
8651 int8x8_t res64;
8652 __m128i r16;
8653 r16 = vshrq_n_s16(a,b);
8654 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8655 return64(r16);
8658 int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
8659 _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
8661 int16x4_t res64;
8662 __m128i r32;
8663 r32 = vshrq_n_s32(a,b);
8664 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8665 return64(r32);
8668 int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
8669 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8671 //no optimal SIMD solution found
8672 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
8673 int32x2_t res;
8674 _mm_store_si128((__m128i*)atmp, a);
8675 res64[0] = (atmp[0] >> b);
8676 res64[1] = (atmp[1] >> b);
8677 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8678 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8679 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8680 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8681 res.m64_i32[0] = (int32_t)res64[0];
8682 res.m64_i32[1] = (int32_t)res64[1];
8683 return res;
8686 uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
8687 _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
8689 uint8x8_t res64;
8690 __m128i r16;
8691 r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8692 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8693 return64(r16);
8696 uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
8697 _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
8699 uint16x4_t res64;
8700 __m128i r32;
8701 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8702 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8703 return64(r32);
8706 uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
8707 _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8709 //serial solution may be faster
8710 uint32x2_t res64;
8711 __m128i r64, res_hi, zero;
8712 zero = _mm_setzero_si128();
8713 r64 = vshrq_n_u64(a,b);
8714 res_hi = _mm_srli_epi64(r64, 32);
8715 res_hi = _mm_cmpgt_epi32(res_hi, zero);
8716 r64 = _mm_or_si128(r64, res_hi);
8717 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8718 return64(r64);
8722 //********* Vector rounding narrowing shift right by constant *************************
8723 //****************************************************************************************
8724 int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8725 _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8727 int8x8_t res64;
8728 __m128i r16;
8729 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8730 r16 = vrshrq_n_s16(a,b);
8731 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8732 return64(r16);
8735 int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8736 _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8738 int16x4_t res64;
8739 __m128i r32;
8740 _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8741 r32 = vrshrq_n_s32(a,b);
8742 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8743 return64(r32);
8746 int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8747 _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8749 int32x2_t res64;
8750 __m128i r64;
8751 r64 = vrshrq_n_s64(a,b);
8752 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8753 return64(r64);
8756 uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8757 _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8759 uint8x8_t res64;
8760 __m128i mask, r16;
8761 mask = _mm_set1_epi16(0xff);
8762 r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8763 r16 = _mm_and_si128(r16, mask); //to avoid saturation
8764 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8765 return64(r16);
8768 uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8769 _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8771 uint16x4_t res64;
8772 __m128i mask, r32;
8773 mask = _mm_set1_epi32(0xffff);
8774 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8775 r32 = _mm_and_si128(r32, mask); //to avoid saturation
8776 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8777 return64(r32);
8780 uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8781 _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
8783 uint32x2_t res64;
8784 __m128i r64;
8785 r64 = vrshrq_n_u64(a,b);
8786 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8787 return64(r64);
8790 //************* Vector rounding narrowing saturating shift right by constant ************
8791 //****************************************************************************************
8792 int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
8793 _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
8795 int8x8_t res64;
8796 __m128i r16;
8797 r16 = vrshrq_n_s16(a,b);
8798 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8799 return64(r16);
8802 int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
8803 _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
8805 int16x4_t res64;
8806 __m128i r32;
8807 r32 = vrshrq_n_s32(a,b);
8808 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8809 return64(r32);
8812 int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
8813 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8815 //no optimal SIMD solution found
8816 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
8817 int32x2_t res;
8818 _mm_store_si128((__m128i*)atmp, a);
8819 maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
8820 res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
8821 maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
8822 res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
8823 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8824 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8825 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8826 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8827 res.m64_i32[0] = (int32_t)res64[0];
8828 res.m64_i32[1] = (int32_t)res64[1];
8829 return res;
8832 uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
8833 _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
8835 uint8x8_t res64;
8836 __m128i r16;
8837 r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8838 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8839 return64(r16);
8842 uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
8843 _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
8845 uint16x4_t res64;
8846 __m128i r32;
8847 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8848 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8849 return64(r32);
8852 uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
8853 _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8855 //serial solution may be faster
8856 uint32x2_t res64;
8857 __m128i r64, res_hi, zero;
8858 zero = _mm_setzero_si128();
8859 r64 = vrshrq_n_u64(a,b);
8860 res_hi = _mm_srli_epi64(r64, 32);
8861 res_hi = _mm_cmpgt_epi32(res_hi, zero);
8862 r64 = _mm_or_si128(r64, res_hi);
8863 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8864 return64(r64);
8867 //************** Vector widening shift left by constant ****************
8868 //************************************************************************
8869 int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
8870 _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
8872 __m128i r;
8873 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8874 return _mm_slli_epi16 (r, b);
8877 int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
8878 _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
8880 __m128i r;
8881 r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
8882 return _mm_slli_epi32 (r, b);
8885 int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
8886 _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
8888 __m128i r;
8889 r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
8890 return _mm_slli_epi64 (r, b);
8893 uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
8894 _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
8896 //no uint8 to uint16 conversion available, manual conversion used
8897 __m128i zero, r;
8898 zero = _mm_setzero_si128 ();
8899 r = _mm_unpacklo_epi8(_pM128i(a), zero);
8900 return _mm_slli_epi16 (r, b);
8903 uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
8904 _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
8906 //no uint16 to uint32 conversion available, manual conversion used
8907 __m128i zero, r;
8908 zero = _mm_setzero_si128 ();
8909 r = _mm_unpacklo_epi16(_pM128i(a), zero);
8910 return _mm_slli_epi32 (r, b);
8913 uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
8914 _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
8916 //no uint32 to uint64 conversion available, manual conversion used
8917 __m128i zero, r;
8918 zero = _mm_setzero_si128 ();
8919 r = _mm_unpacklo_epi32(_pM128i(a), zero);
8920 return _mm_slli_epi64 (r, b);
8923 //************************************************************************************
8924 //**************************** Shifts with insert ************************************
8925 //************************************************************************************
8926 //takes each element in a vector, shifts them by an immediate value,
8927 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
8929 //**************** Vector shift right and insert ************************************
8930 //Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
8931 //All other bits are taken from b shifted.
8932 int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
8933 _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c)
8935 int8x8_t res64;
8936 return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
8940 int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
8941 _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c)
8943 int16x4_t res64;
8944 return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
8948 int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
8949 _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c)
8951 int32x2_t res64;
8952 return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
8956 int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
8957 _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
8959 int64x1_t res;
8960 if (c ==64)
8961 res = a;
8962 else{
8963 res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
8965 return res;
8968 uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
8969 #define vsri_n_u8 vsri_n_s8
8971 uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
8972 #define vsri_n_u16 vsri_n_s16
8974 uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
8975 #define vsri_n_u32 vsri_n_s32
8978 uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
8979 #define vsri_n_u64 vsri_n_s64
8981 poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
8982 #define vsri_n_p8 vsri_n_u8
8984 poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
8985 #define vsri_n_p16 vsri_n_u16
8987 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
8988 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
8990 __m128i maskA, a_masked;
8991 uint8x16_t b_shift;
8992 _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
8993 maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
8994 a_masked = _mm_and_si128 (a, maskA);
8995 b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
8996 return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
8999 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9000 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
9002 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9003 uint16x8_t b_shift;
9004 uint16x8_t a_c;
9005 b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
9006 a_c = vshrq_n_u16( a, (16 - c));
9007 a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
9008 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9011 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9012 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
9014 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9015 uint32x4_t b_shift;
9016 uint32x4_t a_c;
9017 b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
9018 a_c = vshrq_n_u32( a, (32 - c));
9019 a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
9020 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9023 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9024 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
9026 //serial solution may be faster
9027 uint64x2_t b_shift;
9028 uint64x2_t a_c;
9029 b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
9030 a_c = _mm_srli_epi64(a, (64 - c));
9031 a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
9032 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9035 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9036 #define vsriq_n_u8 vsriq_n_s8
9038 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9039 #define vsriq_n_u16 vsriq_n_s16
9041 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9042 #define vsriq_n_u32 vsriq_n_s32
9044 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9045 #define vsriq_n_u64 vsriq_n_s64
9047 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9048 #define vsriq_n_p8 vsriq_n_u8
9050 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9051 #define vsriq_n_p16 vsriq_n_u16
9053 //***** Vector shift left and insert *********************************************
9054 //*********************************************************************************
9055 //Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
9056 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9057 int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9058 _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c)
9060 int8x8_t res64;
9061 return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9065 int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9066 _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c)
9068 int16x4_t res64;
9069 return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9073 int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9074 _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c)
9076 int32x2_t res64;
9077 return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
9080 int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9081 _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
9083 int64x1_t res;
9084 res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
9085 return res;
9089 uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9090 #define vsli_n_u8 vsli_n_s8
9092 uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9093 #define vsli_n_u16 vsli_n_s16
9095 uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9096 #define vsli_n_u32 vsli_n_s32
9098 uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9099 #define vsli_n_u64 vsli_n_s64
9101 poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9102 #define vsli_n_p8 vsli_n_u8
9104 poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9105 #define vsli_n_p16 vsli_n_u16
9107 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9108 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
9110 __m128i maskA, a_masked;
9111 int8x16_t b_shift;
9112 _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
9113 maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
9114 b_shift = vshlq_n_s8( b, c);
9115 a_masked = _mm_and_si128 (a, maskA);
9116 return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
9119 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9120 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
9122 //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9123 int16x8_t b_shift;
9124 int16x8_t a_c;
9125 b_shift = vshlq_n_s16( b, c);
9126 a_c = vshlq_n_s16( a, (16 - c));
9127 a_c = _mm_srli_epi16(a_c, (16 - c));
9128 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9131 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9132 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
9134 //solution may be not optimal compared with the serial one
9135 //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9136 int32x4_t b_shift;
9137 int32x4_t a_c;
9138 b_shift = vshlq_n_s32( b, c);
9139 a_c = vshlq_n_s32( a, (32 - c));
9140 a_c = _mm_srli_epi32(a_c, (32 - c));
9141 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9144 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9145 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
9147 //solution may be not optimal compared with the serial one
9148 //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9149 int64x2_t b_shift;
9150 int64x2_t a_c;
9151 b_shift = vshlq_n_s64( b, c);
9152 a_c = vshlq_n_s64( a, (64 - c));
9153 a_c = _mm_srli_epi64(a_c, (64 - c));
9154 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9157 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9158 #define vsliq_n_u8 vsliq_n_s8
9160 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9161 #define vsliq_n_u16 vsliq_n_s16
9163 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9164 #define vsliq_n_u32 vsliq_n_s32
9166 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9167 #define vsliq_n_u64 vsliq_n_s64
9169 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9170 #define vsliq_n_p8 vsliq_n_u8
9172 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9173 #define vsliq_n_p16 vsliq_n_u16
9175 // ***********************************************************************************************
9176 // ****************** Loads and stores of a single vector ***************************************
9177 // ***********************************************************************************************
9178 //Performs loads and stores of a single vector of some type.
9179 //******************************* Loads ********************************************************
9180 // ***********************************************************************************************
9181 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
9182 //also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
9183 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9184 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
9185 #define LOAD_SI128(ptr) \
9186 ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
9188 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9189 #define vld1q_u8 LOAD_SI128
9191 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9192 #define vld1q_u16 LOAD_SI128
9194 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9195 #define vld1q_u32 LOAD_SI128
9197 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9198 #define vld1q_u64 LOAD_SI128
9200 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9201 #define vld1q_s8 LOAD_SI128
9203 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9204 #define vld1q_s16 LOAD_SI128
9206 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9207 #define vld1q_s32 LOAD_SI128
9209 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9210 #define vld1q_s64 LOAD_SI128
9212 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
9213 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9214 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
9215 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9216 __m128 f2;
9217 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9220 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9221 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
9223 if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned
9224 return _mm_load_ps(ptr);
9225 else
9226 return _mm_loadu_ps(ptr);
9229 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9230 #define vld1q_p8 LOAD_SI128
9232 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9233 #define vld1q_p16 LOAD_SI128
9235 uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
9236 #define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9238 uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
9239 #define vld1_u16 vld1_u8
9241 uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
9242 #define vld1_u32 vld1_u8
9245 uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9246 #define vld1_u64 vld1_u8
9248 int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
9249 #define vld1_s8 vld1_u8
9251 int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
9252 #define vld1_s16 vld1_u16
9254 int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
9255 #define vld1_s32 vld1_u32
9257 int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9258 #define vld1_s64 vld1_u64
9260 float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
9261 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9263 float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
9264 _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
9266 float32x2_t res;
9267 res.m64_f32[0] = *(ptr);
9268 res.m64_f32[1] = *(ptr + 1);
9269 return res;
9272 poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
9273 #define vld1_p8 vld1_u8
9275 poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
9276 #define vld1_p16 vld1_u16
9278 //***********************************************************************************************************
9279 //******* Lane load functions - insert the data at vector's given position (lane) *************************
9280 //***********************************************************************************************************
9281 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9282 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9284 uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9285 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9287 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9288 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9290 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9291 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
9294 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9295 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9297 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9298 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9300 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9301 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9303 float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9304 //current IA SIMD doesn't support float16
9306 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9307 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
9309 //we need to deal with ptr 16bit NOT aligned case
9310 __m128 p;
9311 p = _mm_set1_ps(*(ptr));
9312 return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane));
9315 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9316 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9318 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9319 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9321 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9322 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9324 uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9325 _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
9327 uint8x8_t res;
9328 res = vec;
9329 res.m64_u8[lane] = *(ptr);
9330 return res;
9333 uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9334 _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
9336 uint16x4_t res;
9337 res = vec;
9338 res.m64_u16[lane] = *(ptr);
9339 return res;
9342 uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9343 _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
9345 uint32x2_t res;
9346 res = vec;
9347 res.m64_u32[lane] = *(ptr);
9348 return res;
9351 uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9352 _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
9354 uint64x1_t res;
9355 res.m64_u64[0] = *(ptr);
9356 return res;
9360 int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9361 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9363 int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9364 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9366 int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9367 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9369 float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9370 //current IA SIMD doesn't support float16
9372 float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9373 _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
9375 float32x2_t res;
9376 res = vec;
9377 res.m64_f32[lane] = *(ptr);
9378 return res;
9381 int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9382 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9384 poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9385 #define vld1_lane_p8 vld1_lane_u8
9387 poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9388 #define vld1_lane_p16 vld1_lane_s16
9390 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
9391 // ******************************************************************************************************************
9392 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9393 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9395 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9396 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9398 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9399 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9401 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9402 _NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
9404 _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
9405 return LOAD_SI128(val);
9408 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9409 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9411 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9412 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9414 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9415 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9417 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9418 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9420 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9421 //current IA SIMD doesn't support float16, need to go to 32 bits
9423 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9424 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9426 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9427 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9429 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9430 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9432 uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9433 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9435 uint8x8_t res;
9436 int i;
9437 for(i = 0; i<8; i++) {
9438 res.m64_u8[i] = *(ptr);
9440 return res;
9443 uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9444 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9446 uint16x4_t res;
9447 int i;
9448 for(i = 0; i<4; i++) {
9449 res.m64_u16[i] = *(ptr);
9451 return res;
9454 uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9455 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9457 uint32x2_t res;
9458 res.m64_u32[0] = *(ptr);
9459 res.m64_u32[1] = *(ptr);
9460 return res;
9463 uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9464 _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
9466 uint64x1_t res;
9467 res.m64_u64[0] = *(ptr);
9468 return res;
9471 int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9472 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9475 int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9476 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9479 int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9480 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9483 int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9484 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9486 float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9487 //current IA SIMD doesn't support float16
9489 float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9490 _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
9492 float32x2_t res;
9493 res.m64_f32[0] = *(ptr);
9494 res.m64_f32[1] = res.m64_f32[0];
9495 return res; // use last 64bits only
9498 poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9499 #define vld1_dup_p8 vld1_dup_u8
9502 poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9503 #define vld1_dup_p16 vld1_dup_u16
9506 //*************************************************************************************
9507 //********************************* Store **********************************************
9508 //*************************************************************************************
9509 // If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
9510 //here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9511 #define STORE_SI128(ptr, val) \
9512 (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
9514 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
9515 #define vst1q_u8 STORE_SI128
9517 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
9518 #define vst1q_u16 STORE_SI128
9520 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
9521 #define vst1q_u32 STORE_SI128
9523 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
9524 #define vst1q_u64 STORE_SI128
9526 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
9527 #define vst1q_s8 STORE_SI128
9529 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
9530 #define vst1q_s16 STORE_SI128
9532 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
9533 #define vst1q_s32 STORE_SI128
9535 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
9536 #define vst1q_s64 STORE_SI128
9538 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
9539 // IA32 SIMD doesn't work with 16bit floats currently
9541 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
9542 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
9544 if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned
9545 _mm_store_ps (ptr, val);
9546 else
9547 _mm_storeu_ps (ptr, val);
9550 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
9551 #define vst1q_p8 vst1q_u8
9553 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
9554 #define vst1q_p16 vst1q_u16
9556 void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
9557 _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
9559 int i;
9560 for (i = 0; i<8; i++) {
9561 *(ptr + i) = ((uint8_t*)&val)[i];
9563 //_mm_storel_epi64((__m128i*)ptr, val);
9564 return;
9567 void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
9568 _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
9570 int i;
9571 for (i = 0; i<4; i++) {
9572 *(ptr + i) = ((uint16_t*)&val)[i];
9574 //_mm_storel_epi64((__m128i*)ptr, val);
9575 return;
9578 void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
9579 _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
9581 int i;
9582 for (i = 0; i<2; i++) {
9583 *(ptr + i) = ((uint32_t*)&val)[i];
9585 //_mm_storel_epi64((__m128i*)ptr, val);
9586 return;
9589 void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
9590 _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
9592 *(ptr) = *((uint64_t*)&val);
9593 //_mm_storel_epi64((__m128i*)ptr, val);
9594 return;
9597 void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
9598 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9600 void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
9601 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9603 void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
9604 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9606 void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
9607 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9609 void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
9610 //current IA SIMD doesn't support float16
9612 void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
9613 _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
9615 *(ptr) = val.m64_f32[0];
9616 *(ptr + 1) = val.m64_f32[1];
9617 return;
9620 void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
9621 #define vst1_p8 vst1_u8
9623 void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
9624 #define vst1_p16 vst1_u16
9626 //***********Store a lane of a vector into memory (extract given lane) *********************
9627 //******************************************************************************************
9628 void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9629 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
9631 void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9632 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
9634 void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9635 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9637 void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9638 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9640 void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9641 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
9643 void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9644 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
9646 void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9647 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9649 void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9650 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9652 void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9653 //current IA SIMD doesn't support float16
9655 void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9656 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
9658 int32_t ilane;
9659 ilane = _MM_EXTRACT_PS(val,lane);
9660 *(ptr) = *((float*)&ilane);
9663 void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9664 #define vst1q_lane_p8 vst1q_lane_u8
9666 void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9667 #define vst1q_lane_p16 vst1q_lane_s16
9669 void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9670 _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
9672 *(ptr) = val.m64_u8[lane];
9675 void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9676 _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
9678 *(ptr) = val.m64_u16[lane];
9681 void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9682 _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
9684 *(ptr) = val.m64_u32[lane];
9687 void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9688 _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
9690 *(ptr) = val.m64_u64[0];
9693 void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9694 #define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9696 void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9697 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9699 void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9700 #define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
9703 void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9704 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9707 void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9708 //current IA SIMD doesn't support float16
9710 void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9711 _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
9713 *(ptr) = val.m64_f32[lane];
9716 void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9717 #define vst1_lane_p8 vst1_lane_u8
9719 void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9720 #define vst1_lane_p16 vst1_lane_s16
9722 //***********************************************************************************************
9723 //**************** Loads and stores of an N-element structure **********************************
9724 //***********************************************************************************************
9725 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9726 //We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
9727 //****************** 2 elements load *********************************************
9728 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9729 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
9731 uint8x16x2_t v;
9732 v.val[0] = vld1q_u8(ptr);
9733 v.val[1] = vld1q_u8((ptr + 16));
9734 v = vuzpq_s8(v.val[0], v.val[1]);
9735 return v;
9738 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9739 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
9741 uint16x8x2_t v;
9742 v.val[0] = vld1q_u16( ptr);
9743 v.val[1] = vld1q_u16( (ptr + 8));
9744 v = vuzpq_s16(v.val[0], v.val[1]);
9745 return v;
9748 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9749 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9751 uint32x4x2_t v;
9752 v.val[0] = vld1q_u32 ( ptr);
9753 v.val[1] = vld1q_u32 ( (ptr + 4));
9754 v = vuzpq_s32(v.val[0], v.val[1]);
9755 return v;
9758 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
9759 #define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9761 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9762 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9764 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9765 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9768 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
9769 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9771 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9772 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9774 float32x4x2_t v;
9775 v.val[0] = vld1q_f32 (ptr);
9776 v.val[1] = vld1q_f32 ((ptr + 4));
9777 v = vuzpq_f32(v.val[0], v.val[1]);
9778 return v;
9781 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9782 #define vld2q_p8 vld2q_u8
9784 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9785 #define vld2q_p16 vld2q_u16
9787 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9788 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
9790 uint8x8x2_t v;
9791 _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
9792 __m128i ld128;
9793 ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
9794 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
9795 vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9796 return v;
9799 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9800 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
9802 _NEON2SSE_ALIGN_16 uint16x4x2_t v;
9803 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
9804 __m128i ld128;
9805 ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
9806 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
9807 vst1q_u16((v.val), ld128);
9808 return v;
9811 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9812 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
9814 _NEON2SSE_ALIGN_16 uint32x2x2_t v;
9815 __m128i ld128;
9816 ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
9817 ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
9818 vst1q_u32((v.val), ld128);
9819 return v;
9822 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9823 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
9825 uint64x1x2_t v;
9826 v.val[0].m64_u64[0] = *(ptr);
9827 v.val[1].m64_u64[0] = *(ptr + 1);
9828 return v;
9831 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9832 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9834 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9835 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9837 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9838 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9840 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9841 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9843 float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
9844 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9846 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9847 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
9849 float32x2x2_t v;
9850 v.val[0].m64_f32[0] = *(ptr);
9851 v.val[0].m64_f32[1] = *(ptr + 2);
9852 v.val[1].m64_f32[0] = *(ptr + 1);
9853 v.val[1].m64_f32[1] = *(ptr + 3);
9854 return v;
9857 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9858 #define vld2_p8 vld2_u8
9860 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9861 #define vld2_p16 vld2_u16
9863 //******************** Triplets ***************************************
9864 //*********************************************************************
9865 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
9866 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
9868 //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
9869 //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
9870 //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
9871 //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
9872 uint8x16x3_t v;
9873 __m128i tmp0, tmp1,tmp2, tmp3;
9874 _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
9875 _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
9876 _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
9878 v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
9879 v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
9880 v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
9882 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
9883 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
9884 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
9886 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
9887 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
9888 tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
9889 tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
9890 v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
9891 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
9893 tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
9894 tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
9895 v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
9896 v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
9897 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
9898 v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
9899 v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
9900 tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
9901 tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
9902 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
9904 tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
9905 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
9906 v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
9907 v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
9908 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
9909 tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
9910 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
9911 return v;
9914 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
9915 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
9917 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
9918 uint16x8x3_t v;
9919 __m128i tmp0, tmp1,tmp2, tmp3;
9920 _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
9921 _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
9922 _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
9924 v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
9925 v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
9926 v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
9928 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
9929 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
9930 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
9932 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
9933 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
9934 tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
9935 tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
9936 v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
9937 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
9939 tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
9940 tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
9941 v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
9942 v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
9943 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
9944 v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
9945 v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
9946 tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
9947 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
9948 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
9950 tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
9951 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
9952 v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
9953 v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
9954 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
9955 tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
9956 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
9957 return v;
9960 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
9961 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
9963 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
9964 uint32x4x3_t v;
9965 __m128i tmp0, tmp1,tmp2, tmp3;
9966 v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3,
9967 v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
9968 v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
9970 tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
9971 tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
9972 tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
9974 tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
9975 v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
9976 tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
9977 v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
9978 v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
9979 v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
9980 return v;
9983 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
9984 #define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
9986 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
9987 #define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
9989 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
9990 #define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
9992 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
9993 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9995 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
9996 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
9998 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
9999 float32x4x3_t v;
10000 __m128 tmp0, tmp1,tmp2, tmp3;
10001 v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3,
10002 v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
10003 v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
10005 tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
10006 tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
10007 tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
10008 tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
10010 v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
10011 tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
10012 v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
10013 v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
10014 v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
10015 return v;
10018 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10019 #define vld3q_p8 vld3q_u8
10021 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10022 #define vld3q_p16 vld3q_u16
10024 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10025 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
10027 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10028 uint8x8x3_t v;
10029 __m128i val0, val1, val2, tmp0, tmp1;
10030 _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
10031 _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
10032 val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10033 val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
10035 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10036 tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10037 val0 = _mm_slli_si128(tmp0,10);
10038 val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10039 val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10040 val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10041 _M64(v.val[0], val0);
10042 val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10043 val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10044 val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10045 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10046 val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10047 _M64(v.val[1], val1);
10049 tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10050 val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10051 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
10052 val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10053 _M64(v.val[2], val2);
10054 return v;
10057 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10058 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
10060 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10061 uint16x4x3_t v;
10062 __m128i val0, val1, val2, tmp0, tmp1;
10063 _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10064 val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
10065 val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
10067 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
10068 tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3,
10069 val0 = _mm_slli_si128(tmp0,10);
10070 val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
10071 val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
10072 val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
10073 val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
10074 _M64(v.val[0], val0);
10076 val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
10077 val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
10078 val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
10079 val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
10080 val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
10081 _M64(v.val[1], val1);
10083 tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
10084 tmp1 = _mm_srli_si128(tmp1,4);
10085 tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
10086 val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
10087 _M64(v.val[2], val2);
10088 return v;
10091 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10092 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
10094 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10095 uint32x2x3_t v;
10096 __m128i val0, val1, val2;
10097 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10098 val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
10100 val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
10101 _M64(v.val[0], val0);
10102 val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1,
10103 val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
10104 _M64(v.val[1], val1);
10105 val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x,
10106 _M64(v.val[2], val2);
10107 return v;
10109 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10110 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10112 uint64x1x3_t v;
10113 v.val[0].m64_u64[0] = *(ptr);
10114 v.val[1].m64_u64[0] = *(ptr + 1);
10115 v.val[2].m64_u64[0] = *(ptr + 2);
10116 return v;
10119 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10120 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10122 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10123 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10125 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10126 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10128 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10129 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10131 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10132 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10134 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10135 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
10137 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10138 float32x2x3_t v;
10139 v.val[0].m64_f32[0] = *(ptr);
10140 v.val[0].m64_f32[1] = *(ptr + 3);
10142 v.val[1].m64_f32[0] = *(ptr + 1);
10143 v.val[1].m64_f32[1] = *(ptr + 4);
10145 v.val[2].m64_f32[0] = *(ptr + 2);
10146 v.val[2].m64_f32[1] = *(ptr + 5);
10147 return v;
10150 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10151 #define vld3_p8 vld3_u8
10153 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10154 #define vld3_p16 vld3_u16
10156 //*************** Quadruples load ********************************
10157 //*****************************************************************
10158 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10159 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
10161 uint8x16x4_t v;
10162 __m128i tmp3, tmp2, tmp1, tmp0;
10164 v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
10165 v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
10166 v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
10167 v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
10169 tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10170 tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10171 tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10172 tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10174 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
10175 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10176 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10177 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10179 tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10180 tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10181 tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10182 tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10184 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10185 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10186 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10187 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10188 return v;
10191 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10192 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
10194 uint16x8x4_t v;
10195 __m128i tmp3, tmp2, tmp1, tmp0;
10196 tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
10197 tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
10198 tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
10199 tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
10200 v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
10201 v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
10202 v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
10203 v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
10204 tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
10205 tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
10206 tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
10207 tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
10208 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
10209 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
10210 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
10211 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
10212 return v;
10215 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10216 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10218 uint32x4x4_t v;
10219 __m128i tmp3, tmp2, tmp1, tmp0;
10220 v.val[0] = vld1q_u32 (ptr);
10221 v.val[1] = vld1q_u32 ((ptr + 4));
10222 v.val[2] = vld1q_u32 ((ptr + 8));
10223 v.val[3] = vld1q_u32 ((ptr + 12));
10224 tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
10225 tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
10226 tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
10227 tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
10228 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
10229 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
10230 v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
10231 v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
10232 return v;
10235 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10236 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10238 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10239 #define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10241 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10242 #define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10244 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10245 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10247 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10248 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10250 float32x4x4_t v;
10251 __m128 tmp3, tmp2, tmp1, tmp0;
10253 v.val[0] = vld1q_f32 ((float*) ptr);
10254 v.val[1] = vld1q_f32 ((float*) (ptr + 4));
10255 v.val[2] = vld1q_f32 ((float*) (ptr + 8));
10256 v.val[3] = vld1q_f32 ((float*) (ptr + 12));
10257 tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
10258 tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
10259 tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
10260 tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
10261 v.val[0] = _mm_movelh_ps(tmp0, tmp2);
10262 v.val[1] = _mm_movehl_ps(tmp2, tmp0);
10263 v.val[2] = _mm_movelh_ps(tmp1, tmp3);
10264 v.val[3] = _mm_movehl_ps(tmp3, tmp1);
10265 return v;
10268 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10269 #define vld4q_p8 vld4q_u8
10271 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10272 #define vld4q_p16 vld4q_s16
10274 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10275 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
10277 uint8x8x4_t v;
10278 __m128i sh0, sh1;
10279 __m128i val0, val2;
10280 _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
10282 val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
10283 val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
10285 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
10286 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
10287 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10288 vst1q_u8(&v.val[0], val0 );
10289 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10290 vst1q_u8(&v.val[2], val2 );
10291 return v;
10294 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10295 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
10297 uint16x4x4_t v;
10298 __m128i sh0, sh1;
10299 __m128i val0, val2;
10300 _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
10301 val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
10302 val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
10303 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
10304 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
10305 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
10306 vst1q_u16(&v.val[0], val0 );
10307 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
10308 vst1q_u16(&v.val[2], val2 );
10309 return v;
10312 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10313 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
10315 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10316 uint32x2x4_t v;
10317 __m128i val0, val01, val2;
10318 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10319 val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
10320 val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
10321 val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
10322 vst1q_u32(&v.val[0], val01);
10323 vst1q_u32(&v.val[2], val2 );
10324 return v;
10327 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10328 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10330 uint64x1x4_t v;
10331 v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
10332 v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
10333 v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
10334 v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
10335 return v;
10338 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10339 #define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10341 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10342 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10344 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10345 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10347 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10348 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10350 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10351 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10353 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10354 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
10356 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10357 float32x2x4_t res;
10358 res.val[0].m64_f32[0] = *(ptr);
10359 res.val[0].m64_f32[1] = *(ptr + 4);
10360 res.val[1].m64_f32[0] = *(ptr + 1);
10361 res.val[1].m64_f32[1] = *(ptr + 5);
10362 res.val[2].m64_f32[0] = *(ptr + 2);
10363 res.val[2].m64_f32[1] = *(ptr + 6);
10364 res.val[3].m64_f32[0] = *(ptr + 3);
10365 res.val[3].m64_f32[1] = *(ptr + 7);
10366 return res;
10369 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10370 #define vld4_p8 vld4_u8
10372 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10373 #define vld4_p16 vld4_u16
10375 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
10376 //*******************************************************************************************************************
10377 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10378 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
10380 uint8x8x2_t v;
10381 __m128i val0, val1;
10382 val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10383 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10384 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10385 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10386 vst1q_u8(v.val, val0);
10387 return v;
10390 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10391 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
10393 uint16x4x2_t v;
10394 __m128i val0, val1;
10395 val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
10396 val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
10397 _M64(v.val[0], val0);
10398 val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
10399 _M64(v.val[1], val1);
10400 return v;
10403 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10404 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10406 uint32x2x2_t v;
10407 __m128i val0;
10408 val0 = LOAD_SI128(ptr); //0,1,x,x
10409 val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
10410 vst1q_u32(v.val, val0);
10411 return v;
10414 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10415 #define vld2_dup_u64 vld2_u64
10417 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10418 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10420 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10421 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10423 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10424 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10426 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10427 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10429 float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10430 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10432 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10433 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10435 float32x2x2_t v;
10436 v.val[0].m64_f32[0] = *(ptr); //0,0
10437 v.val[0].m64_f32[1] = *(ptr); //0,0
10438 v.val[1].m64_f32[0] = *(ptr + 1); //1,1
10439 v.val[1].m64_f32[1] = *(ptr + 1); //1,1
10440 return v;
10443 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10444 #define vld2_dup_p8 vld2_dup_u8
10446 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10447 #define vld2_dup_p16 vld2_dup_s16
10449 //************* Duplicate (or propagate)triplets: *******************
10450 //********************************************************************
10451 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10452 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10453 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10455 uint8x8x3_t v;
10456 __m128i val0, val1, val2;
10457 val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10458 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10459 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10460 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10461 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10462 vst1q_u8(v.val, val0);
10463 _M64(v.val[2], val2);
10464 return v;
10467 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10468 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10470 uint16x4x3_t v;
10471 __m128i val0, val1, val2;
10472 val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
10473 val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
10474 val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
10475 val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
10476 _M64(v.val[0], val0);
10477 _M64(v.val[1], val1);
10478 _M64(v.val[2], val2);
10479 return v;
10482 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10483 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10485 uint32x2x3_t v;
10486 __m128i val0, val1, val2;
10487 val2 = LOAD_SI128(ptr); //0,1,2,x
10488 val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
10489 val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
10490 val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
10491 _M64(v.val[0], val0);
10492 _M64(v.val[1], val1);
10493 _M64(v.val[2], val2);
10494 return v;
10497 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10498 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10500 uint64x1x3_t v;
10501 v.val[0].m64_u64[0] = *(ptr);
10502 v.val[1].m64_u64[0] = *(ptr + 1);
10503 v.val[2].m64_u64[0] = *(ptr + 2);
10504 return v;
10507 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10508 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10510 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10511 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10513 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10514 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10516 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10517 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10520 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10521 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10523 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10524 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10526 float32x2x3_t v;
10527 int i;
10528 for (i = 0; i<3; i++) {
10529 v.val[i].m64_f32[0] = *(ptr + i);
10530 v.val[i].m64_f32[1] = *(ptr + i);
10532 return v;
10535 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10536 #define vld3_dup_p8 vld3_dup_u8
10538 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10539 #define vld3_dup_p16 vld3_dup_s16
10542 //************* Duplicate (or propagate) quadruples: *******************
10543 //***********************************************************************
10544 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
10545 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10546 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10548 uint8x8x4_t v;
10549 __m128i val0, val1, val2;
10550 val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10551 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10552 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10553 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10554 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10555 vst1q_u8(&v.val[0], val0);
10556 vst1q_u8(&v.val[2], val2);
10557 return v;
10560 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10561 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10563 uint16x4x4_t v;
10564 __m128i val0, val1, val2, val3;
10565 val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
10566 val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
10567 val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
10568 val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
10569 val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
10570 _M64(v.val[0], val0);
10571 _M64(v.val[1], val1);
10572 _M64(v.val[2], val2);
10573 _M64(v.val[3], val3);
10574 return v;
10577 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10578 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10580 uint32x2x4_t v;
10581 __m128i val0, val1, val2, val3;
10582 val3 = LOAD_SI128(ptr); //0,1,2,3
10583 val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
10584 val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
10585 val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
10586 val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
10587 _M64(v.val[0], val0);
10588 _M64(v.val[1], val1);
10589 _M64(v.val[2], val2);
10590 _M64(v.val[3], val3);
10591 return v;
10594 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10595 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10597 uint64x1x4_t v;
10598 v.val[0].m64_u64[0] = *(ptr);
10599 v.val[1].m64_u64[0] = *(ptr + 1);
10600 v.val[2].m64_u64[0] = *(ptr + 2);
10601 v.val[3].m64_u64[0] = *(ptr + 3);
10602 return v;
10605 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10606 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10608 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10609 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10611 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10612 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10614 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10615 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10617 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10618 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10620 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10621 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10623 float32x2x4_t v;
10624 int i;
10625 for (i = 0; i<4; i++) {
10626 v.val[i].m64_f32[0] = *(ptr + i);
10627 v.val[i].m64_f32[1] = *(ptr + i);
10629 return v;
10632 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10633 #define vld4_dup_p8 vld4_dup_u8
10635 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10636 #define vld4_dup_p16 vld4_dup_u16
10639 //**********************************************************************************
10640 //*******************Lane loads for an N-element structures ***********************
10641 //**********************************************************************************
10642 //********************** Lane pairs ************************************************
10643 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10644 //we assume src is 16 bit aligned
10646 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10647 //to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
10649 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10650 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
10652 uint16x8x2_t v;
10653 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
10654 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
10655 return v;
10657 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10659 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10660 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10662 uint32x4x2_t v;
10663 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
10664 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
10665 return v;
10667 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10669 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10670 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
10672 int16x8x2_t v;
10673 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
10674 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
10675 return v;
10677 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10679 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10680 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
10682 int32x4x2_t v;
10683 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
10684 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
10685 return v;
10687 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10689 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10690 //current IA SIMD doesn't support float16
10692 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10693 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10695 float32x4x2_t v;
10696 v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
10697 v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
10698 return v;
10700 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10702 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10703 #define vld2q_lane_p16 vld2q_lane_u16
10705 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10706 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
10708 uint8x8x2_t v;
10709 v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
10710 v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
10711 return v;
10713 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
10715 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10716 _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane)
10718 uint16x4x2_t v;
10719 v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
10720 v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
10721 return v;
10723 #define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
10725 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10726 _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane)
10728 uint32x2x2_t v;
10729 v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
10730 v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
10731 return v;
10733 #define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
10735 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10736 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10737 #define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
10739 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10740 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10741 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10743 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10744 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10745 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10747 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10748 //current IA SIMD doesn't support float16
10750 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10751 _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane)
10753 float32x2x2_t v;
10754 v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
10755 v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
10756 return v;
10758 #define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
10760 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10761 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10762 #define vld2_lane_p8 vld2_lane_u8
10764 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10765 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10766 #define vld2_lane_p16 vld2_lane_u16
10768 //*********** Lane triplets **********************
10769 //*************************************************
10770 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10771 //we assume src is 16 bit aligned
10773 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10774 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10776 uint16x8x3_t v;
10777 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10778 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10779 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10780 return v;
10782 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10784 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10785 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10787 uint32x4x3_t v;
10788 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10789 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10790 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10791 return v;
10793 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10795 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10796 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10798 int16x8x3_t v;
10799 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10800 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10801 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10802 return v;
10804 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10806 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10807 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10809 int32x4x3_t v;
10810 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10811 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10812 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10813 return v;
10815 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10817 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10818 //current IA SIMD doesn't support float16
10819 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10822 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10823 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10825 float32x4x3_t v;
10826 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10827 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10828 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10829 return v;
10831 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10833 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10834 #define vld3q_lane_p16 vld3q_lane_u16
10836 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10837 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10839 uint8x8x3_t v;
10840 v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
10841 v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
10842 v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
10843 return v;
10845 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
10847 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10848 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10850 uint16x4x3_t v;
10851 v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
10852 v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
10853 v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
10854 return v;
10856 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
10858 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10859 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10861 //need to merge into 128 bit anyway
10862 uint32x2x3_t v;
10863 v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);;
10864 v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);;
10865 v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);;
10866 return v;
10868 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
10870 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10871 #define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
10873 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10874 #define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
10876 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10877 #define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
10879 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10880 //current IA SIMD doesn't support float16
10882 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10883 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10885 float32x2x3_t v;
10886 v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
10887 v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
10888 v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
10889 return v;
10891 #define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
10893 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10894 #define vld3_lane_p8 vld3_lane_u8
10896 //poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10897 #define vld3_lane_p16 vld3_lane_u16
10899 //******************* Lane Quadruples load ***************************
10900 //*********************************************************************
10901 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
10902 //we assume src is 16 bit aligned
10904 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10905 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
10907 uint16x8x4_t v;
10908 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10909 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10910 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10911 v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane);
10912 return v;
10914 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
10916 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10917 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
10919 uint32x4x4_t v;
10920 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10921 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10922 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10923 v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane);
10924 return v;
10926 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
10928 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10929 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10930 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
10932 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10933 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10934 #define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
10936 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10937 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10938 //current IA SIMD doesn't support float16
10940 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10941 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
10943 float32x4x4_t v;
10944 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10945 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10946 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10947 v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
10948 return v;
10950 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
10952 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10953 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10954 #define vld4q_lane_p16 vld4q_lane_u16
10956 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10957 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
10959 uint8x8x4_t v;
10960 v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
10961 v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
10962 v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
10963 v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane);
10964 return v;
10966 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
10968 //uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10969 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
10971 uint16x4x4_t v;
10972 v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
10973 v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
10974 v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
10975 v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane);
10976 return v;
10978 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
10980 //uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10981 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
10983 uint32x2x4_t v;
10984 v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
10985 v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
10986 v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);
10987 v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane);
10988 return v;
10990 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
10992 //int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10993 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);
10994 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
10996 //int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10997 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);
10998 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11000 //int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11001 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);
11002 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11004 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11005 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
11006 //current IA SIMD doesn't support float16
11008 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11009 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
11011 //serial solution may be faster
11012 float32x2x4_t v;
11013 v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
11014 v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
11015 v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
11016 v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane);
11017 return v;
11019 #define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
11021 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11022 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
11023 #define vld4_lane_p8 vld4_lane_u8
11025 //poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11026 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);
11027 #define vld4_lane_p16 vld4_lane_u16
11029 //******************* Store duplets *********************************************
11030 //********************************************************************************
11031 //here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
11032 //If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
11033 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
11034 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
11036 uint8x16x2_t v;
11037 v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
11038 v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
11039 vst1q_u8 (ptr, v.val[0]);
11040 vst1q_u8 ((ptr + 16), v.val[1]);
11042 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11044 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
11045 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
11047 uint16x8x2_t v;
11048 v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
11049 v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
11050 vst1q_u16 (ptr, v.val[0]);
11051 vst1q_u16 ((ptr + 8), v.val[1]);
11053 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11055 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11056 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
11058 uint32x4x2_t v;
11059 v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
11060 v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
11061 vst1q_u32 (ptr, v.val[0]);
11062 vst1q_u32 ((ptr + 4), v.val[1]);
11064 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11066 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
11067 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
11068 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11070 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11071 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
11072 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11074 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
11075 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
11076 #define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
11078 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11079 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
11080 // IA32 SIMD doesn't work with 16bit floats currently
11082 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11083 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
11085 float32x4x2_t v;
11086 v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
11087 v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
11088 vst1q_f32 (ptr, v.val[0]);
11089 vst1q_f32 ((ptr + 4), v.val[1]);
11091 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11093 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
11094 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
11095 #define vst2q_p8 vst2q_u8
11097 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11098 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
11099 #define vst2q_p16 vst2q_u16
11101 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11102 _NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
11104 __m128i v0;
11105 v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1]));
11106 vst1q_u8 (ptr, v0);
11108 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
11110 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11111 _NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
11113 __m128i v0;
11114 v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1]));
11115 vst1q_u16 (ptr, v0);
11117 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
11119 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11120 _NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
11122 __m128i v0;
11123 v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1]));
11124 vst1q_u32 (ptr, v0);
11126 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
11129 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11130 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
11131 _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
11133 *(ptr) = val->val[0].m64_u64[0];
11134 *(ptr + 1) = val->val[1].m64_u64[0];
11136 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
11138 //void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11139 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11141 //void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11142 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11144 //void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11145 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11147 //void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
11148 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11150 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11151 //current IA SIMD doesn't support float16
11153 //void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11154 _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
11156 *(ptr) = val->val[0].m64_f32[0];
11157 *(ptr + 1) = val->val[1].m64_f32[0];
11158 *(ptr + 2) = val->val[0].m64_f32[1];
11159 *(ptr + 3) = val->val[1].m64_f32[1];
11161 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
11163 //void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
11164 #define vst2_p8 vst2_u8
11166 //void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
11167 #define vst2_p16 vst2_u16
11169 //******************** Triplets store *****************************************
11170 //******************************************************************************
11171 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
11172 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
11174 uint8x16x3_t v;
11175 __m128i v0,v1,v2, cff, bldmask;
11176 _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
11177 _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
11178 _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
11179 _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
11180 _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
11181 _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
11183 v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11184 v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11185 v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
11186 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11187 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11188 cff = _mm_cmpeq_epi8(v0, v0); //all ff
11189 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
11190 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11191 vst1q_u8(ptr, v.val[0]);
11192 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11193 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11194 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
11195 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11196 vst1q_u8((ptr + 16), v.val[1]);
11197 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11198 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11199 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
11200 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11201 vst1q_u8((ptr + 32), v.val[2]);
11203 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11205 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
11206 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
11208 uint16x8x3_t v;
11209 __m128i v0,v1,v2, cff, bldmask;
11210 _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
11211 _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
11212 _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
11213 _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
11214 _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
11215 _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
11217 v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
11218 v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
11219 v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
11220 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11221 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11222 cff = _mm_cmpeq_epi16(v0, v0); //all ff
11223 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
11224 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11225 vst1q_u16(ptr, v.val[0]);
11226 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11227 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11228 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
11229 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11230 vst1q_u16((ptr + 8), v.val[1]);
11231 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11232 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11233 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
11234 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11235 vst1q_u16((ptr + 16), v.val[2]);
11237 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11239 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11240 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
11242 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11243 uint32x4x3_t v;
11244 __m128i tmp0, tmp1,tmp2;
11245 tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
11246 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
11247 tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
11248 v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
11249 v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
11250 v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
11251 tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
11252 v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
11254 vst1q_u32(ptr, v.val[0]);
11255 vst1q_u32((ptr + 4), v.val[1]);
11256 vst1q_u32((ptr + 8), v.val[2]);
11258 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11260 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
11261 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
11262 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11264 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
11265 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
11266 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11268 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
11269 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
11270 #define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
11272 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11273 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
11274 // IA32 SIMD doesn't work with 16bit floats currently
11276 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11277 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
11279 float32x4x3_t v;
11280 __m128 tmp0, tmp1,tmp2;
11281 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
11282 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
11283 tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
11284 v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
11285 v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
11286 v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
11287 tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
11288 v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
11290 vst1q_f32( ptr, v.val[0]);
11291 vst1q_f32( (ptr + 4), v.val[1]);
11292 vst1q_f32( (ptr + 8), v.val[2]);
11294 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11296 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
11297 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
11298 #define vst3q_p8 vst3q_u8
11300 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11301 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
11302 #define vst3q_p16 vst3q_u16
11304 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
11305 _NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
11307 __m128i tmp, sh0, sh1, val0, val2;
11308 _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
11309 _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
11310 _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
11311 _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
11312 tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) );
11313 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
11314 val2 = _pM128i(val->val[2]);
11315 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11316 val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
11317 vst1q_u8(ptr, val0); //store as 128 bit structure
11318 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
11319 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11320 val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
11321 _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory
11323 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
11325 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
11326 _NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
11328 __m128i tmp, val0, val1, val2;
11329 _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
11330 _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0};
11331 _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
11332 _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
11333 tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]));
11334 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
11335 val2 = _pM128i(val->val[2]);
11336 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11337 val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
11338 vst1q_u16(ptr, val0); //store as 128 bit structure
11339 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
11340 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11341 val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order
11342 _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory
11344 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
11346 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
11347 _NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
11349 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
11350 __m128i val0, val1;
11351 val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5
11352 val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
11353 val1 = _mm_srli_si128(val0, 8); //4,5, x,x
11354 _M64((*(__m64_128*)(ptr + 4)), val1);
11355 val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2
11356 val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
11357 vst1q_u32(ptr, val0); //store as 128 bit structure
11359 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
11361 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
11362 _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
11364 *(ptr) = val->val[0].m64_u64[0];
11365 *(ptr + 1) = val->val[1].m64_u64[0];
11366 *(ptr + 2) = val->val[2].m64_u64[0];
11368 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
11370 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0]
11371 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
11373 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0]
11374 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
11376 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11377 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
11379 //void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
11380 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
11382 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11383 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
11384 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11386 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
11387 _NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
11389 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
11390 *(ptr) = val->val[0].m64_f32[0];
11391 *(ptr + 1) = val->val[1].m64_f32[0];
11392 *(ptr + 2) = val->val[2].m64_f32[0];
11393 *(ptr + 3) = val->val[0].m64_f32[1];
11394 *(ptr + 4) = val->val[1].m64_f32[1];
11395 *(ptr + 5) = val->val[2].m64_f32[1];
11397 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
11399 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11400 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
11401 #define vst3_p8 vst3_u8
11403 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11404 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
11405 #define vst3_p16 vst3_s16
11407 //*************** Quadruples store ********************************
11408 //*********************************************************************
11409 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
11410 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
11412 __m128i tmp1, tmp2, res;
11413 tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11414 tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11415 res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11416 vst1q_u8(ptr, res);
11417 res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11418 vst1q_u8((ptr + 16), res);
11419 tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
11420 tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
11421 res = _mm_unpacklo_epi16(tmp1, tmp2); //
11422 vst1q_u8((ptr + 32), res);
11423 res = _mm_unpackhi_epi16(tmp1, tmp2); //
11424 vst1q_u8((ptr + 48), res);
11426 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11428 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
11429 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
11431 uint16x8x4_t v;
11432 __m128i tmp1, tmp2;
11433 tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11434 tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11435 v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
11436 v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
11437 tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11438 tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11439 v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
11440 v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
11441 vst1q_u16(ptr, v.val[0]);
11442 vst1q_u16((ptr + 8), v.val[1]);
11443 vst1q_u16((ptr + 16),v.val[2]);
11444 vst1q_u16((ptr + 24), v.val[3]);
11446 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11448 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11449 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
11451 uint16x8x4_t v;
11452 __m128i tmp1, tmp2;
11453 tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11454 tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11455 v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
11456 v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
11457 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11458 tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11459 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
11460 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
11461 vst1q_u32(ptr, v.val[0]);
11462 vst1q_u32((ptr + 4), v.val[1]);
11463 vst1q_u32((ptr + 8), v.val[2]);
11464 vst1q_u32((ptr + 12), v.val[3]);
11466 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11468 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
11469 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
11470 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11472 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
11473 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
11474 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11476 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
11477 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
11478 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11480 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11481 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
11482 // IA32 SIMD doesn't work with 16bit floats currently
11484 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11485 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
11487 __m128 tmp3, tmp2, tmp1, tmp0;
11488 float32x4x4_t v;
11489 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
11490 tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
11491 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
11492 tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
11493 v.val[0] = _mm_movelh_ps(tmp0, tmp2);
11494 v.val[1] = _mm_movehl_ps(tmp2, tmp0);
11495 v.val[2] = _mm_movelh_ps(tmp1, tmp3);
11496 v.val[3] = _mm_movehl_ps(tmp3, tmp1);
11497 vst1q_f32(ptr, v.val[0]);
11498 vst1q_f32((ptr + 4), v.val[1]);
11499 vst1q_f32((ptr + 8), v.val[2]);
11500 vst1q_f32((ptr + 12), v.val[3]);
11502 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11504 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
11505 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
11506 #define vst4q_p8 vst4q_u8
11508 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11509 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
11510 #define vst4q_p16 vst4q_s16
11512 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
11513 _NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
11515 __m128i sh0, sh1, val0, val2;
11516 sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11517 sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11518 val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11519 val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11520 vst1q_u8(ptr, val0);
11521 vst1q_u8((ptr + 16), val2);
11523 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
11525 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
11526 _NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
11528 __m128i sh0, sh1, val0, val2;
11529 sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11530 sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11531 val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
11532 val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
11533 vst1q_u16(ptr, val0); //store as 128 bit structure
11534 vst1q_u16((ptr + 8), val2);
11536 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
11538 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
11539 _NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
11541 //0,4, 1,5, 2,6, 3,7
11542 __m128i sh0, sh1, val0, val1;
11543 sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5
11544 sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7
11545 val0 = _mm_unpacklo_epi64(sh0,sh1); //
11546 val1 = _mm_unpackhi_epi64(sh0,sh1); //
11547 vst1q_u32(ptr, val0); //store as 128 bit structure
11548 vst1q_u32((ptr + 4), val1);
11550 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
11552 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
11553 _NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
11555 *(ptr) = val->val[0].m64_u64[0];
11556 *(ptr + 1) = val->val[1].m64_u64[0];
11557 *(ptr + 2) = val->val[2].m64_u64[0];
11558 *(ptr + 3) = val->val[3].m64_u64[0];
11560 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
11562 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]
11563 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11565 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]
11566 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11568 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
11569 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11571 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
11572 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
11573 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11575 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11576 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
11577 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11579 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
11580 _NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
11582 //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
11583 *(ptr) = val->val[0].m64_f32[0];
11584 *(ptr + 1) = val->val[1].m64_f32[0];
11585 *(ptr + 2) = val->val[2].m64_f32[0];
11586 *(ptr + 3) = val->val[3].m64_f32[0];
11587 *(ptr + 4) = val->val[0].m64_f32[1];
11588 *(ptr + 5) = val->val[1].m64_f32[1];
11589 *(ptr + 6) = val->val[2].m64_f32[1];
11590 *(ptr + 7) = val->val[3].m64_f32[1];
11592 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
11594 //void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11595 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);
11596 #define vst4_p8 vst4_u8
11598 //void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11599 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);
11600 #define vst4_p16 vst4_u16
11602 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors *********************
11603 //********************************************************************************************************************
11604 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
11605 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
11607 vst1q_lane_s16(ptr, val->val[0], lane);
11608 vst1q_lane_s16((ptr + 1), val->val[1], lane);
11610 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11612 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11613 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
11615 vst1q_lane_u32(ptr, val->val[0], lane);
11616 vst1q_lane_u32((ptr + 1), val->val[1], lane);
11618 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11620 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11621 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
11622 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11624 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
11625 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
11626 #define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
11628 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11629 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
11630 //current IA SIMD doesn't support float16
11632 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11633 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
11635 vst1q_lane_f32(ptr, val->val[0], lane);
11636 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11638 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11640 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11641 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
11642 #define vst2q_lane_p16 vst2q_lane_s16
11644 //void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11645 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0]
11646 _NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
11648 *(ptr) = val->val[0].m64_u8[lane];
11649 *(ptr + 1) = val->val[1].m64_u8[lane];
11651 #define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
11653 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11654 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11655 _NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane)
11657 *(ptr) = val->val[0].m64_u16[lane];
11658 *(ptr + 1) = val->val[1].m64_u16[lane];
11660 #define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
11662 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11663 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11664 _NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane)
11666 *(ptr) = val->val[0].m64_u32[lane];
11667 *(ptr + 1) = val->val[1].m64_u32[lane];
11669 #define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
11671 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11672 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
11673 #define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
11675 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11676 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
11677 #define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
11679 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11680 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
11681 #define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
11683 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11684 //current IA SIMD doesn't support float16
11686 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11687 _NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane)
11689 *(ptr) = val->val[0].m64_f32[lane];
11690 *(ptr + 1) = val->val[1].m64_f32[lane];
11692 #define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
11694 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11695 #define vst2_lane_p8 vst2_lane_u8
11697 //void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11698 #define vst2_lane_p16 vst2_lane_u16
11700 //************************* Triple lanes stores *******************************************************
11701 //*******************************************************************************************************
11702 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11703 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
11705 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
11706 vst1q_lane_u16((ptr + 2), val->val[2], lane);
11708 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11710 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11711 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
11713 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
11714 vst1q_lane_u32((ptr + 2), val->val[2], lane);
11716 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11718 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11719 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
11720 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11722 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11723 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
11724 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11726 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11727 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
11728 //current IA SIMD doesn't support float16
11730 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11731 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
11733 vst1q_lane_f32(ptr, val->val[0], lane);
11734 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11735 vst1q_lane_f32((ptr + 2), val->val[2], lane);
11737 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11739 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11740 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
11741 #define vst3q_lane_p16 vst3q_lane_s16
11743 //void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11744 _NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane)
11746 *(ptr) = val->val[0].m64_u8[lane];
11747 *(ptr + 1) = val->val[1].m64_u8[lane];
11748 *(ptr + 2) = val->val[2].m64_u8[lane];
11750 #define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
11752 //void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11753 _NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane)
11755 *(ptr) = val->val[0].m64_u16[lane];
11756 *(ptr + 1) = val->val[1].m64_u16[lane];
11757 *(ptr + 2) = val->val[2].m64_u16[lane];
11759 #define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
11761 //void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11762 _NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane)
11764 *(ptr) = val->val[0].m64_u32[lane];
11765 *(ptr + 1) = val->val[1].m64_u32[lane];
11766 *(ptr + 2) = val->val[2].m64_u32[lane];
11768 #define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
11770 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11771 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
11772 #define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11774 //void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11775 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);
11776 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11778 //void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11779 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);
11780 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11782 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11783 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
11784 //current IA SIMD doesn't support float16
11786 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11787 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
11788 _NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane)
11790 *(ptr) = val->val[0].m64_f32[lane];
11791 *(ptr + 1) = val->val[1].m64_f32[lane];
11792 *(ptr + 2) = val->val[2].m64_f32[lane];
11794 #define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
11796 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11797 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
11798 #define vst3_lane_p8 vst3_lane_u8
11800 //void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11801 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);
11802 #define vst3_lane_p16 vst3_lane_s16
11804 //******************************** Quadruple lanes stores ***********************************************
11805 //*******************************************************************************************************
11806 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11807 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
11809 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane);
11810 vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
11812 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11814 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11815 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
11817 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane);
11818 vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
11820 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11822 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11823 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
11824 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11826 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11827 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
11828 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11830 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11831 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
11832 //current IA SIMD doesn't support float16
11834 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11835 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
11837 vst1q_lane_f32(ptr, val->val[0], lane);
11838 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11839 vst1q_lane_f32((ptr + 2), val->val[2], lane);
11840 vst1q_lane_f32((ptr + 3), val->val[3], lane);
11842 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11844 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11845 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
11846 #define vst4q_lane_p16 vst4q_lane_u16
11848 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11849 _NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
11851 *(ptr) = val->val[0].m64_u8[lane];
11852 *(ptr + 1) = val->val[1].m64_u8[lane];
11853 *(ptr + 2) = val->val[2].m64_u8[lane];
11854 *(ptr + 3) = val->val[3].m64_u8[lane];
11856 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
11858 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11859 _NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
11861 *(ptr) = val->val[0].m64_u16[lane];
11862 *(ptr + 1) = val->val[1].m64_u16[lane];
11863 *(ptr + 2) = val->val[2].m64_u16[lane];
11864 *(ptr + 3) = val->val[3].m64_u16[lane];
11866 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
11868 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11869 _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
11871 *(ptr) = val->val[0].m64_u32[lane];
11872 *(ptr + 1) = val->val[1].m64_u32[lane];
11873 *(ptr + 2) = val->val[2].m64_u32[lane];
11874 *(ptr + 3) = val->val[3].m64_u32[lane];
11876 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
11878 //void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11879 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11881 //void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11882 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11884 //void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11885 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11887 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11888 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
11889 //current IA SIMD doesn't support float16
11891 void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11892 _NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane)
11894 *(ptr) = val->val[0].m64_f32[lane];
11895 *(ptr + 1) = val->val[1].m64_f32[lane];
11896 *(ptr + 2) = val->val[2].m64_f32[lane];
11897 *(ptr + 3) = val->val[3].m64_f32[lane];
11899 #define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
11901 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11902 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
11903 #define vst4_lane_p8 vst4_lane_u8
11905 //void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11906 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);
11907 #define vst4_lane_p16 vst4_lane_u16
11909 //**************************************************************************************************
11910 //************************ Extract lanes from a vector ********************************************
11911 //**************************************************************************************************
11912 //These intrinsics extract a single lane (element) from a vector.
11913 uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11914 #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11916 uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11917 #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11920 uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11921 #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11923 int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
11924 #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11926 int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
11927 #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
11929 int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11930 #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
11932 poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11933 #define vget_lane_p8 vget_lane_u8
11935 poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11936 #define vget_lane_p16 vget_lane_u16
11938 float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11939 #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
11941 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11942 #define vgetq_lane_u8 _MM_EXTRACT_EPI8
11944 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11945 #define vgetq_lane_u16 _MM_EXTRACT_EPI16
11947 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11948 #define vgetq_lane_u32 _MM_EXTRACT_EPI32
11950 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
11951 #define vgetq_lane_s8 vgetq_lane_u8
11953 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
11954 #define vgetq_lane_s16 vgetq_lane_u16
11956 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11957 #define vgetq_lane_s32 vgetq_lane_u32
11959 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11960 #define vgetq_lane_p8 vgetq_lane_u8
11962 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11963 #define vgetq_lane_p16 vgetq_lane_u16
11965 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11966 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
11968 int32_t ilane;
11969 ilane = _MM_EXTRACT_PS(vec,lane);
11970 return *(float*)&ilane;
11973 int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11974 #define vget_lane_s64(vec, lane) vec.m64_i64[0]
11976 uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11977 #define vget_lane_u64(vec, lane) vec.m64_u64[0]
11980 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11981 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64
11983 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11984 #define vgetq_lane_u64 _MM_EXTRACT_EPI64
11986 // ***************** Set lanes within a vector ********************************************
11987 // **************************************************************************************
11988 //These intrinsics set a single lane (element) within a vector.
11989 //same functions as vld1_lane_xx ones, but take the value to be set directly.
11991 uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
11992 _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
11994 uint8_t val;
11995 val = value;
11996 return vld1_lane_u8(&val, vec, lane);
11999 uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12000 _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
12002 uint16_t val;
12003 val = value;
12004 return vld1_lane_u16(&val, vec, lane);
12007 uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12008 _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
12010 uint32_t val;
12011 val = value;
12012 return vld1_lane_u32(&val, vec, lane);
12015 int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12016 _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
12018 int8_t val;
12019 val = value;
12020 return vld1_lane_s8(&val, vec, lane);
12023 int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12024 _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
12026 int16_t val;
12027 val = value;
12028 return vld1_lane_s16(&val, vec, lane);
12031 int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12032 _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
12034 int32_t val;
12035 val = value;
12036 return vld1_lane_s32(&val, vec, lane);
12039 poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12040 #define vset_lane_p8 vset_lane_u8
12042 poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12043 #define vset_lane_p16 vset_lane_u16
12045 float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12046 _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
12048 float32_t val;
12049 val = value;
12050 return vld1_lane_f32(&val, vec, lane);
12053 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12054 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
12056 uint8_t val;
12057 val = value;
12058 return vld1q_lane_u8(&val, vec, lane);
12061 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12062 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
12064 uint16_t val;
12065 val = value;
12066 return vld1q_lane_u16(&val, vec, lane);
12069 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12070 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
12072 uint32_t val;
12073 val = value;
12074 return vld1q_lane_u32(&val, vec, lane);
12077 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12078 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
12080 int8_t val;
12081 val = value;
12082 return vld1q_lane_s8(&val, vec, lane);
12085 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12086 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
12088 int16_t val;
12089 val = value;
12090 return vld1q_lane_s16(&val, vec, lane);
12093 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12094 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
12096 int32_t val;
12097 val = value;
12098 return vld1q_lane_s32(&val, vec, lane);
12101 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12102 #define vsetq_lane_p8 vsetq_lane_u8
12104 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12105 #define vsetq_lane_p16 vsetq_lane_u16
12107 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12108 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
12110 float32_t val;
12111 val = value;
12112 return vld1q_lane_f32(&val, vec, lane);
12115 int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12116 _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
12118 int64_t val;
12119 val = value;
12120 return vld1_lane_s64(&val, vec, lane);
12123 uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12124 _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
12126 uint64_t val;
12127 val = value;
12128 return vld1_lane_u64(&val, vec, lane);
12131 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12132 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
12134 uint64_t val;
12135 val = value;
12136 return vld1q_lane_s64(&val, vec, lane);
12139 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12140 #define vsetq_lane_u64 vsetq_lane_s64
12142 // *******************************************************************************
12143 // **************** Initialize a vector from bit pattern ***************************
12144 // *******************************************************************************
12145 //These intrinsics create a vector from a literal bit pattern.
12146 int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
12147 #define vcreate_s8(a) (*(__m64_128*)&(a))
12150 int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
12151 #define vcreate_s16 vcreate_s8
12153 int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
12154 #define vcreate_s32 vcreate_s8
12156 float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
12157 //no IA32 SIMD avalilable
12159 float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
12160 #define vcreate_f32(a) (*(__m64_128*)&(a))
12162 uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
12163 #define vcreate_u8 vcreate_s8
12165 uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
12166 #define vcreate_u16 vcreate_s16
12168 uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
12169 #define vcreate_u32 vcreate_s32
12171 uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
12172 #define vcreate_u64 vcreate_s8
12175 poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
12176 #define vcreate_p8 vcreate_u8
12178 poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
12179 #define vcreate_p16 vcreate_u16
12181 int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
12182 #define vcreate_s64 vcreate_u64
12184 //********************* Set all lanes to same value ********************************
12185 //*********************************************************************************
12186 //These intrinsics set all lanes to the same value.
12187 uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
12188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12190 uint8x8_t res;
12191 int i;
12192 for (i = 0; i<8; i++) {
12193 res.m64_u8[i] = value;
12195 return res;
12198 uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
12199 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12201 uint16x4_t res;
12202 int i;
12203 for (i = 0; i<4; i++) {
12204 res.m64_u16[i] = value;
12206 return res;
12209 uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
12210 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12212 uint32x2_t res;
12213 res.m64_u32[0] = value;
12214 res.m64_u32[1] = value;
12215 return res;
12218 int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
12219 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12221 int8x8_t res;
12222 int i;
12223 for (i = 0; i<8; i++) {
12224 res.m64_i8[i] = value;
12226 return res;
12229 int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
12230 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12232 int16x4_t res;
12233 int i;
12234 for (i = 0; i<4; i++) {
12235 res.m64_i16[i] = value;
12237 return res;
12240 int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
12241 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12243 int32x2_t res;
12244 res.m64_i32[0] = value;
12245 res.m64_i32[1] = value;
12246 return res;
12249 poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
12250 #define vdup_n_p8 vdup_n_u8
12252 poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
12253 #define vdup_n_p16 vdup_n_s16
12255 float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
12256 _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
12258 float32x2_t res;
12259 res.m64_f32[0] = value;
12260 res.m64_f32[1] = value;
12261 return res;
12264 uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
12265 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
12267 uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
12268 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
12270 uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
12271 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
12273 int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
12274 #define vdupq_n_s8 _mm_set1_epi8
12276 int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
12277 #define vdupq_n_s16 _mm_set1_epi16
12279 int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
12280 #define vdupq_n_s32 _mm_set1_epi32
12282 poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
12283 #define vdupq_n_p8 vdupq_n_u8
12285 poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
12286 #define vdupq_n_p16 vdupq_n_u16
12288 float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
12289 #define vdupq_n_f32 _mm_set1_ps
12291 int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
12292 _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
12294 int64x1_t res;
12295 res.m64_i64[0] = value;
12296 return res;
12299 uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
12300 _NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value)
12302 uint64x1_t res;
12303 res.m64_u64[0] = value;
12304 return res;
12307 int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
12308 _NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value)
12310 _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
12311 return LOAD_SI128(value2);
12314 uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
12315 _NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value)
12317 _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
12318 return LOAD_SI128(val);
12321 //**** Set all lanes to same value ************************
12322 //Same functions as above - just aliaces.********************
12323 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12324 uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
12325 #define vmov_n_u8 vdup_n_s8
12327 uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
12328 #define vmov_n_u16 vdup_n_s16
12330 uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
12331 #define vmov_n_u32 vdup_n_u32
12333 int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
12334 #define vmov_n_s8 vdup_n_s8
12336 int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
12337 #define vmov_n_s16 vdup_n_s16
12339 int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
12340 #define vmov_n_s32 vdup_n_s32
12342 poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
12343 #define vmov_n_p8 vdup_n_u8
12345 poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
12346 #define vmov_n_p16 vdup_n_s16
12348 float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
12349 #define vmov_n_f32 vdup_n_f32
12351 uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
12352 #define vmovq_n_u8 vdupq_n_u8
12354 uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
12355 #define vmovq_n_u16 vdupq_n_s16
12357 uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
12358 #define vmovq_n_u32 vdupq_n_u32
12360 int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
12361 #define vmovq_n_s8 vdupq_n_s8
12363 int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
12364 #define vmovq_n_s16 vdupq_n_s16
12366 int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
12367 #define vmovq_n_s32 vdupq_n_s32
12369 poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
12370 #define vmovq_n_p8 vdupq_n_u8
12372 poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
12373 #define vmovq_n_p16 vdupq_n_s16
12375 float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
12376 #define vmovq_n_f32 vdupq_n_f32
12378 int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
12379 #define vmov_n_s64 vdup_n_s64
12381 uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
12382 #define vmov_n_u64 vdup_n_u64
12384 int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
12385 #define vmovq_n_s64 vdupq_n_s64
12387 uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
12388 #define vmovq_n_u64 vdupq_n_u64
12390 //**************Set all lanes to the value of one lane of a vector *************
12391 //****************************************************************************
12392 //here shuffle is better solution than lane extraction followed by set1 function
12393 uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12394 _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
12396 uint8x8_t res;
12397 uint8_t valane;
12398 int i = 0;
12399 valane = vec.m64_u8[lane];
12400 for (i = 0; i<8; i++) {
12401 res.m64_u8[i] = valane;
12403 return res;
12406 uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12407 _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
12409 uint16x4_t res;
12410 uint16_t valane;
12411 valane = vec.m64_u16[lane];
12412 res.m64_u16[0] = valane;
12413 res.m64_u16[1] = valane;
12414 res.m64_u16[2] = valane;
12415 res.m64_u16[3] = valane;
12416 return res;
12419 uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12420 _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12422 uint32x2_t res;
12423 res.m64_u32[0] = vec.m64_u32[lane];
12424 res.m64_u32[1] = res.m64_u32[0];
12425 return res;
12428 int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12429 #define vdup_lane_s8 vdup_lane_u8
12431 int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12432 #define vdup_lane_s16 vdup_lane_u16
12434 int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12435 #define vdup_lane_s32 vdup_lane_u32
12437 poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12438 #define vdup_lane_p8 vdup_lane_u8
12440 poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12441 #define vdup_lane_p16 vdup_lane_s16
12443 float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12444 _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
12446 float32x2_t res;
12447 res.m64_f32[0] = vec.m64_f32[lane];
12448 res.m64_f32[1] = res.m64_f32[0];
12449 return res;
12452 uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12453 _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
12455 _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane};
12456 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
12459 uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12460 _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
12462 //we could use 8bit shuffle for 16 bit as well
12463 const int8_t lane16 = ((int8_t) lane) << 1;
12464 _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1,
12465 lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1};
12466 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
12469 uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12470 #define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6))
12472 int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12473 #define vdupq_lane_s8 vdupq_lane_u8
12475 int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12476 #define vdupq_lane_s16 vdupq_lane_u16
12478 int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12479 #define vdupq_lane_s32 vdupq_lane_u32
12481 poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12482 #define vdupq_lane_p8 vdupq_lane_u8
12484 poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12485 #define vdupq_lane_p16 vdupq_lane_s16
12487 float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12488 #define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
12490 int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12491 #define vdup_lane_s64(vec,lane) vec
12493 uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12494 #define vdup_lane_u64(vec,lane) vec
12496 int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12497 _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
12499 __m128i vec128;
12500 vec128 = _pM128i(vec);
12501 return _mm_unpacklo_epi64(vec128,vec128);
12504 uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12505 #define vdupq_lane_u64 vdupq_lane_s64
12507 // ********************************************************************
12508 // ******************** Combining vectors *****************************
12509 // ********************************************************************
12510 //These intrinsics join two 64 bit vectors into a single 128bit vector.
12511 int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
12512 #define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12514 int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
12515 #define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12517 int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
12518 #define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12520 int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
12521 #define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12523 float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
12524 //current IA SIMD doesn't support float16
12526 float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
12527 _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
12529 __m128i res;
12530 res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
12531 return _M128(res);
12534 uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
12535 #define vcombine_u8 vcombine_s8
12537 uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
12538 #define vcombine_u16 vcombine_s16
12540 uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
12541 #define vcombine_u32 vcombine_s32
12543 uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
12544 #define vcombine_u64 vcombine_s64
12546 poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
12547 #define vcombine_p8 vcombine_u8
12549 poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
12550 #define vcombine_p16 vcombine_u16
12552 //**********************************************************************
12553 //************************* Splitting vectors **************************
12554 //**********************************************************************
12555 //**************** Get high part ******************************************
12556 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12557 int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
12558 _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
12560 int8x8_t res64;
12561 __m128i res;
12562 res = _mm_unpackhi_epi64(a,a); //SSE2
12563 return64(res);
12566 int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
12567 _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
12569 int16x4_t res64;
12570 __m128i res;
12571 res = _mm_unpackhi_epi64(a,a); //SSE2
12572 return64(res);
12575 int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
12576 _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
12578 int32x2_t res64;
12579 __m128i res;
12580 res = _mm_unpackhi_epi64(a,a); //SSE2
12581 return64(res);
12584 int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
12585 _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
12587 int64x1_t res64;
12588 __m128i res;
12589 res = _mm_unpackhi_epi64(a,a); //SSE2
12590 return64(res);
12593 float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
12594 // IA32 SIMD doesn't work with 16bit floats currently
12596 float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
12597 _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
12599 __m128i res;
12600 __m64_128 res64;
12601 res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
12602 return64(res);
12605 uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
12606 #define vget_high_u8 vget_high_s8
12608 uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
12609 #define vget_high_u16 vget_high_s16
12611 uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
12612 #define vget_high_u32 vget_high_s32
12614 uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
12615 #define vget_high_u64 vget_high_s64
12617 poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
12618 #define vget_high_p8 vget_high_u8
12620 poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
12621 #define vget_high_p16 vget_high_u16
12623 //********************** Get low part **********************
12624 //**********************************************************
12625 int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
12626 _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
12628 int16x4_t res64;
12629 return64(a);
12632 int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
12633 _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
12635 int16x4_t res64;
12636 return64(a);
12639 int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
12640 _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
12642 int32x2_t res64;
12643 return64(a);
12646 int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
12647 _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
12649 int64x1_t res64;
12650 return64 (a);
12653 float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
12654 // IA32 SIMD doesn't work with 16bit floats currently
12656 float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
12657 _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
12659 float32x2_t res64;
12660 _M64f(res64, a);
12661 return res64;
12664 uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
12665 #define vget_low_u8 vget_low_s8
12667 uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
12668 #define vget_low_u16 vget_low_s16
12670 uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
12671 #define vget_low_u32 vget_low_s32
12673 uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
12674 #define vget_low_u64 vget_low_s64
12676 poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
12677 #define vget_low_p8 vget_low_u8
12679 poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
12680 #define vget_low_p16 vget_low_s16
12682 //**************************************************************************
12683 //************************ Converting vectors **********************************
12684 //**************************************************************************
12685 //************* Convert from float ***************************************
12686 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12687 int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
12688 _NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a)
12690 int32x2_t res64;
12691 __m128i res;
12692 res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only
12693 return64(res);
12696 uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
12697 _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
12699 //may be not effective compared with a serial SIMD solution
12700 uint32x2_t res64;
12701 __m128i res;
12702 res = vcvtq_u32_f32(_pM128(a));
12703 return64(res);
12706 int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
12707 #define vcvtq_s32_f32 _mm_cvttps_epi32
12709 uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
12710 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
12712 //No single instruction SSE solution but we could implement it as following:
12713 __m128i resi;
12714 __m128 zero, mask, a_pos, mask_f_max_si, res;
12715 _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
12716 zero = _mm_setzero_ps();
12717 mask = _mm_cmpgt_ps(a, zero);
12718 a_pos = _mm_and_ps(a, mask);
12719 mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
12720 res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything
12721 resi = _mm_cvttps_epi32(res);
12722 return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
12725 // ***** Convert to the fixed point with the number of fraction bits specified by b ***********
12726 //*************************************************************************************************
12727 int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
12728 _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
12730 int32x2_t res64;
12731 return64(vcvtq_n_s32_f32(_pM128(a),b));
12734 uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
12735 _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
12737 uint32x2_t res;
12738 float convconst;
12739 convconst = (float)((uint32_t)1 << b);
12740 res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
12741 res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
12742 return res;
12745 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
12746 _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
12748 float convconst;
12749 _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
12750 __m128 cconst128;
12751 __m128i mask, res;
12752 convconst = (float)(1 << b);
12753 cconst128 = vdupq_n_f32(convconst);
12754 res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
12755 mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
12756 return _mm_xor_si128 (res, mask); //res saturated for 0x80000000
12759 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
12760 _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
12762 float convconst;
12763 __m128 cconst128;
12764 convconst = (float)(1 << b);
12765 cconst128 = vdupq_n_f32(convconst);
12766 return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
12769 //***************** Convert to float *************************
12770 //*************************************************************
12771 float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
12772 _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
12774 float32x2_t res;
12775 res.m64_f32[0] = (float) a.m64_i32[0];
12776 res.m64_f32[1] = (float) a.m64_i32[1];
12777 return res;
12780 float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
12781 _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
12783 float32x2_t res;
12784 res.m64_f32[0] = (float) a.m64_u32[0];
12785 res.m64_f32[1] = (float) a.m64_u32[1];
12786 return res;
12789 float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
12790 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12792 float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
12793 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
12795 //solution may be not optimal
12796 __m128 two16, fHi, fLo;
12797 __m128i hi, lo;
12798 two16 = _mm_set1_ps((float)0x10000); //2^16
12799 // Avoid double rounding by doing two exact conversions
12800 // of high and low 16-bit segments
12801 hi = _mm_srli_epi32(a, 16);
12802 lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
12803 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
12804 fLo = _mm_cvtepi32_ps(lo);
12805 // do single rounding according to current rounding mode
12806 return _mm_add_ps(fHi, fLo);
12809 // ***** Convert to the float from fixed point with the number of fraction bits specified by b ***********
12810 float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
12811 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
12813 float32x2_t res;
12814 float convconst;
12815 convconst = (float)(1. / ((uint32_t)1 << b));
12816 res.m64_f32[0] = a.m64_i32[0] * convconst;
12817 res.m64_f32[1] = a.m64_i32[1] * convconst;
12818 return res;
12821 float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
12822 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
12824 float32x2_t res;
12825 float convconst;
12826 convconst = (float)(1. / ((uint32_t)1 << b));
12827 res.m64_f32[0] = a.m64_u32[0] * convconst;
12828 res.m64_f32[1] = a.m64_u32[1] * convconst;
12829 return res;
12832 float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
12833 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
12835 float convconst;
12836 __m128 cconst128, af;
12837 convconst = (float)(1. / ((uint32_t)1 << b));
12838 af = _mm_cvtepi32_ps(a);
12839 cconst128 = vdupq_n_f32(convconst);
12840 return _mm_mul_ps(af,cconst128);
12843 float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
12844 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
12846 float convconst;
12847 __m128 cconst128, af;
12848 convconst = (float)(1. / (1 << b));
12849 af = vcvtq_f32_u32(a);
12850 cconst128 = vdupq_n_f32(convconst);
12851 return _mm_mul_ps(af,cconst128);
12854 //**************Convert between floats ***********************
12855 //************************************************************
12856 float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
12857 //Intel SIMD doesn't support 16bits floats curently
12859 float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
12860 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
12862 //************Vector narrow integer conversion (truncation) ******************
12863 //****************************************************************************
12864 int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
12865 _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
12867 int8x8_t res64;
12868 __m128i res;
12869 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
12870 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
12871 return64(res);
12874 int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
12875 _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
12877 int16x4_t res64;
12878 __m128i res;
12879 _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
12880 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
12881 return64(res);
12884 int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
12885 _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
12887 //may be not effective compared with a serial implementation
12888 int32x2_t res64;
12889 __m128i res;
12890 res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
12891 return64(res);
12894 uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
12895 #define vmovn_u16 vmovn_s16
12897 uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
12898 #define vmovn_u32 vmovn_s32
12900 uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
12901 #define vmovn_u64 vmovn_s64
12903 //**************** Vector long move ***********************
12904 //***********************************************************
12905 int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
12906 #define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
12908 int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
12909 #define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
12911 int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
12912 #define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
12914 uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
12915 #define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
12917 uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
12918 #define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
12920 uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
12921 #define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
12923 //*************Vector saturating narrow integer*****************
12924 //**************************************************************
12925 int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
12926 _NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a)
12928 int8x8_t res64;
12929 __m128i res;
12930 res = _mm_packs_epi16(a, a);
12931 return64(res);
12934 int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
12935 _NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a)
12937 int16x4_t res64;
12938 __m128i res;
12939 res = _mm_packs_epi32(a, a);
12940 return64(res);
12943 int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
12944 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
12946 int32x2_t res;
12947 _NEON2SSE_ALIGN_16 int64_t atmp[2];
12948 _mm_store_si128((__m128i*)atmp, a);
12949 if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
12950 if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
12951 if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
12952 if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
12953 res.m64_i32[0] = (int32_t)atmp[0];
12954 res.m64_i32[1] = (int32_t)atmp[1];
12955 return res;
12958 uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
12959 _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
12961 //no uint16 to uint8 conversion in SSE, need truncate to max signed first
12962 uint8x8_t res64;
12963 __m128i c7fff, a_trunc;
12964 c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
12965 a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed
12966 a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
12967 return64(a_trunc);
12970 uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
12971 _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
12973 //no uint32 to uint16 conversion in SSE, need truncate to max signed first
12974 uint16x4_t res64;
12975 __m128i c7fffffff, a_trunc;
12976 c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
12977 a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed
12978 a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
12979 return64(a_trunc);
12982 uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
12983 _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
12985 //serial solution may be faster
12986 uint32x2_t res64;
12987 __m128i res_hi, mask;
12988 mask = _mm_setzero_si128();
12989 res_hi = _mm_srli_epi64(a, 32);
12990 res_hi = _mm_cmpeq_epi32(res_hi, mask);
12991 mask = _mm_cmpeq_epi32(mask,mask); //all fff
12992 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
12993 res_hi = _mm_or_si128(a, mask);
12994 res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
12995 return64(res_hi);
12997 //************* Vector saturating narrow integer signed->unsigned **************
12998 //*****************************************************************************
12999 uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
13000 _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
13002 uint8x8_t res64;
13003 __m128i res;
13004 res = _mm_packus_epi16(a, a); //use low 64bits only
13005 return64(res);
13008 uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
13009 _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
13011 uint16x4_t res64;
13012 __m128i res;
13013 res = _MM_PACKUS1_EPI32(a); //use low 64bits only
13014 return64(res);
13017 uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
13018 _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
13020 uint32x2_t res64;
13021 __m128i res_hi,res_lo, zero, cmp;
13022 zero = _mm_setzero_si128();
13023 res_hi = _mm_srli_epi64(a, 32);
13024 cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
13025 res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
13026 cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
13027 res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13028 res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13029 return64(res_lo);
13032 // ********************************************************
13033 // **************** Table look up **************************
13034 // ********************************************************
13035 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13036 //in a table and generate a new vector. Indexes out of range return 0.
13037 //for Intel SIMD we need to set the MSB to 1 for zero return
13038 uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13039 _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
13041 uint8x8_t res64;
13042 __m128i c7, maskgt, bmask, b128;
13043 c7 = _mm_set1_epi8 (7);
13044 b128 = _pM128i(b);
13045 maskgt = _mm_cmpgt_epi8(b128,c7);
13046 bmask = _mm_or_si128(b128,maskgt);
13047 bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
13048 return64(bmask);
13051 int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
13052 #define vtbl1_s8 vtbl1_u8
13054 poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13055 #define vtbl1_p8 vtbl1_u8
13057 //Special trick to avoid __declspec(align('8')) won't be aligned" error
13058 //uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13059 uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13060 _NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b)
13062 uint8x8_t res64;
13063 __m128i c15, a01, maskgt15, bmask, b128;
13064 c15 = _mm_set1_epi8 (15);
13065 b128 = _pM128i(b);
13066 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13067 bmask = _mm_or_si128(b128, maskgt15);
13068 a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1]));
13069 a01 = _mm_shuffle_epi8(a01, bmask);
13070 return64(a01);
13072 #define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
13074 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13075 #define vtbl2_s8 vtbl2_u8
13077 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13078 #define vtbl2_p8 vtbl2_u8
13080 //Special trick to avoid __declspec(align('16')) won't be aligned" error
13081 //uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13082 _NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b)
13084 //solution may be not optimal
13085 uint8x8_t res64;
13086 __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
13087 c15 = _mm_set1_epi8 (15);
13088 c23 = _mm_set1_epi8 (23);
13089 b128 = _pM128i(b);
13090 maskgt23 = _mm_cmpgt_epi8(b128,c23);
13091 bmask = _mm_or_si128(b128, maskgt23);
13092 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13093 a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
13094 sh0 = _mm_shuffle_epi8(a01, bmask);
13095 sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
13096 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
13097 return64(sh0);
13099 #define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
13101 //int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13102 int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13103 #define vtbl3_s8 vtbl3_u8
13105 //poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13106 poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13107 #define vtbl3_p8 vtbl3_u8
13109 //uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13110 _NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b)
13112 //solution may be not optimal
13113 uint8x8_t res64;
13114 __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
13115 c15 = _mm_set1_epi8 (15);
13116 c31 = _mm_set1_epi8 (31);
13117 b128 = _pM128i(b);
13118 maskgt31 = _mm_cmpgt_epi8(b128,c31);
13119 bmask = _mm_or_si128(b128, maskgt31);
13120 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13121 a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
13122 a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3]));
13123 sh0 = _mm_shuffle_epi8(a01, bmask);
13124 sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
13125 sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
13126 return64(sh0);
13128 #define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
13130 //int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13131 int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13132 #define vtbl4_s8 vtbl4_u8
13134 //poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13135 poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13136 #define vtbl4_p8 vtbl4_u8
13138 //****************** Extended table look up intrinsics ***************************
13139 //**********************************************************************************
13140 //VTBX (Vector Table Extension) works in the same way as VTBL do,
13141 // except that indexes out of range leave the destination element unchanged.
13143 uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13144 _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
13146 uint8x8_t res64;
13147 __m128i c7, maskgt, sh, c128;
13148 c7 = _mm_set1_epi8 (7);
13149 c128 = _pM128i(c);
13150 maskgt = _mm_cmpgt_epi8(c128,c7);
13151 c7 = _mm_and_si128(maskgt,_pM128i(a));
13152 sh = _mm_shuffle_epi8(_pM128i(b),c128);
13153 sh = _mm_andnot_si128(maskgt,sh);
13154 sh = _mm_or_si128(sh,c7);
13155 return64(sh);
13158 int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
13159 #define vtbx1_s8 vtbx1_u8
13161 poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13162 #define vtbx1_p8 vtbx1_u8
13164 //Special trick to avoid __declspec(align('8')) won't be aligned" error
13165 //uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13166 uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13167 _NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c)
13169 uint8x8_t res64;
13170 __m128i c15, b01, maskgt15, sh, c128;
13171 c15 = _mm_set1_epi8 (15);
13172 c128 = _pM128i(c);
13173 maskgt15 = _mm_cmpgt_epi8(c128, c15);
13174 c15 = _mm_and_si128(maskgt15, _pM128i(a));
13175 b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1]));
13176 sh = _mm_shuffle_epi8(b01, c128);
13177 sh = _mm_andnot_si128(maskgt15, sh);
13178 sh = _mm_or_si128(sh,c15);
13179 return64(sh);
13181 #define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
13183 //int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13184 #define vtbx2_s8 vtbx2_u8
13186 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13187 #define vtbx2_p8 vtbx2_u8
13189 //uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
13190 _NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c)
13192 //solution may be not optimal
13193 uint8x8_t res64;
13194 __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
13195 c15 = _mm_set1_epi8 (15);
13196 c23 = _mm_set1_epi8 (23);
13197 c128 = _pM128i(c);
13198 maskgt15 = _mm_cmpgt_epi8(c128,c15);
13199 maskgt23 = _mm_cmpgt_epi8(c128,c23);
13200 c23 = _mm_and_si128(maskgt23, _pM128i(a));
13201 b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
13202 sh0 = _mm_shuffle_epi8(b01, c128);
13203 sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
13204 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13205 sh0 = _mm_andnot_si128(maskgt23,sh0);
13206 sh0 = _mm_or_si128(sh0,c23);
13207 return64(sh0);
13209 #define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
13211 //int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13212 int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c);
13213 #define vtbx3_s8 vtbx3_u8
13215 //poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13216 poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c);
13217 #define vtbx3_p8 vtbx3_u8
13219 //uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
13220 _NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c)
13222 //solution may be not optimal
13223 uint8x8_t res64;
13224 __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
13225 c15 = _mm_set1_epi8 (15);
13226 c31 = _mm_set1_epi8 (31);
13227 c128 = _pM128i(c);
13228 maskgt15 = _mm_cmpgt_epi8(c128,c15);
13229 maskgt31 = _mm_cmpgt_epi8(c128,c31);
13230 c31 = _mm_and_si128(maskgt31, _pM128i(a));
13232 b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
13233 b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3]));
13234 sh0 = _mm_shuffle_epi8(b01, c128);
13235 sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
13236 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13237 sh0 = _mm_andnot_si128(maskgt31,sh0);
13238 sh0 = _mm_or_si128(sh0,c31);
13239 return64(sh0);
13241 #define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
13243 //int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13244 int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c);
13245 #define vtbx4_s8 vtbx4_u8
13247 //poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13248 poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c);
13249 #define vtbx4_p8 vtbx4_u8
13251 //*************************************************************************************************
13252 // *************************** Operations with a scalar value *********************************
13253 //*************************************************************************************************
13255 //******* Vector multiply accumulate by scalar *************************************************
13256 //**********************************************************************************************
13257 int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13258 _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
13260 int16_t c;
13261 int16x4_t scalar;
13262 c = vget_lane_s16(v, l);
13263 scalar = vdup_n_s16(c);
13264 return vmla_s16(a, b, scalar);
13267 int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13268 _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
13270 int32_t c;
13271 int32x2_t scalar;
13272 c = vget_lane_s32(v, l);
13273 scalar = vdup_n_s32(c);
13274 return vmla_s32(a, b, scalar);
13277 uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13278 #define vmla_lane_u16 vmla_lane_s16
13281 uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13282 #define vmla_lane_u32 vmla_lane_s32
13284 float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
13285 _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13287 float32_t vlane;
13288 float32x2_t c;
13289 vlane = vget_lane_f32(v, l);
13290 c = vdup_n_f32(vlane);
13291 return vmla_f32(a,b,c);
13294 int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13295 _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13297 int16_t vlane;
13298 int16x8_t c;
13299 vlane = vget_lane_s16(v, l);
13300 c = vdupq_n_s16(vlane);
13301 return vmlaq_s16(a,b,c);
13304 int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13305 _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13307 int32_t vlane;
13308 int32x4_t c;
13309 vlane = vget_lane_s32(v, l);
13310 c = vdupq_n_s32(vlane);
13311 return vmlaq_s32(a,b,c);
13314 uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13315 #define vmlaq_lane_u16 vmlaq_lane_s16
13317 uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13318 #define vmlaq_lane_u32 vmlaq_lane_s32
13320 float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13321 _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13323 float32_t vlane;
13324 float32x4_t c;
13325 vlane = vget_lane_f32(v, l);
13326 c = vdupq_n_f32(vlane);
13327 return vmlaq_f32(a,b,c);
13330 //***************** Vector widening multiply accumulate by scalar **********************
13331 //***************************************************************************************
13332 int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13333 _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13335 int16_t vlane;
13336 int16x4_t c;
13337 vlane = vget_lane_s16(v, l);
13338 c = vdup_n_s16(vlane);
13339 return vmlal_s16(a, b, c);
13342 int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13343 _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13345 int32_t vlane;
13346 int32x2_t c;
13347 vlane = vget_lane_s32(v, l);
13348 c = vdup_n_s32(vlane);
13349 return vmlal_s32(a, b, c);
13352 uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13353 _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13355 uint16_t vlane;
13356 uint16x4_t c;
13357 vlane = vget_lane_u16(v, l);
13358 c = vdup_n_u16(vlane);
13359 return vmlal_u16(a, b, c);
13362 uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13363 _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13365 uint32_t vlane;
13366 uint32x2_t c;
13367 vlane = vget_lane_u32(v, l);
13368 c = vdup_n_u32(vlane);
13369 return vmlal_u32(a, b, c);
13372 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
13373 // ************************************************************************************************
13374 int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
13375 _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13377 int16_t vlane;
13378 int16x4_t c;
13379 vlane = vget_lane_s16(v, l);
13380 c = vdup_n_s16(vlane);
13381 return vqdmlal_s16(a, b, c);
13384 int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
13385 _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
13387 int32_t vlane;
13388 uint32x2_t c;
13389 vlane = vget_lane_s32(v, l);
13390 c = vdup_n_s32(vlane);
13391 return vqdmlal_s32(a, b, c);
13394 // ****** Vector multiply subtract by scalar *****************
13395 // *************************************************************
13396 int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13397 _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13399 int16_t vlane;
13400 int16x4_t c;
13401 vlane = vget_lane_s16(v, l);
13402 c = vdup_n_s16(vlane);
13403 return vmls_s16(a, b, c);
13406 int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13407 _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13409 int32_t vlane;
13410 int32x2_t c;
13411 vlane = vget_lane_s32(v, l);
13412 c = vdup_n_s32(vlane);
13413 return vmls_s32(a, b, c);
13416 uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13417 _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13419 uint16_t vlane;
13420 uint16x4_t c;
13421 vlane = vget_lane_s16(v, l);
13422 c = vdup_n_s16(vlane);
13423 return vmls_s16(a, b, c);
13426 uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13427 _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13429 uint32_t vlane;
13430 uint32x2_t c;
13431 vlane = vget_lane_u32(v, l);
13432 c = vdup_n_u32(vlane);
13433 return vmls_u32(a, b, c);
13436 float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
13437 _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13439 float32_t vlane;
13440 float32x2_t c;
13441 vlane = (float) vget_lane_f32(v, l);
13442 c = vdup_n_f32(vlane);
13443 return vmls_f32(a,b,c);
13446 int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
13447 _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
13449 int16_t vlane;
13450 int16x8_t c;
13451 vlane = vget_lane_s16(v, l);
13452 c = vdupq_n_s16(vlane);
13453 return vmlsq_s16(a, b,c);
13456 int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
13457 _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
13459 int32_t vlane;
13460 int32x4_t c;
13461 vlane = vget_lane_s32(v, l);
13462 c = vdupq_n_s32(vlane);
13463 return vmlsq_s32(a,b,c);
13466 uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13467 _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13469 uint16_t vlane;
13470 uint16x8_t c;
13471 vlane = vget_lane_u16(v, l);
13472 c = vdupq_n_u16(vlane);
13473 return vmlsq_u16(a,b,c);
13476 uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13477 _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13479 uint32_t vlane;
13480 uint32x4_t c;
13481 vlane = vget_lane_u32(v, l);
13482 c = vdupq_n_u32(vlane);
13483 return vmlsq_u32(a,b,c);
13486 float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13487 _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13489 float32_t vlane;
13490 float32x4_t c;
13491 vlane = (float) vget_lane_f32(v, l);
13492 c = vdupq_n_f32(vlane);
13493 return vmlsq_f32(a,b,c);
13496 // **** Vector widening multiply subtract by scalar ****
13497 // ****************************************************
13498 int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13499 _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13501 int16_t vlane;
13502 int16x4_t c;
13503 vlane = vget_lane_s16(v, l);
13504 c = vdup_n_s16(vlane);
13505 return vmlsl_s16(a, b, c);
13508 int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13509 _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13511 int32_t vlane;
13512 int32x2_t c;
13513 vlane = vget_lane_s32(v, l);
13514 c = vdup_n_s32(vlane);
13515 return vmlsl_s32(a, b, c);
13518 uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13519 _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13521 uint16_t vlane;
13522 uint16x4_t c;
13523 vlane = vget_lane_s16(v, l);
13524 c = vdup_n_s16(vlane);
13525 return vmlsl_s16(a, b, c);
13528 uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13529 _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13531 uint32_t vlane;
13532 uint32x2_t c;
13533 vlane = vget_lane_u32(v, l);
13534 c = vdup_n_u32(vlane);
13535 return vmlsl_u32(a, b, c);
13538 //********* Vector widening saturating doubling multiply subtract by scalar **************************
13539 //******************************************************************************************************
13540 int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
13541 _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13543 int16_t vlane;
13544 int16x4_t c;
13545 vlane = vget_lane_s16(v, l);
13546 c = vdup_n_s16(vlane);
13547 return vqdmlsl_s16(a, b, c);
13550 int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
13551 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
13553 int32_t vlane;
13554 int32x2_t c;
13555 vlane = vget_lane_s32(v, l);
13556 c = vdup_n_s32(vlane);
13557 return vqdmlsl_s32(a, b, c);
13559 //********** Vector multiply with scalar *****************************
13560 int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
13561 _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
13563 int16x4_t b16x4;
13564 b16x4 = vdup_n_s16(b);
13565 return vmul_s16(a, b16x4);
13568 int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
13569 _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
13571 //serial solution looks faster
13572 int32x2_t b32x2;
13573 b32x2 = vdup_n_s32(b);
13574 return vmul_s32(a, b32x2);
13577 float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
13578 _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
13580 float32x2_t b32x2;
13581 b32x2 = vdup_n_f32(b);
13582 return vmul_f32(a, b32x2);
13585 uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
13586 _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
13588 uint16x4_t b16x4;
13589 b16x4 = vdup_n_s16(b);
13590 return vmul_s16(a, b16x4);
13593 uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
13594 _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
13596 //serial solution looks faster
13597 uint32x2_t b32x2;
13598 b32x2 = vdup_n_u32(b);
13599 return vmul_u32(a, b32x2);
13602 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
13603 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
13605 int16x8_t b16x8;
13606 b16x8 = vdupq_n_s16(b);
13607 return vmulq_s16(a, b16x8);
13610 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
13611 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
13613 int32x4_t b32x4;
13614 b32x4 = vdupq_n_s32(b);
13615 return vmulq_s32(a, b32x4);
13618 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
13619 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
13621 float32x4_t b32x4;
13622 b32x4 = vdupq_n_f32(b);
13623 return vmulq_f32(a, b32x4);
13626 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
13627 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
13629 uint16x8_t b16x8;
13630 b16x8 = vdupq_n_s16(b);
13631 return vmulq_s16(a, b16x8);
13634 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
13635 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
13637 uint32x4_t b32x4;
13638 b32x4 = vdupq_n_u32(b);
13639 return vmulq_u32(a, b32x4);
13642 //********** Vector multiply lane *****************************
13643 int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
13644 _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
13646 int16x4_t b16x4;
13647 int16_t vlane;
13648 vlane = vget_lane_s16(b, c);
13649 b16x4 = vdup_n_s16(vlane);
13650 return vmul_s16(a, b16x4);
13653 int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
13654 _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
13656 int32x2_t b32x2;
13657 int32_t vlane;
13658 vlane = vget_lane_s32(b, c);
13659 b32x2 = vdup_n_s32(vlane);
13660 return vmul_s32(a, b32x2);
13663 float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
13664 _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
13666 float32x2_t b32x2;
13667 float32_t vlane;
13668 vlane = vget_lane_f32(b, c);
13669 b32x2 = vdup_n_f32(vlane);
13670 return vmul_f32(a, b32x2);
13673 uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
13674 #define vmul_lane_u16 vmul_lane_s16
13676 uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
13677 #define vmul_lane_u32 vmul_lane_s32
13679 int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
13680 _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
13682 int16x8_t b16x8;
13683 int16_t vlane;
13684 vlane = vget_lane_s16(b, c);
13685 b16x8 = vdupq_n_s16(vlane);
13686 return vmulq_s16(a, b16x8);
13689 int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
13690 _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
13692 int32x4_t b32x4;
13693 int32_t vlane;
13694 vlane = vget_lane_s32(b, c);
13695 b32x4 = vdupq_n_s32(vlane);
13696 return vmulq_s32(a, b32x4);
13699 float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
13700 _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
13702 float32x4_t b32x4;
13703 float32_t vlane;
13704 vlane = vget_lane_f32(b, c);
13705 b32x4 = vdupq_n_f32(vlane);
13706 return vmulq_f32(a, b32x4);
13709 uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
13710 #define vmulq_lane_u16 vmulq_lane_s16
13712 uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
13713 #define vmulq_lane_u32 vmulq_lane_s32
13715 //**** Vector long multiply with scalar ************
13716 int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
13717 _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
13719 int16x4_t b16x4;
13720 b16x4 = vdup_n_s16(val2);
13721 return vmull_s16(vec1, b16x4);
13724 int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
13725 _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
13727 int32x2_t b32x2;
13728 b32x2 = vdup_n_s32(val2);
13729 return vmull_s32(vec1, b32x2);
13732 uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
13733 _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
13735 uint16x4_t b16x4;
13736 b16x4 = vdup_n_s16(val2);
13737 return vmull_s16(vec1, b16x4);
13740 uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
13741 _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
13743 uint32x2_t b32x2;
13744 b32x2 = vdup_n_u32(val2);
13745 return vmull_u32(vec1, b32x2);
13748 //**** Vector long multiply by scalar ****
13749 int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
13750 _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
13752 int16_t vlane;
13753 int16x4_t b;
13754 vlane = vget_lane_s16(val2, val3);
13755 b = vdup_n_s16(vlane);
13756 return vmull_s16(vec1, b);
13759 int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
13760 _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
13762 int32_t vlane;
13763 int32x2_t b;
13764 vlane = vget_lane_s32(val2, val3);
13765 b = vdup_n_s32(vlane);
13766 return vmull_s32(vec1, b);
13769 uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
13770 _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
13772 uint16_t vlane;
13773 uint16x4_t b;
13774 vlane = vget_lane_s16(val2, val3);
13775 b = vdup_n_s16(vlane);
13776 return vmull_s16(vec1, b);
13779 uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
13780 _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
13782 uint32_t vlane;
13783 uint32x2_t b;
13784 vlane = vget_lane_u32(val2, val3);
13785 b = vdup_n_u32(vlane);
13786 return vmull_u32(vec1, b);
13789 //********* Vector saturating doubling long multiply with scalar *******************
13790 int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
13791 _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
13793 //the serial soulution may be faster due to saturation
13794 int16x4_t b;
13795 b = vdup_n_s16(val2);
13796 return vqdmull_s16(vec1, b);
13799 int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
13800 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
13802 int32x2_t b;
13803 b = vdup_n_s32(val2);
13804 return vqdmull_s32(vec1,b); //slow serial function!!!!
13807 //************* Vector saturating doubling long multiply by scalar ***********************************************
13808 int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
13809 _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
13811 int16_t c;
13812 int16x4_t scalar;
13813 c = vget_lane_s16(val2, val3);
13814 scalar = vdup_n_s16(c);
13815 return vqdmull_s16(vec1, scalar);
13819 int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
13820 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
13822 int32_t c;
13823 int32x2_t scalar;
13824 c = vget_lane_s32(val2, val3);
13825 scalar = vdup_n_s32(c);
13826 return vqdmull_s32(vec1,scalar); //slow serial function!!!!
13829 // *****Vector saturating doubling multiply high with scalar *****
13830 int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
13831 _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2)
13833 int16x4_t res64;
13834 return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
13837 int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
13838 _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2)
13840 int32x2_t res64;
13841 return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
13844 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
13845 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0]
13847 //solution may be not optimal
13848 int16x8_t scalar;
13849 scalar = vdupq_n_s16(val2);
13850 return vqdmulhq_s16(vec1, scalar);
13853 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
13854 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13856 int32x4_t scalar;
13857 scalar = vdupq_n_s32(val2);
13858 return vqdmulhq_s32(vec1, scalar);
13861 //***** Vector saturating doubling multiply high by scalar ****************
13862 int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
13863 _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0]
13865 //solution may be not optimal
13866 int16_t vlane;
13867 int16x4_t scalar;
13868 vlane = vget_lane_s16(val2, val3);
13869 scalar = vdup_n_s16(vlane);
13870 return vqdmulh_s16(vec1, scalar);
13873 int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
13874 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13876 int32_t vlane;
13877 int32x2_t scalar;
13878 vlane = vget_lane_s32(val2, val3);
13879 scalar = vdup_n_s32(vlane);
13880 return vqdmulh_s32(vec1, scalar);
13883 int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
13884 _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0]
13886 //solution may be not optimal
13887 int16_t vlane;
13888 int16x8_t scalar;
13889 vlane = vget_lane_s16(val2, val3);
13890 scalar = vdupq_n_s16(vlane );
13891 return vqdmulhq_s16(vec1, scalar);
13894 int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
13895 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13897 //solution may be not optimal
13898 int32_t vlane;
13899 int32x4_t scalar;
13900 vlane = vgetq_lane_s32(_pM128i(val2), val3);
13901 scalar = vdupq_n_s32(vlane );
13902 return vqdmulhq_s32(vec1, scalar);
13905 //******** Vector saturating rounding doubling multiply high with scalar ***
13906 int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
13907 _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
13909 //solution may be not optimal
13910 int16x4_t scalar;
13911 scalar = vdup_n_s16(val2);
13912 return vqrdmulh_s16(vec1, scalar);
13915 int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
13916 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13918 int32x2_t scalar;
13919 scalar = vdup_n_s32(val2);
13920 return vqrdmulh_s32(vec1, scalar);
13923 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
13924 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
13926 //solution may be not optimal
13927 int16x8_t scalar;
13928 scalar = vdupq_n_s16(val2);
13929 return vqrdmulhq_s16(vec1, scalar);
13932 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
13933 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13935 int32x4_t scalar;
13936 scalar = vdupq_n_s32(val2);
13937 return vqrdmulhq_s32(vec1, scalar);
13940 //********* Vector rounding saturating doubling multiply high by scalar ****
13941 int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
13942 _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
13944 //solution may be not optimal
13945 int16_t vlane;
13946 int16x4_t scalar;
13947 vlane = vget_lane_s16(val2, val3);
13948 scalar = vdup_n_s16(vlane);
13949 return vqrdmulh_s16(vec1, scalar);
13952 int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
13953 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13955 int32_t vlane;
13956 int32x2_t scalar;
13957 vlane = vget_lane_s32(val2, val3);
13958 scalar = vdup_n_s32(vlane);
13959 return vqrdmulh_s32(vec1, scalar);
13962 int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
13963 _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
13965 //solution may be not optimal
13966 int16_t vlane;
13967 int16x8_t scalar;
13968 vlane = vget_lane_s16(val2, val3);
13969 scalar = vdupq_n_s16(vlane);
13970 return vqrdmulhq_s16(vec1, scalar);
13973 int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
13974 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13976 //solution may be not optimal
13977 int32_t vlane;
13978 int32x4_t scalar;
13979 vlane = vgetq_lane_s32(_pM128i(val2), val3);
13980 scalar = vdupq_n_s32(vlane );
13981 return vqrdmulhq_s32(vec1, scalar);
13984 //**************Vector multiply accumulate with scalar *******************
13985 int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
13986 _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
13988 int16x4_t scalar;
13989 scalar = vdup_n_s16(c);
13990 return vmla_s16(a, b, scalar);
13993 int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
13994 _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
13996 int32x2_t scalar;
13997 scalar = vdup_n_s32(c);
13998 return vmla_s32(a, b, scalar);
14001 uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
14002 #define vmla_n_u16 vmla_n_s16
14005 uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
14006 #define vmla_n_u32 vmla_n_s32
14009 float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
14010 _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
14012 float32x2_t scalar;
14013 scalar = vdup_n_f32(c);
14014 return vmla_f32(a, b, scalar);
14017 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
14018 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
14020 int16x8_t scalar;
14021 scalar = vdupq_n_s16(c);
14022 return vmlaq_s16(a,b,scalar);
14025 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
14026 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
14028 int32x4_t scalar;
14029 scalar = vdupq_n_s32(c);
14030 return vmlaq_s32(a,b,scalar);
14033 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
14034 #define vmlaq_n_u16 vmlaq_n_s16
14036 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
14037 #define vmlaq_n_u32 vmlaq_n_s32
14039 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
14040 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
14042 float32x4_t scalar;
14043 scalar = vdupq_n_f32(c);
14044 return vmlaq_f32(a,b,scalar);
14047 //************Vector widening multiply accumulate with scalar****************************
14048 int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
14049 _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
14051 int16x4_t vc;
14052 vc = vdup_n_s16(c);
14053 return vmlal_s16(a, b, vc);
14056 int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
14057 _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
14059 int32x2_t vc;
14060 vc = vdup_n_s32(c);
14061 return vmlal_s32(a, b, vc);
14064 uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
14065 _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
14067 uint16x4_t vc;
14068 vc = vdup_n_s16(c);
14069 return vmlal_s16(a, b, vc);
14072 uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
14073 _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
14075 uint32x2_t vc;
14076 vc = vdup_n_u32(c);
14077 return vmlal_u32(a, b, vc);
14080 //************ Vector widening saturating doubling multiply accumulate with scalar **************
14081 int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
14082 _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14084 //not optimal SIMD soulution, serial may be faster
14085 int16x4_t vc;
14086 vc = vdup_n_s16(c);
14087 return vqdmlal_s16(a, b, vc);
14090 int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
14091 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14093 int32x2_t vc;
14094 vc = vdup_n_s32(c);
14095 return vqdmlal_s32(a, b, vc);
14098 //******** Vector multiply subtract with scalar **************
14099 int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
14100 _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
14102 int16x4_t vc;
14103 vc = vdup_n_s16(c);
14104 return vmls_s16(a, b, vc);
14107 int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
14108 _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
14110 int32x2_t vc;
14111 vc = vdup_n_s32(c);
14112 return vmls_s32(a, b, vc);
14115 uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
14116 _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
14118 uint16x4_t vc;
14119 vc = vdup_n_s16(c);
14120 return vmls_s16(a, b, vc);
14123 uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
14124 _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
14126 uint32x2_t vc;
14127 vc = vdup_n_u32(c);
14128 return vmls_u32(a, b, vc);
14131 float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
14132 _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
14134 float32x2_t res;
14135 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
14136 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
14137 return res;
14140 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
14141 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
14143 int16x8_t vc;
14144 vc = vdupq_n_s16(c);
14145 return vmlsq_s16(a, b,vc);
14148 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
14149 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
14151 int32x4_t vc;
14152 vc = vdupq_n_s32(c);
14153 return vmlsq_s32(a,b,vc);
14156 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
14157 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
14159 uint32x4_t vc;
14160 vc = vdupq_n_u32(c);
14161 return vmlsq_u32(a,b,vc);
14164 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
14165 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
14167 uint32x4_t vc;
14168 vc = vdupq_n_u32(c);
14169 return vmlsq_u32(a,b,vc);
14172 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
14173 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
14175 float32x4_t vc;
14176 vc = vdupq_n_f32(c);
14177 return vmlsq_f32(a,b,vc);
14180 //**** Vector widening multiply subtract with scalar ******
14181 int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
14182 _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
14184 int16x4_t vc;
14185 vc = vdup_n_s16(c);
14186 return vmlsl_s16(a, b, vc);
14189 int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
14190 _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
14192 int32x2_t vc;
14193 vc = vdup_n_s32(c);
14194 return vmlsl_s32(a, b, vc);
14197 uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
14198 _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
14200 uint16x4_t vc;
14201 vc = vdup_n_u16(c);
14202 return vmlsl_u16(a, b, vc);
14205 uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
14206 _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
14208 uint32x2_t vc;
14209 vc = vdup_n_u32(c);
14210 return vmlsl_u32(a, b, vc);
14213 //***** Vector widening saturating doubling multiply subtract with scalar *********
14214 //**********************************************************************************
14215 int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
14216 _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14218 int16x4_t vc;
14219 vc = vdup_n_s16(c);
14220 return vqdmlsl_s16(a, b, vc);
14223 int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
14224 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14226 int32x2_t vc;
14227 vc = vdup_n_s32(c);
14228 return vqdmlsl_s32(a, b, vc);
14231 //******************* Vector extract ***********************************************
14232 //*************************************************************************************
14233 //VEXT (Vector Extract) extracts elements from the bottom end of the second operand
14234 //vector and the top end of the first, concatenates them, and places the result in the destination vector
14235 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
14236 int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14237 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
14239 int8x8_t res;
14240 int i;
14241 for (i = 0; i<8 - c; i++) {
14242 res.m64_i8[i] = a.m64_i8[i + c];
14244 for(i = 0; i<c; i++) {
14245 res.m64_i8[8 - c + i] = b.m64_i8[i];
14247 return res;
14250 uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14251 #define vext_u8 vext_s8
14252 //same result tested
14254 poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14255 #define vext_p8 vext_u8
14257 int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14258 _NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14260 int16x4_t res;
14261 int i;
14262 for (i = 0; i<4 - c; i++) {
14263 res.m64_i16[i] = a.m64_i16[i + c];
14265 for(i = 0; i<c; i++) {
14266 res.m64_i16[4 - c + i] = b.m64_i16[i];
14268 return res;
14271 uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14272 #define vext_u16 vext_s16
14274 poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14275 #define vext_p16 vext_s16
14277 int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14278 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14280 int32x2_t res;
14281 if (c==0) {
14282 res.m64_i32[0] = a.m64_i32[0];
14283 res.m64_i32[1] = a.m64_i32[1];
14284 } else {
14285 res.m64_i32[0] = a.m64_i32[1];
14286 res.m64_i32[1] = b.m64_i32[0];
14288 return res;
14291 float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14292 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14294 float32x2_t res;
14295 if (c==0) {
14296 res.m64_f32[0] = a.m64_f32[0];
14297 res.m64_f32[1] = a.m64_f32[1];
14298 } else {
14299 res.m64_f32[0] = a.m64_f32[1];
14300 res.m64_f32[1] = b.m64_f32[0];
14302 return res;
14305 uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14306 #define vext_u32 vext_s32
14309 int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14310 #define vext_s64(a,b,c) a
14312 uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14313 #define vext_u64(a,b,c) a
14315 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14316 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14318 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14319 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14321 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14322 #define vextq_p8 vextq_s8
14324 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14325 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14327 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14328 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14330 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14331 #define vextq_p16 vextq_s16
14333 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14334 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14336 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14337 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14339 float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
14340 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14342 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14343 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14345 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14346 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14348 //************ Reverse vector elements (swap endianness)*****************
14349 //*************************************************************************
14350 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14351 int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
14352 _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
14354 int8x8_t res64;
14355 __m128i res;
14356 res = vrev64q_s8(_pM128i(vec));
14357 return64(res);
14360 int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
14361 _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
14363 int16x4_t res64;
14364 __m128i res;
14365 res = vrev64q_s16(_pM128i(vec));
14366 return64(res);
14369 int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
14370 _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
14372 int32x2_t res;
14373 res.m64_i32[0] = vec.m64_i32[1];
14374 res.m64_i32[1] = vec.m64_i32[0];
14375 return res;
14378 uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
14379 #define vrev64_u8 vrev64_s8
14381 uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
14382 #define vrev64_u16 vrev64_s16
14384 uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
14385 #define vrev64_u32 vrev64_s32
14387 poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
14388 #define vrev64_p8 vrev64_u8
14390 poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
14391 #define vrev64_p16 vrev64_u16
14393 float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
14394 _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
14396 float32x2_t res;
14397 res.m64_f32[0] = vec.m64_f32[1];
14398 res.m64_f32[1] = vec.m64_f32[0];
14399 return res;
14402 int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
14403 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
14405 _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
14406 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14409 int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
14410 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
14412 //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14413 _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
14414 return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
14417 int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
14418 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
14420 return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
14423 uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
14424 #define vrev64q_u8 vrev64q_s8
14426 uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
14427 #define vrev64q_u16 vrev64q_s16
14429 uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
14430 #define vrev64q_u32 vrev64q_s32
14432 poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
14433 #define vrev64q_p8 vrev64q_u8
14435 poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
14436 #define vrev64q_p16 vrev64q_u16
14438 float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
14439 #define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
14441 //******************** 32 bit shuffles **********************
14442 //************************************************************
14443 int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
14444 _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
14446 int8x8_t res64;
14447 __m128i res;
14448 res = vrev32q_s8(_pM128i(vec));
14449 return64(res);
14452 int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
14453 _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
14455 int16x4_t res64;
14456 __m128i res;
14457 res = vrev32q_s16(_pM128i(vec));
14458 return64(res);
14461 uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
14462 #define vrev32_u8 vrev32_s8
14464 uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
14465 #define vrev32_u16 vrev32_s16
14467 poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
14468 #define vrev32_p8 vrev32_u8
14470 poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
14471 #define vrev32_p16 vrev32_u16
14473 int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
14474 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
14476 _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
14477 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14480 int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
14481 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
14483 _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
14484 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14487 uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
14488 #define vrev32q_u8 vrev32q_s8
14490 uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
14491 #define vrev32q_u16 vrev32q_s16
14493 poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
14494 #define vrev32q_p8 vrev32q_u8
14496 poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
14497 #define vrev32q_p16 vrev32q_u16
14499 //************* 16 bit shuffles **********************
14500 //******************************************************
14501 int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
14502 _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
14504 int8x8_t res64;
14505 __m128i res;
14506 res = vrev16q_s8(_pM128i(vec));
14507 return64(res);
14510 uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
14511 #define vrev16_u8 vrev16_s8
14513 poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
14514 #define vrev16_p8 vrev16_u8
14516 int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
14517 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
14519 _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
14520 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8);
14523 uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
14524 #define vrev16q_u8 vrev16q_s8
14526 poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
14527 #define vrev16q_p8 vrev16q_u8
14529 //*********************************************************************
14530 //**************** Other single operand arithmetic *******************
14531 //*********************************************************************
14533 //*********** Absolute: Vd[i] = |Va[i]| **********************************
14534 //************************************************************************
14535 int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
14536 _NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a)
14538 int8x8_t res64;
14539 __m128i res;
14540 res = _mm_abs_epi8(_pM128i(a));
14541 return64(res);
14545 int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
14546 _NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a)
14548 int16x4_t res64;
14549 __m128i res;
14550 res = _mm_abs_epi16(_pM128i(a));
14551 return64(res);
14554 int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
14555 _NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a)
14557 int32x2_t res64;
14558 __m128i res;
14559 res = _mm_abs_epi32(_pM128i(a));
14560 return64(res);
14563 float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
14564 _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
14566 float32x4_t res;
14567 __m64_128 res64;
14568 _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14569 res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
14570 _M64f(res64, res);
14571 return res64;
14574 int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
14575 #define vabsq_s8 _mm_abs_epi8
14577 int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
14578 #define vabsq_s16 _mm_abs_epi16
14580 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
14581 #define vabsq_s32 _mm_abs_epi32
14583 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
14584 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
14586 _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14587 return _mm_and_ps (a, *(__m128*)c7fffffff);
14590 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
14591 //**********************************************************************
14592 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14593 int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
14594 _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
14596 int8x8_t res64;
14597 __m128i res;
14598 res = vqabsq_s8(_pM128i(a));
14599 return64(res);
14602 int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
14603 _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
14605 int16x4_t res64;
14606 __m128i res;
14607 res = vqabsq_s16(_pM128i(a));
14608 return64(res);
14611 int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
14612 _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
14614 int32x2_t res64;
14615 __m128i res;
14616 res = vqabsq_s32(_pM128i(a));
14617 return64(res);
14620 int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
14621 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
14623 __m128i c_128, abs, abs_cmp;
14624 c_128 = _mm_set1_epi8 (0x80); //-128
14625 abs = _mm_abs_epi8 (a);
14626 abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
14627 return _mm_xor_si128 (abs, abs_cmp);
14630 int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
14631 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
14633 __m128i c_32768, abs, abs_cmp;
14634 c_32768 = _mm_set1_epi16 (0x8000); //-32768
14635 abs = _mm_abs_epi16 (a);
14636 abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
14637 return _mm_xor_si128 (abs, abs_cmp);
14640 int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
14641 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
14643 __m128i c80000000, abs, abs_cmp;
14644 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14645 abs = _mm_abs_epi32 (a);
14646 abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
14647 return _mm_xor_si128 (abs, abs_cmp);
14650 //*************** Negate: Vd[i] = - Va[i] *************************************
14651 //*****************************************************************************
14652 //several Negate implementations possible for SIMD.
14653 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14654 int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
14655 _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
14657 int8x8_t res64;
14658 __m128i res;
14659 res = vnegq_s8(_pM128i(a));
14660 return64(res);
14663 int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
14664 _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
14666 int16x4_t res64;
14667 __m128i res;
14668 res = vnegq_s16(_pM128i(a));
14669 return64(res);
14672 int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
14673 _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
14675 int32x2_t res64;
14676 __m128i res;
14677 res = vnegq_s32(_pM128i(a));
14678 return64(res);
14681 float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
14682 _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
14684 float32x4_t res;
14685 __m64_128 res64;
14686 _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14687 res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
14688 _M64f(res64, res);
14689 return res64;
14692 int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
14693 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
14695 __m128i zero;
14696 zero = _mm_setzero_si128 ();
14697 return _mm_sub_epi8 (zero, a);
14698 } //or _mm_sign_epi8 (a, negative numbers vector)
14700 int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
14701 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
14703 __m128i zero;
14704 zero = _mm_setzero_si128 ();
14705 return _mm_sub_epi16 (zero, a);
14706 } //or _mm_sign_epi16 (a, negative numbers vector)
14708 int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
14709 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
14711 __m128i zero;
14712 zero = _mm_setzero_si128 ();
14713 return _mm_sub_epi32 (zero, a);
14714 } //or _mm_sign_epi32 (a, negative numbers vector)
14716 float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
14717 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
14719 _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14720 return _mm_xor_ps (a, *(__m128*) c80000000);
14723 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
14724 //***************************************************************************************
14725 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14726 int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
14727 _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
14729 int8x8_t res64;
14730 __m128i res;
14731 res = vqnegq_s8(_pM128i(a));
14732 return64(res);
14735 int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
14736 _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
14738 int16x4_t res64;
14739 __m128i res;
14740 res = vqnegq_s16(_pM128i(a));
14741 return64(res);
14744 int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
14745 _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
14747 int32x2_t res64;
14748 __m128i res;
14749 res = vqnegq_s32(_pM128i(a));
14750 return64(res);
14753 int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
14754 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
14756 __m128i zero;
14757 zero = _mm_setzero_si128 ();
14758 return _mm_subs_epi8 (zero, a); //saturating substraction
14761 int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
14762 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
14764 __m128i zero;
14765 zero = _mm_setzero_si128 ();
14766 return _mm_subs_epi16 (zero, a); //saturating substraction
14769 int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
14770 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
14772 //solution may be not optimal compared with a serial
14773 __m128i c80000000, zero, sub, cmp;
14774 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14775 zero = _mm_setzero_si128 ();
14776 sub = _mm_sub_epi32 (zero, a); //substraction
14777 cmp = _mm_cmpeq_epi32 (a, c80000000);
14778 return _mm_xor_si128 (sub, cmp);
14781 //****************** Count leading zeros ********************************
14782 //**************************************************************************
14783 //no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14784 int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
14785 _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
14787 int8x8_t res64;
14788 __m128i res;
14789 res = vclzq_s8(_pM128i(a));
14790 return64(res);
14793 int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
14794 _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
14796 int16x4_t res64;
14797 __m128i res;
14798 res = vclzq_s16(_pM128i(a));
14799 return64(res);
14802 int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
14803 _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
14805 int32x2_t res64;
14806 __m128i res;
14807 res = vclzq_s32(_pM128i(a));
14808 return64(res);
14812 uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
14813 #define vclz_u8 vclz_s8
14815 uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
14816 #define vclz_u16 vclz_s16
14818 uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
14819 #define vclz_u32 vclz_s32
14821 int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
14822 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
14824 _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
14825 /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
14826 /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
14827 /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 };
14828 __m128i maskLOW, c4, lowclz, mask, hiclz;
14829 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
14830 c4 = _mm_set1_epi8(4);
14831 lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
14832 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
14833 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
14834 hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
14835 mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
14836 lowclz = _mm_and_si128(lowclz,mask);
14837 return _mm_add_epi8(lowclz, hiclz);
14840 int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
14841 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
14843 __m128i c7, res8x16, res8x16_swap;
14844 _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
14845 _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
14846 c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
14847 res8x16 = vclzq_s8(a);
14848 res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
14849 res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
14850 res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
14851 c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
14852 res8x16 = _mm_and_si128(res8x16, c7); //lowclz
14853 return _mm_add_epi16(res8x16_swap, res8x16);
14856 int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
14857 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
14859 __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
14860 c55555555 = _mm_set1_epi32(0x55555555);
14861 c33333333 = _mm_set1_epi32(0x33333333);
14862 c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
14863 c3f = _mm_set1_epi32(0x3f);
14864 c32 = _mm_set1_epi32(32);
14865 tmp = _mm_srli_epi32(a, 1);
14866 res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
14867 tmp = _mm_srli_epi32(res, 2);
14868 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
14869 tmp = _mm_srli_epi32(res, 4);
14870 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
14871 tmp = _mm_srli_epi32(res, 8);
14872 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
14873 tmp = _mm_srli_epi32(res, 16);
14874 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
14876 tmp = _mm_srli_epi32(res, 1);
14877 tmp = _mm_and_si128(tmp, c55555555);
14878 res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
14880 tmp = _mm_srli_epi32(res, 2);
14881 tmp = _mm_and_si128(tmp, c33333333);
14882 tmp1 = _mm_and_si128(res, c33333333);
14883 res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
14885 tmp = _mm_srli_epi32(res, 4);
14886 tmp = _mm_add_epi32(tmp, res);
14887 res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
14889 tmp = _mm_srli_epi32(res, 8);
14890 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
14892 tmp = _mm_srli_epi32(res, 16);
14893 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
14895 res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
14897 return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
14900 uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
14901 #define vclzq_u8 vclzq_s8
14903 uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
14904 #define vclzq_u16 vclzq_s16
14906 uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
14907 #define vclzq_u32 vclzq_s32
14909 //************** Count leading sign bits **************************
14910 //********************************************************************
14911 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
14912 // the topmost bit, that are the same as the topmost bit, in each element in a vector
14913 //No corresponding vector intrinsics in IA32, need to implement it.
14914 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14915 int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
14916 _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
14918 int8x8_t res64;
14919 __m128i res;
14920 res = vclsq_s8(_pM128i(a));
14921 return64(res);
14924 int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
14925 _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
14927 int16x4_t res64;
14928 __m128i res;
14929 res = vclsq_s16(_pM128i(a));
14930 return64(res);
14933 int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
14934 _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
14936 int32x2_t res64;
14937 __m128i res;
14938 res = vclsq_s32(_pM128i(a));
14939 return64(res);
14942 int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
14943 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
14945 __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
14946 cff = _mm_cmpeq_epi8 (a,a); //0xff
14947 c80 = _mm_set1_epi8(0x80);
14948 c1 = _mm_set1_epi8(1);
14949 a_mask = _mm_and_si128(a, c80);
14950 a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
14951 a_neg = _mm_xor_si128(a, cff);
14952 a_neg = _mm_and_si128(a_mask, a_neg);
14953 a_pos = _mm_andnot_si128(a_mask, a);
14954 a_comb = _mm_or_si128(a_pos, a_neg);
14955 a_comb = vclzq_s8(a_comb);
14956 return _mm_sub_epi8(a_comb, c1);
14959 int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
14960 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
14962 __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
14963 cffff = _mm_cmpeq_epi16(a,a);
14964 c8000 = _mm_slli_epi16(cffff, 15); //0x8000
14965 c1 = _mm_srli_epi16(cffff,15); //0x1
14966 a_mask = _mm_and_si128(a, c8000);
14967 a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
14968 a_neg = _mm_xor_si128(a, cffff);
14969 a_neg = _mm_and_si128(a_mask, a_neg);
14970 a_pos = _mm_andnot_si128(a_mask, a);
14971 a_comb = _mm_or_si128(a_pos, a_neg);
14972 a_comb = vclzq_s16(a_comb);
14973 return _mm_sub_epi16(a_comb, c1);
14976 int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
14977 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
14979 __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
14980 cffffffff = _mm_cmpeq_epi32(a,a);
14981 c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000
14982 c1 = _mm_srli_epi32(cffffffff,31); //0x1
14983 a_mask = _mm_and_si128(a, c80000000);
14984 a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
14985 a_neg = _mm_xor_si128(a, cffffffff);
14986 a_neg = _mm_and_si128(a_mask, a_neg);
14987 a_pos = _mm_andnot_si128(a_mask, a);
14988 a_comb = _mm_or_si128(a_pos, a_neg);
14989 a_comb = vclzq_s32(a_comb);
14990 return _mm_sub_epi32(a_comb, c1);
14993 //************************* Count number of set bits ********************************
14994 //*************************************************************************************
14995 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
14996 //another option is to do the following algorithm:
14998 uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
14999 _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
15001 uint8x8_t res64;
15002 __m128i res;
15003 res = vcntq_u8(_pM128i(a));
15004 return64(res);
15007 int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
15008 #define vcnt_s8 vcnt_u8
15010 poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
15011 #define vcnt_p8 vcnt_u8
15013 uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
15014 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
15016 _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
15017 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
15018 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
15019 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 };
15020 __m128i maskLOW, mask, lowpopcnt, hipopcnt;
15021 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
15022 mask = _mm_and_si128(a, maskLOW);
15023 lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
15024 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
15025 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15026 hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
15027 return _mm_add_epi8(lowpopcnt, hipopcnt);
15030 int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
15031 #define vcntq_s8 vcntq_u8
15033 poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
15034 #define vcntq_p8 vcntq_u8
15036 //**************************************************************************************
15037 //*********************** Logical operations ****************************************
15038 //**************************************************************************************
15039 //************************** Bitwise not ***********************************
15040 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15041 int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
15042 _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
15044 int8x8_t res64;
15045 __m128i res;
15046 res = vmvnq_s8(_pM128i(a));
15047 return64(res);
15050 int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
15051 _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
15053 int16x4_t res64;
15054 __m128i res;
15055 res = vmvnq_s16(_pM128i(a));
15056 return64(res);
15059 int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
15060 _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
15062 int32x2_t res64;
15063 __m128i res;
15064 res = vmvnq_s32(_pM128i(a));
15065 return64(res);
15068 uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
15069 #define vmvn_u8 vmvn_s8
15071 uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
15072 #define vmvn_u16 vmvn_s16
15074 uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
15075 #define vmvn_u32 vmvn_s32
15077 poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
15078 #define vmvn_p8 vmvn_u8
15080 int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
15081 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
15083 __m128i c1;
15084 c1 = _mm_cmpeq_epi8 (a,a); //0xff
15085 return _mm_andnot_si128 (a, c1);
15088 int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
15089 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
15091 __m128i c1;
15092 c1 = _mm_cmpeq_epi16 (a,a); //0xffff
15093 return _mm_andnot_si128 (a, c1);
15096 int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
15097 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
15099 __m128i c1;
15100 c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
15101 return _mm_andnot_si128 (a, c1);
15104 uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
15105 #define vmvnq_u8 vmvnq_s8
15107 uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
15108 #define vmvnq_u16 vmvnq_s16
15110 uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
15111 #define vmvnq_u32 vmvnq_s32
15113 poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
15114 #define vmvnq_p8 vmvnq_u8
15116 //****************** Bitwise and ***********************
15117 //******************************************************
15118 int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
15119 _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
15121 int8x8_t res64;
15122 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15125 int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
15126 _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
15128 int16x4_t res64;
15129 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15132 int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
15133 _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
15135 int32x2_t res64;
15136 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15140 int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
15141 _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b)
15143 int64x1_t res;
15144 res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
15145 return res;
15148 uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
15149 #define vand_u8 vand_s8
15151 uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
15152 #define vand_u16 vand_s16
15154 uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
15155 #define vand_u32 vand_s32
15157 uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
15158 #define vand_u64 vand_s64
15161 int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
15162 #define vandq_s8 _mm_and_si128
15164 int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
15165 #define vandq_s16 _mm_and_si128
15167 int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
15168 #define vandq_s32 _mm_and_si128
15170 int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
15171 #define vandq_s64 _mm_and_si128
15173 uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
15174 #define vandq_u8 _mm_and_si128
15176 uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
15177 #define vandq_u16 _mm_and_si128
15179 uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
15180 #define vandq_u32 _mm_and_si128
15182 uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
15183 #define vandq_u64 _mm_and_si128
15185 //******************** Bitwise or *********************************
15186 //******************************************************************
15187 int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
15188 _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
15190 int8x8_t res64;
15191 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15195 int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
15196 _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
15198 int16x4_t res64;
15199 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15203 int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
15204 _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
15206 int32x2_t res64;
15207 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15211 int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
15212 _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b)
15214 int64x1_t res;
15215 res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
15216 return res;
15219 uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
15220 #define vorr_u8 vorr_s8
15222 uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
15223 #define vorr_u16 vorr_s16
15225 uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
15226 #define vorr_u32 vorr_s32
15228 uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
15229 #define vorr_u64 vorr_s64
15231 int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
15232 #define vorrq_s8 _mm_or_si128
15234 int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
15235 #define vorrq_s16 _mm_or_si128
15237 int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
15238 #define vorrq_s32 _mm_or_si128
15240 int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
15241 #define vorrq_s64 _mm_or_si128
15243 uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
15244 #define vorrq_u8 _mm_or_si128
15246 uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
15247 #define vorrq_u16 _mm_or_si128
15249 uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
15250 #define vorrq_u32 _mm_or_si128
15252 uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
15253 #define vorrq_u64 _mm_or_si128
15255 //************* Bitwise exclusive or (EOR or XOR) ******************
15256 //*******************************************************************
15257 int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
15258 _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
15260 int8x8_t res64;
15261 return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
15264 int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
15265 #define veor_s16 veor_s8
15267 int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
15268 #define veor_s32 veor_s8
15270 int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
15271 _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b)
15273 int64x1_t res;
15274 res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
15275 return res;
15278 uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
15279 #define veor_u8 veor_s8
15281 uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
15282 #define veor_u16 veor_s16
15284 uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
15285 #define veor_u32 veor_s32
15287 uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
15288 #define veor_u64 veor_s64
15290 int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
15291 #define veorq_s8 _mm_xor_si128
15293 int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
15294 #define veorq_s16 _mm_xor_si128
15296 int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
15297 #define veorq_s32 _mm_xor_si128
15299 int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
15300 #define veorq_s64 _mm_xor_si128
15302 uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
15303 #define veorq_u8 _mm_xor_si128
15305 uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
15306 #define veorq_u16 _mm_xor_si128
15308 uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
15309 #define veorq_u32 _mm_xor_si128
15311 uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
15312 #define veorq_u64 _mm_xor_si128
15314 //********************** Bit Clear **********************************
15315 //*******************************************************************
15316 //Logical AND complement (AND negation or AND NOT)
15317 int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
15318 _NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b)
15320 int8x8_t res64;
15321 return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
15324 int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
15325 #define vbic_s16 vbic_s8
15327 int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
15328 #define vbic_s32 vbic_s8
15330 int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
15331 _NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b)
15333 int64x1_t res;
15334 res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
15335 return res;
15338 uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
15339 #define vbic_u8 vbic_s8
15341 uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
15342 #define vbic_u16 vbic_s16
15344 uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
15345 #define vbic_u32 vbic_s32
15347 uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
15348 #define vbic_u64 vbic_s64
15350 int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
15351 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15353 int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
15354 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15356 int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
15357 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15359 int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
15360 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15362 uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
15363 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15365 uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
15366 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15368 uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
15369 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15371 uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
15372 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15374 //**************** Bitwise OR complement ********************************
15375 //**************************************** ********************************
15376 //no exact IA 32 match, need to implement it as following
15377 int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
15378 _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b)
15380 int8x8_t res64;
15381 return64(vornq_s8(_pM128i(a), _pM128i(b)));
15385 int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
15386 _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b)
15388 int16x4_t res64;
15389 return64(vornq_s16(_pM128i(a), _pM128i(b)));
15393 int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
15394 _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b)
15396 int32x2_t res64;
15397 return64(vornq_s32(_pM128i(a), _pM128i(b)));
15401 int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
15402 _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
15404 int64x1_t res;
15405 res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
15406 return res;
15409 uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
15410 #define vorn_u8 vorn_s8
15413 uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
15414 #define vorn_u16 vorn_s16
15416 uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
15417 #define vorn_u32 vorn_s32
15419 uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
15420 #define vorn_u64 vorn_s64
15423 int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
15424 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
15426 __m128i b1;
15427 b1 = vmvnq_s8( b); //bitwise not for b
15428 return _mm_or_si128 (a, b1);
15431 int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
15432 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
15434 __m128i b1;
15435 b1 = vmvnq_s16( b); //bitwise not for b
15436 return _mm_or_si128 (a, b1);
15439 int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
15440 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
15442 __m128i b1;
15443 b1 = vmvnq_s32( b); //bitwise not for b
15444 return _mm_or_si128 (a, b1);
15447 int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
15448 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
15450 __m128i c1, b1;
15451 c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
15452 b1 = _mm_andnot_si128 (b, c1);
15453 return _mm_or_si128 (a, b1);
15456 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
15457 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
15459 __m128i b1;
15460 b1 = vmvnq_u8( b); //bitwise not for b
15461 return _mm_or_si128 (a, b1);
15464 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
15465 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
15467 __m128i b1;
15468 b1 = vmvnq_s16( b); //bitwise not for b
15469 return _mm_or_si128 (a, b1);
15472 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
15473 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
15475 __m128i b1;
15476 b1 = vmvnq_u32( b); //bitwise not for b
15477 return _mm_or_si128 (a, b1);
15479 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
15480 #define vornq_u64 vornq_s64
15482 //********************* Bitwise Select *****************************
15483 //******************************************************************
15484 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15486 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15487 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15489 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15490 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15492 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15493 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15495 //VBSL only is implemented for SIMD
15496 int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
15497 _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
15499 int8x8_t res64;
15500 __m128i res;
15501 res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
15502 return64(res);
15505 int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
15506 #define vbsl_s16 vbsl_s8
15508 int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
15509 #define vbsl_s32 vbsl_s8
15511 int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
15512 _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
15514 int64x1_t res;
15515 res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
15516 return res;
15519 uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
15520 #define vbsl_u8 vbsl_s8
15522 uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
15523 #define vbsl_u16 vbsl_s8
15525 uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
15526 #define vbsl_u32 vbsl_s8
15528 uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
15529 #define vbsl_u64 vbsl_s64
15531 float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
15532 _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
15534 __m128 sel1, sel2;
15535 __m64_128 res64;
15536 sel1 = _mm_and_ps (_pM128(a), _pM128(b));
15537 sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
15538 sel1 = _mm_or_ps (sel1, sel2);
15539 _M64f(res64, sel1);
15540 return res64;
15543 poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
15544 #define vbsl_p8 vbsl_s8
15546 poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
15547 #define vbsl_p16 vbsl_s8
15549 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
15550 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
15552 __m128i sel1, sel2;
15553 sel1 = _mm_and_si128 (a, b);
15554 sel2 = _mm_andnot_si128 (a, c);
15555 return _mm_or_si128 (sel1, sel2);
15558 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
15559 #define vbslq_s16 vbslq_s8
15561 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
15562 #define vbslq_s32 vbslq_s8
15564 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
15565 #define vbslq_s64 vbslq_s8
15567 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
15568 #define vbslq_u8 vbslq_s8
15570 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
15571 #define vbslq_u16 vbslq_s8
15573 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
15574 #define vbslq_u32 vbslq_s8
15576 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
15577 #define vbslq_u64 vbslq_s8
15579 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
15580 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
15582 __m128 sel1, sel2;
15583 sel1 = _mm_and_ps (*(__m128*)&a, b);
15584 sel2 = _mm_andnot_ps (*(__m128*)&a, c);
15585 return _mm_or_ps (sel1, sel2);
15588 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
15589 #define vbslq_p8 vbslq_u8
15591 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
15592 #define vbslq_p16 vbslq_s8
15594 //************************************************************************************
15595 //**************** Transposition operations ****************************************
15596 //************************************************************************************
15597 //***************** Vector Transpose ************************************************
15598 //************************************************************************************
15599 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15600 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15601 int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
15602 _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
15604 int8x8x2_t val;
15605 __m128i tmp, val0;
15606 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
15607 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15608 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15609 vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15610 return val;
15613 int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
15614 _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
15616 int16x4x2_t val;
15617 __m128i tmp, val0;
15618 _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
15619 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15620 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
15621 vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15622 return val;
15625 int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
15626 _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
15628 int32x2x2_t val;
15629 __m128i val0;
15630 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
15631 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15632 return val;
15635 uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
15636 #define vtrn_u8 vtrn_s8
15638 uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
15639 #define vtrn_u16 vtrn_s16
15641 uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
15642 #define vtrn_u32 vtrn_s32
15644 float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
15645 _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
15647 float32x2x2_t val;
15648 val.val[0].m64_f32[0] = a.m64_f32[0];
15649 val.val[0].m64_f32[1] = b.m64_f32[0];
15650 val.val[1].m64_f32[0] = a.m64_f32[1];
15651 val.val[1].m64_f32[1] = b.m64_f32[1];
15652 return val; //a0,b0,a1,b1
15655 poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
15656 #define vtrn_p8 vtrn_u8
15658 poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
15659 #define vtrn_p16 vtrn_s16
15661 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
15662 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
15664 int8x16x2_t r8x16;
15665 __m128i a_sh, b_sh;
15666 _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
15667 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15668 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15670 r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15671 r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15672 return r8x16;
15675 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
15676 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
15678 int16x8x2_t v16x8;
15679 __m128i a_sh, b_sh;
15680 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
15681 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
15682 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
15683 v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
15684 v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
15685 return v16x8;
15688 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
15689 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
15691 //may be not optimal solution compared with serial
15692 int32x4x2_t v32x4;
15693 __m128i a_sh, b_sh;
15694 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15695 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15697 v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
15698 v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3
15699 return v32x4;
15702 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
15703 #define vtrnq_u8 vtrnq_s8
15705 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
15706 #define vtrnq_u16 vtrnq_s16
15708 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
15709 #define vtrnq_u32 vtrnq_s32
15711 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
15712 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
15714 //may be not optimal solution compared with serial
15715 float32x4x2_t f32x4;
15716 __m128 a_sh, b_sh;
15717 a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
15718 b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
15720 f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
15721 f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3
15722 return f32x4;
15725 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
15726 #define vtrnq_p8 vtrnq_s8
15728 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
15729 #define vtrnq_p16 vtrnq_s16
15731 //***************** Interleave elements ***************************
15732 //*****************************************************************
15733 //output has (a0,b0,a1,b1, a2,b2,.....)
15734 int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
15735 _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
15737 int8x8x2_t val;
15738 __m128i val0;
15739 val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
15740 vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15741 return val;
15744 int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
15745 _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
15747 int16x4x2_t val;
15748 __m128i val0;
15749 val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
15750 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15751 return val;
15754 int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
15755 #define vzip_s32 vtrn_s32
15757 uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
15758 #define vzip_u8 vzip_s8
15760 uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
15761 #define vzip_u16 vzip_s16
15763 uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
15764 #define vzip_u32 vzip_s32
15766 float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
15767 #define vzip_f32 vtrn_f32
15769 poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
15770 #define vzip_p8 vzip_u8
15772 poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
15773 #define vzip_p16 vzip_u16
15775 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
15776 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
15778 int8x16x2_t r8x16;
15779 r8x16.val[0] = _mm_unpacklo_epi8(a, b);
15780 r8x16.val[1] = _mm_unpackhi_epi8(a, b);
15781 return r8x16;
15784 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
15785 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
15787 int16x8x2_t r16x8;
15788 r16x8.val[0] = _mm_unpacklo_epi16(a, b);
15789 r16x8.val[1] = _mm_unpackhi_epi16(a, b);
15790 return r16x8;
15793 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
15794 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
15796 int32x4x2_t r32x4;
15797 r32x4.val[0] = _mm_unpacklo_epi32(a, b);
15798 r32x4.val[1] = _mm_unpackhi_epi32(a, b);
15799 return r32x4;
15802 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
15803 #define vzipq_u8 vzipq_s8
15805 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
15806 #define vzipq_u16 vzipq_s16
15808 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
15809 #define vzipq_u32 vzipq_s32
15811 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
15812 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
15814 float32x4x2_t f32x4;
15815 f32x4.val[0] = _mm_unpacklo_ps ( a, b);
15816 f32x4.val[1] = _mm_unpackhi_ps ( a, b);
15817 return f32x4;
15820 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
15821 #define vzipq_p8 vzipq_u8
15823 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
15824 #define vzipq_p16 vzipq_u16
15826 //*********************** De-Interleave elements *************************
15827 //*************************************************************************
15828 //As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
15829 //no such functions in IA32 SIMD, shuffle is required
15830 int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
15831 _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
15833 int8x8x2_t val;
15834 __m128i tmp, val0;
15835 _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15};
15836 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15837 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
15838 vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15839 return val;
15842 int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
15843 _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
15845 int16x4x2_t val;
15846 __m128i tmp, val0;
15847 _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
15848 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15849 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
15850 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15851 return val;
15854 int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
15855 _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
15857 int32x2x2_t val;
15858 __m128i val0;
15859 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
15860 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15861 return val;
15864 uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
15865 #define vuzp_u8 vuzp_s8
15867 uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
15868 #define vuzp_u16 vuzp_s16
15870 uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
15871 #define vuzp_u32 vuzp_s32
15873 float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
15874 #define vuzp_f32 vzip_f32
15876 poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
15877 #define vuzp_p8 vuzp_u8
15879 poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
15880 #define vuzp_p16 vuzp_u16
15882 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
15883 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
15885 int8x16x2_t v8x16;
15886 __m128i a_sh, b_sh;
15887 _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
15888 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15889 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15890 //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
15891 v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
15892 v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
15893 return v8x16;
15896 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
15897 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
15899 int16x8x2_t v16x8;
15900 __m128i a_sh, b_sh;
15901 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
15902 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
15903 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
15904 v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
15905 v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
15906 return v16x8;
15909 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
15910 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
15912 //may be not optimal solution compared with serial
15913 int32x4x2_t v32x4;
15914 __m128i a_sh, b_sh;
15915 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15916 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15918 v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
15919 v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
15920 return v32x4;
15923 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
15924 #define vuzpq_u8 vuzpq_s8
15926 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
15927 #define vuzpq_u16 vuzpq_s16
15929 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
15930 #define vuzpq_u32 vuzpq_s32
15932 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
15933 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
15935 float32x4x2_t v32x4;
15936 v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
15937 v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
15938 return v32x4;
15941 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
15942 #define vuzpq_p8 vuzpq_u8
15944 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
15945 #define vuzpq_p16 vuzpq_u16
15947 //##############################################################################################
15948 //*********************** Reinterpret cast intrinsics.******************************************
15949 //##############################################################################################
15950 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
15951 poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
15952 #define vreinterpret_p8_u32
15954 poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
15955 #define vreinterpret_p8_u16
15957 poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
15958 #define vreinterpret_p8_u8
15960 poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
15961 #define vreinterpret_p8_s32
15963 poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
15964 #define vreinterpret_p8_s16
15966 poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
15967 #define vreinterpret_p8_s8
15969 poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
15970 #define vreinterpret_p8_u64
15972 poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
15973 #define vreinterpret_p8_s64
15975 poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
15976 #define vreinterpret_p8_f32
15978 poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
15979 #define vreinterpret_p8_p16
15981 poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
15982 #define vreinterpretq_p8_u32
15984 poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
15985 #define vreinterpretq_p8_u16
15987 poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
15988 #define vreinterpretq_p8_u8
15990 poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
15991 #define vreinterpretq_p8_s32
15993 poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
15994 #define vreinterpretq_p8_s16
15996 poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
15997 #define vreinterpretq_p8_s8
15999 poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
16000 #define vreinterpretq_p8_u64
16002 poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
16003 #define vreinterpretq_p8_s64
16005 poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
16006 #define vreinterpretq_p8_f32(t) _M128i(t)
16008 poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
16009 #define vreinterpretq_p8_p16
16011 poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
16012 #define vreinterpret_p16_u32
16014 poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
16015 #define vreinterpret_p16_u16
16017 poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
16018 #define vreinterpret_p16_u8
16020 poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
16021 #define vreinterpret_p16_s32
16023 poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
16024 #define vreinterpret_p16_s16
16026 poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
16027 #define vreinterpret_p16_s8
16029 poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
16030 #define vreinterpret_p16_u64
16032 poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
16033 #define vreinterpret_p16_s64
16035 poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
16036 #define vreinterpret_p16_f32
16038 poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
16039 #define vreinterpret_p16_p8
16041 poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
16042 #define vreinterpretq_p16_u32
16044 poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
16045 #define vreinterpretq_p16_u16
16047 poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
16048 #define vreinterpretq_p16_s32
16050 poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
16051 #define vreinterpretq_p16_s16
16053 poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
16054 #define vreinterpretq_p16_s8
16056 poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
16057 #define vreinterpretq_p16_u64
16059 poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
16060 #define vreinterpretq_p16_s64
16062 poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
16063 #define vreinterpretq_p16_f32(t) _M128i(t)
16065 poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
16066 #define vreinterpretq_p16_p8 vreinterpretq_s16_p8
16068 //**** Integer to float ******
16069 float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
16070 #define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
16073 float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
16074 #define vreinterpret_f32_u16 vreinterpret_f32_u32
16077 float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
16078 #define vreinterpret_f32_u8 vreinterpret_f32_u32
16081 float32x2_t vreinterpret_f32_s32 (int32x2_t t);
16082 #define vreinterpret_f32_s32 vreinterpret_f32_u32
16085 float32x2_t vreinterpret_f32_s16 (int16x4_t t);
16086 #define vreinterpret_f32_s16 vreinterpret_f32_u32
16088 float32x2_t vreinterpret_f32_s8 (int8x8_t t);
16089 #define vreinterpret_f32_s8 vreinterpret_f32_u32
16092 float32x2_t vreinterpret_f32_u64(uint64x1_t t);
16093 #define vreinterpret_f32_u64 vreinterpret_f32_u32
16096 float32x2_t vreinterpret_f32_s64 (int64x1_t t);
16097 #define vreinterpret_f32_s64 vreinterpret_f32_u32
16100 float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
16101 #define vreinterpret_f32_p16 vreinterpret_f32_u32
16103 float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
16104 #define vreinterpret_f32_p8 vreinterpret_f32_u32
16106 float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
16107 #define vreinterpretq_f32_u32(t) *(__m128*)&(t)
16109 float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
16110 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16112 float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
16113 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16115 float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
16116 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16118 float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
16119 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16121 float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
16122 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16124 float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
16125 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16127 float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
16128 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16130 float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
16131 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16133 float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
16134 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16136 //*** Integer type conversions ******************
16137 //no conversion necessary for the following functions because it is same data type
16138 int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
16139 #define vreinterpret_s64_u32
16141 int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
16142 #define vreinterpret_s64_u16
16144 int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
16145 #define vreinterpret_s64_u8
16147 int64x1_t vreinterpret_s64_s32 (int32x2_t t);
16148 #define vreinterpret_s64_s32
16150 int64x1_t vreinterpret_s64_s16 (int16x4_t t);
16151 #define vreinterpret_s64_s16
16153 int64x1_t vreinterpret_s64_s8 (int8x8_t t);
16154 #define vreinterpret_s64_s8
16156 int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
16157 #define vreinterpret_s64_u64
16159 int64x1_t vreinterpret_s64_f32 (float32x2_t t);
16160 #define vreinterpret_s64_f32
16162 int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
16163 #define vreinterpret_s64_p16
16165 int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
16166 #define vreinterpret_s64_p8
16168 int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
16169 #define vreinterpretq_s64_u32
16171 int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
16172 #define vreinterpretq_s64_s16
16174 int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
16175 #define vreinterpretq_s64_u8
16177 int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
16178 #define vreinterpretq_s64_s32
16180 int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
16181 #define vreinterpretq_s64_u16
16183 int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
16184 #define vreinterpretq_s64_s8
16186 int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
16187 #define vreinterpretq_s64_u64
16189 int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
16190 #define vreinterpretq_s64_f32(t) _M128i(t)
16192 int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
16193 #define vreinterpretq_s64_p16
16195 int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
16196 #define vreinterpretq_s64_p8
16198 uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
16199 #define vreinterpret_u64_u32
16201 uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
16202 #define vreinterpret_u64_u16
16204 uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
16205 #define vreinterpret_u64_u8
16207 uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
16208 #define vreinterpret_u64_s32
16210 uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
16211 #define vreinterpret_u64_s16
16213 uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
16214 #define vreinterpret_u64_s8
16216 uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
16217 #define vreinterpret_u64_s64
16219 uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
16220 #define vreinterpret_u64_f32
16222 uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
16223 #define vreinterpret_u64_p16
16225 uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
16226 #define vreinterpret_u64_p8
16228 uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
16229 #define vreinterpretq_u64_u32
16231 uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
16232 #define vreinterpretq_u64_u16
16234 uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
16235 #define vreinterpretq_u64_u8
16237 uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
16238 #define vreinterpretq_u64_s32
16240 uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
16241 #define vreinterpretq_u64_s16
16243 uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
16244 #define vreinterpretq_u64_s8
16246 uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
16247 #define vreinterpretq_u64_s64
16249 uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
16250 #define vreinterpretq_u64_f32(t) _M128i(t)
16252 uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
16253 #define vreinterpretq_u64_p16
16255 uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
16256 #define vreinterpretq_u64_p8
16258 int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
16259 #define vreinterpret_s8_u32
16261 int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
16262 #define vreinterpret_s8_u16
16264 int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
16265 #define vreinterpret_s8_u8
16267 int8x8_t vreinterpret_s8_s32 (int32x2_t t);
16268 #define vreinterpret_s8_s32
16270 int8x8_t vreinterpret_s8_s16 (int16x4_t t);
16271 #define vreinterpret_s8_s16
16273 int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
16274 #define vreinterpret_s8_u64
16276 int8x8_t vreinterpret_s8_s64 (int64x1_t t);
16277 #define vreinterpret_s8_s64
16279 int8x8_t vreinterpret_s8_f32 (float32x2_t t);
16280 #define vreinterpret_s8_f32
16282 int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
16283 #define vreinterpret_s8_p16
16285 int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
16286 #define vreinterpret_s8_p8
16288 int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
16289 #define vreinterpretq_s8_u32
16291 int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
16292 #define vreinterpretq_s8_u16
16294 int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
16295 #define vreinterpretq_s8_u8
16297 int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
16298 #define vreinterpretq_s8_s32
16300 int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
16301 #define vreinterpretq_s8_s16
16303 int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
16304 #define vreinterpretq_s8_u64
16306 int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
16307 #define vreinterpretq_s8_s64
16309 int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
16310 #define vreinterpretq_s8_f32(t) _M128i(t)
16312 int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
16313 #define vreinterpretq_s8_p16
16315 int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
16316 #define vreinterpretq_s8_p8
16318 int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
16319 #define vreinterpret_s16_u32
16321 int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
16322 #define vreinterpret_s16_u16
16324 int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
16325 #define vreinterpret_s16_u8
16327 int16x4_t vreinterpret_s16_s32 (int32x2_t t);
16328 #define vreinterpret_s16_s32
16330 int16x4_t vreinterpret_s16_s8 (int8x8_t t);
16331 #define vreinterpret_s16_s8
16333 int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
16334 #define vreinterpret_s16_u64
16336 int16x4_t vreinterpret_s16_s64 (int64x1_t t);
16337 #define vreinterpret_s16_s64
16339 int16x4_t vreinterpret_s16_f32 (float32x2_t t);
16340 #define vreinterpret_s16_f32
16343 int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
16344 #define vreinterpret_s16_p16
16346 int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
16347 #define vreinterpret_s16_p8
16349 int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
16350 #define vreinterpretq_s16_u32
16352 int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
16353 #define vreinterpretq_s16_u16
16355 int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
16356 #define vreinterpretq_s16_u8
16358 int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
16359 #define vreinterpretq_s16_s32
16361 int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
16362 #define vreinterpretq_s16_s8
16364 int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
16365 #define vreinterpretq_s16_u64
16367 int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
16368 #define vreinterpretq_s16_s64
16370 int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
16371 #define vreinterpretq_s16_f32(t) _M128i(t)
16373 int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
16374 #define vreinterpretq_s16_p16
16376 int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
16377 #define vreinterpretq_s16_p8
16379 int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
16380 #define vreinterpret_s32_u32
16382 int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
16383 #define vreinterpret_s32_u16
16385 int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
16386 #define vreinterpret_s32_u8
16388 int32x2_t vreinterpret_s32_s16 (int16x4_t t);
16389 #define vreinterpret_s32_s16
16391 int32x2_t vreinterpret_s32_s8 (int8x8_t t);
16392 #define vreinterpret_s32_s8
16394 int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
16395 #define vreinterpret_s32_u64
16397 int32x2_t vreinterpret_s32_s64 (int64x1_t t);
16398 #define vreinterpret_s32_s64
16400 int32x2_t vreinterpret_s32_f32 (float32x2_t t);
16401 #define vreinterpret_s32_f32
16403 int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
16404 #define vreinterpret_s32_p16
16406 int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
16407 #define vreinterpret_s32_p8
16409 int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
16410 #define vreinterpretq_s32_u32
16412 int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
16413 #define vreinterpretq_s32_u16
16415 int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
16416 #define vreinterpretq_s32_u8
16418 int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
16419 #define vreinterpretq_s32_s16
16421 int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
16422 #define vreinterpretq_s32_s8
16424 int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
16425 #define vreinterpretq_s32_u64
16427 int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
16428 #define vreinterpretq_s32_s64
16430 int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
16431 #define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t))
16433 int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
16434 #define vreinterpretq_s32_p16
16436 int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
16437 #define vreinterpretq_s32_p8
16439 uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
16440 #define vreinterpret_u8_u32
16442 uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
16443 #define vreinterpret_u8_u16
16445 uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
16446 #define vreinterpret_u8_s32
16448 uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
16449 #define vreinterpret_u8_s16
16451 uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
16452 #define vreinterpret_u8_s8
16454 uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
16455 #define vreinterpret_u8_u64
16457 uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
16458 #define vreinterpret_u8_s64
16460 uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
16461 #define vreinterpret_u8_f32
16463 uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
16464 #define vreinterpret_u8_p16
16466 uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
16467 #define vreinterpret_u8_p8
16469 uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
16470 #define vreinterpretq_u8_u32
16472 uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
16473 #define vreinterpretq_u8_u16
16475 uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
16476 #define vreinterpretq_u8_s32
16478 uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
16479 #define vreinterpretq_u8_s16
16481 uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
16482 #define vreinterpretq_u8_s8
16484 uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
16485 #define vreinterpretq_u8_u64
16487 uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
16488 #define vreinterpretq_u8_s64
16490 uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
16491 #define vreinterpretq_u8_f32(t) _M128i(t)
16494 uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
16495 #define vreinterpretq_u8_p16
16497 uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
16498 #define vreinterpretq_u8_p8
16500 uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
16501 #define vreinterpret_u16_u32
16503 uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
16504 #define vreinterpret_u16_u8
16506 uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
16507 #define vreinterpret_u16_s32
16509 uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
16510 #define vreinterpret_u16_s16
16512 uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
16513 #define vreinterpret_u16_s8
16515 uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
16516 #define vreinterpret_u16_u64
16518 uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
16519 #define vreinterpret_u16_s64
16521 uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
16522 #define vreinterpret_u16_f32
16524 uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
16525 #define vreinterpret_u16_p16
16527 uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
16528 #define vreinterpret_u16_p8
16530 uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
16531 #define vreinterpretq_u16_u32
16533 uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
16534 #define vreinterpretq_u16_u8
16536 uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
16537 #define vreinterpretq_u16_s32
16539 uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
16540 #define vreinterpretq_u16_s16
16542 uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
16543 #define vreinterpretq_u16_s8
16545 uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
16546 #define vreinterpretq_u16_u64
16548 uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
16549 #define vreinterpretq_u16_s64
16551 uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
16552 #define vreinterpretq_u16_f32(t) _M128i(t)
16554 uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
16555 #define vreinterpretq_u16_p16
16557 uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
16558 #define vreinterpretq_u16_p8
16560 uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
16561 #define vreinterpret_u32_u16
16563 uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
16564 #define vreinterpret_u32_u8
16566 uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
16567 #define vreinterpret_u32_s32
16569 uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
16570 #define vreinterpret_u32_s16
16572 uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
16573 #define vreinterpret_u32_s8
16575 uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
16576 #define vreinterpret_u32_u64
16578 uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
16579 #define vreinterpret_u32_s64
16581 uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
16582 #define vreinterpret_u32_f32
16584 uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
16585 #define vreinterpret_u32_p16
16587 uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
16588 #define vreinterpret_u32_p8
16590 uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
16591 #define vreinterpretq_u32_u16
16593 uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
16594 #define vreinterpretq_u32_u8
16596 uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
16597 #define vreinterpretq_u32_s32
16599 uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
16600 #define vreinterpretq_u32_s16
16602 uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
16603 #define vreinterpretq_u32_s8
16605 uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
16606 #define vreinterpretq_u32_u64
16608 uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
16609 #define vreinterpretq_u32_s64
16611 uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
16612 #define vreinterpretq_u32_f32(t) _M128i(t)
16614 uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
16615 #define vreinterpretq_u32_p16
16617 uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
16618 #define vreinterpretq_u32_p8
16620 #endif /* NEON2SSE_H */